king <- read.table( "http://www.ics.uci.edu/~dgillen/Stat211/Data/KingCounty2001_data.txt", header=TRUE ) king$newpar <- king$parity king$newpar[king$newpar > 3] <- 3 king$newpar <- factor( king$newpar ) king$low <- factor( cut(king$bwt, breaks = c(0, 2500, 10000), labels = c("Yes", "No")) ) king$educcat <- rep( 0, 2500 ) # less than a high school education king$educcat[king$educ == 12] <- 1 # high school education king$educcat[king$educ > 12] <- 2 # more than a high school education king$educcat <- factor( king$educcat, labels = c("Low", "HighSchool", "College") ) ## ##### ##### Characteristics by program participation ##### ## hist( king$bwt ) table( king$firstep ) lapply( split( king, king$firstep ), summary ) ## #### Model 1: No adjustment ## model.1 <- lm( bwt ~ firstep*educcat,data = king ) model.1.null <- lm( bwt ~ educcat, data = king ) ## #### Model 2: Adjustments; age race married smoker wpre newpar ## model.2 <- lm( bwt ~ firstep*educcat + age + race + married + smoker + wpre + newpar, data = king ) model.2.null <- lm( bwt ~ educcat + age + race + married + smoker + wpre + newpar, data = king ) ## #### Model 3: additional adjustment for alcohol use ## model.3 <- lm( bwt ~ firstep*educcat + age + race + married + smoker + drinker + wpre + newpar, data = king ) model.3.null <- lm( bwt ~ educcat + age + race + married + smoker + drinker + wpre + newpar, data = king ) ## #### Model 4: No interaction model (with apriori adjustment) ## model.4 <- lm( bwt ~ firstep + educcat + age + race + married + smoker + wpre + newpar, data = king ) model.4.null <- lm( bwt ~ educcat + age + race + married + smoker + wpre + newpar, data = king ) ## #### Look at the results from the three models getContr() and linContr() are #### functions defined below to compute the lincombinations for within education effects ## contr.names <- list( c( "firstepYes" ), c( "firstepYes", "firstepYes:educcatHighSchool" ), c( "firstepYes", "firstepYes:educcatCollege" ) ) rslt <- rbind( linContr.lm( model.1, contr.names ), linContr.lm( model.2, contr.names ), linContr.lm( model.3, contr.names ) ) contr.names <- list( c( "firstepYes" ) ) linContr.lm( model.4, contr.names ) lrtest( model.1.null, model.1 ) lrtest( model.2.null, model.2 ) lrtest( model.3.null, model.3 ) # round( rbind( c(lrt.stat.1, 1 - pchisq(lrt.stat.1, 3)), c(lrt.stat.2, 1 - pchisq(lrt.stat.2, 3)), c(lrt.stat.3, 1 - pchisq(lrt.stat.3, 3)) ), 3 ) # getContr <- function( fit, contr.names ){ rslt <- NULL for( i in 1:length(contr.names) ){ rslt <- rbind( rslt, is.element( names( fit$coef ), contr.names[[i]] ) ) } rslt <- ifelse( rslt==TRUE, 1, 0 ) return( rslt ) } linContr.lm <- function( fit, contr.names, sig.digs=2 ){ contr <- getContr( fit, contr.names ) est <- round( contr %*% fit$coefficients, sig.digs ) sigma <- summary( fit )$sigma^2 * summary( fit )$cov.unscaled estVar <- diag( contr %*% sigma %*% t(contr) ) ci.low <- round( est - qnorm(.975) * sqrt(estVar), sig.digs ) ci.hi <- round( est + qnorm(.975) * sqrt(estVar), sig.digs ) pVal <- round( 2*( 1- pt(abs(est/sqrt(estVar)), df=fit$df.residual) ), 3 ) rslt <- paste( est, " (", ci.low, ", ", ci.hi, ")\t ", pVal, sep="" ) return(rslt) } ## ##### ##### Diagnostics ##### ## ##### Studentized residuals ## hat.ii <- lm.influence( model.2 )$hat mse <- summary( model.2 )$sigma^2 stresid <- model.2$residuals / sqrt(mse*(1-hat.ii)) plot( 1:2500, stresid, xlab="Observation", ylab="Studentized Residual" ) abline( h=0, col="red", lty=2 ) abline( h=c(-2,2), col="red" ) ## ##### Leverage ## plot( 1:2500, hat.ii, xlab="Observation", ylab="Leverage" ) abline( h=17/2500*c(1:3), col="red", lty=c(1:3) ) king[ hat.ii >.03,][1:5,] ## ##### Residuals vs. Leverage ## plot( stresid^2, hat.ii, xlab="Squared Studentized Reisual", ylab="Leverage" ) abline( h=3*17/2500, col="red" ) abline( v=qchisq(.95, df=1), col="red" ) king[ (hat.ii > 3*17/2500) & (stresid^2 > qchisq(.95, df=1)), ] ## ##### Cook's distance ## cooksD <- cooks.distance( model.2 ) plot(1:2500, cooksD, xlab="Observation", ylab="Cook's Distance" ) king[ cooksD > .015,] ## ##### DF betas ## plot( 1:2500, dfbetas(model.2)[,2], xlab="Observation", ylab="DFbeta - Firstep" ) king[ dfbetas(model.2)[,2] < -.2, ] ## ##### Non-constant variance ## yhat <- fitted( model.2 ) resid <- model.2$residuals plot( yhat, resid, xlab="Fitted Value", ylab="Residual" ) plot( yhat, resid^2, xlab="Fitted Value", ylab="Squared Residual" ) plot( yhat, resid^2, xlab="Fitted Value", ylab="Squared Residual", ylim=c(0,2e+06) ) lines( lowess( I(resid^2) ~ yhat ), col="red", lwd=3 )