# This R code pertains to Sec. 7.3 of E&T. # # I have a web page, # http://mason.gmu.edu/~csutton/EandTT74.txt, # which contains the data shown in Table7.4 on p. 72 of E&T. # Below, I will read the data into R from this file, and then # show the data. # (Note: I don't have to put header=FALSE since that is the default.) cholo <- read.table("http://mason.gmu.edu/~csutton/EandTT74.txt", header=FALSE) cholo # # It can be noted that the columns/variables do not have nice names # (the V1 and V2 names are due to R). # I can name the columns/variables using R's names function. names(cholo) <- c("x", "y") cholo attach(cholo) # # Now let's look at a scatter plot (like Fig. 7.5 on p. 73 of E&T). # plot(x, y) abline(h = 0, col = 8) # # Let's consider some OLS regression models, starting with a simple # 1st-order regression model. ols1 <- lm( y ~ x ) summary(ols1) names(ols1) plot( x, ols1$res ) abline( h = 0, col = 8) # # Now, let's examine a 2nd-order model. ols2 <- lm( y ~ x + I(x^2) ) summary(ols2) plot( x, ols2$res ) abline( h = 0, col = 8) # # Now, let's examine a 3rd-order model. ols3 <- lm( y ~ x + I(x^2) + I(x^3) ) summary(ols3) plot( x, ols3$res ) abline( h = 0, col = 8) # Some may prefer to use the 3rd-order model over the 2nd-order # model. (Perhaps when we cover cross-validation we'll revisit # this data and see which OLS model c-v indicates is best.) # But since the focus in Ch. 7 of E&T is on comparing a 2nd-order # OLS model to a loess fit, let's now move on to loess. # # Let's take a look at a description of the loess function. ?loess # # To match the book I'll override one of the default settings and # use a local 1st-order fit instead of a local 2nd-order fit. (A # local 1st-order fit can allow more "freedom" than a global 2nd- # order fit. A local 2nd-order fit may overfit the data. lo1a <- loess( y ~ x, degree=1) summary(lo1a) plot( x, lo1a$res ) abline( h = 0, col = 8) # # Let's look at a plot similar to Fig. 7.6 on p. 74 of E&T. plot(x, y) abline(h = 0, col = 8) points(x, ols2$fit, type="l", col=4) points(x, lo1a$fit, type="l", col=2) legend( 10, 100, legend=c("ols2", "lo1"), fill=c(4,2)) # # The loess fit done here using the default settings to control # the smoothness is much smoother than the one considered in E&T. # In order to get results closer to those given in E&T, it'll be # necessary to override another default. (Note: In practice, one # could use cross-validation to select a good value for the smooth- # ness parameter.) Although it appears as though a smaller value # may be needed to more closely match the loess fit of E&T, it seems # as though the loess function won't accept a value less than 0.5. # (Note: It may be that E&T's fit is too wiggly, and so it may be # better to not match it so closely.) lo1b <- loess( y ~ x, span=0.5, degree=1) summary(lo1b) plot( x, lo1b$res ) abline( h = 0, col = 8) # plot(x, y) abline(h = 0, col = 8) points(x, ols2$fit, type="l", col=4) points(x, lo1b$fit, type="l", col=2) legend( 10, 100, legend=c("ols2", "lo1"), fill=c(4,2)) # # One can use an alternative R function to do local regression --- # use lowess instead of loess. The lowess function allows us to # obtain a local regression fit that more closely matches the one # shown in E&T. lo1c <- lowess(x, y, f=0.3) names(lo1c) ?lowess # # Note: Since the data frame cholo is attached, x and y are the original # data values. lo1c$x is the same as x, and lo1c$y are the fitted values # from the last local regression fit. x; y; lo1c$x; lo1c$y plot(x, y) abline(h = 0, col = 8) points(x, ols2$fit, type="l", col=4) points(x, lo1b$fit, type="l", col=2) points(x, lo1c$y, type="l", col=3) legend( 10, 100, legend=c("ols2", "loess", "lowess"), fill=c(4,2,3)) # # The last local reg fit matches the one in E&T closely (if not exactly). # # Now I'll do some bootstrapping --- to obtain results similar to those # given in Table 7.5 on p. 80 of E&T, and also some other results given # on p. 80 of E&T. I'll store the results corresponding to Table 7.5 in # a matrix. # # First I'll create a 2 by 4 matrix with entries initilized to 0. Table7.5 <- matrix(0, 2, 4) Table7.5 # # Now I'll obtain the entries I want for the first row, which are # estimates of E(Y|x=60) and E(Y|x=100). Since only the 76th x value # equals 60, the estimates of E(Y|x=60) will be the 76th fitted values. # We can use any of the last 8 fitted values for the estimates of E(Y|x=100). x[76] Table7.5[1,1] <- ols2$fit[76] Table7.5[1,2] <- lo1c$y[76] Table7.5[1,3] <- ols2$fit[164] Table7.5[1,4] <- lo1c$y[164] # # Estimates of E(Y|x=80) are also needed for the estimates of the slope # difference, theta (given by (7.29) on p. 80 of E&T). Since none of the # values of x in the data set equal 80, the estimates of E(Y|x=80) will # have to be obtained a different way. anotherx <- c(80) anothery <- c(0) # Note: It doesn't matter what is put for anothery. detach(cholo) # Note: I don't understand this error message. moredata <- data.frame( cbind(anotherx, anothery) ) names(moredata) <- c("x", "y") attach(moredata) yhat80ols2 <- predict(ols2, newdata=moredata) yhat80ols2 # Unfortunately, the predict function doesn't work with the object lo1c. # So to obtain the desired predicted value I'll just interpolate. yhat80lo1c <- (lo1c$y[94] + 2*lo1c$y[95])/3 yhat80lo1c # This matches one of the two r(80) values given in (7.28) on p. 79 of E&T. # # Now for the estimates of theta (based on the original data). thetahat.ols2 <- ( ols2$fit[164] - 2*yhat80ols2 + ols2$fit[76] )/20 thetahat.lo1c <- ( lo1c$y[164] - 2*yhat80lo1c + lo1c$y[76] )/20 thetahat.ols2 thetahat.lo1c # These match the values on 0.17 given on p. 79 of E&T, and 1.59 # given on p. 80 of E&T. # # Finally (!!!) we're ready to obtain some estimated standard errors via # bootstrapping. We'll get them to correspond to the 2nd row of Table 7.5 # on p. 80 of E&T, and also for the estimator of theta based on local reg. # # Let's bring back the cholo data frame first. detach(moredata) attach(cholo) # # Now to add an id variable. id <- c(1:164) idcholo <- data.frame( cbind( id, x, y ) ) names(idcholo) <- c("id","x","y") idcholo detach(cholo) attach(idcholo) # # Now I'll define a function I've used in previous examples. cdsbootstrap <- function(x, B) { bssamples <- matrix( sample( x, size=B*length(x), replace=T ), nrow=B ) return(bssamples) } # # Now I'll use this function to create 50 bootstrap samples of id numbers. set.seed(321) idbss <- cdsbootstrap( id, 50 ) # # Now I'll set up vectors to store the various bootstrap replicates. yhat60ols <- numeric() yhat60local <- numeric() yhat100ols <- numeric() yhat100local <- numeric() thetahatlocal <- numeric() # # A problem with getting the predicted values for the local regression # fits based on the bootstrap samples is that the 76th x value won't # always be 60 and the 164th x value won't always be 100. In fact, # a bootstrap sample may not even contain any x values equal to 60. # There is a good chance that each bootstrap sample will contain at # least one x value equal to 100, and so getting an estimate of # E(Y|x=100) should not be so difficult. But getting estimates of E(Y|x=60) # and E(Y|x=80) will take some work. # # I'll define some functions. yh100 <- function(x, yhat) { yh <- 9999 for (i in 1:164) if (x[i] == 100) yh <- yhat[i] return(yh) } # At the end, one should look at all of the bootstrap replicates of the # estimates of E(Y|x=100) to make sure that none of them equal 9999. # # The next function uses weighted least squares regression to produce a # local regression fit at some specified value. (It's based on the bottom # half of p. 77 of E&T.) yhat.x.star <- function(x, y, x.star) { absdiff <- abs( x - x.star ) oabsdiff <- sort( absdiff ) threshhold <- oabsdiff[49] u <- numeric() w <- numeric() for (i in 1:164) if (abs(x[i] - x.star) >= threshhold) u[i] <- 1 else u[i] <- abs(x[i] - x.star)/threshhold for (i in 1:164) w[i] <- ( 1 - u[i]^3 )^3 wls <- lm(y ~ x, weights=w) yh<- wls$coef[1] + wls$coef[2]*x.star return(yh) } # For some reason (unknown to me), this function doesn't give fitted # values that match those produced by lowess. myyhat60 <- yhat.x.star(x, y, 60) myyhat60 # The lowess function gave a value of about 34.03. # # Now I'll use a loop, and do the many operations needed to obtain 5 # different bootstrap replicates each pass through the loop. for (b in 1:50) { xx <- x[idbss[b,]] yy <- y[idbss[b,]] ols <- lm( yy ~ xx + I(xx^2) ) locreg <- lowess(xx, yy, f=0.3) yhat60ols[b] <- ols$coef[1] + ols$coef[2]*60 + ols$coef[3]*3600 yhat100ols[b] <- ols$coef[1] + ols$coef[2]*100 + ols$coef[3]*10000 yhat60 <- yhat.x.star(xx, yy, 60) yhat80 <- yhat.x.star(xx, yy, 80) yhat100 <- yh100(xx, locreg$y) yhat60local[b] <- yhat60 yhat100local[b] <- yhat100 thetahatlocal[b] <- (yhat100 - 2*yhat80 + yhat60)/20 } # # Now to compute the estimated standard errors. Table7.5[2,1] <- sd(yhat60ols) Table7.5[2,2] <- sd(yhat60local) Table7.5[2,3] <- sd(yhat100ols) Table7.5[2,4] <- sd(yhat100local) est.se.thetahat <- sd(thetahatlocal) # # Here are results corresponding to Table 7.5 on p. 80 of E&T. Table7.5 # # Here's the estimated standard error of the estimator of theta # based on local regression. est.se.thetahat # E&T got 0.61. # # Finally --- check to see that none of the bootstrap replicates of # the local regression estimate of E(Y|x=100) equal 9999 (which would # indicate a problem). yhat100local #