# This R code pertains to Sec. 7.3 of E&T.
#
# I have a web page,
#    http://mason.gmu.edu/~csutton/EandTT74.txt,
# which contains the data shown in Table7.4 on p. 72 of E&T.
# Below, I will read the data into R from this file, and then 
# show the data.
# (Note: I don't have to put header=FALSE since that is the default.)
cholo <- read.table("http://mason.gmu.edu/~csutton/EandTT74.txt", header=FALSE)
cholo

#
# It can be noted that the columns/variables do not have nice names
# (the V1 and V2 names are due to R).
# I can name the columns/variables using R's names function.
names(cholo) <- c("x", "y")
cholo
attach(cholo)

#
# Now let's look at a scatter plot (like Fig. 7.5 on p. 73 of E&T).
# 
plot(x, y)
abline(h = 0, col = 8)

#
# Let's consider some OLS regression models, starting with a simple
# 1st-order regression model.
ols1 <- lm( y ~ x )
summary(ols1)
names(ols1)
plot( x, ols1$res )
abline( h = 0, col = 8)

#
# Now, let's examine a 2nd-order model.
ols2 <- lm( y ~ x + I(x^2) )
summary(ols2)
plot( x, ols2$res )
abline( h = 0, col = 8)

#
# Now, let's examine a 3rd-order model.
ols3 <- lm( y ~ x + I(x^2) + I(x^3) )
summary(ols3)
plot( x, ols3$res )
abline( h = 0, col = 8)

# Some may prefer to use the 3rd-order model over the 2nd-order
# model.  (Perhaps when we cover cross-validation we'll revisit 
# this data and see which OLS model c-v indicates is best.)
# But since the focus in Ch. 7 of E&T is on comparing a 2nd-order
# OLS model to a loess fit, let's now move on to loess.
#
# Let's take a look at a description of the loess function.
?loess

#
# To match the book I'll override one of the default settings and
# use a local 1st-order fit instead of a local 2nd-order fit.  (A
# local 1st-order fit can allow more "freedom" than a global 2nd-
# order fit.  A local 2nd-order fit may overfit the data.
lo1a <- loess( y ~ x, degree=1) 
summary(lo1a)
plot( x, lo1a$res )
abline( h = 0, col = 8)

#
# Let's look at a plot similar to Fig. 7.6 on p. 74 of E&T.
plot(x, y)
abline(h = 0, col = 8)
points(x, ols2$fit, type="l", col=4)
points(x, lo1a$fit, type="l", col=2)
legend( 10, 100, legend=c("ols2", "lo1"), fill=c(4,2))

#
# The loess fit done here using the default settings to control
# the smoothness is much smoother than the one considered in E&T.
# In order to get results closer to those given in E&T, it'll be 
# necessary to override another default.  (Note: In practice, one
# could use cross-validation to select a good value for the smooth-
# ness parameter.)  Although it appears as though a smaller value
# may be needed to more closely match the loess fit of E&T, it seems
# as though the loess function won't accept a value less than 0.5.
# (Note: It may be that E&T's fit is too wiggly, and so it may be
# better to not match it so closely.)
lo1b <- loess( y ~ x, span=0.5, degree=1) 
summary(lo1b)
plot( x, lo1b$res )
abline( h = 0, col = 8)

#
plot(x, y)
abline(h = 0, col = 8)
points(x, ols2$fit, type="l", col=4)
points(x, lo1b$fit, type="l", col=2)
legend( 10, 100, legend=c("ols2", "lo1"), fill=c(4,2))

# 
# One can use an alternative R function to do local regression ---
# use lowess instead of loess.  The lowess function allows us to 
# obtain a local regression fit that more closely matches the one
# shown in E&T.
lo1c <- lowess(x, y, f=0.3)
names(lo1c)
?lowess

#
# Note: Since the data frame cholo is attached, x and y are the original
# data values.  lo1c$x is the same as x, and lo1c$y are the fitted values
# from the last local regression fit.
x; y; lo1c$x; lo1c$y
plot(x, y)
abline(h = 0, col = 8)
points(x, ols2$fit, type="l", col=4)
points(x, lo1b$fit, type="l", col=2)
points(x, lo1c$y, type="l", col=3)
legend( 10, 100, legend=c("ols2", "loess", "lowess"), fill=c(4,2,3))

# 
# The last local reg fit matches the one in E&T closely (if not exactly).
#
# Now I'll do some bootstrapping --- to obtain results similar to those 
# given in Table 7.5 on p. 80 of E&T, and also some other results given
# on p. 80 of E&T.  I'll store the results corresponding to Table 7.5 in
# a matrix.  
#
# First I'll create a 2 by 4 matrix with entries initilized to 0.
Table7.5 <- matrix(0, 2, 4)
Table7.5

#
# Now I'll obtain the entries I want for the first row, which are 
# estimates of E(Y|x=60) and E(Y|x=100).  Since only the 76th x value
# equals 60, the estimates of E(Y|x=60) will be the 76th fitted values.
# We can use any of the last 8 fitted values for the estimates of E(Y|x=100).
x[76]
Table7.5[1,1] <- ols2$fit[76]
Table7.5[1,2] <- lo1c$y[76]
Table7.5[1,3] <- ols2$fit[164]
Table7.5[1,4] <- lo1c$y[164]

#
# Estimates of E(Y|x=80) are also needed for the estimates of the slope
# difference, theta (given by (7.29) on p. 80 of E&T).  Since none of the
# values of x in the data set equal 80, the estimates of E(Y|x=80) will
# have to be obtained a different way.
anotherx <- c(80)
anothery <- c(0)
# Note: It doesn't matter what is put for anothery.
detach(cholo)
# Note: I don't understand this error message.
moredata <- data.frame( cbind(anotherx, anothery) )
names(moredata) <- c("x", "y")
attach(moredata)
yhat80ols2 <- predict(ols2, newdata=moredata)
yhat80ols2
# Unfortunately, the predict function doesn't work with the object lo1c.
# So to obtain the desired predicted value I'll just interpolate.
yhat80lo1c <- (lo1c$y[94] + 2*lo1c$y[95])/3
yhat80lo1c
# This matches one of the two r(80) values given in (7.28) on p. 79 of E&T.

#
# Now for the estimates of theta (based on the original data).
thetahat.ols2 <- ( ols2$fit[164] - 2*yhat80ols2 + ols2$fit[76] )/20
thetahat.lo1c <- ( lo1c$y[164] - 2*yhat80lo1c + lo1c$y[76] )/20
thetahat.ols2 
thetahat.lo1c
# These match the values on 0.17 given on p. 79 of E&T, and 1.59
# given on p. 80 of E&T.

#
# Finally (!!!) we're ready to obtain some estimated standard errors via
# bootstrapping.  We'll get them to correspond to the 2nd row of Table 7.5
# on p. 80 of E&T, and also for the estimator of theta based on local reg.
#
# Let's bring back the cholo data frame first.
detach(moredata)
attach(cholo)
#
# Now to add an id variable.
id <- c(1:164)
idcholo <- data.frame( cbind( id, x, y ) )
names(idcholo) <- c("id","x","y")
idcholo
detach(cholo)
attach(idcholo)

#
# Now I'll define a function I've used in previous examples.
cdsbootstrap <- function(x, B) {
  bssamples <- matrix( sample( x, size=B*length(x), replace=T ), nrow=B )
  return(bssamples)
 }

#
# Now I'll use this function to create 50 bootstrap samples of id numbers.
set.seed(321)
idbss <- cdsbootstrap( id, 50 )

#
# Now I'll set up vectors to store the various bootstrap replicates.
yhat60ols <- numeric()
yhat60local <- numeric()
yhat100ols <- numeric()
yhat100local <- numeric()
thetahatlocal <- numeric()

#
# A problem with getting the predicted values for the local regression 
# fits based on the bootstrap samples is that the 76th x value won't 
# always be 60 and the 164th x value won't always be 100.  In fact,
# a bootstrap sample may not even contain any x values equal to 60.
# There is a good chance that each bootstrap sample will contain at 
# least one x value equal to 100, and so getting an estimate of 
# E(Y|x=100) should not be so difficult.  But getting estimates of E(Y|x=60)
# and E(Y|x=80) will take some work.
#
# I'll define some functions.
yh100 <- function(x, yhat) {
  yh <- 9999
  for (i in 1:164) if (x[i] == 100) yh <- yhat[i]
  return(yh)
 }
# At the end, one should look at all of the bootstrap replicates of the
# estimates of E(Y|x=100) to make sure that none of them equal 9999.

#
# The next function uses weighted least squares regression to produce a
# local regression fit at some specified value.  (It's based on the bottom
# half of p. 77 of E&T.)
yhat.x.star <- function(x, y, x.star) {
  absdiff <- abs( x - x.star )
  oabsdiff <- sort( absdiff )
  threshhold <- oabsdiff[49]
  u <- numeric()
  w <- numeric()
  for (i in 1:164) if (abs(x[i] - x.star) >= threshhold) u[i] <- 1 else u[i] <- abs(x[i] - x.star)/threshhold 
  for (i in 1:164) w[i] <- ( 1 - u[i]^3 )^3
  wls <- lm(y ~ x, weights=w)
  yh<- wls$coef[1] + wls$coef[2]*x.star
  return(yh)
 }
# For some reason (unknown to me), this function doesn't give fitted
# values that match those produced by lowess.
myyhat60 <- yhat.x.star(x, y, 60)
myyhat60
# The lowess function gave a value of about 34.03.

#
# Now I'll use a loop, and do the many operations needed to obtain 5
# different bootstrap replicates each pass through the loop.
for (b in 1:50) {
  xx <- x[idbss[b,]]
  yy <- y[idbss[b,]]
  ols <- lm( yy ~ xx + I(xx^2) )
  locreg <- lowess(xx, yy, f=0.3)
  yhat60ols[b] <- ols$coef[1] + ols$coef[2]*60 + ols$coef[3]*3600
  yhat100ols[b] <- ols$coef[1] + ols$coef[2]*100 + ols$coef[3]*10000
  yhat60 <- yhat.x.star(xx, yy, 60)
  yhat80 <- yhat.x.star(xx, yy, 80)
  yhat100 <- yh100(xx, locreg$y)
  yhat60local[b] <- yhat60
  yhat100local[b] <- yhat100
  thetahatlocal[b] <- (yhat100 - 2*yhat80 + yhat60)/20
 }

# 
# Now to compute the estimated standard errors.
Table7.5[2,1] <- sd(yhat60ols)
Table7.5[2,2] <- sd(yhat60local) 
Table7.5[2,3] <- sd(yhat100ols)
Table7.5[2,4] <- sd(yhat100local) 
est.se.thetahat <- sd(thetahatlocal)

#
# Here are results corresponding to Table 7.5 on p. 80 of E&T.
Table7.5
#
# Here's the estimated standard error of the estimator of theta
# based on local regression.
est.se.thetahat
# E&T got 0.61.

#
# Finally --- check to see that none of the bootstrap replicates of
# the local regression estimate of E(Y|x=100) equal 9999 (which would
# indicate a problem).
yhat100local

#