# First I'll load the necessary libraries.  
# (Note: It may be that not all of these are 
# needed.) 
library(MASS)
library(Hmisc)
library(Design)
library(kknn)
library(stats)
#
# I like to set the seed when using randomly-generated
# data --- it's useful to be able to obtain the same 
# samples when trouble shooting, and it's good to be 
# able to have it so that others can reproduce the
# results. 
set.seed(632)
#
#
# I'll generate the training data.
# I'll obtain correlated x1 and x2 values from a pair
# of independent uniform (0,1) random variables.
u1 <- runif(100, 0, 1)
u2 <- runif(100, 0, 1)
x1 <- 1 + 2*u1
x2 <- 1 + u1 + u2
plot(x1, x2)
cor(x1, x2)

#
# I'll make the response nonlinear with regard to
# both x1 and x2.  
y <- sqrt(x2/x1) + rnorm(100, 0, 0.1) 
#
# I'll make a plot which shows the mean response.
x1plot <- seq(1, 3, by=0.05)
x2plot <- seq(1, 3, by=0.05)
mresp <- function(x1,x2) { sqrt(x2/x1) }
mrplot <- outer(x1plot, x2plot, mresp)
persp(x1plot, x2plot, mrplot, xlab="x1", ylab="x2", zlab="mean response", theta=105, col="gold")
trdata <- data.frame(cbind(x1,x2,y))
attach(trdata)
# Since the expected response surface is not a plane, 
# more than a 1st-order additive model may be needed. 

#
# I'll find a decent polynomial model using OLS.
# 
# First, a 1st-order model.
lm1 <- lm(y ~ x1 + x2)
summary(lm1)
plot(lm1)
# The residuals indicate a bit of nonlinearity.

#
# Now, a full 2nd-order model.
lm2 <- lm(y ~ x1 + x2 + I(x1^2) + I(x2^2) + I(x1*x2))
summary(lm2)
plot(lm2)
anova(lm1, lm2)
# Although the p-values don't indicate that the 
# 2nd-order model is appreciably better, the 
# residual plots suggest that the full 2nd-order
# model reduces the nonlinearity.  

#
# Next, I'll fit a full 3rd-order model.
# (It may be overkill with regard to reducing the
# bias, and the variance may suffer more than the 
# bias is helped, but I just want to see what happens.)
lm3 <- lm(y~x1+x2+I(x1^2)+I(x2^2)+I(x1*x2)+I(x1^3)+I(x2^3)+I((x1^2)*x2)+I(x1*(x2^2)))
summary(lm3)
plot(lm3)
anova(lm2, lm3)

#
### *** New part (added Sun 7/10) ***
# I'll see which OLS model C_p indicates will give
# the best predictions.
# To compute C_p values, a good estimate of the error 
# term variance is needed.  While I could assume that
# the 3rd-order model is nearly unbiased and use it to
# estimate the error term variance, sigma^2, I'll go
# ahead and fit a 4th-order model and perhaps use its 
# residuals to estimate sigma^2.
lm4 <- lm(y~x1+x2+I(x1^2)+I(x2^2)+I(x1*x2)+I(x1^3)+I(x2^3)+I((x1^2)*x2)+I(x1*(x2^2))+I(x1^4)+I(x2^4)+I((x1^3)*x2)+I((x1^2)*(x2^2))+I(x1*(x2^3)))
summary(lm4)
plot(lm4)

#
# I'll look at estimates of sigma based on all four
# OLS fits, to see if stabelization has occurred when
# the 4th-order model has been reached.
sig1 <- summary(lm1)$sigma
sig2 <- summary(lm2)$sigma
sig3 <- summary(lm3)$sigma
sig4 <- summary(lm4)$sigma
sig1; sig2; sig3; sig4
# These results are rather puzzling --- typically
# the low-order fits would have some bias, and the 
# naive estimates of the error term variance will
# be inflated ... but here we see that the estimated 
# error term variance *increases* as we go to 
# higher-order (less biased) models.
#
# Despite the screwiness, I'll go ahead and compute 
# the C_p values.  Because it doesn't make sense for
# the sigma estimates to decrease as the order increases,
# for the "good" estimate of sigma to use with C_p, I'll 
# use the average of the four estimates obtained above.
sigmasq <- ( (sig1+sig2+sig3+sig4)/4 )^2
#
# I'll use RSS to denote the resuidual sum of squares.
RSS1 <- 97*sig1^2
RSS2 <- 94*sig2^2
RSS3 <- 90*sig3^2
RSS4 <- 85*sig4^2
#
Cp1 <- RSS1/sigmasq + 6 - 100
Cp2 <- RSS2/sigmasq + 12 - 100
Cp3 <- RSS3/sigmasq + 20 - 100
Cp4 <- RSS4/sigmasq + 30 - 100
Cp1; Cp2; Cp3; Cp4
# These results are screwy too (as should be expected
# after the sigma estimates were found to be odd).
# Viewing C_p as a way to identify bias, we can
# look at C_p - p and values close to 0 indicate a
# low bias or an unbiased model.  The C_p - p values
# are (about): 
#    -1.3,  -0.9,  0.5,  1.6.
# It doesn't make sense to have the highest-order
# model be the most biased one, and it doesn't make 
# sense to have C_p be less than p and get negative
# values for C_p - p.  Overall, w/o having anything
# else to work with, I'd take these results to 
# indicate that the 1st-order model is best, since 
# it *appears* to have little or no bias, and it's 
# variance should be lower than the others due to 
# using fewer terms (and thus there are fewer coeff's
# to estimate).  Also note that it produced the 
# smallest C_p value.
# BUT, it will be seen below that C-p fails us for 
# this data.
### *** End of new material. ***

#
# Now, even though there isn't strong evidence of
# nonlinearity and/or interaction, I'll try a 
# natural spline fit, using a 3 knot spline for x1,
# a 3 knot spline for x2, and putting the product 
# of x1 and x2 in linearly.
ddist <- datadist(x1, x2)
options(datadist='ddist')
spl <- ols(y ~ rcs(x1, 3) + rcs(x2, 3) + x1 %ia% x2)
spl
par(mfrow=c(2,1))
plot(spl)
par(mfrow=c(1,1))
# Although there isn't strong evidence that such a
# spline model is needed, there does seem to be a bit 
# of nonlinearity indicated.  

#
# Now I'll investigate the use of local regression,
# using the loess function.
#
# First, I'll use 1st-order local fits.
lo1 <- loess(y ~ x1 + x2, degree=1)
summary(lo1)
plot(lo1$fitted, lo1$residuals)
# The equivalent number of parameters suggests that a
# 1st-order local fit may be somewhat similar to a 
# 2nd-order global fit.

# Now, 2nd-order local fits.
lo2 <- loess(y ~ x1 + x2, degree=2)
summary(lo2)
plot(lo2$fitted, lo2$residuals)
# The equivalent number of parameters suggests that a
# 2nd-order local fit may be somewhat similar to a 
# 3rd-order global fit.

#
# Now I'll investigate the use of nearest-neighbors
# regression.
# I'll use the train.kknn function, which will use 
# cross-validation to determine a good number of
# neighbors to use.
cvknn <- train.kknn(y ~ ., trdata, kmax=50, distance=2, kernel="rectangular")
plot(seq(1:50), cvknn$MEAN.SQU, type="l", col=4, xlab="k", ylab="MSE", main="knn reg (cv)") 
summary(cvknn)
bestk.knn <- cvknn$best.parameters$k
bestk.knn
# Although c-v indicates that 8 is the best value 
# to use for k, since the mean squared error values
# are just estimates, it may be better to use a value
# of 15, which is closer to the middle of the low part 
# of the plot.

#
# Now I'll investigate the use of kernel regression
# (aka weighted k nearest-neighbors), which reduces
# the contribution to the prediction as the distance
# to the neighbor increases.  I'll use the Epanechnikov
# kernel.
cvkernepan <- train.kknn(y ~ ., trdata, kmax=50, distance=2, kernel="epanechnikov")
plot(seq(1:50), cvkernepan$MEAN.SQU, type="l", col=4, xlab="k", ylab="MSE", main="Epanechnikov kernel reg (cv)") 
summary(cvkernepan)
bestk.kernepan <- cvkernepan$best.parameters$k
bestk.kernepan

#
# Not only can c-v be used to determine a good value
# of k, but when using train.kknn it can also be used 
# to determine a good kernel.  
cvkernbest <- train.kknn(y ~ ., trdata, kmax=50, distance=2)
summary(cvkernbest)
bestk.kernbest <- cvkernbest$best.parameters$k
bestk.kernbest
bestkern.kernbest <- cvkernbest$best.parameters$kern
bestkern.kernbest

#
# I will generate a large data set that can be
# used to estimate the generalization errors.
u1 <- runif(4000, 0, 1)
u2 <- runif(4000, 0, 1)
x1 <- 1 + 2*u1
x2 <- 1 + u1 + u2
y <- sqrt(x2/x1) + rnorm(4000, 0, 0.1) 
gendata <- data.frame(cbind(x1,x2,y))
detach(trdata)
attach(gendata)
#
# Since the loess fits found above cannot be
# used to extrapolate (i.e., predictions can't
# be obtained if predictor values fall outside 
# of the range of values used to create the fit),
# I'm first going to try to get predicted values
# using the first loess fit, and then I'll delete
# all of the cases of the 4000 generated for the 
# generalization set for which predictions cannot
# be obtained.  This will create a generalization 
# set which can be applied to all of the fits 
# obtained above.
pred.lo1 <- predict(lo1, newdata=gendata)
x1 <-  x1[!is.na(pred.lo1)]
x2 <-  x2[!is.na(pred.lo1)]
y <-  y[!is.na(pred.lo1)]
usegendata <- data.frame(cbind(x1,x2,y))
detach(gendata)
attach(usegendata)
#
#
# Now I will estimate the generalization errors.
lm1.mspe <- mean( (y - predict(lm1, newdata=usegendata))^2 )  
lm2.mspe <- mean( (y - predict(lm2, newdata=usegendata))^2 ) 
lm3.mspe <- mean( (y - predict(lm3, newdata=usegendata))^2 ) 
spl.mspe <- mean( (y - predict(spl, newdata=usegendata))^2 )  
lo1.mspe <- mean( (y - predict(lo1, newdata=usegendata))^2 )  
lo2.mspe <- mean( (y - predict(lo2, newdata=usegendata))^2 )  
knnbest <- kknn(y ~ ., trdata, usegendata, k=bestk.knn, distance=2, kernel="rectangular")
knnbest.mspe <- mean( (y - knnbest$fitted.values)^2 )
knn15 <- kknn(y ~ ., trdata, usegendata, k=15, distance=2, kernel="rectangular")
knn15.mspe <- mean( (y - knn15$fitted.values)^2 )
kernepan <- kknn(y ~ ., trdata, usegendata, k=bestk.kernepan, distance=2, kernel="epanechnikov")
kernepan.mspe <- mean( (y - kernepan$fitted.values)^2 )
kernbest <- kknn(y ~ ., trdata, usegendata, k=bestk.kernbest, distance=2, kernel=bestkern.kernbest)
kernbest.mspe <- mean( (y - kernbest$fitted.values)^2 )
#
# Here are the estimated (generalization) MSPE values:
lm1.mspe
lm2.mspe
lm3.mspe
spl.mspe
lo1.mspe
lo2.mspe
knnbest.mspe
knn15.mspe
kernepan.mspe
kernbest.mspe
# Since the standard deviation of the error term
# dist'n is 0.1, the irreducible error (for the 
# MSPE) is 0.01.
# Below I will subtract the irreducible error of
# 0.01 from the estimated MSPE values, and divide
# each value so obtained by the value obtained from
# the OLS 1st-order fit.  This will make the values
# displayed at the end relative to the OLS 1st-order
# fit.
(lm1.mspe - 0.01)/(lm1.mspe - 0.01)
# OLS 1st-order
(lm2.mspe - 0.01)/(lm1.mspe - 0.01)
# OLS 2nd-order
(lm3.mspe - 0.01)/(lm1.mspe - 0.01)
# OLS 3rd-order
(spl.mspe - 0.01)/(lm1.mspe - 0.01)
# spline
(lo1.mspe - 0.01)/(lm1.mspe - 0.01)
# loess 1st-order
(lo2.mspe - 0.01)/(lm1.mspe - 0.01)
# loess 2nd-order
(knnbest.mspe - 0.01)/(lm1.mspe - 0.01)
# knn --- k determined by c-v (minimum)
(knn15.mspe - 0.01)/(lm1.mspe - 0.01)
# knn --- k based on c-v (value near middle of good values)
(kernepan.mspe - 0.01)/(lm1.mspe - 0.01)
# kernel reg --- Epanechnikov, k determined by c-v
(kernbest.mspe - 0.01)/(lm1.mspe - 0.01)
# kernel reg --- kernel & k determined by c-v
#
# The nearest-neighbors and kernel regression did
# not work too well.  The success of the 2nd-order
# OLS fit and the natural spline fit show that one
# can include terms which are not statistically 
# significant w/o necessarily harming predictions.
#