# This is similar to a demo that I worked up previously.
# But here I will use a larger sample size, and I will
# change the values used for x1 and x2 so that there
# will be more curvature.
#
# First I'll load the necessary libraries.  
# (Note: It may be that not all of these are 
# needed.) 
library(MASS)
library(Hmisc)
library(Design)
library(kknn)
library(stats)
#
# I like to set the seed when using randomly-generated
# data --- it's useful to be able to obtain the same 
# samples when trouble shooting, and it's good to be 
# able to have it so that others can reproduce the
# results. 
set.seed(632)
#
#
# I'll generate the training data.
# I'll obtain correlated x1 and x2 values from a pair
# of independent uniform (0,1) random variables.
u1 <- runif(200, 0, 1.5)
u2 <- runif(200, 0, 1.5)
x1 <- 0.5 + 2*u1
x2 <- 0.5 + u1 + u2
plot(x1, x2)
cor(x1, x2)

#
# I'll make the response nonlinear with regard to
# both x1 and x2.  
y <- sqrt(x2/x1) + rnorm(100, 0, 0.1) 
#
# I'll make a plot which shows the mean response.
x1plot <- seq(1, 3, by=0.05)
x2plot <- seq(1, 3, by=0.05)
mresp <- function(x1,x2) { sqrt(x2/x1) }
mrplot <- outer(x1plot, x2plot, mresp)
persp(x1plot, x2plot, mrplot, xlab="x1", ylab="x2", zlab="mean response", theta=60, col="gold")
trdata <- data.frame(cbind(x1,x2,y))
attach(trdata)
# Since the expected response surface is not a plane, 
# more than a 1st-order additive model may be needed. 

#
# I'll find a decent polynomial model using OLS.
# 
# First, a 1st-order model.
lm1 <- lm(y ~ x1 + x2)
summary(lm1)
plot(lm1)
# The residuals indicate appreciable nonlinearity.

#
# Now, a full 2nd-order model.
lm2 <- lm(y ~ x1 + x2 + I(x1^2) + I(x2^2) + I(x1*x2))
summary(lm2)
plot(lm2)
anova(lm1, lm2)
# The F test p-value indicates that the 2nd-order model 
# is appreciably better, and the residual plots suggest 
# the same thing.

#
# Since the residual plot from the 2nd-order model
# suggests some nonlinearity, I'll fit a full 3rd-order 
# model.
lm3 <- lm(y~x1+x2+I(x1^2)+I(x2^2)+I(x1*x2)+I(x1^3)+I(x2^3)+I((x1^2)*x2)+I(x1*(x2^2)))
summary(lm3)
plot(lm3)
anova(lm2, lm3)

#
# I'll see which OLS model C_p indicates will give
# the best predictions.
# To compute C_p values, a good estimate of the error 
# term variance is needed.  While I could assume that
# the 3rd-order model is nearly unbiased and use it to
# estimate the error term variance, sigma^2, I'll go
# ahead and fit a 4th-order model and use its residuals
# to estimate sigma^2.
lm4 <- lm(y~x1+x2+I(x1^2)+I(x2^2)+I(x1*x2)+I(x1^3)+I(x2^3)+I((x1^2)*x2)+I(x1*(x2^2))+I(x1^4)+I(x2^4)+I((x1^3)*x2)+I((x1^2)*(x2^2))+I(x1*(x2^3)))
summary(lm4)
plot(lm4)

#
# I'll look at estimates of sigma based on all four
# OLS fits, to see if stabelization has occurred when
# the 4th-order model has been reached.
sig1 <- summary(lm1)$sigma
sig2 <- summary(lm2)$sigma
sig3 <- summary(lm3)$sigma
sig4 <- summary(lm4)$sigma
sig1; sig2; sig3; sig4
#
# Since it doesn't make sense for the sigma estimate
# to increase as we go from a 3rd-order model to a 
# 4th-order model, and since the residuals from the
# 3rd-order fit looked pretty good, I'll use the 
# residuals from the 3rd-order fit (instead of the 
# 4th-order fit) to get the "good" estimate of the
# error term variance.
sigmasq <- sig3^2
#
# I'll use RSS to denote the resuidual sum of squares.
RSS1 <- 197*sig1^2
RSS2 <- 194*sig2^2
RSS3 <- 190*sig3^2
RSS4 <- 185*sig4^2
#
Cp1 <- RSS1/sigmasq + 6 - 200
Cp2 <- RSS2/sigmasq + 12 - 200
Cp3 <- RSS3/sigmasq + 20 - 200
Cp4 <- RSS4/sigmasq + 30 - 200
Cp1; Cp2; Cp3; Cp4
#
# Viewing C_p as a way to determine bias, we can
# look at C_p - p and values close to 0 indicate a
# low bias or an unbiased model.  The C_p - p values
# are (about): 
#    100.3,  4.9,  0,  3.7.
# It doesn't make sense to have the 4th-order model
# to be more biased than the 3rd-order model, and so
# I think this indicates that we have some noise in
# the C_p analysis (which is typical --- one should 
# not take the values too seriously ... instead just 
# use them as rough guidelines).  
# Viewing C_p as a measure of overall accuracy, we
# should just look for the smallest value.  The values
# are (about):
#   103.3,  10.9,  10,  18.7.
# So we expect the 2nd- and 3rd-order models to 
# perform best.  The 1st-order model isn't good due to
# too much bias, and the 4th-order model is inferior 
# due to a variance that's larger than necessary.
# (Note: It'll be seen below that although C_p does 
# okay here, it'll be the 4th-order model which is 
# 2nd best.)

#
# Now, I'll try a natural spline fit, using a 3 knot 
# spline for x1, a 3 knot spline for x2, and putting 
# the product of x1 and x2 in linearly.
ddist <- datadist(x1, x2)
options(datadist='ddist')
spl <- ols(y ~ rcs(x1, 3) + rcs(x2, 3) + x1 %ia% x2)
spl
par(mfrow=c(2,1))
plot(spl)
par(mfrow=c(1,1))
# The top plot shows strong nonlinearity for x1, and 
# the bottom plot shows milder nonlinearity for x2.
# (Note: If there had been stronger evidence that 
# the product of x1 and x2 is important, I could
# make a product variable (prodx1x2 <- x1*x2) and
# put this product in the model with a rcs term, or
# I could just add (x1^2)*x2 and x1*(x2^2) terms, or
# I could put in a full interaction of the initial
# two rcs terms.)

#
# Now I'll investigate the use of local regression,
# using the loess function.
#
# First, I'll use 1st-order local fits.
lo1 <- loess(y ~ x1 + x2, degree=1)
summary(lo1)
plot(lo1$fitted, lo1$residuals)
# The effective number of parameters suggests that a
# local 1st-order fit may be somewhat similar to a
# global 2nd-order fit.

#
# Now, 2nd-order local fits.
lo2 <- loess(y ~ x1 + x2, degree=2)
summary(lo2)
plot(lo2$fitted, lo2$residuals)
# The effective number of parameters suggests that a
# local 2nd-order fit may be somewhat similar to a
# global 3rd-order fit.

#
# Now I'll investigate the use of nearest-neighbors
# regression.
# I'll use the train.kknn function, which will use 
# cross-validation to determine a good number of
# neighbors to use.
cvknn <- train.kknn(y ~ ., trdata, kmax=50, distance=2, kernel="rectangular")
plot(seq(1:50), cvknn$MEAN.SQU, type="l", col=4, xlab="k", ylab="MSE", main="knn reg (cv)") 
summary(cvknn)
bestk.knn <- cvknn$best.parameters$k
bestk.knn
# C-v indicates that 12 is the best value to use for k.

#
# Now I'll investigate the use of kernel regression
# (aka weighted k nearest-neighbors), which reduces
# the contribution to the prediction as the distance
# to the neighbor increases.  I'll use the epanechnikov
# kernel.
cvkernepan <- train.kknn(y ~ ., trdata, kmax=50, distance=2, kernel="epanechnikov")
plot(seq(1:50), cvkernepan$MEAN.SQU, type="l", col=4, xlab="k", ylab="MSE", main="Epanechnikov kernel reg (cv)") 
summary(cvkernepan)
bestk.kernepan <- cvkernepan$best.parameters$k
bestk.kernepan

#
# Not only can c-v be used to determine a good value
# of k, but when using train.kknn it can also be used 
# to determine a good kernel.  
cvkernbest <- train.kknn(y ~ ., trdata, kmax=50, distance=2)
summary(cvkernbest)
bestk.kernbest <- cvkernbest$best.parameters$k
bestk.kernbest
bestkern.kernbest <- cvkernbest$best.parameters$kern
bestkern.kernbest

#
# I'll now try projection pursuit regression (PPR).
ppr1 <- ppr(y~.,data=trdata,nterms=1,sm.method="spline",df=6,level=1)
summary(ppr1)
# Above I just used one ridge term.  Now I will use
# two ridge terms.  There is enough data to let each
# ridge function use a spline with 5 equiv. df.
ppr2 <- ppr(y~.,data=trdata,nterms=2,sm.method="spline",df=5,level=1)
summary(ppr2)
# Finally, I'll use two terms again, but I will
# increase the "optlevel" (which controls how
# much "effort" goes into getting a good fit)
# from 1 to 2.  (At level 1, when the 2nd ridge
# function is added, the smoothing for the 1st
# ridge function is done again, but the direction
# vector for th efirst term is not changed.  At
# level 2, the direction for the 1st term can be
# altered to get a better fit.)  Additionally, I'll
# let a 3rd ridge function be added, but in the only
# 2 will be kept.  (After everything is refitted with
# a 3 term model, the weakest term will be dropped.)
ppr3 <- ppr(y~.,data=trdata,nterms=2,maxterms=3,sm.method="spline",df=5,level=2)
summary(ppr3)
# The final model selected does not differ from the
# previous two term model.  (Based on reading the
# R Help on ppr, and looking at the output, I 
# believe that in this case a meaningful/helpful
# 3rd term could not be produced.  (Otherwise I 
# think the summary would have contained some info
# about the 3rd term, and also info about a comparison
# of the model with 2 terms and the model with 3 terms.)

#
# I will generate a large data set that can be
# used to estimate the generalization errors.
u1 <- runif(4000, 0, 1)
u2 <- runif(4000, 0, 1)
x1 <- 1 + 2*u1
x2 <- 1 + u1 + u2
y <- sqrt(x2/x1) + rnorm(4000, 0, 0.1) 
gendata <- data.frame(cbind(x1,x2,y))
detach(trdata)
attach(gendata)
#
# Since the loess fits found above cannot be
# used to extrapolate (i.e., predictions can't
# be obtained if predictor values fall outside 
# of the range of values used to create the fit),
# I'm first going to try to get predicted values
# using the first loess fit, and then I'll delete
# all of the cases of the 4000 generated for the 
# generalization set for which predictions cannot
# be obtained.  This will create a generalization 
# set which can be applied to all of the fits 
# obtained above.
pred.lo1 <- predict(lo1, newdata=gendata)
x1 <-  x1[!is.na(pred.lo1)]
x2 <-  x2[!is.na(pred.lo1)]
y <-  y[!is.na(pred.lo1)]
usegendata <- data.frame(cbind(x1,x2,y))
detach(gendata)
attach(usegendata)
#
#
# Now I will estimate the generalization errors.
lm1.mspe <- mean( (y - predict(lm1, newdata=usegendata))^2 )  
lm2.mspe <- mean( (y - predict(lm2, newdata=usegendata))^2 ) 
lm3.mspe <- mean( (y - predict(lm3, newdata=usegendata))^2 ) 
lm4.mspe <- mean( (y - predict(lm4, newdata=usegendata))^2 ) 
spl.mspe <- mean( (y - predict(spl, newdata=usegendata))^2 )  
lo1.mspe <- mean( (y - predict(lo1, newdata=usegendata))^2 )  
lo2.mspe <- mean( (y - predict(lo2, newdata=usegendata))^2 )  
knnbest <- kknn(y ~ ., trdata, usegendata, k=bestk.knn, distance=2, kernel="rectangular")
knnbest.mspe <- mean( (y - knnbest$fitted.values)^2 )
kernepan <- kknn(y ~ ., trdata, usegendata, k=bestk.kernepan, distance=2, kernel="epanechnikov")
kernepan.mspe <- mean( (y - kernepan$fitted.values)^2 )
kernbest <- kknn(y ~ ., trdata, usegendata, k=bestk.kernbest, distance=2, kernel=bestkern.kernbest)
kernbest.mspe <- mean( (y - kernbest$fitted.values)^2 )
ppr1.mspe <- mean( (y - predict(ppr1, newdata=usegendata))^2 )
ppr2.mspe <- mean( (y - predict(ppr2, newdata=usegendata))^2 )
#
# Here are the estimated (generalization) MSPE values:
lm1.mspe
lm2.mspe
lm3.mspe
lm4.mspe
spl.mspe
lo1.mspe
lo2.mspe
knnbest.mspe
kernepan.mspe
kernbest.mspe
ppr1.mspe
ppr2.mspe
# Since the standard deviation of the error term
# dist'n is 0.1, the irreducible error (for the 
# MSPE) is 0.01.
# Below I will subtract the irreducible error of
# 0.01 from the estimated MSPE values, and divide
# each value so obtained by the value obtained from
# the OLS 1st-order fit.  This will make the values
# displayed at the end relative to the OLS 3rd-order
# fit.
(lm1.mspe - 0.01)/(lm3.mspe - 0.01)
# OLS 1st-order
(lm2.mspe - 0.01)/(lm3.mspe - 0.01)
# OLS 2nd-order
(lm3.mspe - 0.01)/(lm3.mspe - 0.01)
# OLS 3rd-order
(lm4.mspe - 0.01)/(lm3.mspe - 0.01)
# OLS 4th-order
(spl.mspe - 0.01)/(lm3.mspe - 0.01)
# spline
(lo1.mspe - 0.01)/(lm3.mspe - 0.01)
# loess 1st-order
(lo2.mspe - 0.01)/(lm3.mspe - 0.01)
# loess 2nd-order
(knnbest.mspe - 0.01)/(lm3.mspe - 0.01)
# knn --- k determined by c-v (minimum)
(kernepan.mspe - 0.01)/(lm3.mspe - 0.01)
# kernel reg --- Epanechnikov, k determined by c-v
(kernbest.mspe - 0.01)/(lm3.mspe - 0.01)
# kernel reg --- kernel & k determined by c-v
(ppr1.mspe - 0.01)/(lm3.mspe - 0.01)
# PPR 1 term
(ppr2.mspe - 0.01)/(lm3.mspe - 0.01)
# PPR 2 terms
#
# The good news is that C_p identified the best OLS
# model.  The bad news (in a way) is that none of
# the alternatives did better (although I suspect
# that MARS will do quite well).  This could be due
# to the fact that although the data was not generated
# to make a polynomial model perfect, it was generated
# so that a single (not too messy) formula was used.
# With some real data, it may be that the response
# surface is less homogeneous, so that the same formula
# doesn't work for all parts.
### YIP-YAH!!! The two term PPR model that I added after
### having done the other parts previously turned out to 
### be the winner.  (It might be possible to do better 
### using PPR by changing the df or the smoothing method.) 
#