# First I'll load the necessary libraries. # (Note: It may be that not all of these are # needed.) library(MASS) library(Hmisc) library(Design) library(kknn) library(stats) # # I like to set the seed when using randomly-generated # data --- it's useful to be able to obtain the same # samples when trouble shooting, and it's good to be # able to have it so that others can reproduce the # results. set.seed(632) # # # I'll generate the training data. # I'll obtain correlated x1 and x2 values from a pair # of independent uniform (0,1) random variables. u1 <- runif(100, 0, 1) u2 <- runif(100, 0, 1) x1 <- 1 + 2*u1 x2 <- 1 + u1 + u2 plot(x1, x2) cor(x1, x2) # # I'll make the response nonlinear with regard to # both x1 and x2. y <- sqrt(x2/x1) + rnorm(100, 0, 0.1) # # I'll make a plot which shows the mean response. x1plot <- seq(1, 3, by=0.05) x2plot <- seq(1, 3, by=0.05) mresp <- function(x1,x2) { sqrt(x2/x1) } mrplot <- outer(x1plot, x2plot, mresp) persp(x1plot, x2plot, mrplot, xlab="x1", ylab="x2", zlab="mean response", theta=105, col="gold") trdata <- data.frame(cbind(x1,x2,y)) attach(trdata) # Since the expected response surface is not a plane, # more than a 1st-order additive model may be needed. # # I'll find a decent polynomial model using OLS. # # First, a 1st-order model. lm1 <- lm(y ~ x1 + x2) summary(lm1) plot(lm1) # The residuals indicate a bit of nonlinearity. # # Now, a full 2nd-order model. lm2 <- lm(y ~ x1 + x2 + I(x1^2) + I(x2^2) + I(x1*x2)) summary(lm2) plot(lm2) anova(lm1, lm2) # Although the p-values don't indicate that the # 2nd-order model is appreciably better, the # residual plots suggest that the full 2nd-order # model reduces the nonlinearity. # # Next, I'll fit a full 3rd-order model. # (It may be overkill with regard to reducing the # bias, and the variance may suffer more than the # bias is helped, but I just want to see what happens.) lm3 <- lm(y~x1+x2+I(x1^2)+I(x2^2)+I(x1*x2)+I(x1^3)+I(x2^3)+I((x1^2)*x2)+I(x1*(x2^2))) summary(lm3) plot(lm3) anova(lm2, lm3) # ### *** New part (added Sun 7/10) *** # I'll see which OLS model C_p indicates will give # the best predictions. # To compute C_p values, a good estimate of the error # term variance is needed. While I could assume that # the 3rd-order model is nearly unbiased and use it to # estimate the error term variance, sigma^2, I'll go # ahead and fit a 4th-order model and perhaps use its # residuals to estimate sigma^2. lm4 <- lm(y~x1+x2+I(x1^2)+I(x2^2)+I(x1*x2)+I(x1^3)+I(x2^3)+I((x1^2)*x2)+I(x1*(x2^2))+I(x1^4)+I(x2^4)+I((x1^3)*x2)+I((x1^2)*(x2^2))+I(x1*(x2^3))) summary(lm4) plot(lm4) # # I'll look at estimates of sigma based on all four # OLS fits, to see if stabelization has occurred when # the 4th-order model has been reached. sig1 <- summary(lm1)$sigma sig2 <- summary(lm2)$sigma sig3 <- summary(lm3)$sigma sig4 <- summary(lm4)$sigma sig1; sig2; sig3; sig4 # These results are rather puzzling --- typically # the low-order fits would have some bias, and the # naive estimates of the error term variance will # be inflated ... but here we see that the estimated # error term variance *increases* as we go to # higher-order (less biased) models. # # Despite the screwiness, I'll go ahead and compute # the C_p values. Because it doesn't make sense for # the sigma estimates to decrease as the order increases, # for the "good" estimate of sigma to use with C_p, I'll # use the average of the four estimates obtained above. sigmasq <- ( (sig1+sig2+sig3+sig4)/4 )^2 # # I'll use RSS to denote the resuidual sum of squares. RSS1 <- 97*sig1^2 RSS2 <- 94*sig2^2 RSS3 <- 90*sig3^2 RSS4 <- 85*sig4^2 # Cp1 <- RSS1/sigmasq + 6 - 100 Cp2 <- RSS2/sigmasq + 12 - 100 Cp3 <- RSS3/sigmasq + 20 - 100 Cp4 <- RSS4/sigmasq + 30 - 100 Cp1; Cp2; Cp3; Cp4 # These results are screwy too (as should be expected # after the sigma estimates were found to be odd). # Viewing C_p as a way to identify bias, we can # look at C_p - p and values close to 0 indicate a # low bias or an unbiased model. The C_p - p values # are (about): # -1.3, -0.9, 0.5, 1.6. # It doesn't make sense to have the highest-order # model be the most biased one, and it doesn't make # sense to have C_p be less than p and get negative # values for C_p - p. Overall, w/o having anything # else to work with, I'd take these results to # indicate that the 1st-order model is best, since # it *appears* to have little or no bias, and it's # variance should be lower than the others due to # using fewer terms (and thus there are fewer coeff's # to estimate). Also note that it produced the # smallest C_p value. # BUT, it will be seen below that C-p fails us for # this data. ### *** End of new material. *** # # Now, even though there isn't strong evidence of # nonlinearity and/or interaction, I'll try a # natural spline fit, using a 3 knot spline for x1, # a 3 knot spline for x2, and putting the product # of x1 and x2 in linearly. ddist <- datadist(x1, x2) options(datadist='ddist') spl <- ols(y ~ rcs(x1, 3) + rcs(x2, 3) + x1 %ia% x2) spl par(mfrow=c(2,1)) plot(spl) par(mfrow=c(1,1)) # Although there isn't strong evidence that such a # spline model is needed, there does seem to be a bit # of nonlinearity indicated. # # Now I'll investigate the use of local regression, # using the loess function. # # First, I'll use 1st-order local fits. lo1 <- loess(y ~ x1 + x2, degree=1) summary(lo1) plot(lo1$fitted, lo1$residuals) # The equivalent number of parameters suggests that a # 1st-order local fit may be somewhat similar to a # 2nd-order global fit. # Now, 2nd-order local fits. lo2 <- loess(y ~ x1 + x2, degree=2) summary(lo2) plot(lo2$fitted, lo2$residuals) # The equivalent number of parameters suggests that a # 2nd-order local fit may be somewhat similar to a # 3rd-order global fit. # # Now I'll investigate the use of nearest-neighbors # regression. # I'll use the train.kknn function, which will use # cross-validation to determine a good number of # neighbors to use. cvknn <- train.kknn(y ~ ., trdata, kmax=50, distance=2, kernel="rectangular") plot(seq(1:50), cvknn$MEAN.SQU, type="l", col=4, xlab="k", ylab="MSE", main="knn reg (cv)") summary(cvknn) bestk.knn <- cvknn$best.parameters$k bestk.knn # Although c-v indicates that 8 is the best value # to use for k, since the mean squared error values # are just estimates, it may be better to use a value # of 15, which is closer to the middle of the low part # of the plot. # # Now I'll investigate the use of kernel regression # (aka weighted k nearest-neighbors), which reduces # the contribution to the prediction as the distance # to the neighbor increases. I'll use the Epanechnikov # kernel. cvkernepan <- train.kknn(y ~ ., trdata, kmax=50, distance=2, kernel="epanechnikov") plot(seq(1:50), cvkernepan$MEAN.SQU, type="l", col=4, xlab="k", ylab="MSE", main="Epanechnikov kernel reg (cv)") summary(cvkernepan) bestk.kernepan <- cvkernepan$best.parameters$k bestk.kernepan # # Not only can c-v be used to determine a good value # of k, but when using train.kknn it can also be used # to determine a good kernel. cvkernbest <- train.kknn(y ~ ., trdata, kmax=50, distance=2) summary(cvkernbest) bestk.kernbest <- cvkernbest$best.parameters$k bestk.kernbest bestkern.kernbest <- cvkernbest$best.parameters$kern bestkern.kernbest # # I will generate a large data set that can be # used to estimate the generalization errors. u1 <- runif(4000, 0, 1) u2 <- runif(4000, 0, 1) x1 <- 1 + 2*u1 x2 <- 1 + u1 + u2 y <- sqrt(x2/x1) + rnorm(4000, 0, 0.1) gendata <- data.frame(cbind(x1,x2,y)) detach(trdata) attach(gendata) # # Since the loess fits found above cannot be # used to extrapolate (i.e., predictions can't # be obtained if predictor values fall outside # of the range of values used to create the fit), # I'm first going to try to get predicted values # using the first loess fit, and then I'll delete # all of the cases of the 4000 generated for the # generalization set for which predictions cannot # be obtained. This will create a generalization # set which can be applied to all of the fits # obtained above. pred.lo1 <- predict(lo1, newdata=gendata) x1 <- x1[!is.na(pred.lo1)] x2 <- x2[!is.na(pred.lo1)] y <- y[!is.na(pred.lo1)] usegendata <- data.frame(cbind(x1,x2,y)) detach(gendata) attach(usegendata) # # # Now I will estimate the generalization errors. lm1.mspe <- mean( (y - predict(lm1, newdata=usegendata))^2 ) lm2.mspe <- mean( (y - predict(lm2, newdata=usegendata))^2 ) lm3.mspe <- mean( (y - predict(lm3, newdata=usegendata))^2 ) spl.mspe <- mean( (y - predict(spl, newdata=usegendata))^2 ) lo1.mspe <- mean( (y - predict(lo1, newdata=usegendata))^2 ) lo2.mspe <- mean( (y - predict(lo2, newdata=usegendata))^2 ) knnbest <- kknn(y ~ ., trdata, usegendata, k=bestk.knn, distance=2, kernel="rectangular") knnbest.mspe <- mean( (y - knnbest$fitted.values)^2 ) knn15 <- kknn(y ~ ., trdata, usegendata, k=15, distance=2, kernel="rectangular") knn15.mspe <- mean( (y - knn15$fitted.values)^2 ) kernepan <- kknn(y ~ ., trdata, usegendata, k=bestk.kernepan, distance=2, kernel="epanechnikov") kernepan.mspe <- mean( (y - kernepan$fitted.values)^2 ) kernbest <- kknn(y ~ ., trdata, usegendata, k=bestk.kernbest, distance=2, kernel=bestkern.kernbest) kernbest.mspe <- mean( (y - kernbest$fitted.values)^2 ) # # Here are the estimated (generalization) MSPE values: lm1.mspe lm2.mspe lm3.mspe spl.mspe lo1.mspe lo2.mspe knnbest.mspe knn15.mspe kernepan.mspe kernbest.mspe # Since the standard deviation of the error term # dist'n is 0.1, the irreducible error (for the # MSPE) is 0.01. # Below I will subtract the irreducible error of # 0.01 from the estimated MSPE values, and divide # each value so obtained by the value obtained from # the OLS 1st-order fit. This will make the values # displayed at the end relative to the OLS 1st-order # fit. (lm1.mspe - 0.01)/(lm1.mspe - 0.01) # OLS 1st-order (lm2.mspe - 0.01)/(lm1.mspe - 0.01) # OLS 2nd-order (lm3.mspe - 0.01)/(lm1.mspe - 0.01) # OLS 3rd-order (spl.mspe - 0.01)/(lm1.mspe - 0.01) # spline (lo1.mspe - 0.01)/(lm1.mspe - 0.01) # loess 1st-order (lo2.mspe - 0.01)/(lm1.mspe - 0.01) # loess 2nd-order (knnbest.mspe - 0.01)/(lm1.mspe - 0.01) # knn --- k determined by c-v (minimum) (knn15.mspe - 0.01)/(lm1.mspe - 0.01) # knn --- k based on c-v (value near middle of good values) (kernepan.mspe - 0.01)/(lm1.mspe - 0.01) # kernel reg --- Epanechnikov, k determined by c-v (kernbest.mspe - 0.01)/(lm1.mspe - 0.01) # kernel reg --- kernel & k determined by c-v # # The nearest-neighbors and kernel regression did # not work too well. The success of the 2nd-order # OLS fit and the natural spline fit show that one # can include terms which are not statistically # significant w/o necessarily harming predictions. #