# This is similar to a demo that I worked up previously. # But here I will use a larger sample size, and I will # change the values used for x1 and x2 so that there # will be more curvature. # # First I'll load the necessary libraries. # (Note: It may be that not all of these are # needed.) library(MASS) library(Hmisc) library(Design) library(kknn) library(stats) # # I like to set the seed when using randomly-generated # data --- it's useful to be able to obtain the same # samples when trouble shooting, and it's good to be # able to have it so that others can reproduce the # results. set.seed(632) # # # I'll generate the training data. # I'll obtain correlated x1 and x2 values from a pair # of independent uniform (0,1) random variables. u1 <- runif(200, 0, 1.5) u2 <- runif(200, 0, 1.5) x1 <- 0.5 + 2*u1 x2 <- 0.5 + u1 + u2 plot(x1, x2) cor(x1, x2) # # I'll make the response nonlinear with regard to # both x1 and x2. y <- sqrt(x2/x1) + rnorm(100, 0, 0.1) # # I'll make a plot which shows the mean response. x1plot <- seq(1, 3, by=0.05) x2plot <- seq(1, 3, by=0.05) mresp <- function(x1,x2) { sqrt(x2/x1) } mrplot <- outer(x1plot, x2plot, mresp) persp(x1plot, x2plot, mrplot, xlab="x1", ylab="x2", zlab="mean response", theta=60, col="gold") trdata <- data.frame(cbind(x1,x2,y)) attach(trdata) # Since the expected response surface is not a plane, # more than a 1st-order additive model may be needed. # # I'll find a decent polynomial model using OLS. # # First, a 1st-order model. lm1 <- lm(y ~ x1 + x2) summary(lm1) plot(lm1) # The residuals indicate appreciable nonlinearity. # # Now, a full 2nd-order model. lm2 <- lm(y ~ x1 + x2 + I(x1^2) + I(x2^2) + I(x1*x2)) summary(lm2) plot(lm2) anova(lm1, lm2) # The F test p-value indicates that the 2nd-order model # is appreciably better, and the residual plots suggest # the same thing. # # Since the residual plot from the 2nd-order model # suggests some nonlinearity, I'll fit a full 3rd-order # model. lm3 <- lm(y~x1+x2+I(x1^2)+I(x2^2)+I(x1*x2)+I(x1^3)+I(x2^3)+I((x1^2)*x2)+I(x1*(x2^2))) summary(lm3) plot(lm3) anova(lm2, lm3) # # I'll see which OLS model C_p indicates will give # the best predictions. # To compute C_p values, a good estimate of the error # term variance is needed. While I could assume that # the 3rd-order model is nearly unbiased and use it to # estimate the error term variance, sigma^2, I'll go # ahead and fit a 4th-order model and use its residuals # to estimate sigma^2. lm4 <- lm(y~x1+x2+I(x1^2)+I(x2^2)+I(x1*x2)+I(x1^3)+I(x2^3)+I((x1^2)*x2)+I(x1*(x2^2))+I(x1^4)+I(x2^4)+I((x1^3)*x2)+I((x1^2)*(x2^2))+I(x1*(x2^3))) summary(lm4) plot(lm4) # # I'll look at estimates of sigma based on all four # OLS fits, to see if stabelization has occurred when # the 4th-order model has been reached. sig1 <- summary(lm1)$sigma sig2 <- summary(lm2)$sigma sig3 <- summary(lm3)$sigma sig4 <- summary(lm4)$sigma sig1; sig2; sig3; sig4 # # Since it doesn't make sense for the sigma estimate # to increase as we go from a 3rd-order model to a # 4th-order model, and since the residuals from the # 3rd-order fit looked pretty good, I'll use the # residuals from the 3rd-order fit (instead of the # 4th-order fit) to get the "good" estimate of the # error term variance. sigmasq <- sig3^2 # # I'll use RSS to denote the resuidual sum of squares. RSS1 <- 197*sig1^2 RSS2 <- 194*sig2^2 RSS3 <- 190*sig3^2 RSS4 <- 185*sig4^2 # Cp1 <- RSS1/sigmasq + 6 - 200 Cp2 <- RSS2/sigmasq + 12 - 200 Cp3 <- RSS3/sigmasq + 20 - 200 Cp4 <- RSS4/sigmasq + 30 - 200 Cp1; Cp2; Cp3; Cp4 # # Viewing C_p as a way to determine bias, we can # look at C_p - p and values close to 0 indicate a # low bias or an unbiased model. The C_p - p values # are (about): # 100.3, 4.9, 0, 3.7. # It doesn't make sense to have the 4th-order model # to be more biased than the 3rd-order model, and so # I think this indicates that we have some noise in # the C_p analysis (which is typical --- one should # not take the values too seriously ... instead just # use them as rough guidelines). # Viewing C_p as a measure of overall accuracy, we # should just look for the smallest value. The values # are (about): # 103.3, 10.9, 10, 18.7. # So we expect the 2nd- and 3rd-order models to # perform best. The 1st-order model isn't good due to # too much bias, and the 4th-order model is inferior # due to a variance that's larger than necessary. # (Note: It'll be seen below that although C_p does # okay here, it'll be the 4th-order model which is # 2nd best.) # # Now, I'll try a natural spline fit, using a 3 knot # spline for x1, a 3 knot spline for x2, and putting # the product of x1 and x2 in linearly. ddist <- datadist(x1, x2) options(datadist='ddist') spl <- ols(y ~ rcs(x1, 3) + rcs(x2, 3) + x1 %ia% x2) spl par(mfrow=c(2,1)) plot(spl) par(mfrow=c(1,1)) # The top plot shows strong nonlinearity for x1, and # the bottom plot shows milder nonlinearity for x2. # (Note: If there had been stronger evidence that # the product of x1 and x2 is important, I could # make a product variable (prodx1x2 <- x1*x2) and # put this product in the model with a rcs term, or # I could just add (x1^2)*x2 and x1*(x2^2) terms, or # I could put in a full interaction of the initial # two rcs terms.) # # Now I'll investigate the use of local regression, # using the loess function. # # First, I'll use 1st-order local fits. lo1 <- loess(y ~ x1 + x2, degree=1) summary(lo1) plot(lo1$fitted, lo1$residuals) # The effective number of parameters suggests that a # local 1st-order fit may be somewhat similar to a # global 2nd-order fit. # # Now, 2nd-order local fits. lo2 <- loess(y ~ x1 + x2, degree=2) summary(lo2) plot(lo2$fitted, lo2$residuals) # The effective number of parameters suggests that a # local 2nd-order fit may be somewhat similar to a # global 3rd-order fit. # # Now I'll investigate the use of nearest-neighbors # regression. # I'll use the train.kknn function, which will use # cross-validation to determine a good number of # neighbors to use. cvknn <- train.kknn(y ~ ., trdata, kmax=50, distance=2, kernel="rectangular") plot(seq(1:50), cvknn$MEAN.SQU, type="l", col=4, xlab="k", ylab="MSE", main="knn reg (cv)") summary(cvknn) bestk.knn <- cvknn$best.parameters$k bestk.knn # C-v indicates that 12 is the best value to use for k. # # Now I'll investigate the use of kernel regression # (aka weighted k nearest-neighbors), which reduces # the contribution to the prediction as the distance # to the neighbor increases. I'll use the epanechnikov # kernel. cvkernepan <- train.kknn(y ~ ., trdata, kmax=50, distance=2, kernel="epanechnikov") plot(seq(1:50), cvkernepan$MEAN.SQU, type="l", col=4, xlab="k", ylab="MSE", main="Epanechnikov kernel reg (cv)") summary(cvkernepan) bestk.kernepan <- cvkernepan$best.parameters$k bestk.kernepan # # Not only can c-v be used to determine a good value # of k, but when using train.kknn it can also be used # to determine a good kernel. cvkernbest <- train.kknn(y ~ ., trdata, kmax=50, distance=2) summary(cvkernbest) bestk.kernbest <- cvkernbest$best.parameters$k bestk.kernbest bestkern.kernbest <- cvkernbest$best.parameters$kern bestkern.kernbest # # I'll now try projection pursuit regression (PPR). ppr1 <- ppr(y~.,data=trdata,nterms=1,sm.method="spline",df=6,level=1) summary(ppr1) # Above I just used one ridge term. Now I will use # two ridge terms. There is enough data to let each # ridge function use a spline with 5 equiv. df. ppr2 <- ppr(y~.,data=trdata,nterms=2,sm.method="spline",df=5,level=1) summary(ppr2) # Finally, I'll use two terms again, but I will # increase the "optlevel" (which controls how # much "effort" goes into getting a good fit) # from 1 to 2. (At level 1, when the 2nd ridge # function is added, the smoothing for the 1st # ridge function is done again, but the direction # vector for th efirst term is not changed. At # level 2, the direction for the 1st term can be # altered to get a better fit.) Additionally, I'll # let a 3rd ridge function be added, but in the only # 2 will be kept. (After everything is refitted with # a 3 term model, the weakest term will be dropped.) ppr3 <- ppr(y~.,data=trdata,nterms=2,maxterms=3,sm.method="spline",df=5,level=2) summary(ppr3) # The final model selected does not differ from the # previous two term model. (Based on reading the # R Help on ppr, and looking at the output, I # believe that in this case a meaningful/helpful # 3rd term could not be produced. (Otherwise I # think the summary would have contained some info # about the 3rd term, and also info about a comparison # of the model with 2 terms and the model with 3 terms.) # # I will generate a large data set that can be # used to estimate the generalization errors. u1 <- runif(4000, 0, 1) u2 <- runif(4000, 0, 1) x1 <- 1 + 2*u1 x2 <- 1 + u1 + u2 y <- sqrt(x2/x1) + rnorm(4000, 0, 0.1) gendata <- data.frame(cbind(x1,x2,y)) detach(trdata) attach(gendata) # # Since the loess fits found above cannot be # used to extrapolate (i.e., predictions can't # be obtained if predictor values fall outside # of the range of values used to create the fit), # I'm first going to try to get predicted values # using the first loess fit, and then I'll delete # all of the cases of the 4000 generated for the # generalization set for which predictions cannot # be obtained. This will create a generalization # set which can be applied to all of the fits # obtained above. pred.lo1 <- predict(lo1, newdata=gendata) x1 <- x1[!is.na(pred.lo1)] x2 <- x2[!is.na(pred.lo1)] y <- y[!is.na(pred.lo1)] usegendata <- data.frame(cbind(x1,x2,y)) detach(gendata) attach(usegendata) # # # Now I will estimate the generalization errors. lm1.mspe <- mean( (y - predict(lm1, newdata=usegendata))^2 ) lm2.mspe <- mean( (y - predict(lm2, newdata=usegendata))^2 ) lm3.mspe <- mean( (y - predict(lm3, newdata=usegendata))^2 ) lm4.mspe <- mean( (y - predict(lm4, newdata=usegendata))^2 ) spl.mspe <- mean( (y - predict(spl, newdata=usegendata))^2 ) lo1.mspe <- mean( (y - predict(lo1, newdata=usegendata))^2 ) lo2.mspe <- mean( (y - predict(lo2, newdata=usegendata))^2 ) knnbest <- kknn(y ~ ., trdata, usegendata, k=bestk.knn, distance=2, kernel="rectangular") knnbest.mspe <- mean( (y - knnbest$fitted.values)^2 ) kernepan <- kknn(y ~ ., trdata, usegendata, k=bestk.kernepan, distance=2, kernel="epanechnikov") kernepan.mspe <- mean( (y - kernepan$fitted.values)^2 ) kernbest <- kknn(y ~ ., trdata, usegendata, k=bestk.kernbest, distance=2, kernel=bestkern.kernbest) kernbest.mspe <- mean( (y - kernbest$fitted.values)^2 ) ppr1.mspe <- mean( (y - predict(ppr1, newdata=usegendata))^2 ) ppr2.mspe <- mean( (y - predict(ppr2, newdata=usegendata))^2 ) # # Here are the estimated (generalization) MSPE values: lm1.mspe lm2.mspe lm3.mspe lm4.mspe spl.mspe lo1.mspe lo2.mspe knnbest.mspe kernepan.mspe kernbest.mspe ppr1.mspe ppr2.mspe # Since the standard deviation of the error term # dist'n is 0.1, the irreducible error (for the # MSPE) is 0.01. # Below I will subtract the irreducible error of # 0.01 from the estimated MSPE values, and divide # each value so obtained by the value obtained from # the OLS 1st-order fit. This will make the values # displayed at the end relative to the OLS 3rd-order # fit. (lm1.mspe - 0.01)/(lm3.mspe - 0.01) # OLS 1st-order (lm2.mspe - 0.01)/(lm3.mspe - 0.01) # OLS 2nd-order (lm3.mspe - 0.01)/(lm3.mspe - 0.01) # OLS 3rd-order (lm4.mspe - 0.01)/(lm3.mspe - 0.01) # OLS 4th-order (spl.mspe - 0.01)/(lm3.mspe - 0.01) # spline (lo1.mspe - 0.01)/(lm3.mspe - 0.01) # loess 1st-order (lo2.mspe - 0.01)/(lm3.mspe - 0.01) # loess 2nd-order (knnbest.mspe - 0.01)/(lm3.mspe - 0.01) # knn --- k determined by c-v (minimum) (kernepan.mspe - 0.01)/(lm3.mspe - 0.01) # kernel reg --- Epanechnikov, k determined by c-v (kernbest.mspe - 0.01)/(lm3.mspe - 0.01) # kernel reg --- kernel & k determined by c-v (ppr1.mspe - 0.01)/(lm3.mspe - 0.01) # PPR 1 term (ppr2.mspe - 0.01)/(lm3.mspe - 0.01) # PPR 2 terms # # The good news is that C_p identified the best OLS # model. The bad news (in a way) is that none of # the alternatives did better (although I suspect # that MARS will do quite well). This could be due # to the fact that although the data was not generated # to make a polynomial model perfect, it was generated # so that a single (not too messy) formula was used. # With some real data, it may be that the response # surface is less homogeneous, so that the same formula # doesn't work for all parts. ### YIP-YAH!!! The two term PPR model that I added after ### having done the other parts previously turned out to ### be the winner. (It might be possible to do better ### using PPR by changing the df or the smoothing method.) #