# First I'll bring in the necessary libraries. # (Note: It may be that not all of these are # needed.) library(class) library(MASS) library(Hmisc) library(classPP) library(klaR) library(e1071) library(kknn) library(rpart) library(boost) library(mvtnorm) library(multinomRob ) library(lars) library(stats) library(leaps) # # I like to set the seed when using randomly-generated # data --- it's useful to be able to obtain the same # samples when trouble shooting, and it's good to be # able to have it so that others can reproduce the # results. set.seed(632) # # I'll generate the training data. x1 <- runif(50, 0, 1) x2 <- x1 + rnorm(50, 0, 0.25) x3 <- (x1 + x2)/2 + runif(50, 0, 0.1) x4 <- runif(50, 0, 1) x5 <- (2*x4 + rnorm(50, 0, 0.25))/2 + runif(50, 0, 0.1) x6 <- runif(50, 0, 1) y <- (3 + x1 + x2 + 0.5*x3 + 0.75*x4 + 0.5*x5 + 0.5*x6 + rnorm(50, 0, 1)) # # Since I'm going to use some methods for which # standardized predictors are generally preferred, # to keep things simpler, I'll use standardized # predictors with all of the methods. x <- scale( cbind(x1,x2,x3,x4,x5,x6) ) trdata <- data.frame( cbind(x,y) ) names(trdata) <- c("sx1", "sx2", "sx3", "sx4", "sx5", "sx6", "y") attach(trdata) cor(trdata) # # I'll use OLS to fit a linear model # using all of the (standardized) predictors. ols1 <- lm(y ~ sx1 + sx2 + sx3 + sx4 + sx5 + sx6) summary(ols1) # # I'll use a backwards elimination strategy # and fit a model with sx6 omitted. ols2 <- lm(y ~ sx1 + sx2 + sx3 + sx4 + sx5) summary(ols2) # # I'll now drop sx4. ols3 <- lm(y ~ sx1 + sx2 + sx3 + sx5) summary(ols3) # # Next, I'll drop sx2. ols4 <- lm(y ~ sx1 + sx3 + sx5) summary(ols4) # # Now, I'll drop sx3. ols5 <- lm(y ~ sx1 + sx5) summary(ols5) # # At this point I'll stop with the backwards # elimination, and check to see what a stepwise # procedure gives me. ols6 <- step(ols1, direction="both") summary(ols6) # So, the stepwise regression based on AIC led # to the same model as backwards elimination. # # If one wanted to pursue OLS models more, # one could use # bestss <- regsubsets(y ~ sx1 + sx2 + sx3 + sx4 + sx5 + sx6, data=trdata) # summary.regsubsets(bestss) # to see that backwards elimination did not # even arrive at the best two variable model. # But to deal sensibly with the output of a # best subsets regression routine, one needs a # way to compare the goodness of models having # different numbers of variables. (I'll say # more about this later in class.) # # Perform lasso using lars funstion, # noting that the syntax is different # --- one needs to give it the design # matrix. las <- lars(x, y, type="lasso") las plot(las, plottype="coefficients") # plot(las, plottype="Cp") # The C_p seems to be minimized at about 5.6. # # By using cross-validation with the lasso, # a good (hopefully near-optimal) value for # the "fraction" can be determined. cvlas <- cv.lars(x, y, type="lasso") cvlas frac <- cvlas$fraction[which.min(cvlas$cv)] frac las.coef <- predict.lars(las, type="coefficients", mode="fraction", s=frac) las.coef # I got the same results when using the # unstandardized predictors. # # As a check, let's see if setting the value of # s (the fraction, in the mode being used) to 1 # yields the coefficient values from the OLS fit. las.coef <- predict.lars(las, type="coefficients", mode="fraction", s=1) las.coef ols1 # They match! # # Now, I'll try ridge regression --- # I'll give it a sequence of values # for lambda, since it's not clear at # this point what value to use. lmridge <- lm.ridge(y ~ sx1 + sx2 + sx3 + sx4 + sx5 + sx6, lambda = seq(0, 10, 1)) lmridge$kHKB lmridge$kLW lmridge$GCV # Two estimates of (a good) lambda are # about 2.7 and 4.0, and generalized # cross-validation suggests using a # value between 6 and 8. lmridge <- lm.ridge(y ~ sx1 + sx2 + sx3 + sx4 + sx5 + sx6, lambda = seq(6.72, 6.84, 0.01)) lmridge$GCV # # After some searching, I determined that # the value of lambda which minimizes the # generalized cross-validation estimate of # the error is about 6.8. Two estimation # methods produced estimated lambdas of # about 2.7 and 4.0. So I will fit three # ridge regression models, using lambdas # values of 2.7, 4.0, and 6.8. As a check, # I will fit a fourth model using 0 for lambda, # which should be the same as OLS. ridge1 <- lm.ridge(y ~ sx1 + sx2 + sx3 + sx4 + sx5 + sx6, lambda = 2.7) ridge2 <- lm.ridge(y ~ sx1 + sx2 + sx3 + sx4 + sx5 + sx6, lambda = 4.0) ridge3 <- lm.ridge(y ~ sx1 + sx2 + sx3 + sx4 + sx5 + sx6, lambda = 6.8) ridge4 <- lm.ridge(y ~ sx1 + sx2 + sx3 + sx4 + sx5 + sx6, lambda = 0) # Let's look to see if the fitted coefficients # match those from the OLS fit. ols1 ridge4$coef # They are rather close, but not exactly # the same. # # Now I will do principal components regression. # # The first step is to use the standardized training # data to determine the principal components. tr.pca <- princomp(~ sx1 + sx2 + sx3 + sx4 + sx5 + sx6) # (Note: Normally, one might put cor=T, but here # the variables have already been standardized.) summary(tr.pca) plot(tr.pca) loadings(tr.pca) # # The next step is to compute the p.c. variable # values from the training data. pcomp.tr <- predict(tr.pca) pcomp.tr # # Now I will "pick off" the p.c. variable vectors. # (Note: One wouldn't have to do it this way.) pc1 <- pcomp.tr[,1] pc2 <- pcomp.tr[,2] pc3 <- pcomp.tr[,3] pc4 <- pcomp.tr[,4] pc5 <- pcomp.tr[,5] pc6 <- pcomp.tr[,6] # # And now I will do the regressions. # I'll start by using all of the p.c's, # which should give me a fit equivalent # to the OLS fit based on all of the var's. pcr1 <- lm(y ~ pc1 + pc2 + pc3 + pc4 + pc5 + pc6) summary(pcr1) # # I'll use backwards elimination, and first omit pc3. pcr2 <- lm(y ~ pc1 + pc2 + pc4 + pc5 + pc6) summary(pcr2) # # I'll now omit pc5. pcr3 <- lm(y ~ pc1 + pc2 + pc4 + pc6) summary(pcr3) # # And finally I'll omit pc6. pcr4 <- lm(y ~ pc1 + pc2 + pc4) summary(pcr4) # # I'll next see what a stepwise procedure gives me. pcr5 <- step(pcr1) summary(pcr5) # So, the stepwise regression based on AIC led # to the same model as backwards elimination. # So, I'll go with th emodel based on the 1st, # 2nd, and 4th p.c's, and use the one based on # all of the p.c's as a check (below). # # I will generate a large data set that can be # used to estimate the generalization errors. gx1 <- runif(5000, 0, 1) gx2 <- gx1 + rnorm(5000, 0, 0.25) gx3 <- (gx1 + gx2)/2 + runif(5000, 0, 0.1) gx4 <- runif(5000, 0, 1) gx5 <- (2*gx4 + rnorm(5000, 0, 0.25))/2 + runif(5000, 0, 0.1) gx6 <- runif(5000, 0, 1) gy <- (3 + gx1 + gx2 + 0.5*gx3 + 0.75*gx4 + 0.5*gx5 + 0.5*gx6 + rnorm(5000, 0, 1)) # # To use this data with the previously # fit models to get sensible predictions, # it has to be standardized in the same # way --- the sample means and sample # variances from the training data need # to be used! sgx1 <- (gx1 - mean(x1))/sqrt(var(x1)) sgx2 <- (gx2 - mean(x2))/sqrt(var(x2)) sgx3 <- (gx3 - mean(x3))/sqrt(var(x3)) sgx4 <- (gx4 - mean(x4))/sqrt(var(x4)) sgx5 <- (gx5 - mean(x5))/sqrt(var(x5)) sgx6 <- (gx6 - mean(x6))/sqrt(var(x6)) gx <- cbind(sgx1,sgx2,sgx3,sgx4,sgx5,sgx6) gendata <- data.frame( cbind(gx, gy) ) names(gendata) <- c("sx1", "sx2", "sx3", "sx4", "sx5", "sx6", "y") detach(trdata) attach(gendata) # # I will plug the new data values into the # p.c. formulas determined above. pcomp.gen <- predict(tr.pca, newdata=gendata) cor(pcomp.gen) # # Now I will estimate the generalization errors. ols1.mspe <- mean( (gy - predict(ols1, newdata=gendata))^2 ) ols5.mspe <- mean( (gy - predict(ols5, newdata=gendata))^2 ) predlas <- predict.lars(las, newx=gx, type="fit", mode="fraction", s=frac) las1.mspe <- mean( (gy - predlas$fit)^2 ) predlas <- predict.lars(las, newx=gx, type="fit", mode="fraction", s=0.56) las2.mspe <- mean( (gy - predlas$fit)^2 ) predlas <- predict.lars(las, newx=gx, type="fit", mode="fraction", s=1) las3.mspe <- mean( (gy - predlas$fit)^2 ) # I can't find a function which obtains predicted values # using an lm.ridge object, so I will compute them as # shown below. (Note: The ym component's value is the # sample mean of the training sample response values, # which (see the top of p. 60 of HTF) is the proper # estimate of the intercept if the predictors were centered.) ridge1.mspe <- mean( (gy - (gx %*% ridge1$coef + ridge1$ym))^2 ) ridge2.mspe <- mean( (gy - (gx %*% ridge2$coef + ridge2$ym))^2 ) ridge3.mspe <- mean( (gy - (gx %*% ridge3$coef + ridge3$ym))^2 ) ridge4.mspe <- mean( (gy - (gx %*% ridge4$coef + ridge4$ym))^2 ) # genpcdata <- data.frame(pcomp.gen) names(genpcdata) <- c("pc1", "pc2", "pc3", "pc4", "pc5", "pc6") detach(gendata) attach(genpcdata) pcr1.mspe <- mean( (gy - predict(pcr1, newdata=genpcdata))^2 ) pcr4.mspe <- mean( (gy - predict(pcr4, newdata=genpcdata))^2 ) # # Here are the estimated mean squared prediction errors. # (I'll start by giving the ones which are equiv. to the # OLS fit based on all of the predictors.) ridge4.mspe ### ridge (lambda for OLS equiv.) las3.mspe ### lasso (fraction for OLS equiv.) pcr1.mspe ### prin. comp. reg. (using all p.c's --- equiv. to ols1) ols1.mspe ### OLS (all variables) ols5.mspe ### OLS (2 var's from backwards elimination (also stepwise)) pcr4.mspe ### prin. comp. reg. (using 3 p.c's) las1.mspe ### lasso (fraction by c-v) las2.mspe ### lasso (fraction by C_p) ridge1.mspe ### ridge (lambda by HKB) ridge2.mspe ### ridge (lambda by L-W) ridge3.mspe ### ridge (lambda by gen. c-v) # The last value is the smallest of them all. # While the prediction error was only descreased # by about 8%, it should be kept in mind that # there is a relatively large irreducible error # (due to the variance of the error term). # If the mean irreducible error, which is equal # to 1, is subtracted from all of the estimated # errors, then this last result indicates that # the reducible error was decreased by about 50%.