# First I'll bring in the necessary libraries. # (Note: It may be that not all of these are # needed.) library(class) library(MASS) library(Hmisc) library(klaR) library(e1071) library(mvtnorm) # # I like to set the seed when using randomly-generated # data --- it's useful to be able to obtain the same # samples when trouble shooting, and it's good to be # able to have it so that others can reproduce the # results. set.seed(632) # # I'll generate samples from 5-dimensional multivariate # normal dist'ns for two classes. The classes will differ # the most is the first two dimensions, not as much is in # third dimension, and not at all in the last two dimensions. samp1a <- rmvnorm(50, mean=c(1,0), sigma=diag(1,2)) samp2a <- rmvnorm(50, mean=c(0,1), sigma=diag(1,2)) samp1b <- rnorm(50, 1, 1) samp2b <- rnorm(50, 1, 1) samp1c <- rmvnorm(50, mean=c(0,0), sigma=diag(1,2)) samp2c <- rmvnorm(50, mean=c(0,0), sigma=diag(1,2)) cl1 <- rep(1, 50) cl2 <- rep(2, 50) samp1 <- cbind(samp1a, samp1b, samp1c, cl1) samp2 <- cbind(samp2a, samp2b, samp2c, cl2) tr50data <- data.frame(rbind(samp1, samp2)) names(tr50data) <- c("x1", "x2", "x3", "x4", "x5", "pop") attach(tr50data) # # I will use LDA, using all 5 possible explanatory # variables. lda50all <- lda(pop ~ x1 + x2 + x3 + x4 + x5) lda50all # # Now I will use stepwise LDA to hopefully find a # subset of the explanatory variables which will # do better than the full set of variables. lda50sw <- stepclass(pop ~ x1 + x2 + x3 + x4 + x5, data=tr50data, method="lda") lda50sw # Very interesting! The stepwise procedure indicates # that not even x2 is useful if x1 is used. (Note that # the for the two classes, the sample means differ more # for x1 than they do for x2.) # # Because the predict formula doesn't work with stepclass # objects, I'm going to use the lda function to create an # object, corresponding to the stepclass result, that can # be used with the predict function. lda501 <- lda(pop ~ x1) # # In order to see if using just x1 is really best, # I'm going to use x1 and x2, and then also try # using x1, x2, and x3. lda502 <- lda(pop ~ x1 + x2) lda503 <- lda(pop ~ x1 + x2 + x3) # # Before checking out to see which set of variables # lead to the smallest generalization error for the # samples of size 50, I'm going to try samples of # size 500. detach(tr50data) samp1a <- rmvnorm(500, mean=c(1,0), sigma=diag(1,2)) samp2a <- rmvnorm(500, mean=c(0,1), sigma=diag(1,2)) samp1b <- rnorm(500, 1, 1) samp2b <- rnorm(500, 1, 1) samp1c <- rmvnorm(500, mean=c(0,0), sigma=diag(1,2)) samp2c <- rmvnorm(500, mean=c(0,0), sigma=diag(1,2)) cl1 <- rep(1, 500) cl2 <- rep(2, 500) samp1 <- cbind(samp1a, samp1b, samp1c, cl1) samp2 <- cbind(samp2a, samp2b, samp2c, cl2) tr500data <- data.frame(rbind(samp1, samp2)) names(tr500data) <- c("x1", "x2", "x3", "x4", "x5", "pop") attach(tr500data) # # I will use LDA, using all 5 possible explanatory # variables. lda500all <- lda(pop ~ x1 + x2 + x3 + x4 + x5) lda500all # # Now I will use stepwise LDA to hopefully find a # subset of the explanatory variables which will # do better than the full set of variables. lda500sw <- stepclass(pop ~ x1 + x2 + x3 + x4 + x5, data=tr500data, method="lda") lda500sw # The indication is that using just x1 and x2 will be best. # # Because the predict formula doesn't work with stepclass # objects, I'm going to use the lda function to create an # object, corresponding to the stepclass result, that can # be used with the predict function. lda5002 <- lda(pop ~ x1 + x2) # # I'll try using x1, x2, and x3 to see how this compares # when the generalization error rates are estimated. lda5003 <- lda(pop ~ x1 + x2 + x3) # # Now I'll generate a large new set of data and estimate # the generalization error rates (to find out if the # stepwise procedure worked well). detach(tr500data) samp1a <- rmvnorm(5000, mean=c(1,0), sigma=diag(1,2)) samp2a <- rmvnorm(5000, mean=c(0,1), sigma=diag(1,2)) samp1b <- rnorm(5000, 1, 1) samp2b <- rnorm(5000, 1, 1) samp1c <- rmvnorm(5000, mean=c(0,0), sigma=diag(1,2)) samp2c <- rmvnorm(5000, mean=c(0,0), sigma=diag(1,2)) cl1 <- rep(1, 5000) cl2 <- rep(2, 5000) samp1 <- cbind(samp1a, samp1b, samp1c, cl1) samp2 <- cbind(samp2a, samp2b, samp2c, cl2) gendata <- data.frame(rbind(samp1, samp2)) names(gendata) <- c("x1", "x2", "x3", "x4", "x5", "pop") attach(gendata) # # First I'll estimate the generalization error rates # for LDA classifiers constructed from the samples of # size 50. pred.lda50all <- predict(lda50all, newdata=gendata) generr.lda50all <- mean( pred.lda50all$class != pop ) pred.lda501 <- predict(lda501, newdata=gendata) generr.lda501 <- mean( pred.lda501$class != pop ) pred.lda502 <- predict(lda502, newdata=gendata) generr.lda502 <- mean( pred.lda502$class != pop ) pred.lda503 <- predict(lda503, newdata=gendata) generr.lda503 <- mean( pred.lda503$class != pop ) # # Here are the estimated gen. error rates: generr.lda501 # classifier picked by stepwise prodecure --- just uses x1 generr.lda502 # classifier using x1 and x2 generr.lda503 # classifier using x1, x2, and x3 generr.lda50all # classifier using all of the variables (incl. some that are noise) # # Conclusion: Stepwise LDA did BAD! Perhaps it # shouldn't be trusted with such small samples. # (It may be that the bad result could have been # avoided if the default value of improvement had # not been used --- try using improvement=0.02 as # an additional argument with stepclass. I would # definitely override the default with a large # training set, since in that case the c-v estimates # can be taken as being more accurate, an an estimated # improvement as small as 0.01 might result because one # method is just a tad better than another one.) # It can also be noted that using predictors that # are weaker than the strongest ones, including some # which are pure noise, didn't really hurt things # a lot. While the classifier using just the two # strongest predictors did best, the ones using # more predictors were almost just as good. # # Now I'll estimate the generalization error rates # for LDA classifiers constructed from the samples of # size 500. pred.lda500all <- predict(lda500all, newdata=gendata) generr.lda500all <- mean( pred.lda500all$class != pop ) pred.lda5002 <- predict(lda5002, newdata=gendata) generr.lda5002 <- mean( pred.lda5002$class != pop ) pred.lda5003 <- predict(lda5003, newdata=gendata) generr.lda5003 <- mean( pred.lda5003$class != pop ) # # Here are the estimated gen. error rates: generr.lda5002 # classifier picked by stepwise prodecure --- just uses x1 & x2 generr.lda5003 # classifier using x1, x2, and x3 generr.lda500all # classifier using all of the variables (incl. some that are noise) # # Conclusion: Stepwise LDA did _okay_ --- BUT it # can also be noted that using predictors that are # weaker than the strongest ones, including some # which are pure noise, didn't really hurt things # a lot. While the classifier using just the two # strongest predictors did best, the ones using # more predictors were almost just as good. # # # My guess is that eliminating weak and pure noise # variables as predictors is much more important with # local methods than with LDA and QDA, since with # LDA and QDA the weak predictors can be "recognized" # as being weak, and when the decision boundary is # determined, they can be largely ignored. # # Note: The stepclass function can be used with # method="qda" instead of method="lda" --- I don't # know all of the methods that can be used with # stepclass. #