# First I'll bring in the necessary libraries.  
# (Note: It may be that not all of these are 
# needed.) 
library(class)
library(MASS)
library(Hmisc)
library(klaR)
library(e1071) 
library(mvtnorm)          
#
# I like to set the seed when using randomly-generated
# data --- it's useful to be able to obtain the same 
# samples when trouble shooting, and it's good to be 
# able to have it so that others can reproduce the
# results. 
set.seed(632)

#
# I'll generate samples from 5-dimensional multivariate 
# normal dist'ns for two classes.  The classes will differ
# the most is the first two dimensions, not as much is in
# third dimension, and not at all in the last two dimensions.
samp1a <- rmvnorm(50, mean=c(1,0), sigma=diag(1,2))   
samp2a <- rmvnorm(50, mean=c(0,1), sigma=diag(1,2))  
samp1b <- rnorm(50, 1, 1)
samp2b <- rnorm(50, 1, 1)
samp1c <- rmvnorm(50, mean=c(0,0), sigma=diag(1,2))   
samp2c <- rmvnorm(50, mean=c(0,0), sigma=diag(1,2))  
cl1 <- rep(1, 50)
cl2 <- rep(2, 50)
samp1 <- cbind(samp1a, samp1b, samp1c, cl1)
samp2 <- cbind(samp2a, samp2b, samp2c, cl2)
tr50data <- data.frame(rbind(samp1, samp2))
names(tr50data) <- c("x1", "x2", "x3", "x4", "x5", "pop")
attach(tr50data)

#
# I will use LDA, using all 5 possible explanatory 
# variables.
lda50all <- lda(pop ~ x1 + x2 + x3 + x4 + x5)
lda50all

#
# Now I will use stepwise LDA to hopefully find a 
# subset of the explanatory variables which will 
# do better than the full set of variables.
lda50sw <- stepclass(pop ~ x1 + x2 + x3 + x4 + x5, data=tr50data, method="lda")
lda50sw
# Very interesting!  The stepwise procedure indicates
# that not even x2 is useful if x1 is used.  (Note that
# the for the two classes, the sample means differ more 
# for x1 than they do for x2.)
#
# Because the predict formula doesn't work with stepclass
# objects, I'm going to use the lda function to create an
# object, corresponding to the stepclass result, that can 
# be used with the predict function.
lda501 <- lda(pop ~ x1)

#
# In order to see if using just x1 is really best, 
# I'm going to use x1 and x2, and then also try
# using x1, x2, and x3.
lda502 <- lda(pop ~ x1 + x2)
lda503 <- lda(pop ~ x1 + x2 + x3)

#
# Before checking out to see which set of variables
# lead to the smallest generalization error for the
# samples of size 50, I'm going to try samples of
# size 500.
detach(tr50data)
samp1a <- rmvnorm(500, mean=c(1,0), sigma=diag(1,2))   
samp2a <- rmvnorm(500, mean=c(0,1), sigma=diag(1,2))  
samp1b <- rnorm(500, 1, 1)
samp2b <- rnorm(500, 1, 1)
samp1c <- rmvnorm(500, mean=c(0,0), sigma=diag(1,2))   
samp2c <- rmvnorm(500, mean=c(0,0), sigma=diag(1,2))  
cl1 <- rep(1, 500)
cl2 <- rep(2, 500)
samp1 <- cbind(samp1a, samp1b, samp1c, cl1)
samp2 <- cbind(samp2a, samp2b, samp2c, cl2)
tr500data <- data.frame(rbind(samp1, samp2))
names(tr500data) <- c("x1", "x2", "x3", "x4", "x5", "pop")
attach(tr500data)

#
# I will use LDA, using all 5 possible explanatory 
# variables.
lda500all <- lda(pop ~ x1 + x2 + x3 + x4 + x5)
lda500all

#
# Now I will use stepwise LDA to hopefully find a 
# subset of the explanatory variables which will 
# do better than the full set of variables.
lda500sw <- stepclass(pop ~ x1 + x2 + x3 + x4 + x5, data=tr500data, method="lda")
lda500sw
# The indication is that using just x1 and x2 will be best.
#
# Because the predict formula doesn't work with stepclass
# objects, I'm going to use the lda function to create an
# object, corresponding to the stepclass result, that can 
# be used with the predict function.
lda5002 <- lda(pop ~ x1 + x2)

#
# I'll try using x1, x2, and x3 to see how this compares
# when the generalization error rates are estimated.
lda5003 <- lda(pop ~ x1 + x2 + x3)
 
#
# Now I'll generate a large new set of data and estimate
# the generalization error rates (to find out if the 
# stepwise procedure worked well). 
detach(tr500data)
samp1a <- rmvnorm(5000, mean=c(1,0), sigma=diag(1,2))   
samp2a <- rmvnorm(5000, mean=c(0,1), sigma=diag(1,2))  
samp1b <- rnorm(5000, 1, 1)
samp2b <- rnorm(5000, 1, 1)
samp1c <- rmvnorm(5000, mean=c(0,0), sigma=diag(1,2))   
samp2c <- rmvnorm(5000, mean=c(0,0), sigma=diag(1,2))  
cl1 <- rep(1, 5000)
cl2 <- rep(2, 5000)
samp1 <- cbind(samp1a, samp1b, samp1c, cl1)
samp2 <- cbind(samp2a, samp2b, samp2c, cl2)
gendata <- data.frame(rbind(samp1, samp2))
names(gendata) <- c("x1", "x2", "x3", "x4", "x5", "pop")
attach(gendata)

#
# First I'll estimate the generalization error rates
# for LDA classifiers constructed from the samples of
# size 50.
pred.lda50all <- predict(lda50all, newdata=gendata)
generr.lda50all <- mean( pred.lda50all$class != pop )
pred.lda501 <- predict(lda501, newdata=gendata)
generr.lda501 <- mean( pred.lda501$class != pop )
pred.lda502 <- predict(lda502, newdata=gendata)
generr.lda502 <- mean( pred.lda502$class != pop )
pred.lda503 <- predict(lda503, newdata=gendata)
generr.lda503 <- mean( pred.lda503$class != pop )
#
# Here are the estimated gen. error rates:
generr.lda501
# classifier picked by stepwise prodecure --- just uses x1
generr.lda502
# classifier using x1 and x2
generr.lda503
# classifier using x1, x2, and x3
generr.lda50all
# classifier using all of the variables (incl. some that are noise)
#
# Conclusion: Stepwise LDA did BAD!  Perhaps it 
# shouldn't be trusted with such small samples.
# (It may be that the bad result could have been 
# avoided if the default value of improvement had
# not been used --- try using improvement=0.02 as 
# an additional argument with stepclass.  I would
# definitely override the default with a large 
# training set, since in that case the c-v estimates
# can be taken as being more accurate, an an estimated
# improvement as small as 0.01 might result because one
# method is just a tad better than another one.)
# It can also be noted that using predictors that 
# are weaker than the strongest ones, including some
# which are pure noise, didn't really hurt things
# a lot.  While the classifier using just the two
# strongest predictors did best, the ones using 
# more predictors were almost just as good.

#
# Now I'll estimate the generalization error rates
# for LDA classifiers constructed from the samples of
# size 500.
pred.lda500all <- predict(lda500all, newdata=gendata)
generr.lda500all <- mean( pred.lda500all$class != pop )
pred.lda5002 <- predict(lda5002, newdata=gendata)
generr.lda5002 <- mean( pred.lda5002$class != pop )
pred.lda5003 <- predict(lda5003, newdata=gendata)
generr.lda5003 <- mean( pred.lda5003$class != pop )
#
# Here are the estimated gen. error rates:
generr.lda5002
# classifier picked by stepwise prodecure --- just uses x1 & x2
generr.lda5003
# classifier using x1, x2, and x3
generr.lda500all
# classifier using all of the variables (incl. some that are noise)
#
# Conclusion: Stepwise LDA did _okay_ --- BUT it 
# can also be noted that using predictors that are 
# weaker than the strongest ones, including some
# which are pure noise, didn't really hurt things
# a lot.  While the classifier using just the two
# strongest predictors did best, the ones using 
# more predictors were almost just as good.
#
#
# My guess is that eliminating weak and pure noise
# variables as predictors is much more important with 
# local methods than with LDA and QDA, since with 
# LDA and QDA the weak predictors can be "recognized"
# as being weak, and when the decision boundary is
# determined, they can be largely ignored.  
#
# Note: The stepclass function can be used with 
# method="qda" instead of method="lda" --- I don't
# know all of the methods that can be used with 
# stepclass.
#