# Classification with vowel data set (11 classes, 10 predictors) referred to in Ch. 4 of HTF2.

      # Note: Developed with vesion 2.9.0.
      # Note: I designed this so that it'll work well to paste the commands into R in chunks.
          #   Each time, go down to the next line with just ##### on it and paste all of the 
          #   commands in at one time.  Then after the commands are executed, look at the 
          #   comments and output and look at the graph before pasting in the next chunk of
          #   commands.  (If you paste everything in all at once you won't have much time to 
          #   look at the graphics.)


# I'll load some libraries and set the random number seed.

library(class)
library(MASS)
library(e1071) 
library(kknn)
library(klaR)
library(nnet)

set.seed(7)


#######################################################
# Reading in Data and Creation of Canonical Variables #                            
#######################################################

# I'll read in the training data and delete the unnecessary case id column.

fulltrndat <- read.table("http://mason.gmu.edu/~csutton/HTF2VowelTrain.txt",sep=',',header=T)
trndat <- fulltrndat[,-1]

# Next I'll scale the data. 

sctrn.vars <- scale(trndat[,-1], center=TRUE, scale=TRUE)
sctrndat <- data.frame( cbind( trndat[,1], sctrn.vars ) )
names(sctrndat) <- names(trndat)

# Now I will compute the canonical variables.
# (Note: Some classification will be done using original variables, 
# and some will be done using canonical variables.)

scldafit <- lda(y~., sctrndat)
canvar.trn <- as.matrix(sctrndat[,-1]) %*% scldafit$scaling
canvartrn <- data.frame( cbind( trndat[,1], canvar.trn ) )
names(canvartrn) <- c("y","cv1","cv2","cv3","cv4","cv5","cv6","cv7","cv8","cv9","cv10")

# Now read in test data and create canonical variables.

fulltstdat <- read.table("http://mason.gmu.edu/~csutton/HTF2VowelTest.txt",sep=',',header=T)
tstdat <- fulltstdat[,-1]

sctst.vars <- scale(tstdat[,-1],center=attr(sctrn.vars,"scaled:center"),
                                scale=attr(sctrn.vars,"scaled:scale"))
sctstdat <- data.frame( cbind(tstdat[,1], sctst.vars ) )
names(sctstdat) <- names(sctrndat)

canvar.tst <- as.matrix(sctstdat[,-1]) %*% scldafit$scaling
canvartst <- data.frame( cbind( tstdat[,1], canvar.tst ) )
names(canvartst) <- names(canvartrn)


############################################
# Classification Using Logistic Regression #
############################################

# P. 173 of the 2002 Sage book
#   An R and S-Plus Companion to Applied Regression
# by John Fox states 
#   "The multinomial logit model cannot be fit by glm, 
#    but the multinom function in the nnet library ...
#    will do the trick".

# The help page for multinom indicates that the predictor 
# variables should be roughly scaled to [0,1] or the fit
# will be slow or may not converge at all.  When I used 
# multinom with 10 predictors, whether they were from the 
# original data, the scaled data, or the canonical variables, 
# it ran for 100 iterations and did not converge (although
# the resulting predictions were not horrible).  The scaled
# predictors have standard deviation 1, but of course the 
# values do not all belong to [0,1] (since the sample mean 
# is 0 for each of the scaled predictors).  I'll see what 
# happens if I do a linear transformation on each of the 
# original predictors to make them have range [0,1].  (Note:
# Any linear transformation of the original predictors,
# whether this one, or the one corresponding to scaling the
# data, ought to result in the same fitted model ... if 
# convergence occurs.)

transform.trn <- matrix(0, nrow=nrow(trndat), ncol=(ncol(trndat)-1))
transform.tst <- matrix(0, nrow=nrow(tstdat), ncol=(ncol(tstdat)-1))
for (i in 2:11) 
{
   transform.trn[,i-1] <- (trndat[,i]-min(trndat[,i]))/(max(trndat[,i])-min(trndat[,i])) 
   transform.tst[,i-1] <- (tstdat[,i]-min(trndat[,i]))/(max(trndat[,i])-min(trndat[,i]))
}
# (Note: Above the test data was transformed the same way as the training data ...
# so used min and max values from training data to transform test data.)
transtrn <- data.frame( cbind( trndat[,1], transform.trn ) )
names(transtrn) <- names(trndat)
transtst <- data.frame( cbind( tstdat[,1], transform.tst ) )
names(transtst) <- names(tstdat)

logreg.fit1 <- multinom(as.factor(y) ~ ., data=transtrn)

# We still don't achieve convergence.  (Often when a log reg fit
# fails to converge, some of the coef values and/or estimated 
# standard error values are rather extreme, but this doesn't happen 
# in a big way here (see output below).)  (Note: Wald = T below results 
# in Wald test statistics being given (so one can see if there are any 
# unimportant variables).)

summary(logreg.fit1, Wald=T)

# One way to choose a decent logistic regression model from amongst
# a collection of fitted models based on different predictors is to 
# choose the one yielding the smallest value of the AIC.  We can 
# note that here we got a value of about 903.

# Let's see how well the fitted model predicts the classes for the 
# test set.  (Note: In practice one may not have a test set to use
# to get estimates of the generalizatiion error rates, and so it's
# good to have some other way to select a model.)

pred.logreg <- predict(logreg.fit1, newdata=transtst, type="class")
logreg.rate <-  mean( pred.logreg != transtst[,1] )
logreg.rate

#####


# Maybe there just isn't enough data to fit 110 parameters well.
# So I'll now try using the first three canonical variables.

logreg.fit2 <- multinom(as.factor(y) ~ ., data=canvartrn[1:4])
summary(logreg.fit2, Wald=T)   
pred.logreg <- predict(logreg.fit2, newdata=canvartst[1:4], type="class")
logreg.rate <-  mean( pred.logreg != canvartst[,1] )
logreg.rate

# This time we got convergence, although the AIC (1085) is larger
# (as is the error rate of the test sample predictions).

#####

# Now I'll just use the first two canonical variables.

logreg.fit3 <- multinom(as.factor(y) ~ ., data=canvartrn[1:3])
summary(logreg.fit3, Wald=T)   
pred.logreg <- predict(logreg.fit3, newdata=canvartst[1:3], type="class")
logreg.rate <-  mean( pred.logreg != canvartst[,1] )
logreg.rate

# Even though the 2nd model (using one fewer canonical variable) did a
# better job predicting the classes of the test set, in practice we may
# want to be able to choose a model w/o having test set performance
# results.  If we base model selection on minimizing the AIC, we'd
# prefer the model based on the larger number of variables.  (Note: 
# (7.27) on p. 230 of HTF2 indicates that when the sample size is 
# somewhat large, a small difference in the number of variables may
# have little effect on the AIC.)

# We can use the anove f'n to do a test to see if there is significant
# evidence that the coefficient of cv3 is nonzero.  

compare <- anova(logreg.fit3, logreg.fit2)
compare

# There is strong evidence that at least one of the coefficients 
# of cv3 is nonzero.  (Recall there are 10 coefficients of cv3 in
# an 11-class model.)  So two things (AIC and this test result) 
# suggest it's better to include cv3 than to not include it (even 
# though test set performance indicates otherwise).

#####

# Since logistic regression results in piecewise linear decision
# boundaries, and previous exploration of and experimentation with
# the data suggests that perhaps having curved boundaries might be 
# better, let's see what happens if 2nd-order terms are included.

logreg.fit4 <- multinom(as.factor(y) ~ cv1 + cv2 + I(cv1^2) + I(cv2^2)
                                      + I(cv1*cv2), data=canvartrn[1:3])

# It can be noted that although convergence didn't occur, based on the
# values given above, one might think it was close to convergence when it 
# stopped.  So maybe we need not worry so much about the lack of convergence.
# (Notes: (1) From the R documentation, I don't see how one can increase
# the number of iterations.  (2) The value given with the iterations is
# the absolute value of the log-likelihood (evaluated at the mles).)

summary(logreg.fit4, Wald=T)   
compare <- anova(logreg.fit3, logreg.fit4)
compare

# The 2nd-order model based on cv1 and cv2 has a smaller AIC than the
# 1-st order model based on these two variables.  Also, a chi-square 
# test indicated that the 2nd-order terms are collectively significant.

pred.logreg <- predict(logreg.fit4, newdata=canvartst[1:3], type="class")
logreg.rate <-  mean( pred.logreg != canvartst[,1] )
logreg.rate

# Even though convergence didn't occur, this produced the lowest AIC so 
# far (and also the lowest error rate from the test sample).

# Let's try a 2nd-order model using first three canonical variables.

logreg.fit5 <- multinom(as.factor(y) ~ cv1 + cv2 + cv3 + I(cv1^2) + 
                                      + I(cv2^2) + I(cv3^2) +
                                      + I(cv1*cv2) + I(cv1*cv3) +
                                      + I(cv2*cv3), data=canvartrn[1:4])
summary(logreg.fit5, Wald=T)   
compare <- anova(logreg.fit2, logreg.fit5)
compare

# The 2nd-order model based on cv1, cv2, and cv3 has a smaller AIC than 
# the 1-st order model based on these three variables.  (It's the smallest
# AIC we've obtained so far from models involving the canonical variables.)  
# Also, a chi-square test indicated that the 2nd-order terms are collectively 
# significant.

# So now it seems sensible to compare the two 2nd-order models.

compare <- anova(logreg.fit4, logreg.fit5)
compare

# The larger 2nd-order model (involving cv3) has a smaller AIC 
# (832 vs. 922), and also the chi-square test indicates the terms
# involving cv3 are collectively significant.

# Let's see how well the 2nd-order model based on the first three
# canonical variables predicts the test set.

pred.logreg <- predict(logreg.fit5, newdata=canvartst[1:4], type="class")
logreg.rate <-  mean( pred.logreg != canvartst[,1] )
logreg.rate

# Lower AIC, but worse performance on test set (50.2% errors vs. 47.4%
# errors from 2nd-order model based on just two canonical variables).

#####


# Since previous experience with this data indicated that 
# x.1 and x.2 were the best predictors, and that using x.8
# might also be useful (but not necessarily), let's investigate
# some models using a subset of the best original predictors
# (transformed to make all of the value lie in [0,1]).  (Note:
# 2nd-order terms based on the transformed variables will also
# have all values in [0,1].)

logreg.fit6 <- multinom(as.factor(y) ~ x.1 + x.2 + x.8, data=transtrn)
summary(logreg.fit6, Wald=T)

# A horribly large AIC!  Let's check out the pertinent chi-square test.

compare <- anova(logreg.fit6, logreg.fit1)
compare

# There's strong evidence that at least one of the coefficients of
# the variables not included in the smaller model is nonzero.  
 
# Let's see how well the fitted model predicts the classes for the 
# test set.  
 
pred.logreg <- predict(logreg.fit6, newdata=transtst, type="class")
logreg.rate <-  mean( pred.logreg != transtst[,1] )
logreg.rate

# Even though the smaller model has a much larger value for the AIC,
# and the chi-square test pertaining to the model differences is highly
# significant (indicating too many variables were deleted from the
# smaller model), the smaller actually predicts the test set classes 
# a tad better (51.3% errors vs. 51.9% errors from the larger model).

# One might wonder why I didn't use a stepwise approach to select
# variables for a reduced model.  Well, neither the stepclass f'n
# or the stepAIC f'n seems to work with the multinom f'n. 


# Here's a plot summarizing the results so far.

aic <- c(903,1085,1148,922,832,1440)
test.error <- c(51.9,54.8,49.8,47.4,50.2,51.2)
plot(aic,test.error,col="cornflowerblue",pch=19,
     main="Prediction Error vs. AIC",xlab="AIC",ylab="Prediction Error")

#####


# Now I want to focus on classifiers for the two-class problem of
# classifying just the Class 2 and Class 3 cases.

# Just in case I do something involving random numbers, I'll reset
# the random number seed (so that if things are added above, they
# won't affect what's below).

set.seed(12)

# First I'll create some pertinent data sets from the original
# variables (linearly transformed to belong to [0,1]) and from 
# the canonical varibles.

ind2 <- 2 + 11*c(0:47)
ind3 <- 3 + 11*c(0:47)
index <- c(ind2,ind3)
transtrn.small <- transtrn[index,]
canvartrn.small <- canvartrn[index,]

ind2 <- 2 + 11*c(0:41)
ind3 <- 3 + 11*c(0:41)
index <- c(ind2,ind3)
transtst.small <- transtst[index,]
canvartst.small <- canvartst[index,]


# I'm going to use the glm function to do logistic regression.

fit1 <- glm(as.factor(y) ~ ., data=transtrn.small, family=binomial)
summary(fit1)

# I'll use backwards elimination to search for a good model.
# I find it interesting that x.2, which was found to be an 
# important predictor previously (in stepwise LDA and QDA
# when all 11 classes were considered) yields the largest
# p-value, and so is the first variable eliminated.

fit2 <- glm(as.factor(y) ~ ., data=transtrn.small[,-3], family=binomial)
summary(fit2)

# The AIC went down.  Now x.7 should be eliminated.  But 
# before doing that, let's check to see if a chi-square test
# (an asymptotic likelihood ratio test) yields a p-value
# consistent with the one from the Wald test about the
# coefficient of x.2 from the previous logistic regression
# output.  Unfortunately, the anova f'n doesn't supply us
# with the pertinent p-value (like it does in a lot of cases),
# and so we have to produce one using a value from the outputted
# object, as shown below.

compare <- anova(fit2, fit1)
compare
p.value <- 1 - pchisq( compare$Dev[2], df=1 )
p.value

# We got 0.8796 from the Wald test, and here we get 0.8784.
# So pretty close.  (It's generally thought to be better to 
# use chi-square tests, but it's a lot of trouble.  For step
# 1 in the backwards elimination strategy, I'd have to fit a 
# total of 11 logistic regression models to get the 10 p-values 
# I want, whereas I get all 10 Wald test p-values from a single
# logistic regression fit.  So I'll just rely on Wald test p-values
# for now.)

fit3 <- glm(as.factor(y) ~ ., data=transtrn.small[,-c(3,8)], family=binomial)
summary(fit3)

# Now I'll remove x.5.

fit4 <- glm(as.factor(y) ~ ., data=transtrn.small[,-c(3,6,8)], family=binomial)
summary(fit4)

# Now I'll remove x.1. (I'm surprised that x.1 seems expendable since it has 
# showed up as a strong predictor previously.)

fit5 <- glm(as.factor(y) ~ ., data=transtrn.small[,-c(2,3,6,8)], family=binomial)
summary(fit5)

# Now I'll remove x.9.

fit6 <- glm(as.factor(y) ~ ., data=transtrn.small[,-c(2,3,6,8,10)], family=binomial)
summary(fit6)

# Let's see if the sequence of models found by the backwards elimination ploy
# results in a sequence of decreasing misclassification rates on the test data.
# (Note: type="class" didn't work when I gave the predict f'n the glm object.
# So I use type="response" which returns probabilities (for class 3).  I can
# round the probabilities to get a vector of 0s and 1s which can be compared
# to a vector of the true classes with 2 subtracted (to convert the 2s and 3s 
# to 0s and 1s).  Since AIC decreased as the models were made smaller, I'll
# plot the test error rates against AIC.  If smaller AIC corresponds to better
# prediction accuracy (and so larger AIC would correspond to worse accuracy)
# we should see an increasing trend on the plot (the larger the AIC, the larger
# the error rate).

error <- rep(0,6)
pred <- predict(fit1, newdata=transtst.small, type="response")
error[1] <-  mean( round(pred) != (transtst.small[,1]-2) )
pred <- predict(fit2, newdata=transtst.small, type="response")
error[2] <-  mean( round(pred) != (transtst.small[,1]-2) )
pred <- predict(fit3, newdata=transtst.small, type="response")
error[3] <-  mean( round(pred) != (transtst.small[,1]-2) )
pred <- predict(fit4, newdata=transtst.small, type="response")
error[4] <-  mean( round(pred) != (transtst.small[,1]-2) )
pred <- predict(fit5, newdata=transtst.small, type="response")
error[5] <-  mean( round(pred) != (transtst.small[,1]-2) )
pred <- predict(fit6, newdata=transtst.small, type="response")
error[6] <-  mean( round(pred) != (transtst.small[,1]-2) )

AIC <- c(fit1$aic, fit2$aic, fit3$aic, fit4$aic, fit5$aic, fit6$aic)
plot(AIC, error, type="l", col="limegreen", main="Generalization Error vs. AIC",
  sub="Classification of 2nd and 3rd Classes Using Original Variables", xlab="AIC", ylab="Estimated Error Rate")
results <- cbind(AIC,error)
results

# In this case, once again, AIC doesn't do a good 
# job of identifying the best performing models.

#####

# We can use the stepAIC f'n with a glm object 
# (from the fit based on all of the variables)
# to do stepwise logistic regression, deleting
# and possibly adding back in variables in a 
# step-by-step manner in an attempt to minimize
# the AIC.

fit.sw <- stepAIC(fit1,
    scope = list(upper = ~., lower = ~1),
    trace = FALSE)
fit.sw$anova

# This stepwise tactic results in the same final model
# I obtained above using backwards elimination based
# on Wald test p-values, but the sequence of models is
# slightly different.  Here the variables were removed
# in the order
#  x.2, x.7, x.1, x.5, x.9
# and before they were removed in the order
#  x.2, x.7, x.5, x.1, x.9.


#####

# Now let's try using the canonical variables.

cvfit1 <- glm(as.factor(y) ~ ., data=canvartrn.small, family=binomial)
summary(cvfit1)

# I'll remove cv5.

cvfit2 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-6], family=binomial)
summary(cvfit2)

# I'll remove cv3.

cvfit3 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-c(4,6)], family=binomial)
summary(cvfit3)

# I'll remove cv9.

cvfit4 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-c(4,6,10)], family=binomial)
summary(cvfit4)

# Even though there are no more large p-values, instead of
# stopping I'll keep going in order to more thoroughly explore
# the relationship between AIC and prediction accurcay.  I'll 
# remove cv7.

cvfit5 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-c(4,6,8,10)], family=binomial)
summary(cvfit5)

# I'll remove cv1.

cvfit6 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-c(2,4,6,8,10)], family=binomial)
summary(cvfit6)

# I'll remove cv10.

cvfit7 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-c(2,4,6,8,10,11)], family=binomial)
summary(cvfit7)

# I'll remove cv4.

cvfit8 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-c(2,4,5,6,8,10,11)], family=binomial)
summary(cvfit8)

# I'll remove cv6.

cvfit9 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-c(2,4,5,6,7,8,10,11)], family=binomial)
summary(cvfit9)

# I'll remove cv8.

cvfit10 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-c(2,4,5,6,7,8,9,10,11)], family=binomial)
summary(cvfit10)

# Now let's look at a plot of test set error against AIC.

error <- rep(0,10)
pred1 <- predict(cvfit1, newdata=canvartst.small, type="response")
error[1] <-  mean( round(pred1) != (canvartst.small[,1]-2) )
pred2 <- predict(cvfit2, newdata=canvartst.small, type="response")
error[2] <-  mean( round(pred2) != (canvartst.small[,1]-2) )
pred3 <- predict(cvfit3, newdata=canvartst.small, type="response")
error[3] <-  mean( round(pred3) != (canvartst.small[,1]-2) )
pred4 <- predict(cvfit4, newdata=canvartst.small, type="response")
error[4] <-  mean( round(pred4) != (canvartst.small[,1]-2) )
pred5 <- predict(cvfit5, newdata=canvartst.small, type="response")
error[5] <-  mean( round(pred5) != (canvartst.small[,1]-2) )
pred6 <- predict(cvfit6, newdata=canvartst.small, type="response")
error[6] <-  mean( round(pred6) != (canvartst.small[,1]-2) )
pred7 <- predict(cvfit7, newdata=canvartst.small, type="response")
error[7] <-  mean( round(pred7) != (canvartst.small[,1]-2) )
pred8 <- predict(cvfit8, newdata=canvartst.small, type="response")
error[8] <-  mean( round(pred8) != (canvartst.small[,1]-2) )
pred9 <- predict(cvfit9, newdata=canvartst.small, type="response")
error[9] <-  mean( round(pred9) != (canvartst.small[,1]-2) )
pred10 <- predict(cvfit10, newdata=canvartst.small, type="response")
error[10] <-  mean( round(pred10) != (canvartst.small[,1]-2) )

AIC <- c(cvfit1$aic,cvfit2$aic,cvfit3$aic,cvfit4$aic,cvfit5$aic,cvfit6$aic,
         cvfit7$aic,cvfit8$aic,cvfit9$aic,cvfit10$aic)
plot(AIC, error, type="l", col="darkorange", main="Generalization Error vs. AIC",
  sub="Classification of 2nd and 3rd Classes Using Canonical Variables (from all 11 classes)", 
  xlab="AIC", ylab="Estimated Error Rate",xlim=c(45,124),ylim=c(0.075,0.345))
results <- cbind(AIC,error)
results

# As the AIC initially decreased the test error did not.  
# The p-values indicated I should have stopped after the
# fourth model (with three variables removed), but when
# I kept going, even though the AIC got larger, I obtained
# models which resulted in appreciably lower test error rates.

#####

# Now I want to look at what happens if we start with 10
# canonical variables and produce a sequence of simpler
# models by removing one variable at a time, starting with
# cv10, then cv9, then cv8, etc.

cvfit11 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-11], family=binomial)
cvfit12 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-c(10,11)], family=binomial)
cvfit13 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-c(9,10,11)], family=binomial)
cvfit14 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-c(8,9,10,11)], family=binomial)
cvfit15 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-c(7,8,9,10,11)], family=binomial)
cvfit16 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-c(6,7,8,9,10,11)], family=binomial)
cvfit17 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-c(5,6,7,8,9,10,11)], family=binomial)
cvfit18 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-c(4,5,6,7,8,9,10,11)], family=binomial)
cvfit19 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-c(3,4,5,6,7,8,9,10,11)], family=binomial)

newerror <- rep(0,10)
newerror[1] <- error[1]
pred11 <- predict(cvfit11, newdata=canvartst.small, type="response")
newerror[2] <-  mean( round(pred11) != (canvartst.small[,1]-2) )
pred12 <- predict(cvfit12, newdata=canvartst.small, type="response")
newerror[3] <-  mean( round(pred12) != (canvartst.small[,1]-2) )
pred13 <- predict(cvfit13, newdata=canvartst.small, type="response")
newerror[4] <-  mean( round(pred13) != (canvartst.small[,1]-2) )
pred14 <- predict(cvfit14, newdata=canvartst.small, type="response")
newerror[5] <-  mean( round(pred14) != (canvartst.small[,1]-2) )
pred15 <- predict(cvfit15, newdata=canvartst.small, type="response")
newerror[6] <-  mean( round(pred15) != (canvartst.small[,1]-2) )
pred16 <- predict(cvfit16, newdata=canvartst.small, type="response")
newerror[7] <-  mean( round(pred16) != (canvartst.small[,1]-2) )
pred17 <- predict(cvfit17, newdata=canvartst.small, type="response")
newerror[8] <-  mean( round(pred17) != (canvartst.small[,1]-2) )
pred18 <- predict(cvfit18, newdata=canvartst.small, type="response")
newerror[9] <-  mean( round(pred18) != (canvartst.small[,1]-2) )
pred19 <- predict(cvfit19, newdata=canvartst.small, type="response")
newerror[10] <-  mean( round(pred19) != (canvartst.small[,1]-2) )

newAIC <- c(cvfit1$aic,cvfit11$aic,cvfit12$aic,cvfit13$aic,cvfit14$aic,cvfit15$aic,cvfit16$aic,
         cvfit17$aic,cvfit18$aic,cvfit19$aic)
points(newAIC, newerror, type="l", col="limegreen")

newresults <- cbind(newAIC,newerror)
newresults

# The best performing model is the one using cv1 through cv4,
# but it's not clear how one would be led to that model from
# the training data.  (Note: The best performing model has an
# AIC about 50% larger than the smallest AIC, but makes less 
# than 1/3 the errors that the model with the smallest AIC made.)

#####

# Finally I'll use the stepAIC function to perform stepwise 
# logistic regression using the canonical variables.  (It 
# will delete and possibly add back in deleted variables 
# as it decreases the AIC.) 

cvfit.sw <- stepAIC(cvfit1,
    scope = list(upper = ~., lower = ~1),
    trace = FALSE)
cvfit.sw$anova

# The sequence of four models indicated above are the same as 
# the first four in the sequence obtained using backwards 
# elimination based on p-values.  I'll add this "path" to 
# the plot.  (To better see the small stepwise path, I suggest
# looking at a full screen version of the plot.)

points(AIC[1:4], error[1:4], type="l", lty="dotted", col="darkslateblue")
legend(45,0.35,legend=c("back. elim. (p-values)","canonical variable order","stepwise (AIC)"),
       fill=c("darkorange","limegreen","darkslateblue"))

# It should be noted that the canonical variables used above were
# based on the data from all 11 classes (and by design attempt to
# best separate all 11 classes).  They aren't ideal for classifying
# just the two classes at hand.  (Note: For just the two classes,
# just a single discriminant function from LDA takes the place of the
# canonical variables (which are for when you want a dimension less
# than the number of classes minus 1 (see top of p. 114 of HTF2)).)
# The main purpose of the exercise above was to obtain more sequences 
# of models in order to explore whether or not minimizing the AIC is a 
# good way to identify a good predicting model.  With this data, using 
# AIC doesn't seem to work well, but we should try other data sets before
# jumping to a firm conclusion.


#####  End  #####