# Classification with vowel data set (11 classes, 10 predictors) referred to in Ch. 4 of HTF2.

      # Note: Developed with vesion 2.9.0.
      # Note: I designed this so that it'll work well to paste the commands into R in chunks.
          #   Each time, go down to the next line with just ##### on it and paste all of the 
          #   commands in at one time.  Then after the commands are executed, look at the 
          #   comments and output and look at the graph before pasting in the next chunk of
          #   commands.  (If you paste everything in all at once you won't have much time to 
          #   look at the graphics.)


# First I'll load some libraries that may be needed.

library(class)
library(MASS)
library(e1071) 
library(kknn)
library(klaR)


# I'll also set the random number seed since the c-v routines use random numbers
# and having a fixed seed tends to make trouble-shooting easier.

set.seed(7)


# Next I'll read in the training data.

fulltrndat <- read.table("http://mason.gmu.edu/~csutton/HTF2VowelTrain.txt",sep=',',header=T)

# Since I don't see a real need for the ID variable in the first column, I'll delete it.

trndat <- fulltrndat[,-1]

# Let's look at the data, just using first two predictor variables.

ind1 <- 1 + 11*c(0:47)
ind2 <- 2 + 11*c(0:47)
ind3 <- 3 + 11*c(0:47)
ind4 <- 4 + 11*c(0:47)
ind5 <- 5 + 11*c(0:47)
ind6 <- 6 + 11*c(0:47)
ind7 <- 7 + 11*c(0:47)
ind8 <- 8 + 11*c(0:47)
ind9 <- 9 + 11*c(0:47)
ind10 <- 10 + 11*c(0:47)
ind11 <- 11*c(1:48)

plot(trndat[ind1,2],trndat[ind1,3],col="violet",main='2-D View',xlab='x.1',ylab='x.2',
  xlim=c(-6,-0.5),ylim=c(-2,5.75))
points(trndat[ind2,2], trndat[ind2,3], col="cyan")
points(trndat[ind3,2], trndat[ind3,3], col="darkorange")
points(trndat[ind4,2], trndat[ind4,3], col="olivedrab")
points(trndat[ind5,2], trndat[ind5,3], col="dodgerblue")
points(trndat[ind6,2], trndat[ind6,3], col="deeppink")
points(trndat[ind7,2], trndat[ind7,3], col="sienna")
points(trndat[ind8,2], trndat[ind8,3], col="purple3")
points(trndat[ind9,2], trndat[ind9,3], col="yellow")
points(trndat[ind10,2], trndat[ind10,3], col="chartreuse")
points(trndat[ind11,2], trndat[ind11,3], col="darkslategray")
legend(-1.2,6,legend=c("1","2","3","4","5","6","7","8","9","10","11"),
  fill=c("violet","cyan","darkorange","olivedrab","dodgerblue","deeppink",
  "sienna","purple3","yellow","chartreuse","darkslategray"))

# Based on this plot, one might guess that curved decision boundaries may
# be better than linear ones.

#####

# To get another 2-d view, let's plot the 1st and 2nd principal components.
# (I was trying to get something like Fig. 4.4 on p. 107 of HTF2, but they  
# used some other variables to get their 2-d plot (see p. 114 of HTF2).)

pc.trn <- princomp(~ ., data=trndat[,-1], cor=T)
summary(pc.trn)
plot(pc.trn)
loadings(pc.trn)

pcomp.trn <- data.frame( cbind(trndat[,1],pc.trn$score) )
names(pcomp.trn) <- c("y","pc1","pc2","pc3","pc4","pc5","pc6","pc7","pc8","pc9","pc10")
 
#####

plot(pcomp.trn[ind1,2],pcomp.trn[ind1,3],col="violet",main='2-D View',xlab='pc1',ylab='pc2',
  xlim=c(-4.5,5.5),ylim=c(-3.9,4.4))
points(pcomp.trn[ind2,2], pcomp.trn[ind2,3], col="cyan")
points(pcomp.trn[ind3,2], pcomp.trn[ind3,3], col="darkorange")
points(pcomp.trn[ind4,2], pcomp.trn[ind4,3], col="olivedrab")
points(pcomp.trn[ind5,2], pcomp.trn[ind5,3], col="dodgerblue")
points(pcomp.trn[ind6,2], pcomp.trn[ind6,3], col="deeppink")
points(pcomp.trn[ind7,2], pcomp.trn[ind7,3], col="sienna")
points(pcomp.trn[ind8,2], pcomp.trn[ind8,3], col="purple3")
points(pcomp.trn[ind9,2], pcomp.trn[ind9,3], col="yellow")
points(pcomp.trn[ind10,2], pcomp.trn[ind10,3], col="chartreuse")
points(pcomp.trn[ind11,2], pcomp.trn[ind11,3], col="darkslategray")
legend(4,4.65,legend=c("1","2","3","4","5","6","7","8","9","10","11"),
  fill=c("violet","cyan","darkorange","olivedrab","dodgerblue","deeppink",
  "sienna","purple3","yellow","chartreuse","darkslategray"))


# There appears to be a lot of overlap.  A similar plot of the 1st and 3rd pcs didn't look any better.

#####

# First let's try LDA (after getting the test set ready).  

fulltstdat <- read.table("http://mason.gmu.edu/~csutton/HTF2VowelTest.txt",sep=',',header=T)
tstdat <- fulltstdat[,-1]

trnlda <- lda(y ~ . , trndat)
pred.trnlda <- predict(trnlda, newdata=tstdat)
lda.rate <- mean( pred.trnlda$class != tstdat[,1] )
lda.rate

# From the description of the datafound on the authors' web site, it seems
# like the best neural network classifiers get about 45% wrong and that 
# using one nearest neighbor gets about 44% wrong.  So LDA didn't do that badly.

# Since a plot of just the first two variables looked somewhat promising, I'm
# curious as to what will happen if just they are used.

trnlda <- lda(y ~ . , trndat[,1:3])
pred.trnlda <- predict(trnlda, newdata=tstdat)
lda.rate <- mean( pred.trnlda$class != tstdat[,1] )
lda.rate

# Better!  But perhaps other variables would do better still.
# Let's see what stepwise LDA does.

trnswlda <- stepclass(y ~ ., data=trndat, method="lda")
trnswlda

# It picks the first two variables (the same ones I had tried).
# The output indicates a c-v estimate of the error rate is about 
# 54%.  But we know from before the test data resulted in only
# about 48% errors.

#####

# Now I'll try stepwise LDA with the principal components.

trnswldapc <- stepclass(y ~ ., data=pcomp.trn, method="lda")
trnswldapc

# The indication is that using just the 2nd, 4th, and 7th pcs
# is best.  Let's see how that does.  I need to first set up
# the test data for principal components.

pcomp.tst <- predict(pc.trn, newdata=tstdat[,-1])
pcomp.tst <- data.frame( cbind(tstdat[,1],pcomp.tst) )
names(pcomp.tst) <- c("y","pc1","pc2","pc3","pc4","pc5","pc6","pc7","pc8","pc9","pc10")

trnswldapc <- lda(y ~ pc2 + pc4 + pc7, pcomp.trn)
pred.trnlda <- predict(trnswldapc, newdata=pcomp.tst)
lda.rate <- mean( pred.trnlda$class != tstdat[,1] )
lda.rate

# A big disappointment.

#####

# Let's try QDA.

trnqda <- qda(y ~ . , trndat)
pred.trnqda <- predict(trnqda, newdata=tstdat)
qda.rate <- mean( pred.trnqda$class != tstdat[,1] )
qda.rate

# (Note: Both my LDA and QDA rates match those given for the test data in
# Table 4.1 on p. 107 of HTF2.)

# Since a plot of just the first two variables looked somewhat promising, 
# and these variables were choen using stepwise LDA, I'm curious as to what 
# will happen if just they are used with QDA.

trnqda <- qda(y ~ . , trndat[,1:3])
pred.trnqda <- predict(trnqda, newdata=tstdat)
qda.rate <- mean( pred.trnqda$class != tstdat[,1] )
qda.rate

# A tad better.  Perhaps other variables would do better still.
# Let's see what stepwise QDA does.

trnswqda <- stepclass(y ~ ., data=trndat, method="qda")
trnswqda

# The indication is that we may get great results using 
# QDA with just the 1st, 2nd, 3rd, 8th, and 9th predictors.
# Let's see is this works well for predicting the classes
# for the test set.

trnqda <- qda(y ~ x.1 + x.2 + x.3  + x.8 + x.9, trndat)
pred.trnqda <- predict(trnqda, newdata=tstdat)
qda.rate <- mean( pred.trnqda$class != tstdat[,1] )
qda.rate

# The c-v indication of only about 19% errors was way too high.

# I'm curious as to what will happen if I just use three
# of the predictors.

trnqda <- qda(y ~ x.1 + x.2 + x.8, trndat)
pred.trnqda <- predict(trnqda, newdata=tstdat)
qda.rate <- mean( pred.trnqda$class != tstdat[,1] )
qda.rate

# Not as good.

# By accident (really) I stumbled upon a way to get a 44%
# error rate (which is great for this data set).

trnqda <- qda(y ~ x.1 + x.2 + x.3 + x.4 + x.8, trndat)
pred.trnqda <- predict(trnqda, newdata=tstdat)
qda.rate <- mean( pred.trnqda$class != tstdat[,1] )
qda.rate

# This is the best result so far (but it's not really legit ...
# since if I didn't have the test set to confirm that this
# works best, I wouldn't know or suspect that it does).

#####

# Let's try RDA.  I'll try it four times in a row to see
# how stable the fitting of the parameters seems to be.
# (The c-v divisions will be different each time.) 

trnrda <- rda(y~., trndat, crossval=TRUE, fold=10)
trnrda

pred.trnrda <- predict(trnrda, newdata=tstdat)
rda.rate <- mean( pred.trnrda$class != tstdat[,1] )
rda.rate

# The first parameters (both lambda and gamma are near 0)
# result in RDA being similar to QDA.  I tried it three
# more times and got very similar results each time.

# Note: With the parametrization on p. 112 of HTF2, both parameters
# being 0 makes RDA equivalent to DDA (diagonal discriminant analysis),
# but the R function is different, and the two parameters being 0 makes 
# it equivalent to QDA.  But with this data, having the parameters not 
# exactly 0 makes RDA perform better than both QDA and LDA on the test   
# set.  Using all of the variables,
#      0.556 is the test error for LDA,
#      0.528 is the test error for QDA,
#   &  0.487 is the test error for RDA.

# So now I'll try it using just the variables that stepwise LDA
# found to be the most useful (since stepwise LDA did better than
# stepwise QDA).

trnrda <- rda(y~., trndat[,1:3], crossval=TRUE, fold=10)
trnrda

pred.trnrda <- predict(trnrda, newdata=tstdat)
rda.rate <- mean( pred.trnrda$class != tstdat[,1] )
rda.rate

# I'll now try it with the top two variables from both LDA and QDA
# (x.1 and x.2), along with the 3rd variable chosen for QDA (x.8).

trnrda <- rda(y~x.1+x.2+x.8, trndat, crossval=TRUE, fold=10)
trnrda

pred.trnrda <- predict(trnrda, newdata=tstdat)
rda.rate <- mean( pred.trnrda$class != tstdat[,1] )
rda.rate

# Neither of these subsets worked better than using all of the variables.

# I decided to see if the rda function would work with the stepclass function.
# It did, but it took a *very* long time (due to nested cross-validation).  So
# I won't do it again here.  Instead I'll report that the variables selected, 
# in order, were
#     x.2, x.1, x.8, x.3, and x.4.
# Below, I'll use the rda function with these variables.

trnrda <- rda(y~x.1+x.2+x.3+x.4+x.8, trndat, crossval=TRUE, fold=10)
trnrda

pred.trnrda <- predict(trnrda, newdata=tstdat)
rda.rate <- mean( pred.trnrda$class != tstdat[,1] )
rda.rate

# A great result!  Below is a summary of the results using the variables
# indicated by stepclass, for LDA, QDA, and RDA:
#      0.481 is the test error for LDA,
#      0.498 is the test error for QDA,
#      0.448 is the test error for RDA.

# Note: With the variables selected by stepclass for RDA, on another run of the 
# rda function I got a test error of only 0.392.  (Since rda uses c-v, the results
# can differ due to a different starting point in the random number sequence.)
# I think the parameter values that led to the 0.392 error rate were about 0.09 and 
# 0.01, so I'll fix these values and try the rda function again.

trnrda <- rda(y~x.1+x.2+x.3+x.4+x.8, trndat, gamma=0.09, lambda=0.01, fold=10)
trnrda

pred.trnrda <- predict(trnrda, newdata=tstdat)
rda.rate <- mean( pred.trnrda$class != tstdat[,1] )
rda.rate

# This is the best result so far.  Once again we see that it's not wise to just run
# rda one time, since c-v may lead to parameter values which can be appreciably
# improved upon.

#####

# Now let's try using nearest neighbors classification.  I'll use
# c-v to select a value of k for KNN.

cvknn <- train.kknn(as.factor(y) ~ ., trndat, kmax=25, kernel="rectangular", distance=2) 

cvknn$MISCLASS
summary(cvknn)

plot(c(1:25), cvknn$MISCLASS, type="l", col="blue", main="C-V Results", xlab="k", ylab="Misclass") 

#####

# So let's see how using just one nearest neighbor does on the test data.

pred.1nn <- knn( trndat[,-1], tstdat[,-1], cl=trndat[,1], k=1 )
onenn.rate <-  mean( pred.1nn != tstdat[,1] )
onenn.rate

# Except for that one rda result (that took a lot of time to obtain, since  
# using the stepclass function with the rda function took a long time, and 
# even after stepclass indicated which variables to use, it may take several
# runs of the rda function to find good parameter values), this is the best 
# result so far.

# Let's check to see if the c-v routine actually found the best value for k.

knn.rate <- numeric(25)
for (i in 1:25)
 {
  pred.knn <- knn(trndat[,-1], tstdat[,-1], cl=trndat[,1], k=i)
  knn.rate[i] <- mean(pred.knn != tstdat[,1])
 }
points(c(1:25),knn.rate,type="l",col="sienna")
legend(1,0.4,legend=c("C-V","Observed"),fill=c("blue","sienna"))

# Although some of the numerical values of the c-v estimates or error are way off 
# (too low for the smaller values of k), this time c-v did indicate a decent value 
# to use for k.

# One might wonder why one nearest neighbor appears to work so much better based on
# the training data.  Well, data was collected from 8 subjects for the training set,
# and 7 different subjects for the test set.  With the training data, the nearest 
# neighbor to a case might be another case from the same person.  (Looking at plots 
# of the data does suggest clumping ... possibly due to data from the same subject.)
# When the test points are classified using neighbors from the training set, none of 
# the neighbors can be cases from the same subject, and I suspect the nearest neighbors
# are farther away than when classifying training cases using the training data in a c-v 
# scheme.  All in all, it's harder to correctly classify a test case using the training
# data than it is to classify a training case using the training data.

##### 

# Now I'll try nearest neighbors just using x.1 and x.2.

cvknn <- train.kknn(as.factor(y) ~ x.1 + x.2, trndat, kmax=75, kernel="rectangular", distance=2) 

cvknn$MISCLASS
summary(cvknn)

plot(c(1:75), cvknn$MISCLASS, type="l", col="blue", main="C-V Results", xlab="k", ylab="Misclass",
  ylim=c(0.3,0.6)) 

# Looks worse than before ... but could be better if the c-v estimates of error are accurate.

#####

# So let's see how using seven nearest neighbors does on the test data.

pred.1nn <- knn( trndat[,2:3], tstdat[,2:3], cl=trndat[,1], k=7 )
sevennn.rate <-  mean( pred.1nn != tstdat[,1] )
sevennn.rate

# No good.

# I'm really interested in how some sort of variable selection can be 
# using nearest neighbors.  Ch. 13 of HTF2 covers some methods related
# to this idea.

# Let's see if c-v pointed to the best value of k.

knn.rate <- numeric(75)
for (i in 1:75)
 {
  pred.knn <- knn(trndat[,2:3], tstdat[,2:3], cl=trndat[,1], k=i)
  knn.rate[i] <- mean(pred.knn != tstdat[,1])
 }
points(c(1:75),knn.rate,type="l",col="sienna")
legend(1,0.525,legend=c("Observed","C-V"),fill=c("sienna","blue"))

# We can see that larger values for k do better than 7, but none of the 
# possibilities considered result in an error rate less than 50%.


# Once again, by accident, I stumbled upon a better error rate.
# If 7 nearest neighbors are used with all of the data, the error
# rate drops below 40%.

pred.1nn <- knn( trndat[,-1], tstdat[,-1], cl=trndat[,1], k=7 )
sevennn.rate <-  mean( pred.1nn != tstdat[,1] )
sevennn.rate


# Summary of best test sample error rates:
#
#   0.481  (LDA using just x.1 and x.2 (var's sel. by c-v)),
#   0.498  (QDA using just x.1, x.2, x.3, x.8, and x.9 (var's sel. by c-v)),
#   0.442  (QDA using just x.1, x.2, x.3, x.4, and x.8 (found by accident)),
#   0.487  (RDA using all variables),
#   0.498  (RDA using just x.1 and x.2 (Var's sel. by c-v)),
#   0.448  (RDA using just x.1, x.2, x.3, x.4, and x.8 (var's sel. by lengthy c-v)),  ***
#   0.392  (RDA using just x.1, x.2, x.3, x.4, and x.8 (var's sel. by lengthy c-v)),  ***
#   0.437  (1NN using all variables (k found using c-v)),
#   0.394  (7NN using all variables (k found by accident)).
#
# (Note: The two RDA results marked with *** differ due to different c-v folds, due 
# to different random number streams.  Although the values differ, both are good.)


##### THE END (for now) #####