# Classification with vowel data set (11 classes, 10 predictors) referred to in Ch. 4 of HTF2.

      # Note: Developed with vesion 2.9.0.
      # Note: I designed this so that it'll work well to paste the commands into R in chunks.
          #   Each time, go down to the next line with just ##### on it and paste all of the 
          #   commands in at one time.  Then after the commands are executed, look at the 
          #   comments and output and look at the graph before pasting in the next chunk of
          #   commands.  (If you paste everything in all at once you won't have much time to 
          #   look at the graphics.)


# First I'll load some libraries that may be needed.

library(class)
library(MASS)
library(e1071) 
library(kknn)
library(klaR)


# I'll also set the random number seed since the c-v routines use random numbers
# and having a fixed seed tends to make trouble-shooting easier.

set.seed(7)


# Next I'll read in the training data.

fulltrndat <- read.table("http://mason.gmu.edu/~csutton/HTF2VowelTrain.txt",sep=',',header=T)

# Since I don't see a real need for the ID variable in the first column, I'll delete it
# and then print out the first 25 rows of the data set.

trndat <- fulltrndat[,-1]
trndat[1:25,]

# Instead of starting with all 11 classes, I'll first work with a reduced data set just 
# using the cases for the first three classes.  Since every 11th case is a class 1 case,
# we can pick them out using the following index.

ind1 <- 1 + 11*c(0:47)

# The next two indices will pick out the class 2 and class 3 cases, and the last index will  
# pick out all of the class 1, class 2, and class 3 cases.

ind2 <- 2 + 11*c(0:47)
ind3 <- 3 + 11*c(0:47)
index <- c(ind1,ind2,ind3)
index

# Now I'll use the index to create the desired reduced data set, having 48 class 1 
# cases followed by 48 class 2 cases, followed by 48 class 3 cases.

trndat <- trndat[index,]
trndat

# Let's look at the data, just using first two predictor variables.

plot(trndat[1:48,2], trndat[1:48,3], col="darkslategray",main='2-D View of Classes 1-3',
  xlab='x.1',ylab='x.2')
points(trndat[49:96,2], trndat[49:96,3], col="darkorange")
points(trndat[97:144,2], trndat[97:144,3], col="darkcyan")
legend(-2.5,2.25,legend=c("Class 1","Class 2","Class 3"),fill=c("darkslategray","darkorange","darkcyan"))

# Class 1 seems easily separable from classes 2 and 3 if we can get a curved boundary from a 
# classifier, but using just these two variables it's not clear that Classes 2 and 3 will be 
# as easy to separate.

# To make things more interesting while playing with just two classes for a while, let's focus 
# on classes 2 and 3.  (I'll further reduce the data set to now just have the class 2 and 3 cases.)
# I'll also create the corresponding test set for these classes.

trndat <- trndat[49:144,]

fulltstdat <- read.table("http://mason.gmu.edu/~csutton/HTF2VowelTest.txt",sep=',',header=T)
tstdat <- fulltstdat[,-1]
ind2 <- 2 + 11*c(0:41)
ind3 <- 3 + 11*c(0:41)
index <- c(ind2,ind3)
tstdat <- tstdat[index,]

#####

# First let's try LDA.  (Note: With just two classes, and equal sample sizes, lines 2-6 on
# p. 110 of HTF2 indicate that the regression scheme considered in Ch. 2 (and again in Sec.
# 4.2) is equivalent to LDA.)

trnlda <- lda(y ~ . , trndat)
trnlda

pred.trnlda <- predict(trnlda, newdata=tstdat)
lda.rate <- mean( pred.trnlda$class != tstdat[,1] )
lda.rate

# Since we have less than 5 cases per variable for each class,
# it could be that we're trying to estimate too much with too 
# little and performance is suffering.  Let's see what happens
# if we just use two principal component variables as predictors.

pc.trn <- princomp(~ ., data=trndat[,-1], cor=T)
summary(pc.trn)
plot(pc.trn)
loadings(pc.trn)
 
# The p.c. variable values from the training data are in
#      pc.trn$scores  
# (we could also get them using predict).  Based on  
# the plot, I'll just keep the first three of the 
# new variables.  I'll bind these together with
# the class variable.

pcomp.trn <- data.frame( cbind(trndat[,1],pc.trn$score[,1:3]) )
names(pcomp.trn) <- c("y","pc1","pc2","pc3")

#####

# Now I'll try LDA using these variables as predictors.

trnldapc <- lda(y ~ . , pcomp.trn)
trnldapc

# To see what performance we get with the test set, the 
# test set variables have to be altered the same way
# (i.e., using the loadings determined from the training
# data).

pcomp.tst <- predict(pc.trn, newdata=tstdat[,-1])
pcomp.tst <- data.frame( cbind(tstdat[,1],pcomp.tst[,1:3]) )
names(pcomp.tst) <- c("y","pc1","pc2","pc3")

pred.trnldapc <- predict(trnldapc, newdata=pcomp.tst)
ldapc.rate <- mean( pred.trnldapc$class != tstdat[,1] )
ldapc.rate

# This  made things worse!  Nevertheless, I want to look at a scatterplot.

plot(pcomp.tst[1:48,2], pcomp.tst[1:48,4], col="darkorange",main='2-D View of Classes 2 & 3',
  xlab='pc1',ylab='pc3')
points(pcomp.tst[49:96,2], pcomp.tst[49:96,4], col="darkcyan")
legend(-4,-1.5,legend=c("Class 2","Class 3"),fill=c("darkorange","darkcyan"))

# We have decent separation of the two classes.  
# The trouble is that a linear boundary isn't good!

#####

# I'll use 2nd-order variables as additional predictors.  
# I'll just use the 1st and 3rd principal component
# variables as a starting point.

pc11 <- pcomp.trn[,2]^2
pc33 <- pcomp.trn[,4]^2
pc13 <- pcomp.trn[,2]*pcomp.trn[,4]
pc2.trn <- data.frame(cbind(pcomp.trn,pc11,pc33,pc13))

# Note: I've got a 1st-order term for the 2nd pc, but
# no 2nd-order terms involving the 2nd pc. 

# Now I'll fix the test data similarly.

pc11 <- pcomp.tst[,2]^2
pc33 <- pcomp.tst[,4]^2
pc13 <- pcomp.tst[,2]*pcomp.tst[,4]
pc2.tst <- data.frame(cbind(pcomp.tst,pc11,pc33,pc13))

# Now I'll try LDA on this data set.

trnldapc2 <- lda(y ~ . , pc2.trn)
pred.trnldapc2 <- predict(trnldapc2, newdata=pc2.tst)
ldapc2.rate <- mean( pred.trnldapc2$class != tstdat[,1] )
ldapc2.rate

# It helped quite a bit, but the original 10 variables 
# worked better.

#####

# Now I'll try QDA, first with the first three principal components.

trnqdapc <- qda(y ~ . , pcomp.trn)
pred.trnqdapc <- predict(trnqdapc, newdata=pcomp.tst)
qdapc.rate <- mean( pred.trnqdapc$class != tstdat[,1] )
qdapc.rate

# Yuck!  Now I'll just use the 1st and 3rd pcs.

trnqdapc <- qda(y ~ pc1 + pc3 , pcomp.trn)
pred.trnqdapc <- predict(trnqdapc, newdata=pcomp.tst)
qdapc.rate <- mean( pred.trnqdapc$class != tstdat[,1] )
qdapc.rate

# What about the original data?

trnqda <- qda(y ~ . , trndat)
pred.trnqda <- predict(trnqda, newdata=tstdat)
qda.rate <- mean( pred.trnqda$class != tstdat[,1] )
qda.rate

# So far LDA wih the original data has worked best.


# Now I'll try RDA.  The R f'n has a gamma parameter and a lambda 
# parameter which are different from the alpha and gamma parameters
# given on p. 112 of HTF2.  C-v can be used to (hopefully) determine 
# decent values for these parameters.  (Enter ?rda to get the help
# page that describes the R version of RDA.  It seems to me that if
# the predictors aren't standardized, the gamma parameter in the R
# version could do odd things!)

trnrda <- rda(y~., trndat, crossval=TRUE, fold=10)
trnrda

pred.trnrda <- predict(trnrda, newdata=tstdat)
rda.rate <- mean( pred.trnrda$class != tstdat[,1] )
rda.rate

# Wow!  Some of this fancy s**t actually works (we got a lower rate)
# ... or does it?  The first time I tried it I got an error rate of 
# about 9.5% ... about half that of the 19% that was the previous
# champion's rate.  But I didn't have the random number seed fixed
# in advance.  When I tried it again, using a variety of seeds, I got
# rates larger than 30% a bunch of times.  I noticed that the parameter
# values were bouncing around A LOT!  Fortunately I remembered what
# ballpark the values were in that produced the 9.5%.

# Next I'll do the c-v thing again to show how unstable the results are.
# Then I'll set the parameter values close to what I noticed worked well
# on one run.

trnrda <- rda(y~., trndat, crossval=TRUE, fold=10)
trnrda

pred.trnrda <- predict(trnrda, newdata=tstdat)
rda.rate <- mean( pred.trnrda$class != tstdat[,1] )
rda.rate

# C-v choose parameter values that made RDA behave similarly to QDA.

trnrda <- rda(y~., trndat, lambda=0.7, gamma=0.2)
trnrda

pred.trnrda <- predict(trnrda, newdata=tstdat)
rda.rate <- mean( pred.trnrda$class != tstdat[,1] )
rda.rate

# The moral of the story is that you shouldn't always trust c-v to
# find the best rate.  Try it a few times with different seeds.
# If the results vary a lot and the parameter values bounce around
# a lot, it may be worthwhile to experiment with various parameter
# values on your own and search for some good ones.

#####


# Now let's try using nearest neighbors classification.  I'll use
# c-v to select a value of k for KNN.

cvknn <- train.kknn(as.factor(y) ~ ., trndat, kmax=25, kernel="rectangular", distance=2) 

cvknn$MISCLASS
summary(cvknn)

plot(c(1:25), cvknn$MISCLASS, type="l", col="blue", main="C-V Results", 
  xlab="k", ylab="Misclass") 

# So let's see how using just one nearest neighbor does on the test data.

pred.1nn <- knn( trndat[,-1], tstdat[,-1], cl=trndat[,1], k=1 )
onenn.rate <-  mean( pred.1nn != tstdat[,1] )
onenn.rate

# Not bad (but not quite as good as what I got with RDA)

# Let's check to see if the c-v routine actually found the best value for k.

knn.rate <- numeric(25)
for (i in 1:25)
 {
  pred.knn <- knn(trndat[,-1], tstdat[,-1], cl=trndat[,1], k=i)
  knn.rate[i] <- mean(pred.knn != tstdat[,1])
 }
points(c(1:25),knn.rate,type="l",col="sienna")
legend(1,0.4,legend=c("C-V","Observed"),fill=c("blue","sienna"))

# Although some of the numerical values of the c-v estimates are off 
# (too optimisitc), this time c-v did indicate a decent value to use for k.
# (Note: The c-v estimate are too low.  I think this is largely due to the
# fact that the training sample contains blocks of cases from a small
# number of subjects.  So when c-v is used, a withheld training case can
# be matched to a case in the remaning training data from the same subject.
# When we classify a test set case using the training set cases, none of the
# cases are from the same subject as the test set case, and so the neighbors 
# will generally be farther away (not as similar).)

##### That's all folks! #####