# Classification with vowel data set (11 classes, 10 predictors) referred to in Ch. 4 of HTF2.

      # Note: Developed with vesion 2.9.0.
      # Note: I designed this so that it'll work well to paste the commands into R in chunks.
          #   Each time, go down to the next line with just ##### on it and paste all of the 
          #   commands in at one time.  Then after the commands are executed, look at the 
          #   comments and output and look at the graph before pasting in the next chunk of
          #   commands.  (If you paste everything in all at once you won't have much time to 
          #   look at the graphics.)


# I'll load some libraries and set the random number seed.

library(class)
library(MASS)
library(e1071) 
library(kknn)
library(klaR)
library(rgl)

set.seed(7)


###########################################################
# Creation of Canonical Variables and Exploration of Data #
# (with emphasis on clusters of observations in training  #
#  and test samples)                                      #
###########################################################

# I'll read in the training data and delete the unnecessary case id column.

fulltrndat <- read.table("http://mason.gmu.edu/~csutton/HTF2VowelTrain.txt",sep=',',header=T)
trndat <- fulltrndat[,-1]

# Next I'll scale the data.  (Note: With the "toy" example from Ch. 2 I considered the first
# week there was no real desire to scale it, due to the way it was constructed.  But with 
# most "real" data sets scaling makes some methods (e.g., nearest neighbors) work better.
# With other classification methods, scaling doesn't matter ... you get the same result
# either way.  With the vowel data, scaling it first will make one of the plots below
# better match what's in HTF2.)

sctrn.vars <- scale(trndat[,-1], center=TRUE, scale=TRUE)
sctrndat <- data.frame( cbind( trndat[,1], sctrn.vars ) )
names(sctrndat) <- names(trndat)

# Now I will compute the canonical variables as described on p. 114 of HTF2.  (Note: 
# Rather of doing it step-by-step as indicated there, I can instead make use of some
# objects I create when I do LDA.)

scldafit <- lda(y~., sctrndat)
centroids <- scldafit$means     # the matrix M from p. 114 (based on scaled data) 
canvarcent <- centroids %*% scldafit$scaling

# canvarcent is an 11 by 10 matrix of the 11 centroids represented using the canonical variables.
# I'll plots these centroids.  (Compare with the large disks shown in Fig. 4.4 on p. 107 of HTF2.)
# (Note: Just the first two canonical variables are being used for the plot.)

colorcycle <- c("black","purple","brown","plum","darkorange","dodgerblue","darkgray","goldenrod","slateblue",
                 "tomato","limegreen")
plot(canvarcent[,1], canvarcent[,2], pch=15, col=colorcycle, main='2-D View (Canonical Variables)',
  xlab='1st can var', ylab='2nd can var', xlim=c(-4.75,4.75), ylim=c(-6.5,4))

#####

# Now I'll convert the scaled data to canonical variables and plot the individual
# cases on the plot with the centroids.

canvar.trn <- as.matrix(sctrndat[,-1]) %*% scldafit$scaling

pointcolors=rep(colorcycle,48)
points(canvar.trn[,1], canvar.trn[,2],col=pointcolors)

# I think I've reproduced the plot on p. 107 of HTF2 (except I chose colors 
# that create a bit more contrast, while being similar to the ones in the book.)

# Note: Since the plot on p. 107 of HTF2 seems to have two of the 
# classes colored black, one cannot notice that the class I have 
# colored black has 6 points over near the slate blue and tomato
# red classes around (2.2, -1.5).  (The location of the black
# centroid (near (-2, -3)) relative to most of the black points 
# suggests that there must be some black points far off to the  
# right of most of the black points.)

# Finally, I'll put the canonical variable representations of the data
# into a data frame.

canvartrn <- data.frame( cbind( trndat[,1], canvar.trn ) )
names(canvartrn) <- c("y","cv1","cv2","cv3","cv4","cv5","cv6","cv7","cv8","cv9","cv10")

#####

# Now for a 3-d  plot, based on first 3 canonical variables, 
# that can be rotated.

open3d()
spheres3d(canvartrn[,2:4], radius=.15, color=pointcolors)

#####

# Before doing classification, I'd like to further explore the differences 
# between the training and test data.  For the training data, there are 
# six cases from each of eight subjects for each class.  The description
# of the data provided on HTF's web site doesn't give us which cases are
# from which subject, but I have a hunch the 1st six cases for class 1 are
# from the first subject, the 2nd six cases for class 1 are from the second
# subject, and so on, and similarly for the other classes.

# Below I will explore the class 4 cases, plotting each subject's cases in 
# a different color.

ind4 <- 4 + 11*c(0:47)
class4 <- canvartrn[ind4,]

plot(class4[1:6,2], class4[1:6,3], col="darkgray", main='Class 4 Training Cases',
  xlab='1st can var', ylab='2nd can var', xlim=c(-4,0), ylim=c(-1,4))
points(class4[7:12,2], class4[7:12,3], col="purple")
points(class4[13:18,2], class4[13:18,3], col="dodgerblue")
points(class4[19:24,2], class4[19:24,3], col="darkorange")
points(class4[25:30,2], class4[25:30,3], col="goldenrod")
points(class4[31:36,2], class4[31:36,3], col="limegreen")
points(class4[37:42,2], class4[37:42,3], col="tomato")
points(class4[43:48,2], class4[43:48,3], col="deeppink")

# Clumping can definitely be seen.  When the nearest neighbor is used 
# (in a c-v scheme) to classify a training point using the other
# 527 training cases, the nearest neighbor to a point will usually
# be a point from the same subject and the same class ... and thus 
# the correct class will be chosen.  This is due to the tightness of
# the clumps.  But when a test case is classified using the nearest
# neighbor from the training set, it's impossible for that neighbor
# to be a case from the same subject.  So the nearest neighbor will
# generally be farther away in this case, and there is a greater 
# chance of it being from a different class.  (It might be better to
# represent the cases from the same subject and the same class as a 
# single centroid, instead of with six separate cases.  With such
# centroids, when a (centroid) case from the training data is classified
# using the other training data (centroid) cases, the relationship
# will be much more like it is when a (centroid) case from the test
# data is classified using the (centroid) cases from the training data,
# and so what is learned from the training data during c-v is more 
# pertinent to the classification of the test set cases.  Alternatively, 
# if we want a classifier for just a single case at a time, it may work  
# a lot better if one uses a cross-validation scheme instead of the n-fold
# (leave-one-out) method used by train.kknn.  One could leave out groups
# of cases, each corresponding to a different subject, to create, in
# the case of this data set, an 8-fold c-v (since eight subjects contributed
# cases to the training set).  I may explore these ideas a bit later.)  

#####

# Now I want to compare the class 4 training cases to the class 4 
# test cases.  The training cases will be plotted in various shades 
# of blue and the training cases will be plotted in various shades 
# of red and orange.

# I first need to read in the test data and convert it to canonical
# variables (using the info from the training cases to convert the
# test cases (the same way the training cases were converted)).

fulltstdat <- read.table("http://mason.gmu.edu/~csutton/HTF2VowelTest.txt",sep=',',header=T)
tstdat <- fulltstdat[,-1]

sctst.vars <- scale(tstdat[,-1],center=attr(sctrn.vars,"scaled:center"),scale=attr(sctrn.vars,"scaled:scale"))
# (Note: There might be a slicker way to scale the test set the  
# same way the training set was scaled, but this way works.)
sctstdat <- data.frame( cbind(tstdat[,1], sctst.vars ) )
names(sctstdat) <- names(sctrndat)

canvar.tst <- as.matrix(sctstdat[,-1]) %*% scldafit$scaling
canvartst <- data.frame( cbind( tstdat[,1], canvar.tst ) )
names(canvartst) <- names(canvartrn)
# (Note: One could certainly "streamline" some of the code above, but doing 
# it this way makes it parallel to what was done above for the training data.)

class4tst <- canvartst[ind4[1:42],]

plot(class4tst[1:6,2], class4tst[1:6,3], col="firebrick", main='Class 4 Training and Test Cases',
  xlab='1st can var', ylab='2nd can var', xlim=c(-3.8,-0.2), ylim=c(-0.6,3.8))
points(class4tst[7:12,2], class4tst[7:12,3], col="coral")
points(class4tst[13:18,2], class4tst[13:18,3], col="indianred")
points(class4tst[19:24,2], class4tst[19:24,3], col="darkorange")
points(class4tst[25:30,2], class4tst[25:30,3], col="red")
points(class4tst[31:36,2], class4tst[31:36,3], col="orangered")
points(class4tst[37:42,2], class4tst[37:42,3], col="deeppink")
points(class4[1:6,2], class4[1:6,3], col="darkcyan")
points(class4[7:12,2], class4[7:12,3], col="cyan")
points(class4[13:18,2], class4[13:18,3], col="aquamarine")
points(class4[19:24,2], class4[19:24,3], col="mediumblue")
points(class4[25:30,2], class4[25:30,3], col="turquoise")
points(class4[31:36,2], class4[31:36,3], col="darkblue")
points(class4[37:42,2], class4[37:42,3], col="dodgerblue")
points(class4[43:48,2], class4[43:48,3], col="cornflowerblue")

# Note that blue points are generally much closer to 
# other blue points than red points are to blue points.

#####


############################
# Classification using KNN #
############################

# Let's try KNN classification three different ways:
#  (1)  using the original unscaled data,
#  (2)  using the original data scaled,
#  (3)  using the first two canonical variables.
# For each way, we'll try to pick a good value of k using c-v, and
# then we'll see how well we classify the test set cases with the 
# chosen value of k.

# First the original unscaled data.

cvknn <- train.kknn(as.factor(y) ~ ., trndat, kmax=25, kernel="rectangular", distance=2) 
plot(c(1:25), cvknn$MISCLASS, type="l", col="blue", main="C-V Results", xlab="k", ylab="Misclass") 

# Let's see how using just one nearest neighbor does on the test data.

pred <- knn( trndat[,-1], tstdat[,-1], cl=trndat[,1], k=1 )
unscaled.rate <-  mean( pred != tstdat[,1] )
unscaled.rate

#####

# Now the original data scaled.

cvknn <- train.kknn(as.factor(y) ~ ., sctrndat, kmax=25, kernel="rectangular", distance=2) 
plot(c(1:25), cvknn$MISCLASS, type="l", col="blue", main="C-V Results", xlab="k", ylab="Misclass") 

# Again, let's see how using just one nearest neighbor does on the test data.

pred <- knn( sctrndat[,-1], sctstdat[,-1], cl=sctrndat[,1], k=1 )
scaled.rate <-  mean( pred != sctstdat[,1] )
scaled.rate

# Although typically scaling the data is a good idea (since otherwise one 
# predictor having a variance appreciably larger than the others can have
# too large of an influence in the distance determinations, and scaling the
# data tends to make the various predictors contribute more equally in the
# distance determinations), it could be that the unscaled version actually
# effectively gives more weight to the predictors which are the best ones,
# and scaling the data reduces the influence of the best predictors.  Let's
# look at the scaling factors to see what the case is here.

attr(sctrn.vars,"scaled:scale")

# Scaling would tend to reduce the effect of x.1 and x.2 the most, but from
# past experience with this data I know that these two variables are the best
# predictors!  So not such a mystery why scaling wasn't effective here.

#####

# Now using the first two canonical variables.

cvknn <- train.kknn(as.factor(y) ~ cv1 + cv2, canvartrn, kmax=25, kernel="rectangular", distance=2) 
plot(c(1:25), cvknn$MISCLASS, type="l", col="blue", main="C-V Results", xlab="k", ylab="Misclass") 

# This time, using 7 nearest neighbors seems appropriate.

pred <- knn( canvartrn[,2:3], canvartst[,2:3], cl=canvartrn[,1], k=7 )
canvar.rate <-  mean( pred != canvartst[,1] )
canvar.rate

# Now I am surprised at this result.  By design, and as can be seen from Fig. 4.8 
# on p. 115 of HTF2, the first few canonical variables should be the best predictors,
# and many of the other canonical variables should be pretty much useless as predictors.
# So by just using the first two canonical variables, we're giving increased weight to
# what should be the good predictors (since we're not giving any weight to the others).
# By lowering the diminsion, the nearest neighbors should actually be near (which is not
# always the case when the dimension (number of predictors) is large).  So I'm truly
# puzzled.

#####

# Just to be thorough, let's try using three canonical variables instead of just two.

cvknn <- train.kknn(as.factor(y) ~ cv1 + cv2 + cv3, canvartrn, kmax=25, kernel="rectangular", distance=2) 
plot(c(1:25), cvknn$MISCLASS, type="l", col="blue", main="C-V Results", xlab="k", ylab="Misclass") 

# Now just using one nearest neighbor seems appropriate.

pred <- knn( canvartrn[,2:4], canvartst[,2:4], cl=canvartrn[,1], k=1 )
canvar.rate <-  mean( pred != canvartst[,1] )
canvar.rate

# Worse!  But before quitting (for now) I wonder if just one nearest neighbor should 
# have been used with the first two canonical variables (since 1 was the choice for k 
# in all of the other cases).

pred <- knn( canvartrn[,2:3], canvartst[,2:3], cl=canvartrn[,1], k=1 )
canvar.rate <-  mean( pred != canvartst[,1] )
canvar.rate

# Nope, worse still.  I really thought the canonical variables would
# work well with nearest neighbors classification.

#####


##########################################
# Classification using LDA, QDA, and RDA #
##########################################

# Let's try stepwise LDA classification three different ways:
#  (1)  using the original unscaled data,
#  (2)  using the original data scaled (not really necessary),
#  (3)  using the first two canonical variables.


# First the original unscaled data.

ldafit <- stepclass(y ~ ., data=trndat, method="lda")
ldafit

# Interesting.  When I did this one other time, the c-v
# indicated that just using x.1 and x.2 was best.  When
# using c-v in a situation like this, it may be best to 
# try it several times.

ldafit <- lda(y ~ x.1 + x.2 + x.8, trndat)
pred <- predict(ldafit, newdata=tstdat)
lda.rate <- mean( pred$class != tstdat[,1] )
lda.rate

# Note: From past experience with the data, I know just using
# x.1 and x.2 results in a test error rate of only about 48%.
# So this time the c-v results didn't indicate the best subset 
# of predictors.


# Next, the original data scaled.

ldafit <- stepclass(y ~ ., data=sctrndat, method="lda")
ldafit

# C-v indicates that just using x.1 and x.2 is best.  

ldafit <- lda(y ~ x.1 + x.2, sctrndat)
pred <- predict(ldafit, newdata=sctstdat)
lda.rate <- mean( pred$class != sctstdat[,1] )
lda.rate

# Let's try just x.1` and x.2 with the unscaled data.

ldafit <- lda(y ~ x.1 + x.2, trndat)
pred <- predict(ldafit, newdata=tstdat)
lda.rate <- mean( pred$class != tstdat[,1] )
lda.rate

# I didn't really need to do both versions, since with LDA it 
# doesn't matter if the data are scaled or not.  


# Now using the canonical variables.

ldafit <- stepclass(y ~ ., data=canvartrn, method="lda")
ldafit

# C-v indicates that just using cv1 and cv2 is best.  

ldafit <- lda(y ~ cv1 + cv2, canvartrn)
pred <- predict(ldafit, newdata=canvartst)
lda.rate <- mean( pred$class != canvartst[,1] )
lda.rate

# Well, once again the original variables did better (if c-v
# indicates the best set of variables to use).


# While I've got the canonical variables handy, I'm going to 
# try QDA and RDA with them (using the stepwise procedure to
# select variables with QDA, but just using cv1 and cv2 with
# RDA since the stepwise procedure takes too long (due to 
# doing nested c-v)).

qdafit <- stepclass(y ~ ., data=canvartrn, method="qda")
qdafit

# C-v indicates that just using cv1 and cv2 is best.  

qdafit <- qda(y ~ cv1 + cv2, canvartrn)
pred <- predict(qdafit, newdata=canvartst)
qda.rate <- mean( pred$class != canvartst[,1] )
qda.rate

# Better than all but one of the results obtained above.
# (One nearest neighbors result was better.)

rdafit <- rda(y ~ cv1 + cv2, canvartrn)
rdafit
pred <- predict(rdafit, newdata=canvartst)
rda.rate <- mean( pred$class != canvartst[,1] )
rda.rate

# The parameters selected make RDA similar to QDA, 
# but the QDA result is just a tad better.

# So, before quitting, let's see what happens when we do 
# QDA with the original variables (using the stepwise 
# procedure to select variables to use).

qdafit <- stepclass(y ~ ., data=trndat, method="qda")
qdafit  

# Using the indicated variables.

qdafit <- qda(y ~ x.1 + x.2 + x.3 + x.8 + x.9, trndat)
pred <- predict(qdafit, newdata=tstdat)
qda.rate <- mean( pred$class != tstdat[,1] )
qda.rate

# So, all in all, using nearest neighbors on the original data 
# did best (43.7% errors), and using stepwise QDA with the 
# canonical variables did second best (46.1% errors).

#####


###############################
# Cross-Validation Done Right #
# (given the nature of the    #
#  training and test samples) #
###############################

# Now I will use a designed (nonrandom) 8-fold c-v which leaves out the 
# cases for one training set subject at a time (so that a training set 
# case cannot have other cases from the same subject as its neighbors
# during the c-v).

# I'll assume the first 66 rows in the training data set are from the first
# subject, the second 66 rows from the second subject, and so on.  I'll
# store the numbers of wrong classifications for the various values of k in
# the vector errors as I cycle through the eight folds.

# Clearly I could write a function to do what I'm doing below, but I think
# doing it this way makes it easier to follow.  I'll first use the original 
# data, and then I'll just copy the same chunk of code over, changing the
# data set, to get results for the scaled original data, and for the first
# two canonical variables.

errors <- rep(0, 25)

for (fold in 1:8)
{
   block <- 66*(fold - 1) + c(1:66)     
   for (i in 1:25)
   {
      pred <- knn(trndat[-block,-1], trndat[block,-1], cl=trndat[-block,1], k=i)
      errors[i] <- errors[i] + sum(pred != trndat[block,1])
    }
}

error.rate <- errors/528
plot(c(1:25), error.rate, type="l", col="blue", main="C-V Results", 
  xlab="k", ylab="Misclass") 

# Let's take a closer look at the error rates for the 
# smaller values of k.

error.rate[1:8]

# The rates for k = 1 and k = 6 are tied as being the lowest.  Since the
# overall pattern indicates that the rate gets lower as k gets smaller,
# and since using an even value for k may tend to produce more ties (this
# is definitely the case when there are two classes, and I suspect it's 
# also the case in a lot of multiclass situations as well), I'd choose to
# use k = 1 based on these results.

# We know from before that k = 1 results in a test error rate of about 0.437.
# Notice how now the c-v results give a fairly accurate estimate (0.455) of 
# the test error rate, whereas that was not the case before.

#####

# Now I'll use the scaled original data.

errors <- rep(0, 25)

for (fold in 1:8)
{
   block <- 66*(fold - 1) + c(1:66)     
   for (i in 1:25)
   {
      pred <- knn(sctrndat[-block,-1], sctrndat[block,-1], cl=sctrndat[-block,1], k=i)
      errors[i] <- errors[i] + sum(pred != sctrndat[block,1])
    }
}

error.rate <- errors/528
plot(c(1:25), error.rate, type="l", col="blue", main="C-V Results", 
  xlab="k", ylab="Misclass") 

# Again, using k = 1 seems best.  We know from before that the test
# error rate is about 0.494.  Once again this better way of doing c-v
# gave a decent estimate of the test error rate.  (I'll print the 
# estimate below.)

error.rate[1]

#####

# Next, I'll try using the first two canonical variables.
# I'm going to consider k up to 75 (instead of just 25).

errors <- rep(0, 75)

for (fold in 1:8)
{
   block <- 66*(fold - 1) + c(1:66)     
   for (i in 1:75)
   {
      pred <- knn(canvartrn[-block,2:3], canvartrn[block,2:3], cl=canvartrn[-block,1], k=i)
      errors[i] <- errors[i] + sum(pred != canvartrn[block,1])
    }
}

error.rate <- errors/528
plot(c(1:75), error.rate, type="l", col="blue", main="C-V Results", 
  xlab="k", ylab="Misclass") 

# Wow!  This time we get such different results from the c-v.  
# If the c-v results are accurate, we ought to get our best 
# test set performance using the first two canonical variables
# with a large value for k. 

# Let's see what k minimizes the estimated error rate, and what
# the value of the lowest estimated rate is.

which.min(error.rate)
error.rate[which.min(error.rate)]

#####

# I'll go with k = 42.

pred <- knn( canvartrn[,2:3], canvartst[,2:3], cl=canvartrn[,1], k=42 )
canvar.rate <-  mean( pred != canvartst[,1] )
canvar.rate

# This is lower than all of the test set rates obtained above 
# (although only a tad lower than what was obtained using one
# nearest neighbor with the original data).

# Note that when the simplistic c-v was used with the first two
# canonical variables, k = 7 was indicated and the error rate 
# was not near as good (0.532).

# To be thorough, I'll see what happens if the first three
# canonical variables are used.

errors <- rep(0, 75)

for (fold in 1:8)
{
   block <- 66*(fold - 1) + c(1:66)     
   for (i in 1:75)
   {
      pred <- knn(canvartrn[-block,2:4], canvartrn[block,2:4], cl=canvartrn[-block,1], k=i)
      errors[i] <- errors[i] + sum(pred != canvartrn[block,1])
    }
}

error.rate <- errors/528
plot(c(1:75), error.rate, type="l", col="blue", main="C-V Results", 
  xlab="k", ylab="Misclass") 

which.min(error.rate)
error.rate[which.min(error.rate)]

# Let's travel down Route 66.  (Note: The c-v estimate of the error 
# rate is promising (lower than before).)

pred <- knn( canvartrn[,2:4], canvartst[,2:4], cl=canvartrn[,1], k=66 )
canvar.rate <-  mean( pred != canvartst[,1] )
canvar.rate

# Given this is not the best rate so far, it's not unreasonable to go
# back to the plot and decide to try a smaller value of k ... something
# just a bit bigger than 20.

error.rate[21:25]

# I'll go with k = 21.

pred <- knn( canvartrn[,2:4], canvartst[,2:4], cl=canvartrn[,1], k=21 )
canvar.rate <-  mean( pred != canvartst[,1] )
canvar.rate

# Note that when the simplistic c-v was used with the first 3
# canonical variables, k = 1 was indicated and the error rate 
# was not near as good (0.574).


###########
# Summary #
###########


# Recap of best results from above:
#   0.435  (42 nearest neighbors, 1st 2 canonical variables),
#   0.437  (1 nearest neighbor, original data (all variables)),
#   0.446  (21 nearest neighbors, 1st 3 canonical variables),
#   0.461  (stepwise QDA, canonical variables (1st 2 chosen)),
#   0.478  (RDA, first two canonical variables),
#   0.481  (stepwise LDA, original data (but the good set of 
#           variables may not be chosen using the c-v routine)).

# From the description of the data available on the authors' web site
# one can see that the best neural network classifiers resulted in 
# error rates of about 45%.  (The description also gives the 44% error
# rate obtained using one nearest neighbor.)


# Some lessons learned:
#  (1) understand the nature of the training and test samples
#      (and be careful if they aren't composed of iid observations),
#  (2) try many different classification methods
#      (both simple ones and out-of-the-ordinary ones).  


##### la fin #####