# Classification with vowel data set (11 classes, 10 predictors) referred to in Ch. 4 of HTF2.

      # Note: Developed with vesion 2.9.0.
      # Note: I designed this so that it'll work well to paste the commands into R in chunks.
          #   Each time, go down to right before a heading of the form 
          #        ###   **********
          #        ###   * Plot i *
          #        ###   **********
          #   starting with i = 2, and enter all of the commands up until that next plot.  This
          # way you'll have the next time-consuming cross-validation runs started while you're
          # talking time to read the comments and study the plots.                   


# I'll load some libraries and set the random number seed.

library(class)
library(MASS)
library(e1071) 
library(kknn)
library(klaR)
library(rgl)

set.seed(7)


#######################################################
# Reading in Data and Creation of Canonical Variables #
#######################################################

# I'll read in the training data and delete the unnecessary case id column.

fulltrndat <- read.table("http://mason.gmu.edu/~csutton/HTF2VowelTrain.txt",sep=',',header=T)
trndat <- fulltrndat[,-1]

# I'll scale the data and create canonical variables. 

sctrn.vars <- scale(trndat[,-1], center=TRUE, scale=TRUE)
sctrndat <- data.frame( cbind( trndat[,1], sctrn.vars ) )
names(sctrndat) <- names(trndat)
scldafit <- lda(y~., sctrndat)
canvar.trn <- as.matrix(sctrndat[,-1]) %*% scldafit$scaling
canvartrn <- data.frame( cbind( trndat[,1], canvar.trn ) )
names(canvartrn) <- c("y","cv1","cv2","cv3","cv4","cv5","cv6","cv7","cv8","cv9","cv10")

# Now read in test data and create canonical variables.

fulltstdat <- read.table("http://mason.gmu.edu/~csutton/HTF2VowelTest.txt",sep=',',header=T)
tstdat <- fulltstdat[,-1]

sctst.vars <- scale(tstdat[,-1],center=attr(sctrn.vars,"scaled:center"),
                                scale=attr(sctrn.vars,"scaled:scale"))
sctstdat <- data.frame( cbind(tstdat[,1], sctst.vars ) )
names(sctstdat) <- names(sctrndat)

canvar.tst <- as.matrix(sctstdat[,-1]) %*% scldafit$scaling
canvartst <- data.frame( cbind( tstdat[,1], canvar.tst ) )
names(canvartst) <- names(canvartrn)

##################################

# *** possibly skip this chunk ***
# Now for a 3-d  plot, based on first 3 canonical variables, 
# that can be rotated.

colorcycle <- c("black","purple","brown","plum","darkorange","dodgerblue","darkgray","goldenrod","slateblue",
                 "tomato","limegreen")
pointcolors=rep(colorcycle,48)
open3d()
spheres3d(canvartrn[,2:4], radius=.15, color=pointcolors)

##################################


########################################################################
# Classification using KNN (including weights and alternative metrics) #
########################################################################

# There are at least two R functions that do nearest neighbors classification,
#      knn
#   &  kknn.

# knn is basic "vanilla" nearest neighbors classification ... you just specify
# the number of nearest neighbors that you want to use.  But it does allow you 
# to give it either the original data, a scaled verseion of the data, or really
# any transformation/version of the data that you want to give it.  

# kknn has a lot *more options*; not only can you alter k, but you can use different 
# metrics to determine the distances, and you can do weighted nearest neighbors, using
# different weight functions that give larger votes to the nearer neighbors when
# determining the predicted class using the k nearest neighbors.  (The "rectangular"
# weight function provides the option of giving equal weight to all k of the nearest
# neighbors.)  But kknn *always scales whatever data you give it* before determing the
# distances (so if you  give it both scaled and unscaled versions of the data you'll
# get the same results (except for possibly ties being randomly broken in different
# ways)).  Usually it's good practice to scale the data before applying nearest
# neighbors classification, but in some cases an unscaled version of the data might
# be such that the most important variables for classification just happen to have
# greater weight (and so classification performance could actually be better using
# unscaled data).  

# There is also a train.kknn f'n that uses cross-validation on the training data 
# to try to determine what value works best for k.  In addition to searching for 
# a good k, it can simultaneously search for a good metric and a good weight 
# function.  


# Since the blocked nature of the vowel data makes leave-one-out c-v inapproprite, 
# I'll use my own c-v routine, which leaves out one training set subject at a time 
# (creating an 8-fold c-v).  I'll first write a f'n to do the desired c-v.

cv.vowel <- function(data, k, dist, kern)   # kern is for the weight function
{
  errors <- rep(0, k)
  for (fold in 1:8)
  {
    block <- 66*(fold - 1) + c(1:66)     
    for (i in 1:k)
    {
      out <- kknn(as.factor(y)~.,train=data[-block,],test=data[block,],k=i,distance=dist,kernel=kern)
      errors[i] <- errors[i] + sum(out$fit != data[block,1])
    }
  }
  error.rate <- errors/528
  error.rate
}  


# I'll use the original unscaled data (knowing that kknn will scale it).
# I'll first consider different metrics.

err.35.h.rec <- cv.vowel(trndat, 35, 0.5, "rectangular")
err.35.1.rec <- cv.vowel(trndat, 35, 1, "rectangular")
err.25.2.rec <- cv.vowel(trndat, 25, 2, "rectangular")
err.25.3.rec <- cv.vowel(trndat, 25, 3, "rectangular")

###   **********
###   * Plot 1 *
###   **********

plot(c(1:25),err.35.h.rec[1:25],type="l",col="darkviolet",
  ylim=c(0.45,0.62),main="C-V Results (rect. kernel, various metrics)",xlab="k",ylab="Misclass")
points(c(1:25),err.35.1.rec[1:25],type="l",col="dodgerblue")
points(c(1:25),err.25.2.rec,type="l",col="limegreen")
points(c(1:25),err.25.3.rec,type="l",col="darkgoldenrod")
legend(18,0.5,legend=c("L_3","L_2","L_1","L_0.5"),
       fill=c("darkgoldenrod","limegreen","dodgerblue","darkviolet"))

# It appears L_1 distance with small k small will be best.


# I'll now try different weight f'ns (kernels) with L_1 distance.

err.35.1.tri <- cv.vowel(trndat, 35, 1, "triangular")
err.35.1.biw <- cv.vowel(trndat, 35, 1, "biweight")
err.35.1.gau <- cv.vowel(trndat, 35, 1, "gaussian")

###   **********
###   * Plot 2 *
###   **********

plot(c(1:35),err.35.1.rec,type="l",col="dodgerblue",
  ylim=c(0.435,0.6),main="C-V Results (L_1)",xlab="k",ylab="Misclass")
points(c(1:35),err.35.1.tri,type="l",col="seagreen")
points(c(1:35),err.35.1.biw,type="l",col="gold1")
points(c(1:35),err.35.1.gau,type="l",col="darkorange")
legend(1,0.6,legend=c("rectangular","triangular","biweight","gaussian"),
       fill=c("dodgerblue","seagreen","gold1","darkorange"))

# The weighted nearest neighbors seems to work better 
# than ordinary (rectangular kernel) nearest neighbor.
# One can also note that the biweight f'n is much less  
# sensitive to the choice of k than is the rectangular   
# f'n.  This suggests that it won't matter so much if  
# the c-v doesn't identify the best choice for k.


# Just to be thorough, I'll try weights with L_0.5 metric.

err.35.h.tri <- cv.vowel(trndat, 35, 0.5, "triangular")
err.35.h.biw <- cv.vowel(trndat, 35, 0.5, "biweight")
err.35.h.gau <- cv.vowel(trndat, 35, 0.5, "gaussian")

###   **********
###   * Plot 3 *
###   **********

plot(c(1:35),err.35.h.rec,type="l",col="dodgerblue",
  ylim=c(0.45,0.6),main="C-V Results (L_0.5)",xlab="k",ylab="Misclass")
points(c(1:35),err.35.h.tri,type="l",col="seagreen")
points(c(1:35),err.35.h.biw,type="l",col="gold1")
points(c(1:35),err.35.h.gau,type="l",col="darkorange")
legend(1,0.6,legend=c("rectangular","triangular","biweight","gaussian"),
       fill=c("dodgerblue","seagreen","gold1","darkorange"))

# It looks like the L_1 distance works better.  (The lowest
# estimated error rates were nearer 0.45 on the previous plot.)
# Specifically, based on the last two plots, it appears L_1 
# distance with the biweight kernel and k = 23 works the best.  
# Let's see how well the c-v results do in picking the actual 
# winner and estimating it's accuracy by looking at test set 
# prediction results.


test.vowel <- function(train, test, k, dist, kern)
 {
   errors <- rep(0, k)
   for (i in 1:k)
    {
      out <- kknn(as.factor(y)~.,train=train,test=test,k=i,distance=dist,kernel=kern)
      errors[i] <-  mean(out$fit != test[,1])
     }
   errors
 }  

tsterr.35.1.tri <- test.vowel(trndat, tstdat, 35, 1, "triangular")
tsterr.35.1.biw <- test.vowel(trndat, tstdat, 35, 1, "biweight")
tsterr.35.2.rec <- test.vowel(trndat, tstdat, 35, 2, "rectangular")

###   **********
###   * Plot 4 *
###   **********

plot(c(1:35),err.35.1.tri,type="l",col="seagreen",
     ylim=c(0.44,0.56),main="C-V & Test Results (L_1 metric)",xlab="k",ylab="Misclass",
    sub="Note: Above 'ordinary' knn refers to the L_2 distance and the rectangular kernel.")
points(c(1:35),tsterr.35.2.rec,type="l",col="skyblue1")
points(c(1:35),tsterr.35.1.tri,type="l",col="limegreen")
points(c(1:35),tsterr.35.1.biw,type="l",col="gold")
points(c(1:35),err.35.1.biw,type="l",col="darkgoldenrod")
legend(1,0.56,legend=c("triangular c-v","triangular test","biweight c-v","biweight test","'ordinary' knn"),
       fill=c("seagreen","limegreen","darkgoldenrod","gold","skyblue1"))

# The L_1, biweight, and k=23 choice did okay, but overall
# there isn't a strong correspondence between the c-v estimates
# of the error rates and the error rates experienced with the 
# test data.


#####################################################################

# Now I'll repeat the above process just using x.1 and x.2 (resetting
# the random number seed in case I make changes above later).

set.seed(12)

errsm.45.h.rec <- cv.vowel(trndat[1:3], 45, 0.5, "rectangular")
errsm.45.1.rec <- cv.vowel(trndat[1:3], 45, 1, "rectangular")
errsm.45.2.rec <- cv.vowel(trndat[1:3], 45, 2, "rectangular")
errsm.85.3.rec <- cv.vowel(trndat[1:3], 85, 3, "rectangular")

###   **********
###   * Plot 5 *
###   **********

plot(c(1:45),errsm.45.h.rec,type="l",col="darkviolet",
  ylim=c(0.46,0.6),main="C-V Results (rect. kernel, various metrics)",xlab="k",ylab="Misclass")
points(c(1:45),errsm.45.1.rec,type="l",col="dodgerblue")
points(c(1:45),errsm.45.2.rec,type="l",col="limegreen")
points(c(1:45),errsm.85.3.rec[1:45],type="l",col="darkgoldenrod")
legend(1,0.5,legend=c("L_3","L_2","L_1","L_0.5"),
       fill=c("darkgoldenrod","limegreen","dodgerblue","darkviolet"))

# This time L_3 looks like best choice, with L_2 second best.


# I'll try different weight f'ns (kernels) with L_3 distance.


errsm.45.3.tri <- cv.vowel(trndat[1:3], 45, 3, "triangular")
errsm.45.3.biw <- cv.vowel(trndat[1:3], 45, 3, "biweight")
errsm.45.3.gau <- cv.vowel(trndat[1:3], 45, 3, "gaussian")

###   **********
###   * Plot 6 *
###   **********

plot(c(1:45),errsm.85.3.rec[1:45],type="l",col="dodgerblue",
  ylim=c(0.49,0.595),main="C-V Results (L_3)",xlab="k",ylab="Misclass")
points(c(1:45),errsm.45.3.tri,type="l",col="seagreen")
points(c(1:45),errsm.45.3.biw,type="l",col="gold1")
points(c(1:45),errsm.45.3.gau,type="l",col="darkorange")
legend(32,0.595,legend=c("rectangular","triangular","biweight","gaussian"),
       fill=c("dodgerblue","seagreen","gold1","darkorange"))

# It looks like the rectangular kernel, with k somewhere
# in the 20s might be best.


# Just to check more thorughly, let's try some more metrics with 
# the rectangular kernel.

errsm.85.2h.rec <- cv.vowel(trndat[1:3], 85, 2.5, "rectangular")
errsm.35.3h.rec <- cv.vowel(trndat[1:3], 35, 3.5, "rectangular")
errsm.35.4.rec <- cv.vowel(trndat[1:3], 35, 4, "rectangular")

###   **********
###   * Plot 7 *
###   **********

plot(c(1:35),errsm.85.2h.rec[1:35],type="l",col="darkviolet",
  ylim=c(0.49,0.6),main="C-V Results (rect. kernel, various metrics)",xlab="k",ylab="Misclass")
points(c(1:35),errsm.85.3.rec[1:35],type="l",col="dodgerblue")
points(c(1:35),errsm.35.3h.rec,type="l",col="limegreen")
points(c(1:35),errsm.35.4.rec,type="l",col="darkgoldenrod")
legend(27.5,0.6,legend=c("L_2.5","L_3","L_3.5","L_4"),
       fill=c("darkviolet","dodgerblue","limegreen","darkgoldenrod"))

# In the neighborhood considered, the exact choice of metric
# doesn't seem that crucial.


# Let's see how well the c-v results estimate the test sample 
# error rates, and see if one of the alternative metrics beats
# out the usual L_2 choice.

tsterrsm.85.2h.rec <- test.vowel(trndat[1:3], tstdat[1:3], 85, 2.5, "rectangular")
tsterrsm.85.3.rec <- test.vowel(trndat[1:3], tstdat[1:3], 85, 3, "rectangular")
tsterrsm.85.2.rec <- test.vowel(trndat[1:3], tstdat[1:3], 85, 2, "rectangular")

###   **********
###   * Plot 8 *
###   **********

plot(c(15:85),errsm.85.2h.rec[15:85],type="l",col="seagreen",
     ylim=c(0.49,0.57),main="C-V & Test Results (rectangular kernel)",
     xlab="k",ylab="Misclass")
points(c(15:85),tsterrsm.85.2.rec[15:85],type="l",col="skyblue1")
points(c(15:85),tsterrsm.85.2h.rec[15:85],type="l",col="limegreen")
points(c(15:85),tsterrsm.85.3.rec[15:85],type="l",col="gold")
points(c(15:85),errsm.85.3.rec[15:85],type="l",col="darkgoldenrod")
legend(52.5,0.57,legend=c("L_2 test","L_2.5 test","L_3 test","L_2.5 c-v","L_3 c-v"),
       fill=c("skyblue1","limegreen","gold","seagreen","darkgoldenrod"))

# We can see that c-v led us astray by indicating too small 
# a value for k.  It could be that the main problem is that
# the sample sizes are inadequate to insure decent performance
# with high probability.  (Recall that only 8 subjects contributed
# to the training sample, and only 7 to the test sample.  The c-v 
# results might be a lot less misleading if we had 10 times as many
# subjects since two small samples from the same population are more
# likely to be appreciably different than two larger samples are.)

# We can also see that on the test set it doesn't matter a lot if one 
# uses L_2, L_2.5, or L_3 ... and that whatever choice is made the 
# test sample error rate isn't good compared to what other classifiers
# yield.


#############################################################

# Now I'll use the first two canonical variables.  Since kknn 
# scales the predictors, the  2nd c.v. will now get more weight
# than it did when the knn f'n (which doesn't scale) was used,
# because the 1st c.v. has a larger variance than the 2nd c.v.
# (as can be seen below), and so it will be downweighted more
# by scaling.

set.seed(12)

# variance of 1st c.v.:
var(canvartrn[,2])

# variance of 2nd c.v.:
var(canvartrn[,3])

cverr.115.h.rec <- cv.vowel(canvartrn[1:3], 115, 0.5, "rectangular")
cverr.115.1.rec <- cv.vowel(canvartrn[1:3], 115, 1, "rectangular")
cverr.115.2.rec <- cv.vowel(canvartrn[1:3], 115, 2, "rectangular")
cverr.115.3.rec <- cv.vowel(canvartrn[1:3], 115, 3, "rectangular")

###   **********
###   * Plot 9 *
###   **********

plot(c(1:115),cverr.115.h.rec,type="l",col="darkviolet",
  ylim=c(0.37,0.54),main="C-V Results (rect. kernel, various metrics)",xlab="k",ylab="Misclass")
points(c(1:115),cverr.115.1.rec,type="l",col="dodgerblue")
points(c(1:115),cverr.115.2.rec,type="l",col="limegreen")
points(c(1:115),cverr.115.3.rec,type="l",col="darkgoldenrod")
legend(85,0.54,legend=c("L_3","L_2","L_1","L_0.5"),
       fill=c("darkgoldenrod","limegreen","dodgerblue","darkviolet"))

# L_1 with k around the mid 30s looks good. But the estimated error
# rate for L_2 and L_3 with larger k (50s and 60s) are also good for 
# this data set.

# I'll now try different weight f'ns (kernels) with L_1 distance.

cverr.115.1.tri <- cv.vowel(canvartrn[1:3], 115, 1, "triangular")
cverr.115.1.biw <- cv.vowel(canvartrn[1:3], 115, 1, "biweight")
cverr.115.1.gau <- cv.vowel(canvartrn[1:3], 115, 1, "gaussian")

###   ***********
###   * Plot 10 *
###   ***********

plot(c(1:115),cverr.115.1.rec,type="l",col="dodgerblue",
  ylim=c(0.37,0.55),main="C-V Results (L_1)",xlab="k",ylab="Misclass")
points(c(1:115),cverr.115.1.tri,type="l",col="seagreen")
points(c(1:115),cverr.115.1.biw,type="l",col="gold1")
points(c(1:115),cverr.115.1.gau,type="l",col="darkorange")
legend(80,0.55,legend=c("rectangular","triangular","biweight","gaussian"),
       fill=c("dodgerblue","seagreen","gold1","darkorange"))

which.min(cverr.115.1.rec)
cverr.115.1.rec[which.min(cverr.115.1.rec)]

# k = 36, L_1, and rectangular gives lowest value, but
# notice how the triangular kernel seems to keep getting 
# better as k increases.  (With the triangular kernel, as
# one increases k, the weight given to the newly added 
# "voting neighbors" is small and the heavy "voters" are
# of course voting as they did for smaller k.  So it tends 
# to take fairly large increases of k to have an appreciable 
# effect.) 


# I'll now try different weight f'ns (kernels) with L_3 distance.

cverr.115.3.tri <- cv.vowel(canvartrn[1:3], 115, 3, "triangular")
cverr.115.3.biw <- cv.vowel(canvartrn[1:3], 115, 3, "biweight")
cverr.115.3.gau <- cv.vowel(canvartrn[1:3], 115, 3, "gaussian")

###   ***********
###   * Plot 11 *
###   ***********

plot(c(1:115),cverr.115.3.rec,type="l",col="dodgerblue",
  ylim=c(0.37,0.55),main="C-V Results (L_3)",xlab="k",ylab="Misclass")
points(c(1:115),cverr.115.3.tri,type="l",col="seagreen")
points(c(1:115),cverr.115.3.biw,type="l",col="gold1")
points(c(1:115),cverr.115.3.gau,type="l",col="darkorange")
legend(80,0.55,legend=c("rectangular","triangular","biweight","gaussian"),
       fill=c("dodgerblue","seagreen","gold1","darkorange"))

which.min(cverr.115.3.tri)

# The rectangular kernel looks best, but again the 
# triangular is interesting.

# Let's look at some test sample results for L_3.

tstcverr.115.3.rec <- test.vowel(canvartrn[1:3], canvartst[1:3], 115, 3, "rectangular")
tstcverr.115.3.tri <- test.vowel(canvartrn[1:3], canvartst[1:3], 115, 3, "triangular")
tstcverr.115.2.rec <- test.vowel(canvartrn[1:3], canvartst[1:3], 115, 2, "rectangular")

###   ***********
###   * Plot 12 *
###   ***********

plot(c(25:115),cverr.115.3.tri[25:115],type="l",col="seagreen",
     ylim=c(0.385,0.5),main="C-V & Test Results (L_3 metric)",xlab="k",ylab="Misclass")
points(c(25:115),tstcverr.115.2.rec[25:115],type="l",col="skyblue1")
points(c(25:115),tstcverr.115.3.tri[25:115],type="l",col="limegreen")
points(c(25:115),tstcverr.115.3.rec[25:115],type="l",col="gold")
points(c(25:115),cverr.115.3.rec[25:115],type="l",col="darkgoldenrod")
legend(53,0.45,legend=c("L2 rec test","tri test","rect test","tri c-v","rect c-v"),
       fill=c("skyblue1","limegreen","gold","seagreen","darkgoldenrod"))

# The c-v estimates of the error rates are overoptimistic.
# The L_3, triangular, and large k combination works pretty 
# good (compared to other classification methods), with test
# sample error rates close to 44 and 45 percent.  (Note: The
# sky blue rates on the plot are test sample results for ordinary
# knn (i.e., rectangular kernel and L_2 metric).  THe other rates
# are all for L_3.)


# And now some test sample results for L_1.

tstcverr.115.1.rec <- test.vowel(canvartrn[1:3], canvartst[1:3], 115, 1, "rectangular")
tstcverr.115.1.tri <- test.vowel(canvartrn[1:3], canvartst[1:3], 115, 1, "triangular")

###   ***********
###   * Plot 13 *
###   ***********

plot(c(25:115),cverr.115.1.tri[25:115],type="l",col="seagreen",
     ylim=c(0.375,0.5),main="C-V & Test Results (L_1 metric)",xlab="k",ylab="Misclass")
points(c(25:115),tstcverr.115.2.rec[25:115],type="l",col="skyblue1")
points(c(25:115),tstcverr.115.1.tri[25:115],type="l",col="limegreen")
points(c(25:115),tstcverr.115.1.rec[25:115],type="l",col="gold")
points(c(25:115),cverr.115.1.rec[25:115],type="l",col="darkgoldenrod")
legend(44,0.456,legend=c("L2 rec test","tri test","rect test","tri c-v","rect c-v"),
       fill=c("skyblue1","limegreen","gold","seagreen","darkgoldenrod"))

# Once again, the c-v estimates of error are too low.


# Lessons learned:
#  (1)  Cross-validation estimates based on inadequate data 
#       can be misleading;
#  (2)  When working with weighted KNN classification, consider
#       some large enough values of k in order to make sure the
#       error rate doesn't keep getting lower as k is increased
#       past where a first impression might suggest.


##### TTFN #####