# Classification with vowel data set (11 classes, 10 predictors) referred to in Ch. 4 of HTF2.
   #  Focus here is on some classification methods from Ch. 13 of HTF2.         

      # Note: Developed with vesion 2.9.0.
      # Note: I designed this so that it'll work well to paste the commands into R in chunks.
          #   Each time, go down to the next line with just ##### on it and paste all of the 
          #   commands in at one time.  Then after the commands are executed, look at the 
          #   comments and output and look at the graph before pasting in the next chunk of
          #   commands.  (If you paste everything in all at once you won't have much time to 
          #   look at the graphics.)


# I'll load a package and set the random number seed

library(MASS)
library(class)

set.seed(7)


#######################################################
# Reading in Data and Creation of Canonical Variables #                            
#######################################################

# I'll read in the training data and replace the unnecessary 
# case id column with a subject id column.

trndat <- read.table("http://mason.gmu.edu/~csutton/HTF2VowelTrain.txt",sep=',',header=T)
trndat[,1] <- c(rep(1,66),rep(2,66),rep(3,66),rep(4,66),
                rep(5,66),rep(6,66),rep(7,66),rep(8,66))

# Next I'll scale the data. 

sctrn.vars <- scale(trndat[,-c(1:2)], center=TRUE, scale=TRUE)
sctrndat <- data.frame( cbind( trndat[,c(1:2)], sctrn.vars ) )
names(sctrndat) <- c("sub","class","x1","x2","x3","x4","x5","x6","x7","x8","x9","x10")

# Now I will compute the canonical variables.
# (Note: Some classification will be done using original variables, 
# and some will be done using canonical variables.)

scldafit <- lda(class~., sctrndat[,-1])
canvar.trn <- as.matrix(sctrndat[,-c(1:2)]) %*% scldafit$scaling
canvartrn <- data.frame( cbind( trndat[,c(1:2)], canvar.trn ) )
names(canvartrn) <- c("sub","class","cv1","cv2","cv3","cv4","cv5","cv6","cv7","cv8","cv9","cv10")

# Now read in test data, create subject id, and create canonical variables.

tstdat <- read.table("http://mason.gmu.edu/~csutton/HTF2VowelTest.txt",sep=',',header=T)
tstdat[,1] <- c(rep(1,66),rep(2,66),rep(3,66),rep(4,66),
                rep(5,66),rep(6,66),rep(7,66))

sctst.vars <- scale(tstdat[,-c(1:2)],center=attr(sctrn.vars,"scaled:center"),
                                     scale=attr(sctrn.vars,"scaled:scale"))
sctstdat <- data.frame( cbind(tstdat[,c(1:2)], sctst.vars ) )
names(sctstdat) <- names(sctrndat)

canvar.tst <- as.matrix(sctstdat[,-c(1:2)]) %*% scldafit$scaling
canvartst <- data.frame( cbind( tstdat[,c(1:2)], canvar.tst ) )
names(canvartst) <- names(canvartrn)


############################################################
# Classification Using K-means Clustering (p. 460 of HTF2) #
############################################################

# With this data, rather than use a random start, I'll specify eight
# initial means per class, each the centroid of one subject's cases.

# I'll write a function that will first use K-means clustering to
# obtain eight new prototypes for each class, and then classify
# the test set cases using the class of the nearest prototype.
# (I can use the knn function to do the last step, letting it 
# compare the test set cases to the 88 prototypes.)

KMeansClassFixed <- function(traindata,testdata)
# 1st col. of each data set should be group/subject id
# 2nd col. of each data set should be class variable
# remaining col's of each set should be predictor var's
{
  prototypes <- NULL
  for (g in 1:11)
  {
    cases <- traindata[,2] == g
    classdata <- traindata[cases,]
    n.var <- length(classdata[1,]) - 2
    varmeans <- matrix(0, nrow=8, ncol=n.var) 
    for (var in 1:n.var) 
      varmeans[,var] <- tapply(classdata[,var+2], classdata[,1], mean)
       # varmeans contains 8 starting points for class g
      plot(classdata[,3],classdata[,4],col="gray",pch=20,
        xlim=c(range(classdata[,3])),ylim=c(range(classdata[,4])),
        xlab="1st variable",ylab="2nd variable",
        main="Initial & Final Centers for a Class")
      points(varmeans[,1],varmeans[,2],col="darkorange",pch=18)
      newmeans <- kmeans(classdata[,-c(1:2)],varmeans)
       # newmeans$centers are the final cluster centers for class g
      points(newmeans$centers[,1],newmeans$centers[,2],col="blue",pch=5)
      prototypes <- rbind(prototypes,newmeans$centers)
  }
  prototype.class <- c(rep(1,8),rep(2,8),rep(3,8),rep(4,8),rep(5,8),rep(6,8),
                       rep(7,8),rep(8,8),rep(9,8),rep(10,8),rep(11,8))
  prototype.sub <- rep((1:8),11)
  prototypes <- cbind(prototype.sub,prototype.class, prototypes)
  names(prototypes) <- names(traindata)
  predict.class <- knn(prototypes[,-c(1:2)], testdata[,-c(1:2)], prototypes[,2], k=1)
  test.error <-  mean( predict.class != testdata[,2] )
  test.error
}

KMeansClassFixed(trndat,tstdat)
# Above is test error rate for original data 
# (all variables).  
# ***  It's a low rate compared to others 
#      we've obtained with this data.

##### 

KMeansClassFixed(trndat[,1:4],tstdat[,1:4])
# Above is test error rate for original data 
# (first two variables).

KMeansClassFixed(sctrndat,sctstdat)
# Above is test error rate for scaled data 
# (all variables).

KMeansClassFixed(sctrndat[,1:4],sctstdat[,1:4])
# Above is test error rate for scaled data 
# (first two variables).

KMeansClassFixed(canvartrn,canvartst)
# Above is test error rate for
# all the canonical variables.

KMeansClassFixed(canvartrn[,1:4],canvartst[,1:4])
# Above is test error rate for
# first two canonical variables.

# With each of the three data sets, using all of
# the variables worked better than using just the 
# first two.  The lowest rate was obtained using
# the original (unscaled) data.

#####


######################################################################
# Classification Using Learning Vector Quantization (p. 462 of HTF2) #
######################################################################

# With this data, rather than use a random start, I'll specify eight
# initial means per class, each the centroid of one subject's cases.

# I'll write a function that uses Algorithm 13.1 on p. 462 of HTF2.

LVQfixed <- function(traindata,testdata,iterations,initial.epsilon)
# 1st col. of each data set should be group/subject id
# 2nd col. of each data set should be class variable
# remaining col's of each set should be predictor var's
{
  prototypes <- NULL
  for (g in 1:11)
  {
    cases <- traindata[,2] == g
    classdata <- traindata[cases,]
    n.var <- length(classdata[1,]) - 2
    varmeans <- matrix(0, nrow=8, ncol=n.var) 
    for (var in 1:n.var) 
      varmeans[,var] <- tapply(classdata[,var+2], classdata[,1], mean)
       # varmeans contains 8 starting points for class g
      prototypes <- rbind(prototypes,varmeans)
       # prototypes contains the starting p'types for the
       # classes processed so far
  }
  class <- c(rep(1,8),rep(2,8),rep(3,8),rep(4,8),rep(5,8),rep(6,8),
                       rep(7,8),rep(8,8),rep(9,8),rep(10,8),rep(11,8))
  sub <- rep((1:8),11)
  prototypes <- data.frame(cbind(sub,class,prototypes))
   # prototypes contains all of the initial p'types
   # (with class labels attached)
  names(prototypes) <- names(traindata)
  plot(prototypes[,3],prototypes[,4],col="gray",main="Drifting Prototypes",
         xlab="1st variable",ylab="2nd variable")
  for (i in 1:iterations)
  {
    epsilon <- initial.epsilon/i
    case <- sample.int(528, 1)
    distances <- as.matrix(dist(rbind(traindata[case,3:4],prototypes[,3:4])))
    closest <- as.integer(which.min(distances[-1,1]))
    ifelse(prototypes[closest,2]==traindata[case,2],
      prototypes[closest,-c(1:2)] <- (1 - epsilon)*prototypes[closest,-c(1:2)] + epsilon*traindata[case,-c(1:2)],
      prototypes[closest,-c(1:2)] <- (1 - epsilon)*prototypes[closest,-c(1:2)] - epsilon*traindata[case,-c(1:2)])
    points(prototypes[closest,3],prototypes[closest,4],col="cornflowerblue")
  }   
  predict.class <- knn(prototypes[,-c(1:2)], testdata[,-c(1:2)], prototypes[,2], k=1)
  test.error <-  mean( predict.class != testdata[,2] )
  test.error
}

LVQfixed(trndat,tstdat,100,0.5)
# Above is test error rate for original data 
# (all variables).  
# ***  It's a low rate compared to others 
#      we've obtained with this data.

##### 

LVQfixed(trndat[,1:4],tstdat[,1:4],100,0.5)
# Above is test error rate for original data 
# (first two variables).

LVQfixed(sctrndat,sctstdat,100,0.5)
# Above is test error rate for scaled data 
# (all variables).

LVQfixed(sctrndat[,1:4],sctstdat[,1:4],100,0.5)
# Above is test error rate for scaled data 
# (first two variables).

LVQfixed(canvartrn,canvartst,100,0.5)
# Above is test error rate for
# all the canonical variables.

LVQfixed(canvartrn[,1:4],canvartst[,1:4],100,0.5)
# Above is test error rate for
# first two canonical variables.

# With each of the three data sets, using all of
# the variables worked better than using just the 
# first two.  The lowest rate was obtained using
# the original (unscaled) data.

#####

# Now I'll try changing the number of iterations 
# and the initial epsilon.

LVQfixed(trndat,tstdat,400,0.5)

LVQfixed(trndat,tstdat,100,0.75)
# Best so far!

LVQfixed(trndat,tstdat,400,0.75)
# Better!!

LVQfixed(trndat,tstdat,400,1)

# I'll go back to epsilon = 0.75 and
# increase the number of iterations.
LVQfixed(trndat,tstdat,900,0.75)

# I'll try it once more with the same settings.
LVQfixed(trndat,tstdat,900,0.75)

# I find it peculiar that the error rate is exactly the same.
# I'll try it once more.
LVQfixed(trndat,tstdat,900,0.75)

# Based on the error rate we're getting with the initial
# epsilon being 0.75 and using a large number of iterations,
# I'd say this method seems pretty well suited for the data.


# I just won't quit.
LVQfixed(trndat,tstdat,1600,0.5)


########## Now I'll Quit ##########