# Classification with vowel data set (11 classes, 10 predictors) referred to in Ch. 4 of HTF2. # Focus here is on some classification methods from Ch. 13 of HTF2. # Note: Developed with vesion 2.9.0. # Note: I designed this so that it'll work well to paste the commands into R in chunks. # Each time, go down to the next line with just ##### on it and paste all of the # commands in at one time. Then after the commands are executed, look at the # comments and output and look at the graph before pasting in the next chunk of # commands. (If you paste everything in all at once you won't have much time to # look at the graphics.) # I'll load a package and set the random number seed library(MASS) library(class) set.seed(7) ####################################################### # Reading in Data and Creation of Canonical Variables # ####################################################### # I'll read in the training data and replace the unnecessary # case id column with a subject id column. trndat <- read.table("http://mason.gmu.edu/~csutton/HTF2VowelTrain.txt",sep=',',header=T) trndat[,1] <- c(rep(1,66),rep(2,66),rep(3,66),rep(4,66), rep(5,66),rep(6,66),rep(7,66),rep(8,66)) # Next I'll scale the data. sctrn.vars <- scale(trndat[,-c(1:2)], center=TRUE, scale=TRUE) sctrndat <- data.frame( cbind( trndat[,c(1:2)], sctrn.vars ) ) names(sctrndat) <- c("sub","class","x1","x2","x3","x4","x5","x6","x7","x8","x9","x10") # Now I will compute the canonical variables. # (Note: Some classification will be done using original variables, # and some will be done using canonical variables.) scldafit <- lda(class~., sctrndat[,-1]) canvar.trn <- as.matrix(sctrndat[,-c(1:2)]) %*% scldafit$scaling canvartrn <- data.frame( cbind( trndat[,c(1:2)], canvar.trn ) ) names(canvartrn) <- c("sub","class","cv1","cv2","cv3","cv4","cv5","cv6","cv7","cv8","cv9","cv10") # Now read in test data, create subject id, and create canonical variables. tstdat <- read.table("http://mason.gmu.edu/~csutton/HTF2VowelTest.txt",sep=',',header=T) tstdat[,1] <- c(rep(1,66),rep(2,66),rep(3,66),rep(4,66), rep(5,66),rep(6,66),rep(7,66)) sctst.vars <- scale(tstdat[,-c(1:2)],center=attr(sctrn.vars,"scaled:center"), scale=attr(sctrn.vars,"scaled:scale")) sctstdat <- data.frame( cbind(tstdat[,c(1:2)], sctst.vars ) ) names(sctstdat) <- names(sctrndat) canvar.tst <- as.matrix(sctstdat[,-c(1:2)]) %*% scldafit$scaling canvartst <- data.frame( cbind( tstdat[,c(1:2)], canvar.tst ) ) names(canvartst) <- names(canvartrn) ############################################################ # Classification Using K-means Clustering (p. 460 of HTF2) # ############################################################ # With this data, rather than use a random start, I'll specify eight # initial means per class, each the centroid of one subject's cases. # I'll write a function that will first use K-means clustering to # obtain eight new prototypes for each class, and then classify # the test set cases using the class of the nearest prototype. # (I can use the knn function to do the last step, letting it # compare the test set cases to the 88 prototypes.) KMeansClassFixed <- function(traindata,testdata) # 1st col. of each data set should be group/subject id # 2nd col. of each data set should be class variable # remaining col's of each set should be predictor var's { prototypes <- NULL for (g in 1:11) { cases <- traindata[,2] == g classdata <- traindata[cases,] n.var <- length(classdata[1,]) - 2 varmeans <- matrix(0, nrow=8, ncol=n.var) for (var in 1:n.var) varmeans[,var] <- tapply(classdata[,var+2], classdata[,1], mean) # varmeans contains 8 starting points for class g plot(classdata[,3],classdata[,4],col="gray",pch=20, xlim=c(range(classdata[,3])),ylim=c(range(classdata[,4])), xlab="1st variable",ylab="2nd variable", main="Initial & Final Centers for a Class") points(varmeans[,1],varmeans[,2],col="darkorange",pch=18) newmeans <- kmeans(classdata[,-c(1:2)],varmeans) # newmeans$centers are the final cluster centers for class g points(newmeans$centers[,1],newmeans$centers[,2],col="blue",pch=5) prototypes <- rbind(prototypes,newmeans$centers) } prototype.class <- c(rep(1,8),rep(2,8),rep(3,8),rep(4,8),rep(5,8),rep(6,8), rep(7,8),rep(8,8),rep(9,8),rep(10,8),rep(11,8)) prototype.sub <- rep((1:8),11) prototypes <- cbind(prototype.sub,prototype.class, prototypes) names(prototypes) <- names(traindata) predict.class <- knn(prototypes[,-c(1:2)], testdata[,-c(1:2)], prototypes[,2], k=1) test.error <- mean( predict.class != testdata[,2] ) test.error } KMeansClassFixed(trndat,tstdat) # Above is test error rate for original data # (all variables). # *** It's a low rate compared to others # we've obtained with this data. ##### KMeansClassFixed(trndat[,1:4],tstdat[,1:4]) # Above is test error rate for original data # (first two variables). KMeansClassFixed(sctrndat,sctstdat) # Above is test error rate for scaled data # (all variables). KMeansClassFixed(sctrndat[,1:4],sctstdat[,1:4]) # Above is test error rate for scaled data # (first two variables). KMeansClassFixed(canvartrn,canvartst) # Above is test error rate for # all the canonical variables. KMeansClassFixed(canvartrn[,1:4],canvartst[,1:4]) # Above is test error rate for # first two canonical variables. # With each of the three data sets, using all of # the variables worked better than using just the # first two. The lowest rate was obtained using # the original (unscaled) data. ##### ###################################################################### # Classification Using Learning Vector Quantization (p. 462 of HTF2) # ###################################################################### # With this data, rather than use a random start, I'll specify eight # initial means per class, each the centroid of one subject's cases. # I'll write a function that uses Algorithm 13.1 on p. 462 of HTF2. LVQfixed <- function(traindata,testdata,iterations,initial.epsilon) # 1st col. of each data set should be group/subject id # 2nd col. of each data set should be class variable # remaining col's of each set should be predictor var's { prototypes <- NULL for (g in 1:11) { cases <- traindata[,2] == g classdata <- traindata[cases,] n.var <- length(classdata[1,]) - 2 varmeans <- matrix(0, nrow=8, ncol=n.var) for (var in 1:n.var) varmeans[,var] <- tapply(classdata[,var+2], classdata[,1], mean) # varmeans contains 8 starting points for class g prototypes <- rbind(prototypes,varmeans) # prototypes contains the starting p'types for the # classes processed so far } class <- c(rep(1,8),rep(2,8),rep(3,8),rep(4,8),rep(5,8),rep(6,8), rep(7,8),rep(8,8),rep(9,8),rep(10,8),rep(11,8)) sub <- rep((1:8),11) prototypes <- data.frame(cbind(sub,class,prototypes)) # prototypes contains all of the initial p'types # (with class labels attached) names(prototypes) <- names(traindata) plot(prototypes[,3],prototypes[,4],col="gray",main="Drifting Prototypes", xlab="1st variable",ylab="2nd variable") for (i in 1:iterations) { epsilon <- initial.epsilon/i case <- sample.int(528, 1) distances <- as.matrix(dist(rbind(traindata[case,3:4],prototypes[,3:4]))) closest <- as.integer(which.min(distances[-1,1])) ifelse(prototypes[closest,2]==traindata[case,2], prototypes[closest,-c(1:2)] <- (1 - epsilon)*prototypes[closest,-c(1:2)] + epsilon*traindata[case,-c(1:2)], prototypes[closest,-c(1:2)] <- (1 - epsilon)*prototypes[closest,-c(1:2)] - epsilon*traindata[case,-c(1:2)]) points(prototypes[closest,3],prototypes[closest,4],col="cornflowerblue") } predict.class <- knn(prototypes[,-c(1:2)], testdata[,-c(1:2)], prototypes[,2], k=1) test.error <- mean( predict.class != testdata[,2] ) test.error } LVQfixed(trndat,tstdat,100,0.5) # Above is test error rate for original data # (all variables). # *** It's a low rate compared to others # we've obtained with this data. ##### LVQfixed(trndat[,1:4],tstdat[,1:4],100,0.5) # Above is test error rate for original data # (first two variables). LVQfixed(sctrndat,sctstdat,100,0.5) # Above is test error rate for scaled data # (all variables). LVQfixed(sctrndat[,1:4],sctstdat[,1:4],100,0.5) # Above is test error rate for scaled data # (first two variables). LVQfixed(canvartrn,canvartst,100,0.5) # Above is test error rate for # all the canonical variables. LVQfixed(canvartrn[,1:4],canvartst[,1:4],100,0.5) # Above is test error rate for # first two canonical variables. # With each of the three data sets, using all of # the variables worked better than using just the # first two. The lowest rate was obtained using # the original (unscaled) data. ##### # Now I'll try changing the number of iterations # and the initial epsilon. LVQfixed(trndat,tstdat,400,0.5) LVQfixed(trndat,tstdat,100,0.75) # Best so far! LVQfixed(trndat,tstdat,400,0.75) # Better!! LVQfixed(trndat,tstdat,400,1) # I'll go back to epsilon = 0.75 and # increase the number of iterations. LVQfixed(trndat,tstdat,900,0.75) # I'll try it once more with the same settings. LVQfixed(trndat,tstdat,900,0.75) # I find it peculiar that the error rate is exactly the same. # I'll try it once more. LVQfixed(trndat,tstdat,900,0.75) # Based on the error rate we're getting with the initial # epsilon being 0.75 and using a large number of iterations, # I'd say this method seems pretty well suited for the data. # I just won't quit. LVQfixed(trndat,tstdat,1600,0.5) ########## Now I'll Quit ##########