# Classification with vowel data set (11 classes, 10 predictors) referred to in Ch. 4 of HTF2. # Note: Developed with vesion 2.9.0. # Note: I designed this so that it'll work well to paste the commands into R in chunks. # Each time, go down to right before a heading of the form # ### ********** # ### * Plot i * # ### ********** # starting with i = 2, and enter all of the commands up until that next plot. This # way you'll have the next time-consuming cross-validation runs started while you're # talking time to read the comments and study the plots. # I'll load some libraries and set the random number seed. library(class) library(MASS) library(e1071) library(kknn) library(klaR) library(rgl) set.seed(7) ####################################################### # Reading in Data and Creation of Canonical Variables # ####################################################### # I'll read in the training data and delete the unnecessary case id column. fulltrndat <- read.table("http://mason.gmu.edu/~csutton/HTF2VowelTrain.txt",sep=',',header=T) trndat <- fulltrndat[,-1] # I'll scale the data and create canonical variables. sctrn.vars <- scale(trndat[,-1], center=TRUE, scale=TRUE) sctrndat <- data.frame( cbind( trndat[,1], sctrn.vars ) ) names(sctrndat) <- names(trndat) scldafit <- lda(y~., sctrndat) canvar.trn <- as.matrix(sctrndat[,-1]) %*% scldafit$scaling canvartrn <- data.frame( cbind( trndat[,1], canvar.trn ) ) names(canvartrn) <- c("y","cv1","cv2","cv3","cv4","cv5","cv6","cv7","cv8","cv9","cv10") # Now read in test data and create canonical variables. fulltstdat <- read.table("http://mason.gmu.edu/~csutton/HTF2VowelTest.txt",sep=',',header=T) tstdat <- fulltstdat[,-1] sctst.vars <- scale(tstdat[,-1],center=attr(sctrn.vars,"scaled:center"), scale=attr(sctrn.vars,"scaled:scale")) sctstdat <- data.frame( cbind(tstdat[,1], sctst.vars ) ) names(sctstdat) <- names(sctrndat) canvar.tst <- as.matrix(sctstdat[,-1]) %*% scldafit$scaling canvartst <- data.frame( cbind( tstdat[,1], canvar.tst ) ) names(canvartst) <- names(canvartrn) ################################## # *** possibly skip this chunk *** # Now for a 3-d plot, based on first 3 canonical variables, # that can be rotated. colorcycle <- c("black","purple","brown","plum","darkorange","dodgerblue","darkgray","goldenrod","slateblue", "tomato","limegreen") pointcolors=rep(colorcycle,48) open3d() spheres3d(canvartrn[,2:4], radius=.15, color=pointcolors) ################################## ######################################################################## # Classification using KNN (including weights and alternative metrics) # ######################################################################## # There are at least two R functions that do nearest neighbors classification, # knn # & kknn. # knn is basic "vanilla" nearest neighbors classification ... you just specify # the number of nearest neighbors that you want to use. But it does allow you # to give it either the original data, a scaled verseion of the data, or really # any transformation/version of the data that you want to give it. # kknn has a lot *more options*; not only can you alter k, but you can use different # metrics to determine the distances, and you can do weighted nearest neighbors, using # different weight functions that give larger votes to the nearer neighbors when # determining the predicted class using the k nearest neighbors. (The "rectangular" # weight function provides the option of giving equal weight to all k of the nearest # neighbors.) But kknn *always scales whatever data you give it* before determing the # distances (so if you give it both scaled and unscaled versions of the data you'll # get the same results (except for possibly ties being randomly broken in different # ways)). Usually it's good practice to scale the data before applying nearest # neighbors classification, but in some cases an unscaled version of the data might # be such that the most important variables for classification just happen to have # greater weight (and so classification performance could actually be better using # unscaled data). # There is also a train.kknn f'n that uses cross-validation on the training data # to try to determine what value works best for k. In addition to searching for # a good k, it can simultaneously search for a good metric and a good weight # function. # Since the blocked nature of the vowel data makes leave-one-out c-v inapproprite, # I'll use my own c-v routine, which leaves out one training set subject at a time # (creating an 8-fold c-v). I'll first write a f'n to do the desired c-v. cv.vowel <- function(data, k, dist, kern) # kern is for the weight function { errors <- rep(0, k) for (fold in 1:8) { block <- 66*(fold - 1) + c(1:66) for (i in 1:k) { out <- kknn(as.factor(y)~.,train=data[-block,],test=data[block,],k=i,distance=dist,kernel=kern) errors[i] <- errors[i] + sum(out$fit != data[block,1]) } } error.rate <- errors/528 error.rate } # I'll use the original unscaled data (knowing that kknn will scale it). # I'll first consider different metrics. err.35.h.rec <- cv.vowel(trndat, 35, 0.5, "rectangular") err.35.1.rec <- cv.vowel(trndat, 35, 1, "rectangular") err.25.2.rec <- cv.vowel(trndat, 25, 2, "rectangular") err.25.3.rec <- cv.vowel(trndat, 25, 3, "rectangular") ### ********** ### * Plot 1 * ### ********** plot(c(1:25),err.35.h.rec[1:25],type="l",col="darkviolet", ylim=c(0.45,0.62),main="C-V Results (rect. kernel, various metrics)",xlab="k",ylab="Misclass") points(c(1:25),err.35.1.rec[1:25],type="l",col="dodgerblue") points(c(1:25),err.25.2.rec,type="l",col="limegreen") points(c(1:25),err.25.3.rec,type="l",col="darkgoldenrod") legend(18,0.5,legend=c("L_3","L_2","L_1","L_0.5"), fill=c("darkgoldenrod","limegreen","dodgerblue","darkviolet")) # It appears L_1 distance with small k small will be best. # I'll now try different weight f'ns (kernels) with L_1 distance. err.35.1.tri <- cv.vowel(trndat, 35, 1, "triangular") err.35.1.biw <- cv.vowel(trndat, 35, 1, "biweight") err.35.1.gau <- cv.vowel(trndat, 35, 1, "gaussian") ### ********** ### * Plot 2 * ### ********** plot(c(1:35),err.35.1.rec,type="l",col="dodgerblue", ylim=c(0.435,0.6),main="C-V Results (L_1)",xlab="k",ylab="Misclass") points(c(1:35),err.35.1.tri,type="l",col="seagreen") points(c(1:35),err.35.1.biw,type="l",col="gold1") points(c(1:35),err.35.1.gau,type="l",col="darkorange") legend(1,0.6,legend=c("rectangular","triangular","biweight","gaussian"), fill=c("dodgerblue","seagreen","gold1","darkorange")) # The weighted nearest neighbors seems to work better # than ordinary (rectangular kernel) nearest neighbor. # One can also note that the biweight f'n is much less # sensitive to the choice of k than is the rectangular # f'n. This suggests that it won't matter so much if # the c-v doesn't identify the best choice for k. # Just to be thorough, I'll try weights with L_0.5 metric. err.35.h.tri <- cv.vowel(trndat, 35, 0.5, "triangular") err.35.h.biw <- cv.vowel(trndat, 35, 0.5, "biweight") err.35.h.gau <- cv.vowel(trndat, 35, 0.5, "gaussian") ### ********** ### * Plot 3 * ### ********** plot(c(1:35),err.35.h.rec,type="l",col="dodgerblue", ylim=c(0.45,0.6),main="C-V Results (L_0.5)",xlab="k",ylab="Misclass") points(c(1:35),err.35.h.tri,type="l",col="seagreen") points(c(1:35),err.35.h.biw,type="l",col="gold1") points(c(1:35),err.35.h.gau,type="l",col="darkorange") legend(1,0.6,legend=c("rectangular","triangular","biweight","gaussian"), fill=c("dodgerblue","seagreen","gold1","darkorange")) # It looks like the L_1 distance works better. (The lowest # estimated error rates were nearer 0.45 on the previous plot.) # Specifically, based on the last two plots, it appears L_1 # distance with the biweight kernel and k = 23 works the best. # Let's see how well the c-v results do in picking the actual # winner and estimating it's accuracy by looking at test set # prediction results. test.vowel <- function(train, test, k, dist, kern) { errors <- rep(0, k) for (i in 1:k) { out <- kknn(as.factor(y)~.,train=train,test=test,k=i,distance=dist,kernel=kern) errors[i] <- mean(out$fit != test[,1]) } errors } tsterr.35.1.tri <- test.vowel(trndat, tstdat, 35, 1, "triangular") tsterr.35.1.biw <- test.vowel(trndat, tstdat, 35, 1, "biweight") tsterr.35.2.rec <- test.vowel(trndat, tstdat, 35, 2, "rectangular") ### ********** ### * Plot 4 * ### ********** plot(c(1:35),err.35.1.tri,type="l",col="seagreen", ylim=c(0.44,0.56),main="C-V & Test Results (L_1 metric)",xlab="k",ylab="Misclass", sub="Note: Above 'ordinary' knn refers to the L_2 distance and the rectangular kernel.") points(c(1:35),tsterr.35.2.rec,type="l",col="skyblue1") points(c(1:35),tsterr.35.1.tri,type="l",col="limegreen") points(c(1:35),tsterr.35.1.biw,type="l",col="gold") points(c(1:35),err.35.1.biw,type="l",col="darkgoldenrod") legend(1,0.56,legend=c("triangular c-v","triangular test","biweight c-v","biweight test","'ordinary' knn"), fill=c("seagreen","limegreen","darkgoldenrod","gold","skyblue1")) # The L_1, biweight, and k=23 choice did okay, but overall # there isn't a strong correspondence between the c-v estimates # of the error rates and the error rates experienced with the # test data. ##################################################################### # Now I'll repeat the above process just using x.1 and x.2 (resetting # the random number seed in case I make changes above later). set.seed(12) errsm.45.h.rec <- cv.vowel(trndat[1:3], 45, 0.5, "rectangular") errsm.45.1.rec <- cv.vowel(trndat[1:3], 45, 1, "rectangular") errsm.45.2.rec <- cv.vowel(trndat[1:3], 45, 2, "rectangular") errsm.85.3.rec <- cv.vowel(trndat[1:3], 85, 3, "rectangular") ### ********** ### * Plot 5 * ### ********** plot(c(1:45),errsm.45.h.rec,type="l",col="darkviolet", ylim=c(0.46,0.6),main="C-V Results (rect. kernel, various metrics)",xlab="k",ylab="Misclass") points(c(1:45),errsm.45.1.rec,type="l",col="dodgerblue") points(c(1:45),errsm.45.2.rec,type="l",col="limegreen") points(c(1:45),errsm.85.3.rec[1:45],type="l",col="darkgoldenrod") legend(1,0.5,legend=c("L_3","L_2","L_1","L_0.5"), fill=c("darkgoldenrod","limegreen","dodgerblue","darkviolet")) # This time L_3 looks like best choice, with L_2 second best. # I'll try different weight f'ns (kernels) with L_3 distance. errsm.45.3.tri <- cv.vowel(trndat[1:3], 45, 3, "triangular") errsm.45.3.biw <- cv.vowel(trndat[1:3], 45, 3, "biweight") errsm.45.3.gau <- cv.vowel(trndat[1:3], 45, 3, "gaussian") ### ********** ### * Plot 6 * ### ********** plot(c(1:45),errsm.85.3.rec[1:45],type="l",col="dodgerblue", ylim=c(0.49,0.595),main="C-V Results (L_3)",xlab="k",ylab="Misclass") points(c(1:45),errsm.45.3.tri,type="l",col="seagreen") points(c(1:45),errsm.45.3.biw,type="l",col="gold1") points(c(1:45),errsm.45.3.gau,type="l",col="darkorange") legend(32,0.595,legend=c("rectangular","triangular","biweight","gaussian"), fill=c("dodgerblue","seagreen","gold1","darkorange")) # It looks like the rectangular kernel, with k somewhere # in the 20s might be best. # Just to check more thorughly, let's try some more metrics with # the rectangular kernel. errsm.85.2h.rec <- cv.vowel(trndat[1:3], 85, 2.5, "rectangular") errsm.35.3h.rec <- cv.vowel(trndat[1:3], 35, 3.5, "rectangular") errsm.35.4.rec <- cv.vowel(trndat[1:3], 35, 4, "rectangular") ### ********** ### * Plot 7 * ### ********** plot(c(1:35),errsm.85.2h.rec[1:35],type="l",col="darkviolet", ylim=c(0.49,0.6),main="C-V Results (rect. kernel, various metrics)",xlab="k",ylab="Misclass") points(c(1:35),errsm.85.3.rec[1:35],type="l",col="dodgerblue") points(c(1:35),errsm.35.3h.rec,type="l",col="limegreen") points(c(1:35),errsm.35.4.rec,type="l",col="darkgoldenrod") legend(27.5,0.6,legend=c("L_2.5","L_3","L_3.5","L_4"), fill=c("darkviolet","dodgerblue","limegreen","darkgoldenrod")) # In the neighborhood considered, the exact choice of metric # doesn't seem that crucial. # Let's see how well the c-v results estimate the test sample # error rates, and see if one of the alternative metrics beats # out the usual L_2 choice. tsterrsm.85.2h.rec <- test.vowel(trndat[1:3], tstdat[1:3], 85, 2.5, "rectangular") tsterrsm.85.3.rec <- test.vowel(trndat[1:3], tstdat[1:3], 85, 3, "rectangular") tsterrsm.85.2.rec <- test.vowel(trndat[1:3], tstdat[1:3], 85, 2, "rectangular") ### ********** ### * Plot 8 * ### ********** plot(c(15:85),errsm.85.2h.rec[15:85],type="l",col="seagreen", ylim=c(0.49,0.57),main="C-V & Test Results (rectangular kernel)", xlab="k",ylab="Misclass") points(c(15:85),tsterrsm.85.2.rec[15:85],type="l",col="skyblue1") points(c(15:85),tsterrsm.85.2h.rec[15:85],type="l",col="limegreen") points(c(15:85),tsterrsm.85.3.rec[15:85],type="l",col="gold") points(c(15:85),errsm.85.3.rec[15:85],type="l",col="darkgoldenrod") legend(52.5,0.57,legend=c("L_2 test","L_2.5 test","L_3 test","L_2.5 c-v","L_3 c-v"), fill=c("skyblue1","limegreen","gold","seagreen","darkgoldenrod")) # We can see that c-v led us astray by indicating too small # a value for k. It could be that the main problem is that # the sample sizes are inadequate to insure decent performance # with high probability. (Recall that only 8 subjects contributed # to the training sample, and only 7 to the test sample. The c-v # results might be a lot less misleading if we had 10 times as many # subjects since two small samples from the same population are more # likely to be appreciably different than two larger samples are.) # We can also see that on the test set it doesn't matter a lot if one # uses L_2, L_2.5, or L_3 ... and that whatever choice is made the # test sample error rate isn't good compared to what other classifiers # yield. ############################################################# # Now I'll use the first two canonical variables. Since kknn # scales the predictors, the 2nd c.v. will now get more weight # than it did when the knn f'n (which doesn't scale) was used, # because the 1st c.v. has a larger variance than the 2nd c.v. # (as can be seen below), and so it will be downweighted more # by scaling. set.seed(12) # variance of 1st c.v.: var(canvartrn[,2]) # variance of 2nd c.v.: var(canvartrn[,3]) cverr.115.h.rec <- cv.vowel(canvartrn[1:3], 115, 0.5, "rectangular") cverr.115.1.rec <- cv.vowel(canvartrn[1:3], 115, 1, "rectangular") cverr.115.2.rec <- cv.vowel(canvartrn[1:3], 115, 2, "rectangular") cverr.115.3.rec <- cv.vowel(canvartrn[1:3], 115, 3, "rectangular") ### ********** ### * Plot 9 * ### ********** plot(c(1:115),cverr.115.h.rec,type="l",col="darkviolet", ylim=c(0.37,0.54),main="C-V Results (rect. kernel, various metrics)",xlab="k",ylab="Misclass") points(c(1:115),cverr.115.1.rec,type="l",col="dodgerblue") points(c(1:115),cverr.115.2.rec,type="l",col="limegreen") points(c(1:115),cverr.115.3.rec,type="l",col="darkgoldenrod") legend(85,0.54,legend=c("L_3","L_2","L_1","L_0.5"), fill=c("darkgoldenrod","limegreen","dodgerblue","darkviolet")) # L_1 with k around the mid 30s looks good. But the estimated error # rate for L_2 and L_3 with larger k (50s and 60s) are also good for # this data set. # I'll now try different weight f'ns (kernels) with L_1 distance. cverr.115.1.tri <- cv.vowel(canvartrn[1:3], 115, 1, "triangular") cverr.115.1.biw <- cv.vowel(canvartrn[1:3], 115, 1, "biweight") cverr.115.1.gau <- cv.vowel(canvartrn[1:3], 115, 1, "gaussian") ### *********** ### * Plot 10 * ### *********** plot(c(1:115),cverr.115.1.rec,type="l",col="dodgerblue", ylim=c(0.37,0.55),main="C-V Results (L_1)",xlab="k",ylab="Misclass") points(c(1:115),cverr.115.1.tri,type="l",col="seagreen") points(c(1:115),cverr.115.1.biw,type="l",col="gold1") points(c(1:115),cverr.115.1.gau,type="l",col="darkorange") legend(80,0.55,legend=c("rectangular","triangular","biweight","gaussian"), fill=c("dodgerblue","seagreen","gold1","darkorange")) which.min(cverr.115.1.rec) cverr.115.1.rec[which.min(cverr.115.1.rec)] # k = 36, L_1, and rectangular gives lowest value, but # notice how the triangular kernel seems to keep getting # better as k increases. (With the triangular kernel, as # one increases k, the weight given to the newly added # "voting neighbors" is small and the heavy "voters" are # of course voting as they did for smaller k. So it tends # to take fairly large increases of k to have an appreciable # effect.) # I'll now try different weight f'ns (kernels) with L_3 distance. cverr.115.3.tri <- cv.vowel(canvartrn[1:3], 115, 3, "triangular") cverr.115.3.biw <- cv.vowel(canvartrn[1:3], 115, 3, "biweight") cverr.115.3.gau <- cv.vowel(canvartrn[1:3], 115, 3, "gaussian") ### *********** ### * Plot 11 * ### *********** plot(c(1:115),cverr.115.3.rec,type="l",col="dodgerblue", ylim=c(0.37,0.55),main="C-V Results (L_3)",xlab="k",ylab="Misclass") points(c(1:115),cverr.115.3.tri,type="l",col="seagreen") points(c(1:115),cverr.115.3.biw,type="l",col="gold1") points(c(1:115),cverr.115.3.gau,type="l",col="darkorange") legend(80,0.55,legend=c("rectangular","triangular","biweight","gaussian"), fill=c("dodgerblue","seagreen","gold1","darkorange")) which.min(cverr.115.3.tri) # The rectangular kernel looks best, but again the # triangular is interesting. # Let's look at some test sample results for L_3. tstcverr.115.3.rec <- test.vowel(canvartrn[1:3], canvartst[1:3], 115, 3, "rectangular") tstcverr.115.3.tri <- test.vowel(canvartrn[1:3], canvartst[1:3], 115, 3, "triangular") tstcverr.115.2.rec <- test.vowel(canvartrn[1:3], canvartst[1:3], 115, 2, "rectangular") ### *********** ### * Plot 12 * ### *********** plot(c(25:115),cverr.115.3.tri[25:115],type="l",col="seagreen", ylim=c(0.385,0.5),main="C-V & Test Results (L_3 metric)",xlab="k",ylab="Misclass") points(c(25:115),tstcverr.115.2.rec[25:115],type="l",col="skyblue1") points(c(25:115),tstcverr.115.3.tri[25:115],type="l",col="limegreen") points(c(25:115),tstcverr.115.3.rec[25:115],type="l",col="gold") points(c(25:115),cverr.115.3.rec[25:115],type="l",col="darkgoldenrod") legend(53,0.45,legend=c("L2 rec test","tri test","rect test","tri c-v","rect c-v"), fill=c("skyblue1","limegreen","gold","seagreen","darkgoldenrod")) # The c-v estimates of the error rates are overoptimistic. # The L_3, triangular, and large k combination works pretty # good (compared to other classification methods), with test # sample error rates close to 44 and 45 percent. (Note: The # sky blue rates on the plot are test sample results for ordinary # knn (i.e., rectangular kernel and L_2 metric). THe other rates # are all for L_3.) # And now some test sample results for L_1. tstcverr.115.1.rec <- test.vowel(canvartrn[1:3], canvartst[1:3], 115, 1, "rectangular") tstcverr.115.1.tri <- test.vowel(canvartrn[1:3], canvartst[1:3], 115, 1, "triangular") ### *********** ### * Plot 13 * ### *********** plot(c(25:115),cverr.115.1.tri[25:115],type="l",col="seagreen", ylim=c(0.375,0.5),main="C-V & Test Results (L_1 metric)",xlab="k",ylab="Misclass") points(c(25:115),tstcverr.115.2.rec[25:115],type="l",col="skyblue1") points(c(25:115),tstcverr.115.1.tri[25:115],type="l",col="limegreen") points(c(25:115),tstcverr.115.1.rec[25:115],type="l",col="gold") points(c(25:115),cverr.115.1.rec[25:115],type="l",col="darkgoldenrod") legend(44,0.456,legend=c("L2 rec test","tri test","rect test","tri c-v","rect c-v"), fill=c("skyblue1","limegreen","gold","seagreen","darkgoldenrod")) # Once again, the c-v estimates of error are too low. # Lessons learned: # (1) Cross-validation estimates based on inadequate data # can be misleading; # (2) When working with weighted KNN classification, consider # some large enough values of k in order to make sure the # error rate doesn't keep getting lower as k is increased # past where a first impression might suggest. ##### TTFN #####