# Classification with vowel data set (11 classes, 10 predictors) referred to in Ch. 4 of HTF2. # Note: Developed with vesion 2.9.0. # Note: I designed this so that it'll work well to paste the commands into R in chunks. # Each time, go down to the next line with just ##### on it and paste all of the # commands in at one time. Then after the commands are executed, look at the # comments and output and look at the graph before pasting in the next chunk of # commands. (If you paste everything in all at once you won't have much time to # look at the graphics.) # First I'll load some libraries that may be needed. library(class) library(MASS) library(e1071) library(kknn) library(klaR) # I'll also set the random number seed since the c-v routines use random numbers # and having a fixed seed tends to make trouble-shooting easier. set.seed(7) # Next I'll read in the training data. fulltrndat <- read.table("http://mason.gmu.edu/~csutton/HTF2VowelTrain.txt",sep=',',header=T) # Since I don't see a real need for the ID variable in the first column, I'll delete it. trndat <- fulltrndat[,-1] # Let's look at the data, just using first two predictor variables. ind1 <- 1 + 11*c(0:47) ind2 <- 2 + 11*c(0:47) ind3 <- 3 + 11*c(0:47) ind4 <- 4 + 11*c(0:47) ind5 <- 5 + 11*c(0:47) ind6 <- 6 + 11*c(0:47) ind7 <- 7 + 11*c(0:47) ind8 <- 8 + 11*c(0:47) ind9 <- 9 + 11*c(0:47) ind10 <- 10 + 11*c(0:47) ind11 <- 11*c(1:48) plot(trndat[ind1,2],trndat[ind1,3],col="violet",main='2-D View',xlab='x.1',ylab='x.2', xlim=c(-6,-0.5),ylim=c(-2,5.75)) points(trndat[ind2,2], trndat[ind2,3], col="cyan") points(trndat[ind3,2], trndat[ind3,3], col="darkorange") points(trndat[ind4,2], trndat[ind4,3], col="olivedrab") points(trndat[ind5,2], trndat[ind5,3], col="dodgerblue") points(trndat[ind6,2], trndat[ind6,3], col="deeppink") points(trndat[ind7,2], trndat[ind7,3], col="sienna") points(trndat[ind8,2], trndat[ind8,3], col="purple3") points(trndat[ind9,2], trndat[ind9,3], col="yellow") points(trndat[ind10,2], trndat[ind10,3], col="chartreuse") points(trndat[ind11,2], trndat[ind11,3], col="darkslategray") legend(-1.2,6,legend=c("1","2","3","4","5","6","7","8","9","10","11"), fill=c("violet","cyan","darkorange","olivedrab","dodgerblue","deeppink", "sienna","purple3","yellow","chartreuse","darkslategray")) # Based on this plot, one might guess that curved decision boundaries may # be better than linear ones. ##### # To get another 2-d view, let's plot the 1st and 2nd principal components. # (I was trying to get something like Fig. 4.4 on p. 107 of HTF2, but they # used some other variables to get their 2-d plot (see p. 114 of HTF2).) pc.trn <- princomp(~ ., data=trndat[,-1], cor=T) summary(pc.trn) plot(pc.trn) loadings(pc.trn) pcomp.trn <- data.frame( cbind(trndat[,1],pc.trn$score) ) names(pcomp.trn) <- c("y","pc1","pc2","pc3","pc4","pc5","pc6","pc7","pc8","pc9","pc10") ##### plot(pcomp.trn[ind1,2],pcomp.trn[ind1,3],col="violet",main='2-D View',xlab='pc1',ylab='pc2', xlim=c(-4.5,5.5),ylim=c(-3.9,4.4)) points(pcomp.trn[ind2,2], pcomp.trn[ind2,3], col="cyan") points(pcomp.trn[ind3,2], pcomp.trn[ind3,3], col="darkorange") points(pcomp.trn[ind4,2], pcomp.trn[ind4,3], col="olivedrab") points(pcomp.trn[ind5,2], pcomp.trn[ind5,3], col="dodgerblue") points(pcomp.trn[ind6,2], pcomp.trn[ind6,3], col="deeppink") points(pcomp.trn[ind7,2], pcomp.trn[ind7,3], col="sienna") points(pcomp.trn[ind8,2], pcomp.trn[ind8,3], col="purple3") points(pcomp.trn[ind9,2], pcomp.trn[ind9,3], col="yellow") points(pcomp.trn[ind10,2], pcomp.trn[ind10,3], col="chartreuse") points(pcomp.trn[ind11,2], pcomp.trn[ind11,3], col="darkslategray") legend(4,4.65,legend=c("1","2","3","4","5","6","7","8","9","10","11"), fill=c("violet","cyan","darkorange","olivedrab","dodgerblue","deeppink", "sienna","purple3","yellow","chartreuse","darkslategray")) # There appears to be a lot of overlap. A similar plot of the 1st and 3rd pcs didn't look any better. ##### # First let's try LDA (after getting the test set ready). fulltstdat <- read.table("http://mason.gmu.edu/~csutton/HTF2VowelTest.txt",sep=',',header=T) tstdat <- fulltstdat[,-1] trnlda <- lda(y ~ . , trndat) pred.trnlda <- predict(trnlda, newdata=tstdat) lda.rate <- mean( pred.trnlda$class != tstdat[,1] ) lda.rate # From the description of the datafound on the authors' web site, it seems # like the best neural network classifiers get about 45% wrong and that # using one nearest neighbor gets about 44% wrong. So LDA didn't do that badly. # Since a plot of just the first two variables looked somewhat promising, I'm # curious as to what will happen if just they are used. trnlda <- lda(y ~ . , trndat[,1:3]) pred.trnlda <- predict(trnlda, newdata=tstdat) lda.rate <- mean( pred.trnlda$class != tstdat[,1] ) lda.rate # Better! But perhaps other variables would do better still. # Let's see what stepwise LDA does. trnswlda <- stepclass(y ~ ., data=trndat, method="lda") trnswlda # It picks the first two variables (the same ones I had tried). # The output indicates a c-v estimate of the error rate is about # 54%. But we know from before the test data resulted in only # about 48% errors. ##### # Now I'll try stepwise LDA with the principal components. trnswldapc <- stepclass(y ~ ., data=pcomp.trn, method="lda") trnswldapc # The indication is that using just the 2nd, 4th, and 7th pcs # is best. Let's see how that does. I need to first set up # the test data for principal components. pcomp.tst <- predict(pc.trn, newdata=tstdat[,-1]) pcomp.tst <- data.frame( cbind(tstdat[,1],pcomp.tst) ) names(pcomp.tst) <- c("y","pc1","pc2","pc3","pc4","pc5","pc6","pc7","pc8","pc9","pc10") trnswldapc <- lda(y ~ pc2 + pc4 + pc7, pcomp.trn) pred.trnlda <- predict(trnswldapc, newdata=pcomp.tst) lda.rate <- mean( pred.trnlda$class != tstdat[,1] ) lda.rate # A big disappointment. ##### # Let's try QDA. trnqda <- qda(y ~ . , trndat) pred.trnqda <- predict(trnqda, newdata=tstdat) qda.rate <- mean( pred.trnqda$class != tstdat[,1] ) qda.rate # (Note: Both my LDA and QDA rates match those given for the test data in # Table 4.1 on p. 107 of HTF2.) # Since a plot of just the first two variables looked somewhat promising, # and these variables were choen using stepwise LDA, I'm curious as to what # will happen if just they are used with QDA. trnqda <- qda(y ~ . , trndat[,1:3]) pred.trnqda <- predict(trnqda, newdata=tstdat) qda.rate <- mean( pred.trnqda$class != tstdat[,1] ) qda.rate # A tad better. Perhaps other variables would do better still. # Let's see what stepwise QDA does. trnswqda <- stepclass(y ~ ., data=trndat, method="qda") trnswqda # The indication is that we may get great results using # QDA with just the 1st, 2nd, 3rd, 8th, and 9th predictors. # Let's see is this works well for predicting the classes # for the test set. trnqda <- qda(y ~ x.1 + x.2 + x.3 + x.8 + x.9, trndat) pred.trnqda <- predict(trnqda, newdata=tstdat) qda.rate <- mean( pred.trnqda$class != tstdat[,1] ) qda.rate # The c-v indication of only about 19% errors was way too high. # I'm curious as to what will happen if I just use three # of the predictors. trnqda <- qda(y ~ x.1 + x.2 + x.8, trndat) pred.trnqda <- predict(trnqda, newdata=tstdat) qda.rate <- mean( pred.trnqda$class != tstdat[,1] ) qda.rate # Not as good. # By accident (really) I stumbled upon a way to get a 44% # error rate (which is great for this data set). trnqda <- qda(y ~ x.1 + x.2 + x.3 + x.4 + x.8, trndat) pred.trnqda <- predict(trnqda, newdata=tstdat) qda.rate <- mean( pred.trnqda$class != tstdat[,1] ) qda.rate # This is the best result so far (but it's not really legit ... # since if I didn't have the test set to confirm that this # works best, I wouldn't know or suspect that it does). ##### # Let's try RDA. I'll try it four times in a row to see # how stable the fitting of the parameters seems to be. # (The c-v divisions will be different each time.) trnrda <- rda(y~., trndat, crossval=TRUE, fold=10) trnrda pred.trnrda <- predict(trnrda, newdata=tstdat) rda.rate <- mean( pred.trnrda$class != tstdat[,1] ) rda.rate # The first parameters (both lambda and gamma are near 0) # result in RDA being similar to QDA. I tried it three # more times and got very similar results each time. # Note: With the parametrization on p. 112 of HTF2, both parameters # being 0 makes RDA equivalent to DDA (diagonal discriminant analysis), # but the R function is different, and the two parameters being 0 makes # it equivalent to QDA. But with this data, having the parameters not # exactly 0 makes RDA perform better than both QDA and LDA on the test # set. Using all of the variables, # 0.556 is the test error for LDA, # 0.528 is the test error for QDA, # & 0.487 is the test error for RDA. # So now I'll try it using just the variables that stepwise LDA # found to be the most useful (since stepwise LDA did better than # stepwise QDA). trnrda <- rda(y~., trndat[,1:3], crossval=TRUE, fold=10) trnrda pred.trnrda <- predict(trnrda, newdata=tstdat) rda.rate <- mean( pred.trnrda$class != tstdat[,1] ) rda.rate # I'll now try it with the top two variables from both LDA and QDA # (x.1 and x.2), along with the 3rd variable chosen for QDA (x.8). trnrda <- rda(y~x.1+x.2+x.8, trndat, crossval=TRUE, fold=10) trnrda pred.trnrda <- predict(trnrda, newdata=tstdat) rda.rate <- mean( pred.trnrda$class != tstdat[,1] ) rda.rate # Neither of these subsets worked better than using all of the variables. # I decided to see if the rda function would work with the stepclass function. # It did, but it took a *very* long time (due to nested cross-validation). So # I won't do it again here. Instead I'll report that the variables selected, # in order, were # x.2, x.1, x.8, x.3, and x.4. # Below, I'll use the rda function with these variables. trnrda <- rda(y~x.1+x.2+x.3+x.4+x.8, trndat, crossval=TRUE, fold=10) trnrda pred.trnrda <- predict(trnrda, newdata=tstdat) rda.rate <- mean( pred.trnrda$class != tstdat[,1] ) rda.rate # A great result! Below is a summary of the results using the variables # indicated by stepclass, for LDA, QDA, and RDA: # 0.481 is the test error for LDA, # 0.498 is the test error for QDA, # 0.448 is the test error for RDA. # Note: With the variables selected by stepclass for RDA, on another run of the # rda function I got a test error of only 0.392. (Since rda uses c-v, the results # can differ due to a different starting point in the random number sequence.) # I think the parameter values that led to the 0.392 error rate were about 0.09 and # 0.01, so I'll fix these values and try the rda function again. trnrda <- rda(y~x.1+x.2+x.3+x.4+x.8, trndat, gamma=0.09, lambda=0.01, fold=10) trnrda pred.trnrda <- predict(trnrda, newdata=tstdat) rda.rate <- mean( pred.trnrda$class != tstdat[,1] ) rda.rate # This is the best result so far. Once again we see that it's not wise to just run # rda one time, since c-v may lead to parameter values which can be appreciably # improved upon. ##### # Now let's try using nearest neighbors classification. I'll use # c-v to select a value of k for KNN. cvknn <- train.kknn(as.factor(y) ~ ., trndat, kmax=25, kernel="rectangular", distance=2) cvknn$MISCLASS summary(cvknn) plot(c(1:25), cvknn$MISCLASS, type="l", col="blue", main="C-V Results", xlab="k", ylab="Misclass") ##### # So let's see how using just one nearest neighbor does on the test data. pred.1nn <- knn( trndat[,-1], tstdat[,-1], cl=trndat[,1], k=1 ) onenn.rate <- mean( pred.1nn != tstdat[,1] ) onenn.rate # Except for that one rda result (that took a lot of time to obtain, since # using the stepclass function with the rda function took a long time, and # even after stepclass indicated which variables to use, it may take several # runs of the rda function to find good parameter values), this is the best # result so far. # Let's check to see if the c-v routine actually found the best value for k. knn.rate <- numeric(25) for (i in 1:25) { pred.knn <- knn(trndat[,-1], tstdat[,-1], cl=trndat[,1], k=i) knn.rate[i] <- mean(pred.knn != tstdat[,1]) } points(c(1:25),knn.rate,type="l",col="sienna") legend(1,0.4,legend=c("C-V","Observed"),fill=c("blue","sienna")) # Although some of the numerical values of the c-v estimates or error are way off # (too low for the smaller values of k), this time c-v did indicate a decent value # to use for k. # One might wonder why one nearest neighbor appears to work so much better based on # the training data. Well, data was collected from 8 subjects for the training set, # and 7 different subjects for the test set. With the training data, the nearest # neighbor to a case might be another case from the same person. (Looking at plots # of the data does suggest clumping ... possibly due to data from the same subject.) # When the test points are classified using neighbors from the training set, none of # the neighbors can be cases from the same subject, and I suspect the nearest neighbors # are farther away than when classifying training cases using the training data in a c-v # scheme. All in all, it's harder to correctly classify a test case using the training # data than it is to classify a training case using the training data. ##### # Now I'll try nearest neighbors just using x.1 and x.2. cvknn <- train.kknn(as.factor(y) ~ x.1 + x.2, trndat, kmax=75, kernel="rectangular", distance=2) cvknn$MISCLASS summary(cvknn) plot(c(1:75), cvknn$MISCLASS, type="l", col="blue", main="C-V Results", xlab="k", ylab="Misclass", ylim=c(0.3,0.6)) # Looks worse than before ... but could be better if the c-v estimates of error are accurate. ##### # So let's see how using seven nearest neighbors does on the test data. pred.1nn <- knn( trndat[,2:3], tstdat[,2:3], cl=trndat[,1], k=7 ) sevennn.rate <- mean( pred.1nn != tstdat[,1] ) sevennn.rate # No good. # I'm really interested in how some sort of variable selection can be # using nearest neighbors. Ch. 13 of HTF2 covers some methods related # to this idea. # Let's see if c-v pointed to the best value of k. knn.rate <- numeric(75) for (i in 1:75) { pred.knn <- knn(trndat[,2:3], tstdat[,2:3], cl=trndat[,1], k=i) knn.rate[i] <- mean(pred.knn != tstdat[,1]) } points(c(1:75),knn.rate,type="l",col="sienna") legend(1,0.525,legend=c("Observed","C-V"),fill=c("sienna","blue")) # We can see that larger values for k do better than 7, but none of the # possibilities considered result in an error rate less than 50%. # Once again, by accident, I stumbled upon a better error rate. # If 7 nearest neighbors are used with all of the data, the error # rate drops below 40%. pred.1nn <- knn( trndat[,-1], tstdat[,-1], cl=trndat[,1], k=7 ) sevennn.rate <- mean( pred.1nn != tstdat[,1] ) sevennn.rate # Summary of best test sample error rates: # # 0.481 (LDA using just x.1 and x.2 (var's sel. by c-v)), # 0.498 (QDA using just x.1, x.2, x.3, x.8, and x.9 (var's sel. by c-v)), # 0.442 (QDA using just x.1, x.2, x.3, x.4, and x.8 (found by accident)), # 0.487 (RDA using all variables), # 0.498 (RDA using just x.1 and x.2 (Var's sel. by c-v)), # 0.448 (RDA using just x.1, x.2, x.3, x.4, and x.8 (var's sel. by lengthy c-v)), *** # 0.392 (RDA using just x.1, x.2, x.3, x.4, and x.8 (var's sel. by lengthy c-v)), *** # 0.437 (1NN using all variables (k found using c-v)), # 0.394 (7NN using all variables (k found by accident)). # # (Note: The two RDA results marked with *** differ due to different c-v folds, due # to different random number streams. Although the values differ, both are good.) ##### THE END (for now) #####