# Classification with vowel data set (11 classes, 10 predictors) referred to in Ch. 4 of HTF2. # Note: Developed with vesion 2.9.0. # Note: I designed this so that it'll work well to paste the commands into R in chunks. # Each time, go down to the next line with just ##### on it and paste all of the # commands in at one time. Then after the commands are executed, look at the # comments and output and look at the graph before pasting in the next chunk of # commands. (If you paste everything in all at once you won't have much time to # look at the graphics.) # First I'll load some libraries that may be needed. library(class) library(MASS) library(e1071) library(kknn) library(klaR) # I'll also set the random number seed since the c-v routines use random numbers # and having a fixed seed tends to make trouble-shooting easier. set.seed(7) # Next I'll read in the training data. fulltrndat <- read.table("http://mason.gmu.edu/~csutton/HTF2VowelTrain.txt",sep=',',header=T) # Since I don't see a real need for the ID variable in the first column, I'll delete it # and then print out the first 25 rows of the data set. trndat <- fulltrndat[,-1] trndat[1:25,] # Instead of starting with all 11 classes, I'll first work with a reduced data set just # using the cases for the first three classes. Since every 11th case is a class 1 case, # we can pick them out using the following index. ind1 <- 1 + 11*c(0:47) # The next two indices will pick out the class 2 and class 3 cases, and the last index will # pick out all of the class 1, class 2, and class 3 cases. ind2 <- 2 + 11*c(0:47) ind3 <- 3 + 11*c(0:47) index <- c(ind1,ind2,ind3) index # Now I'll use the index to create the desired reduced data set, having 48 class 1 # cases followed by 48 class 2 cases, followed by 48 class 3 cases. trndat <- trndat[index,] trndat # Let's look at the data, just using first two predictor variables. plot(trndat[1:48,2], trndat[1:48,3], col="darkslategray",main='2-D View of Classes 1-3', xlab='x.1',ylab='x.2') points(trndat[49:96,2], trndat[49:96,3], col="darkorange") points(trndat[97:144,2], trndat[97:144,3], col="darkcyan") legend(-2.5,2.25,legend=c("Class 1","Class 2","Class 3"),fill=c("darkslategray","darkorange","darkcyan")) # Class 1 seems easily separable from classes 2 and 3 if we can get a curved boundary from a # classifier, but using just these two variables it's not clear that Classes 2 and 3 will be # as easy to separate. # To make things more interesting while playing with just two classes for a while, let's focus # on classes 2 and 3. (I'll further reduce the data set to now just have the class 2 and 3 cases.) # I'll also create the corresponding test set for these classes. trndat <- trndat[49:144,] fulltstdat <- read.table("http://mason.gmu.edu/~csutton/HTF2VowelTest.txt",sep=',',header=T) tstdat <- fulltstdat[,-1] ind2 <- 2 + 11*c(0:41) ind3 <- 3 + 11*c(0:41) index <- c(ind2,ind3) tstdat <- tstdat[index,] ##### # First let's try LDA. (Note: With just two classes, and equal sample sizes, lines 2-6 on # p. 110 of HTF2 indicate that the regression scheme considered in Ch. 2 (and again in Sec. # 4.2) is equivalent to LDA.) trnlda <- lda(y ~ . , trndat) trnlda pred.trnlda <- predict(trnlda, newdata=tstdat) lda.rate <- mean( pred.trnlda$class != tstdat[,1] ) lda.rate # Since we have less than 5 cases per variable for each class, # it could be that we're trying to estimate too much with too # little and performance is suffering. Let's see what happens # if we just use two principal component variables as predictors. pc.trn <- princomp(~ ., data=trndat[,-1], cor=T) summary(pc.trn) plot(pc.trn) loadings(pc.trn) # The p.c. variable values from the training data are in # pc.trn$scores # (we could also get them using predict). Based on # the plot, I'll just keep the first three of the # new variables. I'll bind these together with # the class variable. pcomp.trn <- data.frame( cbind(trndat[,1],pc.trn$score[,1:3]) ) names(pcomp.trn) <- c("y","pc1","pc2","pc3") ##### # Now I'll try LDA using these variables as predictors. trnldapc <- lda(y ~ . , pcomp.trn) trnldapc # To see what performance we get with the test set, the # test set variables have to be altered the same way # (i.e., using the loadings determined from the training # data). pcomp.tst <- predict(pc.trn, newdata=tstdat[,-1]) pcomp.tst <- data.frame( cbind(tstdat[,1],pcomp.tst[,1:3]) ) names(pcomp.tst) <- c("y","pc1","pc2","pc3") pred.trnldapc <- predict(trnldapc, newdata=pcomp.tst) ldapc.rate <- mean( pred.trnldapc$class != tstdat[,1] ) ldapc.rate # This made things worse! Nevertheless, I want to look at a scatterplot. plot(pcomp.tst[1:48,2], pcomp.tst[1:48,4], col="darkorange",main='2-D View of Classes 2 & 3', xlab='pc1',ylab='pc3') points(pcomp.tst[49:96,2], pcomp.tst[49:96,4], col="darkcyan") legend(-4,-1.5,legend=c("Class 2","Class 3"),fill=c("darkorange","darkcyan")) # We have decent separation of the two classes. # The trouble is that a linear boundary isn't good! ##### # I'll use 2nd-order variables as additional predictors. # I'll just use the 1st and 3rd principal component # variables as a starting point. pc11 <- pcomp.trn[,2]^2 pc33 <- pcomp.trn[,4]^2 pc13 <- pcomp.trn[,2]*pcomp.trn[,4] pc2.trn <- data.frame(cbind(pcomp.trn,pc11,pc33,pc13)) # Note: I've got a 1st-order term for the 2nd pc, but # no 2nd-order terms involving the 2nd pc. # Now I'll fix the test data similarly. pc11 <- pcomp.tst[,2]^2 pc33 <- pcomp.tst[,4]^2 pc13 <- pcomp.tst[,2]*pcomp.tst[,4] pc2.tst <- data.frame(cbind(pcomp.tst,pc11,pc33,pc13)) # Now I'll try LDA on this data set. trnldapc2 <- lda(y ~ . , pc2.trn) pred.trnldapc2 <- predict(trnldapc2, newdata=pc2.tst) ldapc2.rate <- mean( pred.trnldapc2$class != tstdat[,1] ) ldapc2.rate # It helped quite a bit, but the original 10 variables # worked better. ##### # Now I'll try QDA, first with the first three principal components. trnqdapc <- qda(y ~ . , pcomp.trn) pred.trnqdapc <- predict(trnqdapc, newdata=pcomp.tst) qdapc.rate <- mean( pred.trnqdapc$class != tstdat[,1] ) qdapc.rate # Yuck! Now I'll just use the 1st and 3rd pcs. trnqdapc <- qda(y ~ pc1 + pc3 , pcomp.trn) pred.trnqdapc <- predict(trnqdapc, newdata=pcomp.tst) qdapc.rate <- mean( pred.trnqdapc$class != tstdat[,1] ) qdapc.rate # What about the original data? trnqda <- qda(y ~ . , trndat) pred.trnqda <- predict(trnqda, newdata=tstdat) qda.rate <- mean( pred.trnqda$class != tstdat[,1] ) qda.rate # So far LDA wih the original data has worked best. # Now I'll try RDA. The R f'n has a gamma parameter and a lambda # parameter which are different from the alpha and gamma parameters # given on p. 112 of HTF2. C-v can be used to (hopefully) determine # decent values for these parameters. (Enter ?rda to get the help # page that describes the R version of RDA. It seems to me that if # the predictors aren't standardized, the gamma parameter in the R # version could do odd things!) trnrda <- rda(y~., trndat, crossval=TRUE, fold=10) trnrda pred.trnrda <- predict(trnrda, newdata=tstdat) rda.rate <- mean( pred.trnrda$class != tstdat[,1] ) rda.rate # Wow! Some of this fancy s**t actually works (we got a lower rate) # ... or does it? The first time I tried it I got an error rate of # about 9.5% ... about half that of the 19% that was the previous # champion's rate. But I didn't have the random number seed fixed # in advance. When I tried it again, using a variety of seeds, I got # rates larger than 30% a bunch of times. I noticed that the parameter # values were bouncing around A LOT! Fortunately I remembered what # ballpark the values were in that produced the 9.5%. # Next I'll do the c-v thing again to show how unstable the results are. # Then I'll set the parameter values close to what I noticed worked well # on one run. trnrda <- rda(y~., trndat, crossval=TRUE, fold=10) trnrda pred.trnrda <- predict(trnrda, newdata=tstdat) rda.rate <- mean( pred.trnrda$class != tstdat[,1] ) rda.rate # C-v choose parameter values that made RDA behave similarly to QDA. trnrda <- rda(y~., trndat, lambda=0.7, gamma=0.2) trnrda pred.trnrda <- predict(trnrda, newdata=tstdat) rda.rate <- mean( pred.trnrda$class != tstdat[,1] ) rda.rate # The moral of the story is that you shouldn't always trust c-v to # find the best rate. Try it a few times with different seeds. # If the results vary a lot and the parameter values bounce around # a lot, it may be worthwhile to experiment with various parameter # values on your own and search for some good ones. ##### # Now let's try using nearest neighbors classification. I'll use # c-v to select a value of k for KNN. cvknn <- train.kknn(as.factor(y) ~ ., trndat, kmax=25, kernel="rectangular", distance=2) cvknn$MISCLASS summary(cvknn) plot(c(1:25), cvknn$MISCLASS, type="l", col="blue", main="C-V Results", xlab="k", ylab="Misclass") # So let's see how using just one nearest neighbor does on the test data. pred.1nn <- knn( trndat[,-1], tstdat[,-1], cl=trndat[,1], k=1 ) onenn.rate <- mean( pred.1nn != tstdat[,1] ) onenn.rate # Not bad (but not quite as good as what I got with RDA) # Let's check to see if the c-v routine actually found the best value for k. knn.rate <- numeric(25) for (i in 1:25) { pred.knn <- knn(trndat[,-1], tstdat[,-1], cl=trndat[,1], k=i) knn.rate[i] <- mean(pred.knn != tstdat[,1]) } points(c(1:25),knn.rate,type="l",col="sienna") legend(1,0.4,legend=c("C-V","Observed"),fill=c("blue","sienna")) # Although some of the numerical values of the c-v estimates are off # (too optimisitc), this time c-v did indicate a decent value to use for k. # (Note: The c-v estimate are too low. I think this is largely due to the # fact that the training sample contains blocks of cases from a small # number of subjects. So when c-v is used, a withheld training case can # be matched to a case in the remaning training data from the same subject. # When we classify a test set case using the training set cases, none of the # cases are from the same subject as the test set case, and so the neighbors # will generally be farther away (not as similar).) ##### That's all folks! #####