# Classification with vowel data set (11 classes, 10 predictors) referred to in Ch. 4 of HTF2. # Note: Developed with vesion 2.9.0. # Note: I designed this so that it'll work well to paste the commands into R in chunks. # Each time, go down to the next line with just ##### on it and paste all of the # commands in at one time. Then after the commands are executed, look at the # comments and output and look at the graph before pasting in the next chunk of # commands. (If you paste everything in all at once you won't have much time to # look at the graphics.) # I'll load some libraries and set the random number seed. library(class) library(MASS) library(e1071) library(kknn) library(klaR) library(rgl) set.seed(7) ########################################################### # Creation of Canonical Variables and Exploration of Data # # (with emphasis on clusters of observations in training # # and test samples) # ########################################################### # I'll read in the training data and delete the unnecessary case id column. fulltrndat <- read.table("http://mason.gmu.edu/~csutton/HTF2VowelTrain.txt",sep=',',header=T) trndat <- fulltrndat[,-1] # Next I'll scale the data. (Note: With the "toy" example from Ch. 2 I considered the first # week there was no real desire to scale it, due to the way it was constructed. But with # most "real" data sets scaling makes some methods (e.g., nearest neighbors) work better. # With other classification methods, scaling doesn't matter ... you get the same result # either way. With the vowel data, scaling it first will make one of the plots below # better match what's in HTF2.) sctrn.vars <- scale(trndat[,-1], center=TRUE, scale=TRUE) sctrndat <- data.frame( cbind( trndat[,1], sctrn.vars ) ) names(sctrndat) <- names(trndat) # Now I will compute the canonical variables as described on p. 114 of HTF2. (Note: # Rather of doing it step-by-step as indicated there, I can instead make use of some # objects I create when I do LDA.) scldafit <- lda(y~., sctrndat) centroids <- scldafit$means # the matrix M from p. 114 (based on scaled data) canvarcent <- centroids %*% scldafit$scaling # canvarcent is an 11 by 10 matrix of the 11 centroids represented using the canonical variables. # I'll plots these centroids. (Compare with the large disks shown in Fig. 4.4 on p. 107 of HTF2.) # (Note: Just the first two canonical variables are being used for the plot.) colorcycle <- c("black","purple","brown","plum","darkorange","dodgerblue","darkgray","goldenrod","slateblue", "tomato","limegreen") plot(canvarcent[,1], canvarcent[,2], pch=15, col=colorcycle, main='2-D View (Canonical Variables)', xlab='1st can var', ylab='2nd can var', xlim=c(-4.75,4.75), ylim=c(-6.5,4)) ##### # Now I'll convert the scaled data to canonical variables and plot the individual # cases on the plot with the centroids. canvar.trn <- as.matrix(sctrndat[,-1]) %*% scldafit$scaling pointcolors=rep(colorcycle,48) points(canvar.trn[,1], canvar.trn[,2],col=pointcolors) # I think I've reproduced the plot on p. 107 of HTF2 (except I chose colors # that create a bit more contrast, while being similar to the ones in the book.) # Note: Since the plot on p. 107 of HTF2 seems to have two of the # classes colored black, one cannot notice that the class I have # colored black has 6 points over near the slate blue and tomato # red classes around (2.2, -1.5). (The location of the black # centroid (near (-2, -3)) relative to most of the black points # suggests that there must be some black points far off to the # right of most of the black points.) # Finally, I'll put the canonical variable representations of the data # into a data frame. canvartrn <- data.frame( cbind( trndat[,1], canvar.trn ) ) names(canvartrn) <- c("y","cv1","cv2","cv3","cv4","cv5","cv6","cv7","cv8","cv9","cv10") ##### # Now for a 3-d plot, based on first 3 canonical variables, # that can be rotated. open3d() spheres3d(canvartrn[,2:4], radius=.15, color=pointcolors) ##### # Before doing classification, I'd like to further explore the differences # between the training and test data. For the training data, there are # six cases from each of eight subjects for each class. The description # of the data provided on HTF's web site doesn't give us which cases are # from which subject, but I have a hunch the 1st six cases for class 1 are # from the first subject, the 2nd six cases for class 1 are from the second # subject, and so on, and similarly for the other classes. # Below I will explore the class 4 cases, plotting each subject's cases in # a different color. ind4 <- 4 + 11*c(0:47) class4 <- canvartrn[ind4,] plot(class4[1:6,2], class4[1:6,3], col="darkgray", main='Class 4 Training Cases', xlab='1st can var', ylab='2nd can var', xlim=c(-4,0), ylim=c(-1,4)) points(class4[7:12,2], class4[7:12,3], col="purple") points(class4[13:18,2], class4[13:18,3], col="dodgerblue") points(class4[19:24,2], class4[19:24,3], col="darkorange") points(class4[25:30,2], class4[25:30,3], col="goldenrod") points(class4[31:36,2], class4[31:36,3], col="limegreen") points(class4[37:42,2], class4[37:42,3], col="tomato") points(class4[43:48,2], class4[43:48,3], col="deeppink") # Clumping can definitely be seen. When the nearest neighbor is used # (in a c-v scheme) to classify a training point using the other # 527 training cases, the nearest neighbor to a point will usually # be a point from the same subject and the same class ... and thus # the correct class will be chosen. This is due to the tightness of # the clumps. But when a test case is classified using the nearest # neighbor from the training set, it's impossible for that neighbor # to be a case from the same subject. So the nearest neighbor will # generally be farther away in this case, and there is a greater # chance of it being from a different class. (It might be better to # represent the cases from the same subject and the same class as a # single centroid, instead of with six separate cases. With such # centroids, when a (centroid) case from the training data is classified # using the other training data (centroid) cases, the relationship # will be much more like it is when a (centroid) case from the test # data is classified using the (centroid) cases from the training data, # and so what is learned from the training data during c-v is more # pertinent to the classification of the test set cases. Alternatively, # if we want a classifier for just a single case at a time, it may work # a lot better if one uses a cross-validation scheme instead of the n-fold # (leave-one-out) method used by train.kknn. One could leave out groups # of cases, each corresponding to a different subject, to create, in # the case of this data set, an 8-fold c-v (since eight subjects contributed # cases to the training set). I may explore these ideas a bit later.) ##### # Now I want to compare the class 4 training cases to the class 4 # test cases. The training cases will be plotted in various shades # of blue and the training cases will be plotted in various shades # of red and orange. # I first need to read in the test data and convert it to canonical # variables (using the info from the training cases to convert the # test cases (the same way the training cases were converted)). fulltstdat <- read.table("http://mason.gmu.edu/~csutton/HTF2VowelTest.txt",sep=',',header=T) tstdat <- fulltstdat[,-1] sctst.vars <- scale(tstdat[,-1],center=attr(sctrn.vars,"scaled:center"),scale=attr(sctrn.vars,"scaled:scale")) # (Note: There might be a slicker way to scale the test set the # same way the training set was scaled, but this way works.) sctstdat <- data.frame( cbind(tstdat[,1], sctst.vars ) ) names(sctstdat) <- names(sctrndat) canvar.tst <- as.matrix(sctstdat[,-1]) %*% scldafit$scaling canvartst <- data.frame( cbind( tstdat[,1], canvar.tst ) ) names(canvartst) <- names(canvartrn) # (Note: One could certainly "streamline" some of the code above, but doing # it this way makes it parallel to what was done above for the training data.) class4tst <- canvartst[ind4[1:42],] plot(class4tst[1:6,2], class4tst[1:6,3], col="firebrick", main='Class 4 Training and Test Cases', xlab='1st can var', ylab='2nd can var', xlim=c(-3.8,-0.2), ylim=c(-0.6,3.8)) points(class4tst[7:12,2], class4tst[7:12,3], col="coral") points(class4tst[13:18,2], class4tst[13:18,3], col="indianred") points(class4tst[19:24,2], class4tst[19:24,3], col="darkorange") points(class4tst[25:30,2], class4tst[25:30,3], col="red") points(class4tst[31:36,2], class4tst[31:36,3], col="orangered") points(class4tst[37:42,2], class4tst[37:42,3], col="deeppink") points(class4[1:6,2], class4[1:6,3], col="darkcyan") points(class4[7:12,2], class4[7:12,3], col="cyan") points(class4[13:18,2], class4[13:18,3], col="aquamarine") points(class4[19:24,2], class4[19:24,3], col="mediumblue") points(class4[25:30,2], class4[25:30,3], col="turquoise") points(class4[31:36,2], class4[31:36,3], col="darkblue") points(class4[37:42,2], class4[37:42,3], col="dodgerblue") points(class4[43:48,2], class4[43:48,3], col="cornflowerblue") # Note that blue points are generally much closer to # other blue points than red points are to blue points. ##### ############################ # Classification using KNN # ############################ # Let's try KNN classification three different ways: # (1) using the original unscaled data, # (2) using the original data scaled, # (3) using the first two canonical variables. # For each way, we'll try to pick a good value of k using c-v, and # then we'll see how well we classify the test set cases with the # chosen value of k. # First the original unscaled data. cvknn <- train.kknn(as.factor(y) ~ ., trndat, kmax=25, kernel="rectangular", distance=2) plot(c(1:25), cvknn$MISCLASS, type="l", col="blue", main="C-V Results", xlab="k", ylab="Misclass") # Let's see how using just one nearest neighbor does on the test data. pred <- knn( trndat[,-1], tstdat[,-1], cl=trndat[,1], k=1 ) unscaled.rate <- mean( pred != tstdat[,1] ) unscaled.rate ##### # Now the original data scaled. cvknn <- train.kknn(as.factor(y) ~ ., sctrndat, kmax=25, kernel="rectangular", distance=2) plot(c(1:25), cvknn$MISCLASS, type="l", col="blue", main="C-V Results", xlab="k", ylab="Misclass") # Again, let's see how using just one nearest neighbor does on the test data. pred <- knn( sctrndat[,-1], sctstdat[,-1], cl=sctrndat[,1], k=1 ) scaled.rate <- mean( pred != sctstdat[,1] ) scaled.rate # Although typically scaling the data is a good idea (since otherwise one # predictor having a variance appreciably larger than the others can have # too large of an influence in the distance determinations, and scaling the # data tends to make the various predictors contribute more equally in the # distance determinations), it could be that the unscaled version actually # effectively gives more weight to the predictors which are the best ones, # and scaling the data reduces the influence of the best predictors. Let's # look at the scaling factors to see what the case is here. attr(sctrn.vars,"scaled:scale") # Scaling would tend to reduce the effect of x.1 and x.2 the most, but from # past experience with this data I know that these two variables are the best # predictors! So not such a mystery why scaling wasn't effective here. ##### # Now using the first two canonical variables. cvknn <- train.kknn(as.factor(y) ~ cv1 + cv2, canvartrn, kmax=25, kernel="rectangular", distance=2) plot(c(1:25), cvknn$MISCLASS, type="l", col="blue", main="C-V Results", xlab="k", ylab="Misclass") # This time, using 7 nearest neighbors seems appropriate. pred <- knn( canvartrn[,2:3], canvartst[,2:3], cl=canvartrn[,1], k=7 ) canvar.rate <- mean( pred != canvartst[,1] ) canvar.rate # Now I am surprised at this result. By design, and as can be seen from Fig. 4.8 # on p. 115 of HTF2, the first few canonical variables should be the best predictors, # and many of the other canonical variables should be pretty much useless as predictors. # So by just using the first two canonical variables, we're giving increased weight to # what should be the good predictors (since we're not giving any weight to the others). # By lowering the diminsion, the nearest neighbors should actually be near (which is not # always the case when the dimension (number of predictors) is large). So I'm truly # puzzled. ##### # Just to be thorough, let's try using three canonical variables instead of just two. cvknn <- train.kknn(as.factor(y) ~ cv1 + cv2 + cv3, canvartrn, kmax=25, kernel="rectangular", distance=2) plot(c(1:25), cvknn$MISCLASS, type="l", col="blue", main="C-V Results", xlab="k", ylab="Misclass") # Now just using one nearest neighbor seems appropriate. pred <- knn( canvartrn[,2:4], canvartst[,2:4], cl=canvartrn[,1], k=1 ) canvar.rate <- mean( pred != canvartst[,1] ) canvar.rate # Worse! But before quitting (for now) I wonder if just one nearest neighbor should # have been used with the first two canonical variables (since 1 was the choice for k # in all of the other cases). pred <- knn( canvartrn[,2:3], canvartst[,2:3], cl=canvartrn[,1], k=1 ) canvar.rate <- mean( pred != canvartst[,1] ) canvar.rate # Nope, worse still. I really thought the canonical variables would # work well with nearest neighbors classification. ##### ########################################## # Classification using LDA, QDA, and RDA # ########################################## # Let's try stepwise LDA classification three different ways: # (1) using the original unscaled data, # (2) using the original data scaled (not really necessary), # (3) using the first two canonical variables. # First the original unscaled data. ldafit <- stepclass(y ~ ., data=trndat, method="lda") ldafit # Interesting. When I did this one other time, the c-v # indicated that just using x.1 and x.2 was best. When # using c-v in a situation like this, it may be best to # try it several times. ldafit <- lda(y ~ x.1 + x.2 + x.8, trndat) pred <- predict(ldafit, newdata=tstdat) lda.rate <- mean( pred$class != tstdat[,1] ) lda.rate # Note: From past experience with the data, I know just using # x.1 and x.2 results in a test error rate of only about 48%. # So this time the c-v results didn't indicate the best subset # of predictors. # Next, the original data scaled. ldafit <- stepclass(y ~ ., data=sctrndat, method="lda") ldafit # C-v indicates that just using x.1 and x.2 is best. ldafit <- lda(y ~ x.1 + x.2, sctrndat) pred <- predict(ldafit, newdata=sctstdat) lda.rate <- mean( pred$class != sctstdat[,1] ) lda.rate # Let's try just x.1` and x.2 with the unscaled data. ldafit <- lda(y ~ x.1 + x.2, trndat) pred <- predict(ldafit, newdata=tstdat) lda.rate <- mean( pred$class != tstdat[,1] ) lda.rate # I didn't really need to do both versions, since with LDA it # doesn't matter if the data are scaled or not. # Now using the canonical variables. ldafit <- stepclass(y ~ ., data=canvartrn, method="lda") ldafit # C-v indicates that just using cv1 and cv2 is best. ldafit <- lda(y ~ cv1 + cv2, canvartrn) pred <- predict(ldafit, newdata=canvartst) lda.rate <- mean( pred$class != canvartst[,1] ) lda.rate # Well, once again the original variables did better (if c-v # indicates the best set of variables to use). # While I've got the canonical variables handy, I'm going to # try QDA and RDA with them (using the stepwise procedure to # select variables with QDA, but just using cv1 and cv2 with # RDA since the stepwise procedure takes too long (due to # doing nested c-v)). qdafit <- stepclass(y ~ ., data=canvartrn, method="qda") qdafit # C-v indicates that just using cv1 and cv2 is best. qdafit <- qda(y ~ cv1 + cv2, canvartrn) pred <- predict(qdafit, newdata=canvartst) qda.rate <- mean( pred$class != canvartst[,1] ) qda.rate # Better than all but one of the results obtained above. # (One nearest neighbors result was better.) rdafit <- rda(y ~ cv1 + cv2, canvartrn) rdafit pred <- predict(rdafit, newdata=canvartst) rda.rate <- mean( pred$class != canvartst[,1] ) rda.rate # The parameters selected make RDA similar to QDA, # but the QDA result is just a tad better. # So, before quitting, let's see what happens when we do # QDA with the original variables (using the stepwise # procedure to select variables to use). qdafit <- stepclass(y ~ ., data=trndat, method="qda") qdafit # Using the indicated variables. qdafit <- qda(y ~ x.1 + x.2 + x.3 + x.8 + x.9, trndat) pred <- predict(qdafit, newdata=tstdat) qda.rate <- mean( pred$class != tstdat[,1] ) qda.rate # So, all in all, using nearest neighbors on the original data # did best (43.7% errors), and using stepwise QDA with the # canonical variables did second best (46.1% errors). ##### ############################### # Cross-Validation Done Right # # (given the nature of the # # training and test samples) # ############################### # Now I will use a designed (nonrandom) 8-fold c-v which leaves out the # cases for one training set subject at a time (so that a training set # case cannot have other cases from the same subject as its neighbors # during the c-v). # I'll assume the first 66 rows in the training data set are from the first # subject, the second 66 rows from the second subject, and so on. I'll # store the numbers of wrong classifications for the various values of k in # the vector errors as I cycle through the eight folds. # Clearly I could write a function to do what I'm doing below, but I think # doing it this way makes it easier to follow. I'll first use the original # data, and then I'll just copy the same chunk of code over, changing the # data set, to get results for the scaled original data, and for the first # two canonical variables. errors <- rep(0, 25) for (fold in 1:8) { block <- 66*(fold - 1) + c(1:66) for (i in 1:25) { pred <- knn(trndat[-block,-1], trndat[block,-1], cl=trndat[-block,1], k=i) errors[i] <- errors[i] + sum(pred != trndat[block,1]) } } error.rate <- errors/528 plot(c(1:25), error.rate, type="l", col="blue", main="C-V Results", xlab="k", ylab="Misclass") # Let's take a closer look at the error rates for the # smaller values of k. error.rate[1:8] # The rates for k = 1 and k = 6 are tied as being the lowest. Since the # overall pattern indicates that the rate gets lower as k gets smaller, # and since using an even value for k may tend to produce more ties (this # is definitely the case when there are two classes, and I suspect it's # also the case in a lot of multiclass situations as well), I'd choose to # use k = 1 based on these results. # We know from before that k = 1 results in a test error rate of about 0.437. # Notice how now the c-v results give a fairly accurate estimate (0.455) of # the test error rate, whereas that was not the case before. ##### # Now I'll use the scaled original data. errors <- rep(0, 25) for (fold in 1:8) { block <- 66*(fold - 1) + c(1:66) for (i in 1:25) { pred <- knn(sctrndat[-block,-1], sctrndat[block,-1], cl=sctrndat[-block,1], k=i) errors[i] <- errors[i] + sum(pred != sctrndat[block,1]) } } error.rate <- errors/528 plot(c(1:25), error.rate, type="l", col="blue", main="C-V Results", xlab="k", ylab="Misclass") # Again, using k = 1 seems best. We know from before that the test # error rate is about 0.494. Once again this better way of doing c-v # gave a decent estimate of the test error rate. (I'll print the # estimate below.) error.rate[1] ##### # Next, I'll try using the first two canonical variables. # I'm going to consider k up to 75 (instead of just 25). errors <- rep(0, 75) for (fold in 1:8) { block <- 66*(fold - 1) + c(1:66) for (i in 1:75) { pred <- knn(canvartrn[-block,2:3], canvartrn[block,2:3], cl=canvartrn[-block,1], k=i) errors[i] <- errors[i] + sum(pred != canvartrn[block,1]) } } error.rate <- errors/528 plot(c(1:75), error.rate, type="l", col="blue", main="C-V Results", xlab="k", ylab="Misclass") # Wow! This time we get such different results from the c-v. # If the c-v results are accurate, we ought to get our best # test set performance using the first two canonical variables # with a large value for k. # Let's see what k minimizes the estimated error rate, and what # the value of the lowest estimated rate is. which.min(error.rate) error.rate[which.min(error.rate)] ##### # I'll go with k = 42. pred <- knn( canvartrn[,2:3], canvartst[,2:3], cl=canvartrn[,1], k=42 ) canvar.rate <- mean( pred != canvartst[,1] ) canvar.rate # This is lower than all of the test set rates obtained above # (although only a tad lower than what was obtained using one # nearest neighbor with the original data). # Note that when the simplistic c-v was used with the first two # canonical variables, k = 7 was indicated and the error rate # was not near as good (0.532). # To be thorough, I'll see what happens if the first three # canonical variables are used. errors <- rep(0, 75) for (fold in 1:8) { block <- 66*(fold - 1) + c(1:66) for (i in 1:75) { pred <- knn(canvartrn[-block,2:4], canvartrn[block,2:4], cl=canvartrn[-block,1], k=i) errors[i] <- errors[i] + sum(pred != canvartrn[block,1]) } } error.rate <- errors/528 plot(c(1:75), error.rate, type="l", col="blue", main="C-V Results", xlab="k", ylab="Misclass") which.min(error.rate) error.rate[which.min(error.rate)] # Let's travel down Route 66. (Note: The c-v estimate of the error # rate is promising (lower than before).) pred <- knn( canvartrn[,2:4], canvartst[,2:4], cl=canvartrn[,1], k=66 ) canvar.rate <- mean( pred != canvartst[,1] ) canvar.rate # Given this is not the best rate so far, it's not unreasonable to go # back to the plot and decide to try a smaller value of k ... something # just a bit bigger than 20. error.rate[21:25] # I'll go with k = 21. pred <- knn( canvartrn[,2:4], canvartst[,2:4], cl=canvartrn[,1], k=21 ) canvar.rate <- mean( pred != canvartst[,1] ) canvar.rate # Note that when the simplistic c-v was used with the first 3 # canonical variables, k = 1 was indicated and the error rate # was not near as good (0.574). ########### # Summary # ########### # Recap of best results from above: # 0.435 (42 nearest neighbors, 1st 2 canonical variables), # 0.437 (1 nearest neighbor, original data (all variables)), # 0.446 (21 nearest neighbors, 1st 3 canonical variables), # 0.461 (stepwise QDA, canonical variables (1st 2 chosen)), # 0.478 (RDA, first two canonical variables), # 0.481 (stepwise LDA, original data (but the good set of # variables may not be chosen using the c-v routine)). # From the description of the data available on the authors' web site # one can see that the best neural network classifiers resulted in # error rates of about 45%. (The description also gives the 44% error # rate obtained using one nearest neighbor.) # Some lessons learned: # (1) understand the nature of the training and test samples # (and be careful if they aren't composed of iid observations), # (2) try many different classification methods # (both simple ones and out-of-the-ordinary ones). ##### la fin #####