# This example addresses the classification # setting dealt with in Sec. 2.3 of HTF2. # # (Developed using 2.8.1 (2.9.0 produces different results).) # # # First I'll generate the training data as described on pp. 16-17 of HTF2. # This will be done in a number of steps. # # Step 1: For each class generate 10 observations from # the proper bivariate normal distribution to serve as # means for the components of the mixture distribution. # # I will make use of some functions from various libraries # which may need to be installed if you haven't used them # yet. # library(mvtnorm) # for generating multivariate normal values library(multinomRob) # for generating multinomial dist'n values library(class) # for nearest neighbors methods (knn) library(kknn) # for knn with easy cross-validation ability, and weights # # I'll set the seed for the random number generator. set.seed(3) # Sig1 <- diag(1,2) # Sig1 is a variance-covariance matrix. # Let's look at it. Sig1 ### Bmeans <- rmvnorm(10, mean=c(1,0), sigma=Sig1) # B is used for the Blue class. Omeans <- rmvnorm(10, mean=c(0,1), sigma=Sig1) # O is used for the Orange class. # # I'll plot the means. plot(Bmeans[,1],Bmeans[,2],col=4,main='Component Means',xlab='x1',ylab='x2',xlim=c(-3,4),ylim=c(-3,4)) points(Omeans[,1], Omeans[,2], col="orange") ### # Step 2: For each class, generate 10 counts which sum to 100, # using a multinomial (100,0.1,0.1,...,0.1) distribution. Bcounts <- rmultinomial(100, rep(0.1,10)) Ocounts <- rmultinomial(100, rep(0.1,10)) # Let's look at the counts. Bcounts Ocounts ### # Step 3: using the component means generated in step 1, # and the counts generated in step 2 as sample sizes (for # the various components of the mixture distributions), # generate 100 training sample obervations for each class. Sig2 <- diag(0.2, 2) Bxvals <- matrix(0, 100, 2) finish <- 0 for (i in 1:10) { start <- finish + 1 finish <- finish + Bcounts[i] Bxvals[start:finish,] <- rmvnorm(Bcounts[i], mean=Bmeans[i,], sigma=Sig2) } Oxvals <- matrix(0, 100, 2) finish <- 0 for (i in 1:10) { start <- finish + 1 finish <- finish + Ocounts[i] Oxvals[start:finish,] <- rmvnorm(Ocounts[i], mean=Omeans[i,], sigma=Sig2) } # Let's look at the training data. plot(Bxvals[,1],Bxvals[,2],col=4,main='Training Data',xlab='x1',ylab='x2',xlim=c(-3.5,4.5),ylim=c(-3.5,4.5)) points(Oxvals[,1], Oxvals[,2], col="orange") ### # Step 4: Give response value of 0 for Blue class observations, # and 1 for Orange class observations, then combine them together # to form the training data. Bclass <- cbind(Bxvals, 0) Oclass <- cbind(Oxvals, 1) trdata <- data.frame(rbind(Bclass, Oclass)) names(trdata) <- c("x1", "x2", "g") # This completes the generation of the training data. # # # Later we'll want to assess the accuracy of various classifiers # constructed from the training data. For this purpose I'll create # another data set, which we can call the generalization set. It'll # consist of 1000 additional observations from each of the two classes. # (Note: The component means are the ones used to create the training data.) Bcounts <- rmultinomial(1000, rep(0.1,10)) Ocounts <- rmultinomial(1000, rep(0.1,10)) Bxvals <- matrix(0, 1000, 2) finish <- 0 for (i in 1:10) { start <- finish + 1 finish <- finish + Bcounts[i] Bxvals[start:finish,] <- rmvnorm(Bcounts[i], mean=Bmeans[i,], sigma=Sig2) } Oxvals <- matrix(0, 1000, 2) finish <- 0 for (i in 1:10) { start <- finish + 1 finish <- finish + Ocounts[i] Oxvals[start:finish,] <- rmvnorm(Ocounts[i], mean=Omeans[i,], sigma=Sig2) } Bclass <- cbind(Bxvals, 0) Oclass <- cbind(Oxvals, 1) gendata <- data.frame(rbind(Bclass, Oclass)) names(gendata) <- c("x1", "x2", "g") # # Now I'll use a 1-nearest neighbor classifier to classify each case in the # generalization sample. Because we know the true classes for the gen sample # cases, the proportion misclassified can be determined and it will be a good # estimate of the generalization error rate. pred.1nn <- knn(trdata[,-3], gendata[,-3], cl=trdata[,3], k=1) gen.err.1nn <- mean(pred.1nn != gendata[,3]) # Here is the estimated gen error rate for 1-nn: gen.err.1nn ### # Now I'll use a 15-nearest neighbors classifier to classify each case in the # generalization sample. pred.15nn <- knn(trdata[,-3], gendata[,-3], cl=trdata[,3], k=15) gen.err.15nn <- mean(pred.15nn != gendata[,3]) # Here is the estimated gen error rate for 15-nn: gen.err.15nn ### # It appears that using 15 nearest neighbors is better, but in practice we wouldn't # know this since the only observations that we know the true class of are those in # the training sample. But if we use the "resubstitution error rate" it will always # indicate that using just one nearest neighbor is best. # # The following plot shows the resubstitution error rates for k = 1, 3, 5, ..., 99, # along with the corresponding estimates obtained by the unbiased method of using # the proportion of the generalization sample which is misclassified. It can be seen # that the resubstitution estimates of the true error rate are uniformly optimistic. I <- 50 resub.err <- numeric(I) gen.err <- numeric(I) for (i in 1:I) { pred.resub <- knn(trdata[,-3], trdata[,-3], cl=trdata[,3], k = (2*i - 1)) resub.err[i] <- mean(pred.resub != trdata[,3]) pred.gen <- knn(trdata[,-3], gendata[,-3], cl=trdata[,3], k = (2*i - 1)) gen.err[i] <- mean(pred.gen != gendata[,3]) } plot(2*seq(1:I)-1,resub.err,type="l",xlab='k',ylab='proportion misclassified',main='Estimated Error Rates',col=2,ylim=c(0,0.3)) points(2*seq(1:I)-1,gen.err,type="l",col=3) abline(0.13, 0, col=8) legend(10,0.275,legend=c("resubstitution","generalization","cross-val","reference (0.13)","Bayes"),fill=c(2,3,5,8,"purple")) # The gray line is just a reference line to help it be seen # that the error rates generally decrease towards 0.13 as k # increases from 1, and then generally increase as k increases # past 15 or so. # # It should be noted that, as is usually the case with # classifiers, using the same data to both classify points # and estimate the accuracy of the classifier can lead to # overoptimistic estimates of accuracy. This is particularly # evident for the case of k = 1. ### # In practice, if we only knew the true class for the # training set of 200 cases, we could use cross-validation # to determine a good value of k to use when classifying # cases with unknown class. Although for some purposes # using an n-fold c-v may take an excessively long time # and may be worse than using a smaller number of folds, # with k-nearest-neighbors n-fold c-v is simple and # and relatively quick. (It's simple because with n-fold # c-v, to get the predicted class of a case the k nearest # ***other*** cases are the neighbors used ... so it's like # k+1 nearest neighbors can be determined in the usual # way and then the nearest neighbor (the case itself) is # ignored. The train.kknn function, as used below, will # do this.) cvknn <- train.kknn(as.factor(g) ~ ., trdata, kmax=99, kernel="rectangular", distance=2) # (Note: If the response is coded numerically, it must be # converted to make it be recogonized as being nominal # by the kknn function.) i <- seq(1:I) k.to.use <- 2*i - 1 points(k.to.use, cvknn$MISCLASS[k.to.use], type="l", col=5) # To help you perhaps get a better understanding of train.kknn, # let's look at cvknn$MISCLASS and also a summary of cvknn. cvknn$MISCLASS summary(cvknn) # I find it a bit odd that 6 using nearest neighbors appears to work best. # In practice I'd use either 5 or 7 to avoid the arbitrary tie-breaking that # can occur when k = 6 ... if it was the case that using 6 nearest neighbors # was the only choice that yielded the smallest observed estimated error rate. # But an inspection of the MISCLASS values above reveals that the lowest observed # rate also occured with 9, 20, 21, and 25-30 nearest neighbors, and so maybe 27 # or 25 would be a good choice for k. # # It's not surprising that for large values of k the c-v estimate of the error # rate is equal or close to resubsitution estimate, since the same cases are # being classified, and for most of them it may not matter whether the case itself # or one more nearest neighbor is used (since k-1 of the neighbors used will be the # same). What may be a bit surprising is that the error rate estimates based on the # generalization data are larger than the c-v estimates. That different cases are being # classified accounts for different proportions misclassified, but should they be uniformly # larger? ### # I'll do something to shed some light on the last question posed shortly, but right now I # want to determine the Bayes error rate (the error rate of the optimal classifier we could # use if we knew exactly the densities for the two classes and the probabilities that a future # case to be classified comes from each of two classes). # # If we assume that future cases to be classified are equally likely to be from each of the # two classes, the Bayes classifier classifies a point (x1, x2) as Blue if the Blue pdf at # (x1, x2) is greater than the Orange pdf, classifies the point as Orange if the Orange pdf # is greater, and makes an arbitrary decision if the two pdfs are equal at (x1, x2). So # to perform classification using the Bayes classifier we just need the pdf values. We can # determine the needed pdf values if we know the form of the parametric distribution and the # parameter values for the two classes. For this example, having generated data, we know what # we need to know, but in practice we typically would not be able to do classification according # to the Bayes rule. (*** As we cover the classification methods in HTF2, we'll see that many # good classification methods attempt to approximate the Bayes rule in some way.*** How does # nearest neighbors do this?) # # Below I obtain an estimate of the Bayes error rate using the large generalization sample. Bpdf <-rep(0,2000) Opdf <-rep (0,2000) for (i in 1:10) { Bpdf <- Bpdf + dmvnorm(gendata[,1:2], Bmeans[i,], Sig2) Opdf <- Opdf + dmvnorm(gendata[,1:2], Omeans[i,], Sig2) } pred.bayes <- ifelse(Bpdf > Opdf, 0, 1) est.bayes.rate <- mean(pred.bayes != gendata[,3]) abline(est.bayes.rate, 0, col="purple") est.bayes.rate # This is the estimated error rate of the Bayes classifier. # A line showing the Bayes rate has been added to the plot. ### # The Bayes rate looks close to the best k-nearest neighbors generalization # performance (and we know it's really better, since it's the best possible # rate and it seems very unlikely any nearest neighbors classifier will be # exactly optimal). Let's see what the lowest generalization rate for the # nearest neighbors classifiers is, and what value of k produced it. best.k <- 2*which.min(gen.err) - 1 best.k # (possible) best value of k gen.err[which.min(gen.err)] # estimate of gen. error rate (best k) ### # Let's change the random number seed to get new values for everything and see if the # pattern is the same. # (a) Look at the plot one more time and note the green generalization errors are # generally largest. # (b) Change the random number seed by executing the command below. set.seed(13) # (c) Grab code starting right after the first break (the line that starts Bmeans <-) # all of the way through to the break just above and run it. Some of the comments # won't be correct with the new results, but it shouldn't be too hard to figure # things out given the printed results and plots. ### ############ Follow the instructions above and then continue from here. ############# # Quite a different picture! Now c-v estimates are higher than they ought to be. # Good point: Best k-nn results in error rate close to that of Bayes classifier. # Bad point: In practice we wouldn't know this, and wouldn't know what to use for # k to get good knn performance. # Good point: k-nn works decently over a wide range of k values. # Bad point: The c-v estimates point us to a bad choice for k. # # Let's now see what happens if we go from 100 training observations per class to 200. # We'll keep the component means the same as before, redo Step 2 above by running the # code below. First I'll set the random number seed once more. (Doing this makes it # easier to jump into the middle of all of this code and get the same results each time.) set.seed(8) # # Revised Step 2: For each class, generate 10 counts which sum to 200, # using a multinomial (200,0.1,0.1,...,0.1) distribution. Bcounts <- rmultinomial(200, rep(0.1,10)) Ocounts <- rmultinomial(200, rep(0.1,10)) # Also redo Steps 3 and 4. # Revised Step 3: using the component means generated in step 1, # and the counts generated in revised step 2 as sample sizes (for # the various components of the mixture distributions), # generate 200 training sample obervations for each class. Sig2 <- diag(0.2, 2) Bxvals <- matrix(0, 200, 2) finish <- 0 for (i in 1:10) { start <- finish + 1 finish <- finish + Bcounts[i] Bxvals[start:finish,] <- rmvnorm(Bcounts[i], mean=Bmeans[i,], sigma=Sig2) } Oxvals <- matrix(0, 200, 2) finish <- 0 for (i in 1:10) { start <- finish + 1 finish <- finish + Ocounts[i] Oxvals[start:finish,] <- rmvnorm(Ocounts[i], mean=Omeans[i,], sigma=Sig2) } # Let's look at the training data. plot(Bxvals[,1],Bxvals[,2],col=4,main='Training Data',xlab='x1',ylab='x2',xlim=c(-3.5,4.5),ylim=c(-3.5,4.5)) points(Oxvals[,1], Oxvals[,2], col="orange") # Revised Step 4: Give response value of 0 for Blue class observations, # and 1 for Orange class observations, then combine them together # to form the training data. Bclass <- cbind(Bxvals, 0) Oclass <- cbind(Oxvals, 1) trdata <- data.frame(rbind(Bclass, Oclass)) names(trdata) <- c("x1", "x2", "g") # This completes the generation of the training data. ### # Now find the line that has # I <- 50 # above and grab everything from it to the finding and printing out of the gen # error rate for the best value of k, and run that chunk of commands. ### ############ Follow the instructions above and then continue from here. ############# # Now there seems to be generally better agreement with the c-v estimates of the errors and # the better estimates obtained using the unbiased scheme based on the large gen sample. # # # Now I will use the training data to fit a # least squares regression model, using the # 0/1 class labels as the response var, to # determine the predicted class for each # observation using (2.7) on p. 12 of HTF2. # ols1 <- lm(g ~ x1 + x2, data=trdata) # # Now let's determine the generalization error rate for this classifier. # pred.ols1 <- predict(ols1, newdata=gendata) ols1.err <- mean( ifelse(pred.ols1 > 0.5, 1, 0) != gendata[,3] ) ols1.err # This is the 1st-order OLS classifier error rate. ### # This is a bit scary! I didn't expect the linear decision boundary to outperform # nearest neighbors classification. Let's look at the data and the linear decision # boundary together. plot(trdata[1:200,1], trdata[1:200,2], col=4, main='1st-order OLS', xlab='x1',ylab='x2', xlim=c(-3,4),ylim=c(-3,4)) points(trdata[201:400,1], trdata[201:400,2], col="orange") abline((0.5 - ols1$coef[1])/ols1$coef[3], - ols1$coef[2]/ols1$coef[3], col=3) ### # It would perhaps be better to look to see how well the linear boundary fit from the training # data separates the classes in the generalization data ... but let's forge ahead instead. # # Now I'll try a 2nd-order regression fit (extending the investigation beyond what's covered in # Sec. 2.3 of HTF2. ols2 <- lm(g ~ x1 + x1 + I(x1^2) + I(x2^2) + I(x1*x2), data=trdata) pred.ols2 <- ifelse(predict(ols2, newdata=gendata) <= 0.5, 0, 1) ols2.err <- mean( pred.ols2 != gendata[,3] ) ols2.err # This is the 2nd-order OLS classifier error rate. # It appears that while a linear boundary worked fine, a curved boundary # overfits the training data. (The curve based on the training data doesn't # match well with the generalization data.) ### # Finally, let's go back to the initial data set with 100 training cases from each class. # Normally if the linear boundary worked better than k-nn with the larger data set, we'd # also expect it to work better with a smaller data set. *But* we have to consider it to # be something of a fluke that the linear boundary did so well before ... after all it appeared # to outperform the optimal Bayes classifier! But with the initial data set considered, the # component means were such that a linear boundary may not work so well ... let's see. # # Starting with # set.seed(3) # near the top of the file, run commands through obtaining estimate of gen error rate for best k # (right after computation of Bayes error rate). ### ##################### after doing above ######################### # Run from # ols1 <- lm(g ~ x1 + x2, data=trdata) # through computation of ols2 error rate. ### # As I expected, the linear boundary didn't work as good as nearest neighbors # classification. Also, it can be noted that using a 2nd-order regression model # didn't help (it made things worse). # # # Finally, to wrap up this exploration for now, let's investigate a simpler # classification setting ... one where the density for each class is a single # bivariate normal pdf instead of being the pdf from a mixture distribution. # In this simple setting, it may well be that a simple linear boundary does # great. (The Bayes classifier will have a linear boundary if the var-cov # matrices are the same for the two normal distributions.) # set.seed(13) Bpoints <- rmvnorm(100, mean=c(1,0), sigma=Sig1) # B is used for the Blue class. Opoints <- rmvnorm(100, mean=c(0,1), sigma=Sig1) # O is used for the Orange class. Bclass <- cbind(Bpoints, 0) Oclass <- cbind(Opoints, 1) newtrdata <- data.frame(rbind(Bclass, Oclass)) names(newtrdata) <- c("x1", "x2", "g") # This completes the generation of the training data. Bpoints <- rmvnorm(1000, mean=c(1,0), sigma=Sig1) # B is used for the Blue class. Opoints <- rmvnorm(1000, mean=c(0,1), sigma=Sig1) # O is used for the Orange class. Bclass <- cbind(Bpoints, 0) Oclass <- cbind(Opoints, 1) newgendata <- data.frame(rbind(Bclass, Oclass)) names(newgendata) <- c("x1", "x2", "g") # This completes the generation of the generalization data. # # The Bayes classifier will classify a case as Blue if x1 > x2, and # Orange otherwise. (The boundary is a line through the origin having # slope 1; i.e., the line x2 = x1.) est.bayes.rate <- mean( ifelse(newgendata[,1] > newgendata[,2], 0, 1) != newgendata[,3] ) est.bayes.rate # This is the estimated Bayes rate. # # # Now let's consider the simple linear classifier. ols1 <- lm(g ~ x1 + x2, data=newtrdata) # # Now let's determine the generalization error rate for this classifier. # pred.ols1 <- predict(ols1, newdata=newgendata) ols1.err <- mean( ifelse(pred.ols1 > 0.5, 1, 0) != newgendata[,3] ) ols1.err # This is the estimated 1st-order OLS classifier error rate. intercept <- (0.5 - ols1$coef[1])/ols1$coef[3] slope <- -ols1$coef[2]/ols1$coef[3] intercept slope # Intercept and slope of boundary line (compare to 0 and 1). plot(newgendata[1:1000,1], newgendata[1:1000,2], col=4, main='1st-order OLS', xlab='x1',ylab='x2', xlim=c(-3,4),ylim=c(-3,4)) points(newgendata[1001:2000,1], newgendata[1001:2000,2], col="orange") abline(intercept, slope, col=3) ### # Let's see how well nearest neighbors does. I <- 50 resub.err <- numeric(I) gen.err <- numeric(I) for (i in 1:I) { pred.resub <- knn(newtrdata[,-3], newtrdata[,-3], cl=newtrdata[,3], k = (2*i - 1)) resub.err[i] <- mean(pred.resub != newtrdata[,3]) pred.gen <- knn(newtrdata[,-3], newgendata[,-3], cl=newtrdata[,3], k = (2*i - 1)) gen.err[i] <- mean(pred.gen != newgendata[,3]) } plot(2*seq(1:I)-1,resub.err,type="l",xlab='k',ylab='proportion misclassified',main='Estimated Error Rates',col=2,ylim=c(0,0.31)) points(2*seq(1:I)-1,gen.err,type="l",col=3) legend(62.5,0.125,legend=c("resubstitution","generalization","cross-val","linear","Bayes"),fill=c(2,3,5,"sienna","purple")) cvknn <- train.kknn(as.factor(g) ~ ., newtrdata, kmax=99, kernel="rectangular", distance=2) i <- seq(1:I) k.to.use <- 2*i - 1 points(k.to.use, cvknn$MISCLASS[k.to.use], type="l", col=5) abline(ols1.err, 0, col="sienna") abline(est.bayes.rate, 0, col="purple") # # While nearest neighbors doesn't work quite as well as the linear classifier, # it's not horrible either. Perhaps in the weeks to come we'll encounter classification # settings for which the choice of method makes a bigger difference. ###