# R Code (and assorted comments) for Overview of Classification Methods library(mvtnorm) # for generating multivariate normal values library(multinomRob ) # for generating multinomial dist'n values library(klaR) # for naive Bayes classification library(kknn) # for kknn and train.kknn f'ns (for KNN) # I'll set the seed for the random number generator. set.seed(17) # Before listing (and grouping) some classification methods, I think # it'll be good to point out that for iid observations # the Bayes classifier # is an optimal classifier. For each query point x it gives the # predicted class to be # argmax f_j ( x ) * pi_j, # where # f_j is the pdf for the jth class # & pi_j is the a priori probability that an obs comes from class j. # For example, in a two-class problem for which an observation is equally # likely to come from either class, the Bayes classifier # predicts class 1 if f_1 ( x ) > f_2 ( x ) # & predicts class 2 if f_2 ( 2 ) > f_1 ( x ) # (and arbitrarily predicts a class if f_1(x) = f_2(x)). But if an obser- # vation is twice as likely to come from class 1 as class 2, i.e., # pi_1 = 2/3 & pi_2 = 1/3, # the Bayes classifier predicts class 1 if # 2 f_1(x) > f_2(x). # For a specific example, let # pi_1 = pi_2 = 1/2, # and let the density underlying each class be a bivariate normal pdf # having covariance matrix I (the identity matrix). For class 1, let # the mean vector be # ( 2, 0 ) # and for class 2 let the mean vector be # ( 0, 2 ). # Let's plot the two pdfs. x1plot <- seq(-3, 8, by=0.1) x2plot <- seq(-3, 8, by=0.1) Sig <- diag(1,2) maxpdf <- function(x1, x2){mat=cbind(x1,x2); pmax(dmvnorm(mat, c(2,0), Sig), dmvnorm(mat, c(0,2), Sig))} pdfplot <- outer(x1plot, x2plot, maxpdf) persp(x1plot, x2plot, pdfplot, xlab="x1", ylab="x2", zlab="pdf", theta=-30, col="gold") ### ## Possibly Skip ## # Below is a simpler example of using the perspective plot function. x1plot <- seq(1, 3, by=0.05) x2plot <- seq(1, 3, by=0.05) mresp <- function(x1,x2) { sqrt(x2/x1) } mrplot <- outer(x1plot, x2plot, mresp) persp(x1plot, x2plot, mrplot, xlab="x1", ylab="x2", zlab="mean response", theta=60, col="gold") ## End of Possible Skip ## # For this simple two-class problem involving bivariate normal populations, # one could determine the error rate of the Bayes classifier (aka the Bayes # rate) analytically. We can also obtain a Monte Carlo estimate (done below). x <- rmvnorm(10000, mean=c(2,0), sigma=Sig) mean( x[,1] < x[,2] ) # Monte Carlo estimate of Bayes rate pnorm( -sqrt( 2 ) ) # Bayes rate (exact) ### # It should be noted that while the Bayes classifier is optimal, # we usually don't have all of the information we need to implement # it; the class densities and/or the prior probabilities are typically # unknown. # The 1984 CART book (p. 15) states # "The three most commonly used classification procedures # # Disriminant analysis # Kernel density estimation # Kth nearest neighbor # # attempt, in different ways, to approximate the Bayes rule". # Classification using density estimation is the most direct way # approximate the Bayesian classifier; one simply substitutes # estimates of the densities and prior probabilities for the # unknowns. # I'll generate data for the simple classification problem # introduced above. I'll first generate a training sample # of 100 observations. n1 <- rbinom(1, 100, 0.5) n1 # Number of Class 1 observations in training sample. X1 <- rmvnorm(n1, mean=c(2,0), sigma=Sig) X2 <- rmvnorm(100-n1, mean=c(0,2), sigma=Sig) X <- rbind(X1, X2) g <- c( rep(1, n1), rep(2, 100-n1) ) trndat <- data.frame( cbind(X, g) ) names(trndat) <- c("x1", "x2", "g") dim(trndat) head(trndat) plot(trndat[,1], trndat[,2], col=4-trndat[,3], main='training data', xlab='x1', ylab='x2') # Now I'll generate some data (the same way, but independently) # to serve as a "generalization sample" (to provide estimates # of the error rates for the various classification methods # considered). I'll put 5000 observations from each class in # the generalization sample. (This seems better than making # the class sample sizes random.) X1 <- rmvnorm(5000, mean=c(2,0), sigma=Sig) X2 <- rmvnorm(5000, mean=c(0,2), sigma=Sig) X <- rbind(X1, X2) g <- c( rep(1, 5000), rep(2, 5000) ) gendat <- data.frame( cbind(X, g) ) names(gendat) <- names(trndat) ### # I'll use two-dimensional kernel density estimates # to classify by directly approximating the Bayes # classifier. # (Note: The kde2d function used below will only work # if there are two predictors. If there is just one # predictor, then R's density function can be used to # obtain a density estimate. Unfortunately, there # doesn't seem to be a function which will do kernel # density estimation for dimensions greater than 2 # (although in a lot of cases I suppose it is the case # that for dimensions greater than 2 the method doesn't # work well compared to other methods). Let me know if # you know of some function that will do density estimation # in dimensions greater than 2.) # Let's look at an estimate of the Class 1 (green) density. x1limL <- min(trndat[,1]) - 2 x1limU <- max(trndat[,1]) + 2 x2limL <- min(trndat[,2]) - 2 x2limU <- max(trndat[,2]) + 2 gden <- kde2d(trndat[1:n1,1], trndat[1:n1,2], n=301, lims=c(x1limL,x1limU,x2limL,x2limU)) image(gden) # Let's take a look at the Help info for the function kde2d. ?kde2d ### # Here's the Class 2 (red) density estimate. rden <- kde2d(trndat[(n1+1):100,1], trndat[(n1+1):100,2], n=301, lims=c(x1limL,x1limU,x2limL,x2limU)) image(rden) ### # Now let's look at the decision boundary # (on which *weighted* densities are equal). denrat <- (100 - n1)*rden$z/(n1*gden$z) contour(gden$x, gden$y, denrat, levels=c(0.5,1,2)) # The contour labeled with 1 is the decision boundary. # Outside of the wavy band, one estimated density is # more than twice as high as the one one. ### # Here's another representation of the decision boundary # determined from the training data. pred.kern.den <- ifelse(denrat < 1, 2, 1) image(gden$x, gden$y, pred.kern.den) # Now I'll obtain an estimated error rate for this # classification method from the training data. x1index <- round(1 + 300*(trndat[,1] - min(gden$x))/(max(gden$x) - min(gden$x))) x2index <- round(1 + 300*(trndat[,2] - min(gden$y))/(max(gden$y) - min(gden$y))) pred.denrat <- rep(0, 100) for (i in 1:100) pred.denrat[i] <- denrat[x1index[i],x2index[i]] trn.err.kern.den <- mean( ifelse(pred.denrat > 1, 2, 1) != trndat[,3] ) trn.err.kern.den # The training error for the kernel density estimation # method is relatively small. # Note: Error rates obtained from the training data are # often optimistic! # Let's now get a better estimate of the error rate for # this density estimation method using the generalization # sample. # Note: One has to take care that the subscript does not # fall outside of the set {1, 2, 3, ..., 300, 301}. x1index <- round(1 + 300*(gendat[,1] - min(gden$x))/(max(gden$x) - min(gden$x))) x1index <- ifelse( x1index < 1, 1, x1index ) x1index <- ifelse( x1index > 301, 301, x1index ) x2index <- round(1 + 300*(gendat[,2] - min(gden$y))/(max(gden$y) - min(gden$y))) x2index <- ifelse( x2index < 1, 1, x2index ) x2index <- ifelse( x2index > 301, 301, x2index ) pred.denrat <- rep(0, 10000) for (i in 1:10000) pred.denrat[i] <- denrat[x1index[i],x2index[i]] gen.err.kern.den <- mean( ifelse(pred.denrat > 1, 2, 1) != gendat[,3] ) gen.err.kern.den # A tad larger than the Bayes rate, which seems reasonable. # (The actual error rate definitely cannot be smaller than # the Bayes rate. But perhaps it shouldn't be too much greater # since the density estimation method ought to work decently # in this simple setting.) # A big drawback to this very straightforward approximation of the # Bayes classifier is that density estimates can be very unreliable # when the dimension is not real small. (A practical drawback is # that here doesn't seem to be a good way to estimate densities using # R when the dimension is greater than 2. (Note: I haven't researched # this thoroughly.)) # A "work around" when the dimension is not small is to assume # independence and use a naive Bayes classifier. (Note: I don't # like R's default, which is to assume normality along with independence. # I think in this case one might often be better off using either LDA or # QDA (although maybe not is the dimension is too large).) # Although the method is based on an assumption of independence, # which may be grossly violated, a nice thing is that this # method may be done using R for dimensions greater than 2. # R's NaiveBayes function apparently needs to be given # data with the class variable being a "factor" with # "levels" specified in order for the predict function # to work properly on the object. Therefore, I created # a new data frame to use at this point. alttrndat <- data.frame(cbind(trndat[,1],trndat[,2],factor(trndat[,3]))) names(alttrndat) <- c("x1","x2","g") levels(alttrndat$g) <- c("1","2") # I'll also modify the generalization data the same way. altgendat <- data.frame(cbind(gendat[,1],gendat[,2],factor(gendat[,3]))) names(altgendat) <- c("x1","x2","g") levels(altgendat$g) <- c("1","2") # I'll start with a version of the method that assumes # the marginal dist'ns are normal, and so instead of # using a nonparametric density estimator, a parametric # model is used, and the data is used to estimate the # unknown parameters. nbnorm <- NaiveBayes(g ~ ., data=alttrndat) pred.nbnorm <- predict(nbnorm, newdata=alttrndat) trn.err.nbnorm <- mean( pred.nbnorm$class != alttrndat[,3] ) trn.err.nbnorm # Est. error rate based on training data. pred.nbnorm <- predict(nbnorm, newdata=altgendat) gen.err.nbnorm <- mean( pred.nbnorm$class != altgendat[,3] ) gen.err.nbnorm # Est. error rate based on generalization data. # Next, I will use what I consider to be the standard # way of doing naive Bayes classification, with a kernel # density estimator being used to obtain a nonparametric # density estimate for each predictor. nbkern <- NaiveBayes(g ~ ., data=alttrndat, usekernel=TRUE) pred.nbkern <- predict(nbkern, newdata=alttrndat) tr.err.nbkern <- mean( pred.nbkern$class != alttrndat[,3] ) tr.err.nbkern # Est. error rate based on training data. pred.nbkern <- predict(nbkern, newdata=altgendat) gen.err.nbkern <- mean( pred.nbkern$class != altgendat[,3] ) gen.err.nbkern # Est. error rate based on generalization data. # (A little worse this way ... not surprising since class # densities are in fact normal.) # Since there can be problems with trying to directly approximate # Bayes rule, one might attempt less direct approximations. Assuming # the densities don't vary widely in smallish regions, an indirect # nonparametric approximation is to use a nearest neighbors method. # I'll try ordinary nearest neighbors, using # train.kknn to select a good value of k. cvknn <- train.kknn(as.factor(g) ~ ., trndat, kmax=50, kernel="rectangular", distance=2) plot(c(1:50),cvknn$MISCLASS,type="l",xlab='k', ylab='proportion misclassified',main='Estimated Error Rates',col="maroon") which.min(cvknn$MISCLASS) # Since there are many values of k which tie for the # smallest c-v error rate, I'll use k = 23 (near middle # of range of good values). pred.gen.knn <- kknn(as.factor(g)~.,train=trndat,test=gendat, k=23,kernel="rectangular") gen.err.knn <- mean(pred.gen.knn$fit != gendat[,3]) gen.err.knn # Error rate for 23 nearest neighbors. # (A tad larger than density estimation rate (and Bayes rate).) # Alternatively, one can assume the density for each class is a # multivariate normal density and use either LDA or QDA. If the # true class densities are multivariate normal, these methods can # provide an excellent approximation of the Bayes classifier. Even # if the densities are rather different from normal, these methods # can often do amazingly well compared to other methods. (In some # nonnormal cases, first transforming the predictors may be helpful.) # I'll try LDA. lda.fit <- lda(g ~ ., trndat) pred.lda <- predict(lda.fit, newdata=gendat) lda.rate <- mean( pred.lda$class != gendat[,3] ) lda.rate # LDA rate # Now I'll try QDA. qda.fit <- qda(g ~ ., trndat) pred.qda <- predict(qda.fit, newdata=gendat) qda.rate <- mean( pred.qda$class != gendat[,3] ) qda.rate # QDA rate # (Lowest so far. (Data was generated so that LDA should # work very well ... better than QDA. But really, a number # of reasonable methods work very well (getting us close to # accuracy of Bayes classifier). Things tend to get more # interesting when class densities are more complex, and when # Bayes error rate isn't so low.)) # Nearest neighbors methods are *local* methods; they only use observations # close to a query point to classify the query point. On the other hand, LDA # and QDA are global methods; they use all of the data to fit a model, and then # all query points are classified using the fitted model. (We can also refer to # LDA and QDA as model-based methods.) # Some classification methods which are more global than local are: # LDA, # QDA, # RDA, # logistic regression, # SVMs, # neural networks. # (We can divide these by whether or not they form linear boundaries, # and whether or not they are based on parametric models for the class # densities.) # Some classification methods which are more local than global are: # nearest neighbors methods (incl. common KNN but also variations like DANN), # methods based on nonparametric density estimates. # (I'd say that CART is more local than global, but it's not as purely # local as the methods listed immediately above.)