# 2nd example for 6/18/10 # (like the 1st, but means closer together) library(mvtnorm) # for generating multivariate normal values library(multinomRob ) # for generating multinomial dist'n values library(klaR) # for naive Bayes classification library(kknn) # for kknn and train.kknn f'ns (for KNN) # I'll set the seed for the random number generator. set.seed(17) # Let # pi_1 = pi_2 = 1/2, # and let the density underlying each class be a bivariate normal pdf # having covariance matrix I (the identity matrix). For class 1, let # the mean vector be # ( 1, 0 ) # and for class 2 let the mean vector be # ( 0, 1 ). # Let's plot the two pdfs. x1plot <- seq(-3, 8, by=0.1) x2plot <- seq(-3, 8, by=0.1) Sig <- diag(1,2) maxpdf <- function(x1, x2){mat=cbind(x1,x2); pmax(dmvnorm(mat, c(1,0), Sig), dmvnorm(mat, c(0,1), Sig))} pdfplot <- outer(x1plot, x2plot, maxpdf) persp(x1plot, x2plot, pdfplot, xlab="x1", ylab="x2", zlab="pdf", theta=-30, col="gold") # For this simple two-class problem involving bivariate normal populations, # one could determine the error rate of the Bayes classifier (aka the Bayes # rate) analytically. We can also obtain a Monte Carlo estimate (done below). x <- rmvnorm(10000, mean=c(1,0), sigma=Sig) mean( x[,1] < x[,2] ) # Monte Carlo estimate of Bayes rate pnorm( -1/sqrt( 2 ) ) # Bayes rate (exact) ### # I'll generate data for the simple classification problem # introduced above. I'll first generate a training sample # of 100 observations. n1 <- rbinom(1, 100, 0.5) n1 # Number of Class 1 observations in training sample. X1 <- rmvnorm(n1, mean=c(1,0), sigma=Sig) X2 <- rmvnorm(100-n1, mean=c(0,1), sigma=Sig) X <- rbind(X1, X2) g <- c( rep(1, n1), rep(2, 100-n1) ) trndat <- data.frame( cbind(X, g) ) names(trndat) <- c("x1", "x2", "g") plot(trndat[,1], trndat[,2], col=4-trndat[,3], main='training data', xlab='x1', ylab='x2') # Now I'll generate some data (the same way, but independently) # to serve as a "generalization sample" (to provide estimates # of the error rates for the various classification methods # considered). I'll put 5000 observations from each class in # the generalization sample. (This seems better than making # the class sample sizes random.) X1 <- rmvnorm(5000, mean=c(1,0), sigma=Sig) X2 <- rmvnorm(5000, mean=c(0,1), sigma=Sig) X <- rbind(X1, X2) g <- c( rep(1, 5000), rep(2, 5000) ) gendat <- data.frame( cbind(X, g) ) names(gendat) <- names(trndat) ### # I'll use two-dimensional kernel density estimates # to classify by directly approximating the Bayes # classifier. x1limL <- min(trndat[,1]) - 2 x1limU <- max(trndat[,1]) + 2 x2limL <- min(trndat[,2]) - 2 x2limU <- max(trndat[,2]) + 2 gden <- kde2d(trndat[1:n1,1], trndat[1:n1,2], n=301, lims=c(x1limL,x1limU,x2limL,x2limU)) rden <- kde2d(trndat[(n1+1):100,1], trndat[(n1+1):100,2], n=301, lims=c(x1limL,x1limU,x2limL,x2limU)) # Now let's look at the decision boundary # (on which *weighted* densities are equal). denrat <- (100 - n1)*rden$z/(n1*gden$z) contour(gden$x, gden$y, denrat, levels=c(0.5,1,2)) x1index <- round(1 + 300*(trndat[,1] - min(gden$x))/(max(gden$x) - min(gden$x))) x2index <- round(1 + 300*(trndat[,2] - min(gden$y))/(max(gden$y) - min(gden$y))) pred.denrat <- rep(0, 100) for (i in 1:100) pred.denrat[i] <- denrat[x1index[i],x2index[i]] trn.err.kern.den <- mean( ifelse(pred.denrat > 1, 2, 1) != trndat[,3] ) trn.err.kern.den # Let's now get a better estimate of the error rate for # this density estimation method using the generalization # sample. # Note: One has to take care that the subscript does not # fall outside of the set {1, 2, 3, ..., 300, 301}. x1index <- round(1 + 300*(gendat[,1] - min(gden$x))/(max(gden$x) - min(gden$x))) x1index <- ifelse( x1index < 1, 1, x1index ) x1index <- ifelse( x1index > 301, 301, x1index ) x2index <- round(1 + 300*(gendat[,2] - min(gden$y))/(max(gden$y) - min(gden$y))) x2index <- ifelse( x2index < 1, 1, x2index ) x2index <- ifelse( x2index > 301, 301, x2index ) pred.denrat <- rep(0, 10000) for (i in 1:10000) pred.denrat[i] <- denrat[x1index[i],x2index[i]] gen.err.kern.den <- mean( ifelse(pred.denrat > 1, 2, 1) != gendat[,3] ) gen.err.kern.den # Now I'll try naive Bayes. alttrndat <- data.frame(cbind(trndat[,1],trndat[,2],factor(trndat[,3]))) names(alttrndat) <- c("x1","x2","g") levels(alttrndat$g) <- c("1","2") altgendat <- data.frame(cbind(gendat[,1],gendat[,2],factor(gendat[,3]))) names(altgendat) <- c("x1","x2","g") levels(altgendat$g) <- c("1","2") # I'll start with a version of the method that assumes # the marginal densities are normal. nbnorm <- NaiveBayes(g ~ ., data=alttrndat) pred.nbnorm <- predict(nbnorm, newdata=alttrndat) trn.err.nbnorm <- mean( pred.nbnorm$class != alttrndat[,3] ) trn.err.nbnorm # Est. error rate based on training data. pred.nbnorm <- predict(nbnorm, newdata=altgendat) gen.err.nbnorm <- mean( pred.nbnorm$class != altgendat[,3] ) gen.err.nbnorm # Est. error rate based on generalization data. # Next, I will use what I consider to be the standard # way of doing naive Bayes classification, with a kernel # density estimator being used to obtain a nonparametric # density estimate for each predictor. nbkern <- NaiveBayes(g ~ ., data=alttrndat, usekernel=TRUE) pred.nbkern <- predict(nbkern, newdata=alttrndat) tr.err.nbkern <- mean( pred.nbkern$class != alttrndat[,3] ) tr.err.nbkern # Est. error rate based on training data. pred.nbkern <- predict(nbkern, newdata=altgendat) gen.err.nbkern <- mean( pred.nbkern$class != altgendat[,3] ) gen.err.nbkern # Est. error rate based on generalization data. # I'll try ordinary nearest neighbors, using # train.kknn to select a good value of k. cvknn <- train.kknn(as.factor(g) ~ ., trndat, kmax=50, kernel="rectangular", distance=2) plot(c(1:50),cvknn$MISCLASS,type="l",xlab='k', ylab='proportion misclassified',main='Estimated Error Rates',col="maroon") which.min(cvknn$MISCLASS) # I'll use k = 39. pred.gen.knn <- kknn(as.factor(g)~.,train=trndat,test=gendat, k=39,kernel="rectangular") gen.err.knn <- mean(pred.gen.knn$fit != gendat[,3]) gen.err.knn # Error rate for 39 nearest neighbors. # I'll try LDA. lda.fit <- lda(g ~ ., trndat) pred.lda <- predict(lda.fit, newdata=gendat) lda.rate <- mean( pred.lda$class != gendat[,3] ) lda.rate # LDA rate # Now I'll try QDA. qda.fit <- qda(g ~ ., trndat) pred.qda <- predict(qda.fit, newdata=gendat) qda.rate <- mean( pred.qda$class != gendat[,3] ) qda.rate # QDA rate ########################### SUMMARY ############################### # 1st Example (from other file) 2nd Example (this file) # Bayes rate 0.07865 0.23975 # kernel density 0.084 (0.02) 0.248 (0.16) # naive Bayes (normal model) 0.085 (0.03) 0.251 (0.20) # naive Bayes (kern. den. est.) 0.087 (0.03) 0.245 (0.18) # KNN (K selected by c-v) 0.084 (0.03) 0.243 (0.19) # LDA 0.085 0.244 # QDA 0.081 0.244 # With the normal class densities, it makes little difference which # method is used. Accuracy doesn't seem to appreciably suffer if we # don't assume normality (even though assumption is true in this case). # It'll be interesting to consider some nonnormal cases.