# This example addresses the classification # setting dealt with in Sec. 2.3 of HTF2. # First I'll generate the training data as described on # pp. 16-17 of HTF2, except sample sizes for the two # classes are random (based on pi_1 = pi_2 = 1/2). # This will be done in a number of steps. # Step 1: For each class generate 10 observations from # the proper bivariate normal distribution to serve as # means for the components of the mixture distribution. # I will make use of some functions from various libraries # which may need to be installed if you haven't used them # yet. library(mvtnorm) # for generating multivariate normal values library(multinomRob) # for generating multinomial dist'n values library(klaR) # for naive Bayes classification library(kknn) # for knn with easy cross-validation ability, and weights # I'll set the seed for the random number generator. set.seed(3) Sig1 <- diag(1,2) # Sig1 is a variance-covariance matrix. # Let's look at it. Sig1 Bmeans <- rmvnorm(10, mean=c(1,0), sigma=Sig1) # B is used for the Blue class. Omeans <- rmvnorm(10, mean=c(0,1), sigma=Sig1) # O is used for the Orange class. # I'll plot the means. plot(Bmeans[,1],Bmeans[,2],col=4,main='Component Means',xlab='x1',ylab='x2',xlim=c(-3,4),ylim=c(-3,4)) points(Omeans[,1], Omeans[,2], col="orange") ### # Step 2: For each class, generate 10 counts which sum # to the randomly chosen sample sizes using a # multinomial (100,0.1,0.1,...,0.1) # distribution. n1 <- rbinom(1, 200, 0.5) n1 # Number of Class 1 observations in training sample. ### Bcounts <- rmultinomial(n1, rep(0.1,10)) Ocounts <- rmultinomial(200 - n1, rep(0.1,10)) # Step 3: using the component means generated in step 1, # and the counts generated in step 2 as sample sizes (for # the various components of the mixture distributions), # generate 100 training sample obervations for each class. Sig2 <- diag(0.2, 2) Bxvals <- matrix(0, n1, 2) finish <- 0 for (i in 1:10) { start <- finish + 1 finish <- finish + Bcounts[i] Bxvals[start:finish,] <- rmvnorm(Bcounts[i], mean=Bmeans[i,], sigma=Sig2) } Oxvals <- matrix(0, 200 - n1, 2) finish <- 0 for (i in 1:10) { start <- finish + 1 finish <- finish + Ocounts[i] Oxvals[start:finish,] <- rmvnorm(Ocounts[i], mean=Omeans[i,], sigma=Sig2) } # Let's look at the training data. plot(Bxvals[,1],Bxvals[,2],col=4,main='Training Data',xlab='x1',ylab='x2',xlim=c(-3.5,4.5),ylim=c(-3.5,4.5)) points(Oxvals[,1], Oxvals[,2], col="orange") ### # Step 4: Give response value of 1 for Blue class observations, # and 2 for Orange class observations, then combine them together # to form the training data. Bclass <- cbind(Bxvals, 1) Oclass <- cbind(Oxvals, 2) trndat <- data.frame(rbind(Bclass, Oclass)) names(trndat) <- c("x1", "x2", "g") # This completes the generation of the training data. # Later we'll want to assess the accuracy of various classifiers # constructed from the training data. For this purpose I'll create # another data set, which we can call the generalization set. It'll # consist of 1000 additional observations from each of the two classes. # (Note: The component means are the ones used to create the training data.) Bcounts <- rmultinomial(1000, rep(0.1,10)) Ocounts <- rmultinomial(1000, rep(0.1,10)) Bxvals <- matrix(0, 1000, 2) finish <- 0 for (i in 1:10) { start <- finish + 1 finish <- finish + Bcounts[i] Bxvals[start:finish,] <- rmvnorm(Bcounts[i], mean=Bmeans[i,], sigma=Sig2) } Oxvals <- matrix(0, 1000, 2) finish <- 0 for (i in 1:10) { start <- finish + 1 finish <- finish + Ocounts[i] Oxvals[start:finish,] <- rmvnorm(Ocounts[i], mean=Omeans[i,], sigma=Sig2) } Bclass <- cbind(Bxvals, 1) Oclass <- cbind(Oxvals, 2) gendat <- data.frame(rbind(Bclass, Oclass)) names(gendat) <- c("x1", "x2", "g") ### # I'll use two-dimensional kernel density estimates # to classify by directly approximating the Bayes # classifier. x1limL <- min(trndat[,1]) - 2 x1limU <- max(trndat[,1]) + 2 x2limL <- min(trndat[,2]) - 2 x2limU <- max(trndat[,2]) + 2 bden <- kde2d(trndat[1:n1,1], trndat[1:n1,2], n=301, lims=c(x1limL,x1limU,x2limL,x2limU)) oden <- kde2d(trndat[(n1+1):200,1], trndat[(n1+1):200,2], n=301, lims=c(x1limL,x1limU,x2limL,x2limU)) # Now let's look at the decision boundary # (on which *weighted* densities are equal). denrat <- (200 - n1)*oden$z/(n1*bden$z) contour(bden$x, bden$y, denrat, levels=c(0.5,1,2)) ### x1index <- round(1 + 300*(trndat[,1] - min(bden$x))/(max(bden$x) - min(bden$x))) x2index <- round(1 + 300*(trndat[,2] - min(bden$y))/(max(bden$y) - min(bden$y))) pred.denrat <- rep(0, 200) for (i in 1:200) pred.denrat[i] <- denrat[x1index[i],x2index[i]] trn.err.kern.den <- mean( ifelse(pred.denrat > 1, 2, 1) != trndat[,3] ) trn.err.kern.den # Let's now get a better estimate of the error rate for # this density estimation method using the generalization # sample. # Note: One has to take care that the subscript does not # fall outside of the set {1, 2, 3, ..., 300, 301}. x1index <- round(1 + 300*(gendat[,1] - min(bden$x))/(max(bden$x) - min(bden$x))) x1index <- ifelse( x1index < 1, 1, x1index ) x1index <- ifelse( x1index > 301, 301, x1index ) x2index <- round(1 + 300*(gendat[,2] - min(bden$y))/(max(bden$y) - min(bden$y))) x2index <- ifelse( x2index < 1, 1, x2index ) x2index <- ifelse( x2index > 301, 301, x2index ) pred.denrat <- rep(0, 2000) for (i in 1:2000) pred.denrat[i] <- denrat[x1index[i],x2index[i]] gen.err.kern.den <- mean( ifelse(pred.denrat > 1, 2, 1) != gendat[,3] ) gen.err.kern.den # Now I'll try naive Bayes. alttrndat <- data.frame(cbind(trndat[,1],trndat[,2],factor(trndat[,3]))) names(alttrndat) <- c("x1","x2","g") levels(alttrndat$g) <- c("1","2") altgendat <- data.frame(cbind(gendat[,1],gendat[,2],factor(gendat[,3]))) names(altgendat) <- c("x1","x2","g") levels(altgendat$g) <- c("1","2") # I'll start with a version of the method that assumes # the marginal densities are normal. nbnorm <- NaiveBayes(g ~ ., data=alttrndat) pred.nbnorm <- predict(nbnorm, newdata=alttrndat) trn.err.nbnorm <- mean( pred.nbnorm$class != alttrndat[,3] ) trn.err.nbnorm # Est. error rate based on training data. pred.nbnorm <- predict(nbnorm, newdata=altgendat) gen.err.nbnorm <- mean( pred.nbnorm$class != altgendat[,3] ) gen.err.nbnorm # Est. error rate based on generalization data. # Next, I will use what I consider to be the standard # way of doing naive Bayes classification, with a kernel # density estimator being used to obtain a nonparametric # density estimate for each predictor. nbkern <- NaiveBayes(g ~ ., data=alttrndat, usekernel=TRUE) pred.nbkern <- predict(nbkern, newdata=alttrndat) tr.err.nbkern <- mean( pred.nbkern$class != alttrndat[,3] ) tr.err.nbkern # Est. error rate based on training data. pred.nbkern <- predict(nbkern, newdata=altgendat) gen.err.nbkern <- mean( pred.nbkern$class != altgendat[,3] ) gen.err.nbkern # Est. error rate based on generalization data. # I'll try ordinary nearest neighbors, using # train.kknn to select a good value of k. cvknn <- train.kknn(as.factor(g) ~ ., trndat, kmax=75, kernel="rectangular", distance=2) plot(c(1:75),cvknn$MISCLASS,type="l",xlab='k', ylab='proportion misclassified',main='Estimated Error Rates',col="maroon") which.min(cvknn$MISCLASS) # I'll use k = 21. pred.gen.knn <- kknn(as.factor(g)~.,train=trndat,test=gendat, k=21,kernel="rectangular") gen.err.knn <- mean(pred.gen.knn$fit != gendat[,3]) gen.err.knn # Error rate for 21 nearest neighbors. # I'll try LDA. lda.fit <- lda(g ~ ., trndat) pred.lda <- predict(lda.fit, newdata=gendat) lda.rate <- mean( pred.lda$class != gendat[,3] ) lda.rate # LDA rate # Now I'll try QDA. qda.fit <- qda(g ~ ., trndat) pred.qda <- predict(qda.fit, newdata=gendat) qda.rate <- mean( pred.qda$class != gendat[,3] ) qda.rate # QDA rate ########################### SUMMARY ############################### # Bayes rate ?????? (Can you figure out how to do it? (I can!)) # kernel density 0.1350 (0.115) # naive Bayes (normal model) 0.1315 (0.125) # naive Bayes (kern. den. est.) 0.1375 (0.135) # KNN (K selected by c-v) 0.1300 (0.125) # LDA 0.1305 # QDA 0.1295