# Example of Global Dimension Reduction Method for # Nearest Neighbors from Subsection 13.4.2 of HFT2, # and Support Vector Machines from Ch. 12. # Developed using 2.9.0. library(MASS) library(class) library(kknn) # for kknn and train.kknn f'ns (for KNN) library(klaR) # for stepclass f'n (for stepwise LDA) library(e1071) # for svm f'n (for support vector machines) library(mvtnorm) # for rmvnorm and dmvnorm (for generation # of multivariate normal observations and # computation of multivariate normal pdf # values) set.seed(7) # I'll generate data from four classes, using a multivariate # normal distribution for each class. The covariance matrix # will be the same for all of the classes. The means will # differ in the first two dimensions, but not the other six # dimensions. (So six of the eight predictor variables are # pure noise.) Sig <- diag(1,8) Sig[1,2] <- 0.5 Sig[2,1] <- 0.5 Sig # Sig is a variance-covariance matrix. Oxvals <- rmvnorm(100, mean=c(-1.5,-1,0,0,0,0,0,0), sigma=Sig) # O is used for the Orange class. Gxvals <- rmvnorm(100, mean=c(-0.75,-0.5,0,0,0,0,0,0), sigma=Sig) # G is used for the Green class. Mxvals <- rmvnorm(100, mean=c(0.75,1,0,0,0,0,0,0), sigma=Sig) # M is used for the Maroon class. Bxvals <- rmvnorm(100, mean=c(1.5,1.5,0,0,0,0,0,0), sigma=Sig) # B is used for the Blue class. plot(Oxvals[,1],Oxvals[,2],col="darkorange",main='Training Data (nonnoise variables)', xlab='x1',ylab='x2',xlim=c(-4.5,4.5),ylim=c(-4.5,4.5)) points(Gxvals[,1], Gxvals[,2], col="limegreen") points(Mxvals[,1], Mxvals[,2], col="maroon") points(Bxvals[,1], Bxvals[,2], col="royalblue") # There's quite a bit of overlap. ##### # I'll give response value of 0 for Orange class observations, # 1 for Green class observations, 2 for the Maroon class # observations, and 3 for the Blue class observations, and then # combine them together to form the training data. Then I'll # generate 1000 observations for each class to serve as the # generalization sample. Oclass <- cbind(0, Oxvals) Gclass <- cbind(1, Gxvals) Mclass <- cbind(2, Mxvals) Bclass <- cbind(3, Bxvals) trndat <- data.frame(rbind(Oclass, Gclass, Mclass, Bclass)) names(trndat) <- c("g", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8") Oxvals <- rmvnorm(1000, mean=c(-1.5,-1,0,0,0,0,0,0), sigma=Sig) # O is used for the Orange class. Gxvals <- rmvnorm(1000, mean=c(-0.75,-0.5,0,0,0,0,0,0), sigma=Sig) # G is used for the Green class. Mxvals <- rmvnorm(1000, mean=c(0.75,1,0,0,0,0,0,0), sigma=Sig) # M is used for the Maroon class. Bxvals <- rmvnorm(1000, mean=c(1.5,1.5,0,0,0,0,0,0), sigma=Sig) # B is used for the Blue class. Oclass <- cbind(0, Oxvals) Gclass <- cbind(1, Gxvals) Mclass <- cbind(2, Mxvals) Bclass <- cbind(3, Bxvals) gendat <- data.frame(rbind(Oclass, Gclass, Mclass, Bclass)) names(gendat) <- c("g", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8") ##### # I'll obtain an estimate of the Bayes error # rate using the large generalization sample. Opdf <- dmvnorm(gendat[,2:9], c(-1.5,-1,0,0,0,0,0,0), Sig) Gpdf <- dmvnorm(gendat[,2:9], c(-0.75,-0.5,0,0,0,0,0,0), Sig) Mpdf <- dmvnorm(gendat[,2:9], c(0.75,1,0,0,0,0,0,0), Sig) Bpdf <- dmvnorm(gendat[,2:9], c(1.5,1.5,0,0,0,0,0,0), Sig) pred.bayes <- ifelse(Gpdf > Mpdf, 1, 2) pred.bayes <- ifelse(Opdf > Gpdf & Opdf > Bpdf & Opdf > Mpdf, 0, pred.bayes) pred.bayes <- ifelse(Bpdf > Opdf & Bpdf > Gpdf & Bpdf > Mpdf, 3, pred.bayes) est.bayes.rate <- mean(pred.bayes != gendat[,1]) est.bayes.rate # This is the estimated error rate of the Bayes classifier. # Below is a confusion matrix, with true class for the # rows and predicted class for the columns. The Green # and Maroon classes are the most difficult one to # correctly predict. table(gendat[,1], pred.bayes) ##### # Before trying the method from subsection 13.4.2 of HTF2 # I'll try LDA using all of the variables, and then step- # wise LDA. lda.all <- lda(g ~ ., trndat) pred.lda.all <- predict(lda.all, newdata=gendat) lda.all.rate <- mean( pred.lda.all$class != gendat[,1] ) lda.all.rate # LDA rate (all predictors used). ldafit <- stepclass(g ~ ., data=trndat, method="lda") ldafit # Stepwise LDA selects only x2. lda.step <- lda(g ~ x2, trndat) pred.lda.step <- predict(lda.step, newdata=gendat) lda.step.rate <- mean( pred.lda.step$class != gendat[,1] ) lda.step.rate # LDA rate (using predictor chosen by stepwise procedure). # Not good! Stepwise LDA failed us. ##### # Now I'll try ordinary nearest neighbors, using # train.kknn to selct a good value of k. cvknn <- train.kknn(as.factor(g) ~ ., trndat, kmax=150, kernel="rectangular", distance=2) plot(c(1:150),cvknn$MISCLASS,type="l",xlab='k', ylab='proportion misclassified',main='Estimated Error Rates', ylim=c(0.4,0.65),col="maroon") abline(est.bayes.rate, 0, col="purple") legend(100,0.65,legend=c("cross-val","Bayes"), fill=c("maroon","purple")) which.min(cvknn$MISCLASS) # Not looking good. I'll go with k = 11. pred.gen.knn <- kknn(as.factor(g)~.,train=trndat,test=gendat, k=11,kernel="rectangular") gen.err.knn <- mean(pred.gen.knn$fit != gendat[,1]) gen.err.knn # Error rate for 11 nearest neighbors. # (Not good.) ##### # Now I'll try the method from subsection 13.4.2 of HTF2. GlobDimRed <- function(traindata) # 1st col. of data set should be class variable. # (Classes should be specified using consecutive integers.) # Remaining col's of each set should be predictor var's. { first.class <- min(traindata[,1]) last.class <- max(traindata[,1]) n.class <- last.class - first.class + 1 n.var <- length(traindata[1,]) - 1 var.means <- matrix(0, nrow=n.var, ncol=n.class) var.sums <- matrix(0, nrow=n.var, ncol=n.class) class.n <- numeric(n.class) for (g in first.class:last.class) { cases <- traindata[,1] == g class.n[g - first.class + 1] <- sum(cases) classdata <- traindata[cases,] var.means[,g - first.class + 1] <- apply(classdata[,-1],2,mean) var.sums[,g - first.class + 1] <- apply(classdata[,-1],2,sum) } # var.means now contains the mean vectors for the # classes in its columns ... now I'll subtract the # grand mean from each column and put the result in # mean.deviations grand.mean <- apply(var.sums,1,sum)/sum(class.n) mean.deviations <- var.means - grand.mean # now I'll put the matrix given by (13.10) on p. 479 # of HTF2, which is also the matrix B on p. 477 of # HTF2, in B B <- matrix(0, nrow=n.var, ncol=n.var) for (g in first.class:last.class) { col.ind <- g - first.class + 1 B <- B + class.n[col.ind]*cbind(mean.deviations[,col.ind])%*%mean.deviations[,col.ind] } B <- B/sum(class.n) # I'll put the eigenvalues and eigned vectors # of B in ev, and return this object ev <- eigen(B) } dim.red <- GlobDimRed(trndat) dim.red # Compared to the first three eignevalues, the others # are extremely small, and the 2nd and 3rd are much smaller # than the first. This suggests that a good approximating # subspace may be the one-dimensional subspace spanned by # the 1st eignevector, or the three-dimensional subspace # spanned by the first three eigenvectors. ##### # I'll create new data sets having predictors created # from the eignevectors. If just the first predictor # is used, it'll be as though we're using a predictor # obtained from projecting the original predictors into # the one-dimensional subspace spanned by the first # eigenvector. new.trndat <- data.frame(cbind(trndat[,1], as.matrix(trndat[,-1])%*%dim.red$vector)) names(new.trndat) <- c("g","ev1","ev2","ev3","ev4","ev5","ev6","ev7","ev8") new.gendat <- data.frame(cbind(gendat[,1], as.matrix(gendat[,-1])%*%dim.red$vector)) names(new.gendat) <- c("g","ev1","ev2","ev3","ev4","ev5","ev6","ev7","ev8") # I'll first try the best one-dimensional approximating # subspace, using train.kknn to identify a good choice # of k to use for classifying the test set cases using # the training data. # (NOTE: If we wanted to use a subspace of dimension # greater than 1, train.kknn may not be good to use # since it automatically scales the predictors, which # would give too much influence to the less important # predictors. I.e., I think it'll work better if we # can give the first eigenvector the most influence in # determining the nearest neighbors, but if the predictors # are scaled this won't be the case.) new.cvknn <- train.kknn(as.factor(g) ~ ., new.trndat[,1:2], kmax=150, kernel="rectangular", distance=2) plot(c(1:150),new.cvknn$MISCLASS,type="l",xlab='k', ylab='proportion misclassified',main='Estimated Error Rates', ylim=c(0.4,0.7),col="maroon") abline(est.bayes.rate, 0, col="purple") legend(108,0.7,legend=c("cross-val","Bayes"), fill=c("maroon","purple")) which.min(new.cvknn$MISCLASS) # Let's see what the error rate is if we use # 40 nearest neighbors in the one-dimensional # subspace. pred.gen.new <- kknn(as.factor(g)~.,train=new.trndat[,1:2],test=new.gendat[,1:2], k=43,kernel="rectangular") gen.err.new <- mean(pred.gen.new$fit != new.gendat[,1]) gen.err.new # Error rate for 40 nearest neighbors (in 1-d subspace). # A decent improvement over ordinary nearest neighbors # using all of the variables, but not as good as LDA # (using all of the variables). (Of course the data was # generated in such a way as to make LDA well-suited for it.) ##### # Now I'll try using the best approximating three-dimensional # subspace. From below, it can be noted that the sample # variances of the new variables aren't all the same (even # though each eigenvector is of length 1). apply( new.trndat[,-1], 2, var ) # I won't use train.kknn to select a value of k # because scaling the three predictors would give the one based # on the second and third eigenvalues too much influence. Instead I'll # just keep k at 40 (thinking that the 2nd and 3rd predictors won't # change things that much (but hoping that it changes things # a bit for the better)). I'll use the knn function instead # of the kknn function because knn doesn't scale the predictors. pred.gen.newest <- knn(new.trndat[,2:4],test=new.gendat[,2:4], cl=new.trndat[,1],k=40) gen.err.newest <- mean(pred.gen.newest != new.gendat[,1]) gen.err.newest # Error rate for 40 nearest neighbors (in 3-d subspace). # Just a tad better. ##### # Now I will try some support vector machines. # The default kernel is the radial kernel. # Using the svm function, one has to supply a # value for the gamma parameter, or else rely # on the default, which is the inverse of the # number of predictors, or 1/8 in this case. # The help page stresses that one should tune, # and so a variety of gamma values should be # considered. The svm function has a cross- # validation feature which can be used to obtain # an estimate of generalization error rate from # the training sample for the particular value of # gamma which is currently being used. However, # even if the same random number seed is used, # the c-v estimates for a particular value of # gamma can differ from trial to trial. This # prevents from doing a "trial-and-error" search # for a good gamma and making comments here, # because when this script is run I can't know # in advance what the outcoems will be. So I'll # just try a bunch of different gamma values # and obtain c-v estimates of the error rates, as # well as estimates of the error rates using the # fitted models to make predictions for the large # generalization sample. num.trials <- 8 gamma.val <- c(1/16, 1/8, 3/16, 1/4, 5/16, 3/8, 7/16, 1/2) error.cv <- numeric(8) error.gen <- numeric(8) for (i in 1:num.trials) { svm.rad <- svm( as.factor(g) ~ ., trndat, cross=10, gamma=gamma.val[i]) error.cv[i] <- svm.rad$tot.accuracy error.gen[i] <- mean( predict(svm.rad, gendat) != as.factor(gendat[,1]) ) } error.cv <- 1 - error.cv/100 plot(gamma.val,error.gen,type="l",xlab='gamma', ylab='proportion misclassified',main='Estimated Error Rates (SVM, radial kernel)', ylim=c(0.4,0.70),col="seagreen") points(gamma.val,error.cv,type="l",col="tomato") abline(est.bayes.rate, 0, col="purple") legend(0.15,0.7,legend=c("cross-val","generalization","Bayes"), fill=c("tomato","seagreen","purple")) ##### # Now I'll try using the polynomial kernel. To tune, # I'll start by considering various values for the # degree, keeping the other parameters at their default # settings. svm.poly2 <- svm( as.factor(g) ~ ., trndat, cross=10, kernel="polynomial", degree=2) summary(svm.poly2) svm.poly3 <- svm( as.factor(g) ~ ., trndat, cross=10, kernel="polynomial", degree=3) summary(svm.poly3) svm.poly4 <- svm( as.factor(g) ~ ., trndat, cross=10, kernel="polynomial", degree=4) summary(svm.poly4) svm.poly5 <- svm( as.factor(g) ~ ., trndat, cross=10, kernel="polynomial", degree=5) summary(svm.poly4) # C-v results indicate setting the degree to 3 # works best, but it doesn't look real promising. # Nevertheless, I'll try some different values of # gamma to see if I can get improvement. for (i in 1:num.trials) { svm.rad <- svm( as.factor(g) ~ ., trndat, cross=10, kernel="polynomial", degree=3, gamma=gamma.val[i]) error.cv[i] <- svm.rad$tot.accuracy error.gen[i] <- mean( predict(svm.rad, gendat) != as.factor(gendat[,1]) ) } error.cv <- 1 - error.cv/100 plot(gamma.val,error.gen,type="l",xlab='gamma', ylab='proportion misclassified',main='Estimated Error Rates (SVM, poly 3 kernel)', ylim=c(0.4,0.85),col="seagreen") points(gamma.val,error.cv,type="l",col="tomato") abline(est.bayes.rate, 0, col="purple") legend(0.15,0.85,legend=c("cross-val","generalization","Bayes"), fill=c("tomato","seagreen","purple")) ##### # Before quitting, I'll try the linear kernel. svm.lin <- svm( as.factor(g) ~ ., trndat, cross=10, kernel="linear") summary(svm.lin) # The c-v estimate isn't promising, but I'll check the # performance on the generalization set anyway. mean( predict(svm.lin, gendat) != as.factor(gendat[,1]) ) # Better than the c-v indicated it would be. # In fact, except for LDA, this is our best # result. Let's look at a plot to see if we # can figure out why. plot(svm.lin, trndat, x2 ~ x1) # Could it be that the linear kernel leads to a fit # similar to the LDA fit having linear boundaries? ##### End #####