# Example of Global Dimension Reduction Method for # Nearest Neighbors from Subsection 13.4.2 of HFT2, # and Some Home-Grown Variable Selection Methods # Developed using 2.9.0. library(MASS) library(class) library(kknn) # for kknn and train.kknn f'ns (for KNN) library(klaR) # for stepclass f'n (for stepwise LDA) library(mvtnorm) # for rmvnorm and dmvnorm (for generation # of multivariate normal observations and # computation of multivariate normal pdf # values) set.seed(7) # I'll generate data from four classes, using a multivariate # normal distribution for each class. The covariance matrix # will be the same for all of the classes. The means will # differ in the first two dimensions, but not the other two # dimensions. (So two of the four predictor variables are # pure noise.) Sig <- diag(1,4) Sig[1,2] <- 0.5 Sig[2,1] <- 0.5 Sig # Sig is a variance-covariance matrix. Oxvals <- rmvnorm(1200, mean=c(-1.5,-1,0,0), sigma=Sig) # O is used for the Orange class. Gxvals <- rmvnorm(1200, mean=c(-0.75,-0.5,0,0), sigma=Sig) # G is used for the Green class. Mxvals <- rmvnorm(1200, mean=c(0.75,1,0,0), sigma=Sig) # M is used for the Maroon class. Bxvals <- rmvnorm(1200, mean=c(1.5,1.5,0,0), sigma=Sig) # B is used for the Blue class. # I'll give response value of 0 for Orange class observations, # 1 for Green class observations, 2 for the Maroon class # observations, and 3 for the Blue class observations. Then I'll put # the last 200 observations for each class in a data frame to serve # as the training samle, and use the rest of the data as a generalization # sample. Oclass <- cbind(0, Oxvals) Gclass <- cbind(1, Gxvals) Mclass <- cbind(2, Mxvals) Bclass <- cbind(3, Bxvals) trndat <- data.frame(rbind(Oclass[1001:1200,], Gclass[1001:1200,], Mclass[1001:1200,], Bclass[1001:1200,])) names(trndat) <- c("g", "x1", "x2", "x3", "x4") gendat <- data.frame(rbind(Oclass[1:1000,], Gclass[1:1000,], Mclass[1:1000,], Bclass[1:1000,])) names(gendat) <- c("g", "x1", "x2", "x3", "x4") plot(Oxvals[1001:1200,1],Oxvals[1001:1200,2],col="darkorange", main='Training Data (nonnoise variables)', xlab='x1',ylab='x2',xlim=c(-4.5,4.5),ylim=c(-4.5,4.5)) points(Gxvals[1001:1200,1], Gxvals[1001:1200,2], col="limegreen") points(Mxvals[1001:1200,1], Mxvals[1001:1200,2], col="maroon") points(Bxvals[1001:1200,1], Bxvals[1001:1200,2], col="royalblue") # There's quite a bit of overlap. ##### # I'll obtain an estimate of the Bayes error # rate using the large generalization sample. Opdf <- dmvnorm(gendat[,2:5], c(-1.5,-1,0,0), Sig) Gpdf <- dmvnorm(gendat[,2:5], c(-0.75,-0.5,0,0), Sig) Mpdf <- dmvnorm(gendat[,2:5], c(0.75,1,0,0), Sig) Bpdf <- dmvnorm(gendat[,2:5], c(1.5,1.5,0,0), Sig) pred.bayes <- ifelse(Gpdf > Mpdf, 1, 2) pred.bayes <- ifelse(Opdf > Gpdf & Opdf > Bpdf & Opdf > Mpdf, 0, pred.bayes) pred.bayes <- ifelse(Bpdf > Opdf & Bpdf > Gpdf & Bpdf > Mpdf, 3, pred.bayes) est.bayes.rate <- mean(pred.bayes != gendat[,1]) est.bayes.rate # This is the estimated error rate of the Bayes classifier. # Below is a confusion matrix, with true class for the # rows and predicted class for the columns. The Green # and Maroon classes are the most difficult one to # correctly predict. table(gendat[,1], pred.bayes) ##### # Before trying the method from subsection 13.4.2 of HTF2 # I'll try LDA using all of the variables, and then step- # wise LDA. lda.all <- lda(g ~ ., trndat) pred.lda.all <- predict(lda.all, newdata=gendat) lda.all.rate <- mean( pred.lda.all$class != gendat[,1] ) lda.all.rate # LDA rate (all predictors used). ldafit <- stepclass(g ~ ., data=trndat, method="lda") ldafit # Stepwise LDA selects only x1. lda.step <- lda(g ~ x1, trndat) pred.lda.step <- predict(lda.step, newdata=gendat) lda.step.rate <- mean( pred.lda.step$class != gendat[,1] ) lda.step.rate # LDA rate (using predictor chosen by stepwise procedure). # Worse than using all predictors. Stepwise LDA failed. ##### # Now I'll try ordinary nearest neighbors, using # train.kknn to selct a good value of k. cvknn <- train.kknn(as.factor(g) ~ ., trndat, kmax=200, kernel="rectangular", distance=2) plot(c(1:200),cvknn$MISCLASS,type="l",xlab='k', ylab='proportion misclassified',main='Estimated Error Rates', ylim=c(0.4,0.65),col="maroon") abline(est.bayes.rate, 0, col="purple") legend(100,0.65,legend=c("cross-val","Bayes"), fill=c("maroon","purple")) which.min(cvknn$MISCLASS) # Not looking great. I'll go with k = 131. pred.gen.knn <- kknn(as.factor(g)~.,train=trndat,test=gendat, k=131,kernel="rectangular") gen.err.knn <- mean(pred.gen.knn$fit != gendat[,1]) gen.err.knn # Error rate for 131 nearest neighbors. # (Not good compared to LDA, but this # data is well-suited for LDA.) ##### # Now I'll try the method from subsection 13.4.2 of HTF2. between.cov <- function(data) # 1st col. of data set should be class variable. # (Classes should be specified using consecutive integers.) # Remaining col's of each set should be predictor var's. { first.class <- min(data[,1]) last.class <- max(data[,1]) n.class <- last.class - first.class + 1 n.var <- length(data[1,]) - 1 var.means <- matrix(0, nrow=n.var, ncol=n.class) var.sums <- matrix(0, nrow=n.var, ncol=n.class) class.n <- numeric(n.class) for (g in first.class:last.class) { cases <- data[,1] == g class.n[g-first.class+1] <- sum(cases) classdata <- data[cases,] var.means[,g-first.class+1] <- apply(classdata[,-1],2,mean) var.sums[,g-first.class+1] <- apply(classdata[,-1],2,sum) } # var.means now contains the mean vectors for the # classes in its columns ... now I'll subtract the # grand mean from each column and put the result in # mean.deviations grand.mean <- apply(var.sums,1,sum)/sum(class.n) mean.deviations <- var.means - grand.mean # now I'll put the matrix given by (13.10) on p. 479 # of HTF2, which is also the matrix B on p. 477 of # HTF2, in B B <- matrix(0, nrow=n.var, ncol=n.var) for (g in first.class:last.class) { col.ind <- g - first.class + 1 B <- B + class.n[col.ind]*cbind(mean.deviations[,col.ind])%*%mean.deviations[,col.ind] } B <- B/sum(class.n) B } GlobDimRed2 <- function(data) # 1st col. of data set should be class variable. # (Classes should be specified using consecutive integers.) # Remaining col's of each set should be predictor var's. # (Note: This function, GlobDimRed2, requires my between.cov # function. I have another function, GlobDimRed, that has # the between.cov part already incorporated in it.) { B <- between.cov(data) # I'll put the eigenvalues and eigned vectors # of B in ev, and return this object ev <- eigen(B) ev } dim.red <- GlobDimRed2(trndat) dim.red # Compared to the first three eignevalues, the 4th one is # extremely small, and the 2nd and 3rd are much smaller # than the first. This suggests that a good approximating # subspace may be the one-dimensional subspace spanned by # the 1st eignevector, or the three-dimensional subspace # spanned by the first three eigenvectors. (The first # eigenvector gives great weight to the first two predictors # and so it *should* do well as a predictor.) ##### # I'll create new data sets having predictors created # from the eignevectors. If just the first predictor # is used, it'll be as though we're using a predictor # obtained from projecting the original predictors into # the one-dimensional subspace spanned by the first # eigenvector. new.trndat <- data.frame(cbind(trndat[,1], as.matrix(trndat[,-1])%*%dim.red$vector)) names(new.trndat) <- c("g","ev1","ev2","ev3","ev4") new.gendat <- data.frame(cbind(gendat[,1], as.matrix(gendat[,-1])%*%dim.red$vector)) names(new.gendat) <- c("g","ev1","ev2","ev3","ev4") # I'll first try the best one-dimensional approximating # subspace, using train.kknn to identify a good choice # of k to use for classifying the test set cases using # the training data. # (NOTE: If we wanted to use a subspace of dimension # greater than 1, train.kknn may not be good to use # since it automatically scales the predictors, which # would give too much influence to the less important # predictors. I.e., I think it'll work better if we # can give the first eigenvector the most influence in # determining the nearest neighbors, but if the predictors # are scaled this won't be the case.) new.cvknn <- train.kknn(as.factor(g) ~ ., new.trndat[,1:2], kmax=200, kernel="rectangular", distance=2) plot(c(1:200),new.cvknn$MISCLASS,type="l",xlab='k', ylab='proportion misclassified',main='Estimated Error Rates', ylim=c(0.4,0.7),col="maroon") abline(est.bayes.rate, 0, col="purple") legend(108,0.7,legend=c("cross-val","Bayes"), fill=c("maroon","purple")) which.min(new.cvknn$MISCLASS) # Let's see what the error rate is if we use # 22 nearest neighbors in the one-dimensional # subspace. pred.gen.new <- kknn(as.factor(g)~.,train=new.trndat[,1:2],test=new.gendat[,1:2], k=22,kernel="rectangular") gen.err.new <- mean(pred.gen.new$fit != new.gendat[,1]) gen.err.new # Error rate for 22 nearest neighbors (in 1-d subspace). # Worse ordinary nearest neighbors using all of the variables, # and also worse than LDA. # Now I'll try using the best approximating three-dimensional # subspace. From below, it can be noted that the sample # variances of the new variables aren't all the same. apply( new.trndat[,-1], 2, var ) # I won't use train.kknn to select a value of k # because scaling the three predictors would give the one based # on the second and third eigenvalues too much influence. Instead I'll # just keep k at 22 (thinking that the 2nd and 3rd predictors won't # change things that much (but hoping that it changes things # a bit for the better)). I'll use the knn function instead # of the kknn function because knn doesn't scale the predictors. pred.gen.newest <- knn(new.trndat[,2:4],test=new.gendat[,2:4], cl=new.trndat[,1],k=22) gen.err.newest <- mean(pred.gen.newest != new.gendat[,1]) gen.err.newest # Error rate for 22 nearest neighbors (in 3-d subspace). # A tad better (the best of the nearest neighbors results # so far). ##### ################################################################ # # # NOTE: This part takes a LOT of time to run. So you might # # want to skip running it and just look at the results # # I got, which are presented below this chunk of code. # # # ################################################################ # Now for the DANN method (see p. 477 of HTF2). # Here is a function to find square root of # the inverse of a matrix. sqrt.inv.mat <- function( mat ) # won't work for all matrices { eigen.mat <- eigen( mat ) q <- eigen.mat$vectors inv.sqrt.lamb <- diag( sqrt( 1/eigen.mat$values ) ) sr <- ( q%*%inv.sqrt.lamb )%*%t(q) sr } # Here is a function to find the the pooled # within-class covariance matrix (see p. 477 # of HTF2). First column of data matrix # should have groups identified with consecutive # integers. pooled.within.cov <- function( data ) { first.class <- min( data[,1] ) last.class <- max( data[,1] ) var.cov <- 0 for ( g in first.class:last.class ) { cases <- data[,1] == g if (sum(cases) < 2) var.cov <- var.cov + 0 else var.cov <- var.cov + ( sum( cases )/length( data[,1] ) )*cov( data[cases,-1] ) } var.cov } # Here are a pair of functions that determine the # Sigma given by (13.9) on p. 477 of HTF2. (Based # on comment in HTF2, the default value of epsilon # is 1.) DANN.Sig <- function( data, epsilon=1 ) { w.neg.half <- sqrt.inv.mat( pooled.within.cov( data ) ) b <- between.cov( data ) b.star <- w.neg.half%*%b%*%w.neg.half n.var <- length( data[1,] ) - 1 sig <- w.neg.half%*%( b.star + diag(epsilon, n.var) )%*%w.neg.half } local.Sig <- function( target.point, data, num.neighbors=num.neighbors ) { # data should have class indicator in 1st column, # and predictor variables in remaining columns. # target.point should just have coordinates of # target point. Based on a comment in HFT2, the # default number of nearest neighbors used is 50. distances <- as.matrix(dist(rbind( target.point, data[,-1]))) closest.training.points <- rank( distances[-1,1] ) < num.neighbors + 1 loc.sig <- DANN.Sig( data[closest.training.points,] ) loc.sig } # This function determines the distances given by # (13.8) on p. 477 of HTF2. local.dist <- function( target.point, data, num.neighbors=num.neighbors ) { x.mat <- t( data[,-1] ) # class variable should # be in 1st col of data diffs <- x.mat - as.vector( t( target.point ) ) distances <- diag( ( t( diffs )%*%local.Sig( target.point, data, num.neighbors ) )%*%diffs ) distances } # This function determines predicted class of a target point. pred.class <- function( target.point, data, num.neighbors=num.neighbors, k=k ) { dists <- local.dist( target.point, data, num.neighbors ) near.neighbors <- rank( dists ) < k + 1 first.class <- min( data[,1] ) last.class <- max( data[,1] ) num.classes <- last.class - first.class + 1 classes <- numeric(num.classes) for (g in first.class:last.class ) classes[g - first.class + 1] <- sum( data[near.neighbors,1] == g ) pred <- which.max( classes ) - first.class - 1 pred } DANN <- function( train.data, test.data, num.neighbors=50, k=15 ) { num.cases <- length( test.data[,1] ) predictions <- numeric(num.cases) for (i in 1:num.cases ) predictions[i] <- pred.class( test.data[i,-1], train.data, num.neighbors=num.neighbors, k=k ) predictions } pred <- DANN( trndat, gendat ) DANN.err <- mean(pred != gendat[,1]) DANN.err # error rate for DANN method # (not good!) # I'll try it again using only 7 nearest neighbors. # (Previous run used default of 15.) pred <- DANN( trndat, gendat, k=7 ) DANN.7.err <- mean(pred != gendat[,1]) DANN.7.err # error rate for DANN method (w/ k = 7) # Worse. # Now I'll try it using 25 nearest neighbors. pred <- DANN( trndat, gendat, k=25 ) DANN.25.err <- mean(pred != gendat[,1]) DANN.25.err # error rate for DANN method (w/ k = 25) # --- best of the DANN results ######################################################## # SUMMARY # # error method # ----- ------ # # 0.462 LDA (all variables) # 0.471 stepwise LDA (x1 chosen) # # 0.476 131 NN (all variables, 131 chosen using c-v) # 0.483 22 NN (using 1-d subspace, 22 chosen by c-v) # 0.475 22 NN (using 3-d subspace) # # 0.504 DANN (k = 7) # 0.526 DANN (k = 15) # 0.480 DANN (k = 25) # # # 0.449 Bayes rate # ######################################################## ##### set.seed(12) # (resetting the seed so it won't matter # whether or not the DANN stuff is run) # Let's try a backwards elimination approach to eliminate # one variable. error.rate <- numeric(4) est.error.rate <- numeric(4) for (v in 1:4) { cvknn <- train.kknn(as.factor(g) ~ ., trndat[,-(v+1)], kmax=150, kernel="rectangular", distance=2) est.error.rate[v] <- cvknn$MISCLASS[which.min(cvknn$MISCLASS)] pred <- kknn(as.factor(g)~.,train=trndat[,-(v+1)],test=gendat[,-(v+1)], k=which.min(cvknn$MISCLASS),kernel="rectangular") error.rate[v] <- mean(pred$fit != gendat[,1]) } results <- cbind(c(1:4),est.error.rate,error.rate) results # Hmm ... c-v indicates that I should take out the 2nd # predictor even though the 3rd and 4th predictors were # the noise variables. The last column shows that # eliminating the 3rd predictor would have really been # more effective. # Before quitting, I'm just going to examine the error rate # for the case of both noise predictors not being used (using # the k selected when all of the predictors were used). pred <- kknn(as.factor(g)~.,train=trndat[,-c(4:5)],test=gendat[,-c(4:5)], k=131,kernel="rectangular") err.rate <- mean(pred$fit != gendat[,1]) err.rate ### End for Now ### ##### END #####