# Example of Global Dimension Reduction Method for # Nearest Neighbors from Subsection 13.4.2 of HFT2. # Developed using 2.9.0. library(MASS) library(class) library(kknn) # for kknn and train.kknn f'ns (for KNN) library(klaR) # for stepclass f'n (for stepwise LDA) library(mvtnorm) # for rmvnorm and dmvnorm (for generation # of multivariate normal observations and # computation of multivariate normal pdf # values) set.seed(7) # I'll generate data from three classes, using a multivariate # normal distribution for each class. The covariance matrix # will be the same for all of the classes. The means will # differ in the first two dimensions, but not the other two # dimensions. (So two of the four predictor variables are # pure noise.) Sig <- diag(1,4) # Sig is a variance-covariance matrix. Oxvals <- rmvnorm(100, mean=c(0,1,0,0), sigma=Sig) # O is used for the Orange class. Gxvals <- rmvnorm(100, mean=c(0,0,0,0), sigma=Sig) # G is used for the Green class. Bxvals <- rmvnorm(100, mean=c(1,0,0,0), sigma=Sig) # B is used for the Blue class. plot(Oxvals[,1],Oxvals[,2],col="darkorange",main='Training Data (nonnoise variables)', xlab='x1',ylab='x2',xlim=c(-3,4),ylim=c(-3,4)) points(Gxvals[,1], Gxvals[,2], col="limegreen") points(Bxvals[,1], Bxvals[,2], col="royalblue") # There's quite a bit of overlap. ##### # I'll give response value of 0 for Orange class observations, # 1 for Green class observations, and 2 for the Blue class # observations, and then combine them together to form the # training data. Then I'll generate 1000 observations for each # class to serve as the generalization sample. Oclass <- cbind(0, Oxvals) Gclass <- cbind(1, Gxvals) Bclass <- cbind(2, Bxvals) trndat <- data.frame(rbind(Oclass, Gclass, Bclass)) names(trndat) <- c("g", "x1", "x2", "x3", "x4") Oxvals <- rmvnorm(1000, mean=c(0,1,0,0), sigma=Sig) Gxvals <- rmvnorm(1000, mean=c(0,0,0,0), sigma=Sig) Bxvals <- rmvnorm(1000, mean=c(1,0,0,0), sigma=Sig) Oclass <- cbind(0, Oxvals) Gclass <- cbind(1, Gxvals) Bclass <- cbind(2, Bxvals) gendat <- data.frame(rbind(Oclass, Gclass, Bclass)) names(gendat) <- c("g", "x1", "x2", "x3", "x4") ##### # I'll obtain an estimate of the Bayes error # rate using the large generalization sample. Opdf <-rep(0,3000) Gpdf <-rep(0,3000) Bpdf <-rep(0,3000) Opdf <- Opdf + dmvnorm(gendat[,2:5], c(0,1,0,0), Sig) Gpdf <- Gpdf + dmvnorm(gendat[,2:5], c(0,0,0,0), Sig) Bpdf <- Bpdf + dmvnorm(gendat[,2:5], c(1,0,0,0), Sig) pred.bayes <- ifelse(Gpdf > Bpdf, 1, 2) pred.bayes <- ifelse(Opdf > Gpdf & Opdf > Bpdf, 0, pred.bayes) est.bayes.rate <- mean(pred.bayes != gendat[,1]) est.bayes.rate # This is the estimated error rate of the Bayes classifier. # Below is a confusion matrix, with true class for the # rows and predicted class for the columns. The Green # class is the most difficult one to correctly predict. table(gendat[,1], pred.bayes) ##### # Before trying the method from subsection 13.4.2 of HTF2 # I'll try LDA using all of the variables, and then step- # wise LDA. lda.all <- lda(g ~ ., trndat) pred.lda.all <- predict(lda.all, newdata=gendat) lda.all.rate <- mean( pred.lda.all$class != gendat[,1] ) lda.all.rate # LDA rate (all predictors used). ldafit <- stepclass(g ~ ., data=trndat, method="lda") ldafit # Stepwise LDA selects the two nonnoise variables. lda.step <- lda(g ~ x1 + x2, trndat) pred.lda.step <- predict(lda.step, newdata=gendat) lda.step.rate <- mean( pred.lda.step$class != gendat[,1] ) lda.step.rate # LDA rate (using predictors chosen by stepwise procedure). # Eliminating the noise variables had little effect. # Whether or not they are used, LDA gets close to the # optimal Bayes error rate. ##### # Now I'll try ordinary nearest neighbors, using # train.kknn to select a good value of k. cvknn <- train.kknn(as.factor(g) ~ ., trndat, kmax=150, kernel="rectangular", distance=2) plot(c(1:150),cvknn$MISCLASS,type="l",xlab='k', ylab='proportion misclassified',main='Estimated Error Rates', ylim=c(0.4,0.65),col="maroon") abline(est.bayes.rate, 0, col="purple") legend(100,0.65,legend=c("cross-val","Bayes"), fill=c("maroon","purple")) which.min(cvknn$MISCLASS) # We could go with the minimizing value of k, 87, # or we could mentally smooth the curve and select # a value like 105. ##### # Because the indicated value of k is curiously large, # and because the c-v results suggest getting close to # the Bayes rate is possible, I obtained estimates of # the generalization errors to see how KNN performs for # all values of k up to 150. I won't include that part # here because it takes a relatively long time to run, # but I can report that for many large values of k the # estimated error rate was pretty close to the Bayes rate. pred.gen.105nn <- kknn(as.factor(g)~.,train=trndat,test=gendat, k=105,kernel="rectangular") gen.err.105nn <- mean(pred.gen.105nn$fit != gendat[,1]) pred.gen.87nn <- kknn(as.factor(g)~.,train=trndat,test=gendat, k=187,kernel="rectangular") gen.err.87nn <- mean(pred.gen.87nn$fit != gendat[,1]) gen.err.105nn # Error rate for 105 nearest neighbors. gen.err.87nn # Error rate for 87 nearest neighbors. # (Close to LDA rate and the Bayes rate.) # Note: The estimated error rate for k = 65 # is 0.431 (same as estimated Bayes rate). ##### # Now I'll try the method from subsection 13.4.2 of HTF2. # Given that both LDA and ordinary nearest neighbors using # all of the predictors work so well, it'll be disappointing # if this method doesn't also work well. GlobDimRed <- function(traindata) # 1st col. of data set should be class variable. # (Classes should be specified using consecutive integers.) # Remaining col's of each set should be predictor var's. { first.class <- min(traindata[,1]) last.class <- max(traindata[,1]) n.class <- last.class - first.class + 1 n.var <- length(traindata[1,]) - 1 var.means <- matrix(0, nrow=n.var, ncol=n.class) var.sums <- matrix(0, nrow=n.var, ncol=n.class) class.n <- numeric(n.class) for (g in first.class:last.class) { cases <- traindata[,1] == g class.n[g - first.class + 1] <- sum(cases) classdata <- traindata[cases,] var.means[,g - first.class + 1] <- apply(classdata[,-1],2,mean) var.sums[,g - first.class + 1] <- apply(classdata[,-1],2,sum) } # var.means now contains the mean vectors for the # classes in its columns ... now I'll subtract the # grand mean from each column and put the result in # mean.deviations grand.mean <- apply(var.sums,1,sum)/sum(class.n) mean.deviations <- var.means - grand.mean # now I'll put the matrix given by (13.10) on p. 479 # of HTF2, which is also the matrix B on p. 477 of # HTF2, in B B <- matrix(0, nrow=n.var, ncol=n.var) for (g in first.class:last.class) { col.ind <- g - first.class + 1 B <- B + class.n[col.ind]*cbind(mean.deviations[,col.ind])%*%mean.deviations[,col.ind] } B <- B/sum(class.n) # I'll put the eigenvalues and eigned vectors # of B in ev, and return this object ev <- eigen(B) } dim.red <- GlobDimRed(trndat) dim.red # Compared to the 1st two eignevalues, the other two # eigenvalues are extremely small. This suggests # that a good approximating subspace is the one- # dimensional subspace spanned by the 1st eignevector # (which is given by the first column in the matrix # shown just above), or the two-dimensional subspace # spanned by the first two eigenvectors. ##### # I'll create new data sets having predictors created # from the eignevectors. If just the first predictor # is used, it'll be as though we're using a predictor # obtained from projecting the original predictors into # the one-dimensional subspace spanned by the first # eigenvector. new.trndat <- data.frame(cbind(trndat[,1], as.matrix(trndat[,-1])%*%ndat[,-1])%*%dim.red$vector)) names(new.gendat) <- c("g", "ev1", "ev2", "ev3", "ev4") # I'll use train.kknn to identify a good choice of k # to use for classifying the test set cases using the # training data. # (NOTE: If we wanted to use a subspace of dimension # greater than 1, train.kknn would not be good to use # since it automatically scales the predictors, which # would give too much influence to the less important # predictors. I.e., without scaling, the first eigen- # vector has the most influence in determining the # nearest neighbors, but if the predictors are scaled # this won't be the case.) new.cvknn <- train.kknn(as.factor(g) ~ ., new.trndat[,1:2], kmax=150, kernel="rectangular", distance=2) plot(c(1:150),new.cvknn$MISCLASS,type="l",xlab='k', ylab='proportion misclassified',main='Estimated Error Rates', ylim=c(0.4,0.7),col="maroon") abline(est.bayes.rate, 0, col="purple") legend(108,0.52,legend=c("cross-val","Bayes"), fill=c("maroon","purple")) which.min(new.cvknn$MISCLASS) # Let's see what the error rate is if we use # 19 nearest neighbors in the one-dimensional # subspace. pred.gen.new <- kknn(as.factor(g)~.,train=new.trndat[,1:2], test=new.gendat[,1:2], k=19,kernel="rectangular") gen.err.new <- mean(pred.gen.new$fit != new.gendat[,1]) gen.err.new # Error rate for 19 nearest neighbors (in subspace). # Quite horrible! ##### END #####