# Methods for Variable Selection / Weighting # Part II # Developed using 2.9.0. library(MASS) library(class) library(kknn) # for kknn and train.kknn f'ns (for KNN) library(klaR) # for stepclass f'n (for stepwise LDA) library(e1071) # for svm f'n (for support vector machines) library(mvtnorm) # for rmvnorm and dmvnorm (for generation # of multivariate normal observations and # computation of multivariate normal pdf # values) set.seed(7) # I'll generate data for four classes. The means will # differ in the first two dimensions, but not the other three # dimensions. (So three of the five predictor variables are # pure noise.) Sig <- diag(1,5) Sig[1,2] <- 0.6 Sig[2,1] <- 0.6 Sig[3,4] <- 0.4 Sig[4,3] <- 0.4 Oxvals <- rmvnorm(100, mean=c(3.5,3,5,5,5), sigma=Sig) # O is used for the Orange class. Gxvals <- (0.8*rmvnorm(100, mean=c(3.25,3.75,6.25,6.25,6.25), sigma=Sig)) # G is used for the Green class. Mxvals <- (0.8*rmvnorm(100, mean=c(4.75,4,6.25,6.25,6.25), sigma=Sig)) # M is used for the Maroon class. Bxvals <- rmvnorm(100, mean=c(5.5,5.5,5,5,5), sigma=Sig) # B is used for the Blue class. Gxvals[,2] <- Gxvals[,2]^0.75 Mxvals[,2] <- Mxvals[,2]^1.5 plot(Oxvals[,1],Oxvals[,2],col="darkorange",main='Training Data (nonnoise variables)', xlab='x1',ylab='x2',xlim=c(-1,9),ylim=c(-1,12.5)) points(Gxvals[,1], Gxvals[,2], col="limegreen") points(Mxvals[,1], Mxvals[,2], col="maroon") points(Bxvals[,1], Bxvals[,2], col="royalblue") ##### # I'll give response value of 1 for Orange class observations, # 2 for Green class observations, 3 for the Maroon class # observations, and 4 for the Blue class observations, and then # combine them together to form the training data. Then I'll # generate 250 observations for each class to serve as the # generalization sample. Oclass <- cbind(1, Oxvals) Gclass <- cbind(2, Gxvals) Mclass <- cbind(3, Mxvals) Bclass <- cbind(4, Bxvals) trndat <- data.frame(rbind(Oclass, Gclass, Mclass, Bclass)) names(trndat) <- c("g", "x1", "x2", "x3", "x4", "x5") trndat[,5] <- trndat[,5]^0.75 trndat[,6] <- trndat[,6]^1.5 Oxvals <- rmvnorm(250, mean=c(3.5,3,5,5,5), sigma=Sig) # O is used for the Orange class. Gxvals <- (0.8*rmvnorm(250, mean=c(3.25,3.75,6.25,6.25,6.25), sigma=Sig)) # G is used for the Green class. Mxvals <- (0.8*rmvnorm(250, mean=c(4.75,4,6.25,6.25,6.25), sigma=Sig)) # M is used for the Maroon class. Bxvals <- rmvnorm(250, mean=c(5.5,5.5,5,5,5), sigma=Sig) # B is used for the Blue class. Oclass <- cbind(1, Oxvals) Gxvals[,2] <- Gxvals[,2]^0.75 Gclass <- cbind(2, Gxvals) Mxvals[,2] <- Mxvals[,2]^1.5 Mclass <- cbind(3, Mxvals) Bclass <- cbind(4, Bxvals) gendat <- data.frame(rbind(Oclass, Gclass, Mclass, Bclass)) names(gendat) <- c("g", "x1", "x2", "x3", "x4", "x5") gendat[,5] <- gendat[,5]^0.75 gendat[,6] <- gendat[,6]^1.5 ##### # Before trying the method from subsection 13.4.2 of HTF2 # I'll try LDA using all of the variables, and then step- # wise LDA. lda.all <- lda(g ~ ., trndat) pred.lda.all <- predict(lda.all, newdata=gendat) lda.all.rate <- mean( pred.lda.all$class != gendat[,1] ) lda.all.rate # LDA rate (all predictors used). ldafit <- stepclass(g ~ ., data=trndat, method="lda") ldafit # Stepwise LDA (correctly) selects the two meaningful variables. lda.step <- lda(g ~ x1 + x2, trndat) pred.lda.step <- predict(lda.step, newdata=gendat) lda.step.rate <- mean( pred.lda.step$class != gendat[,1] ) lda.step.rate # LDA rate (using predictor chosen by stepwise procedure). # A tad worse! ##### # I'll now try SVM using the linear kernel. svm.lin <- svm( as.factor(g) ~ ., trndat, cross=10, kernel="linear") summary(svm.lin) mean( predict(svm.lin, gendat) != as.factor(gendat[,1]) ) plot(svm.lin, trndat, x2 ~ x1) ##### # Now I'll try QDA. qda.all <- qda(g ~ ., trndat) pred.qda.all <- predict(qda.all, newdata=gendat) qda.all.rate <- mean( pred.qda.all$class != gendat[,1] ) qda.all.rate # QDA rate (all predictors used). qdafit <- stepclass(g ~ ., data=trndat, method="qda") qdafit # Stepwise QDA (correctly) selects the 2 useful variables. qda.step <- qda(g ~ x1 + x2, trndat) pred.qda.step <- predict(qda.step, newdata=gendat) qda.step.rate <- mean( pred.qda.step$class != gendat[,1] ) qda.step.rate # QDA rate (using predictor chosen by stepwise procedure). # Just a wee tad better (2 more correct out of 1000). ##### # Now I'll try ordinary nearest neighbors, using # train.kknn to selct a good value of k. cvknn <- train.kknn(as.factor(g) ~ ., trndat, kmax=100, kernel="rectangular", distance=2) plot(c(1:100),cvknn$MISCLASS,type="l",xlab='k', ylab='proportion misclassified',main='Estimated Error Rates', ylim=c(0.25,0.45),col="tomato") which.min(cvknn$MISCLASS) # I'll go with k = 19. pred.gen.knn <- kknn(as.factor(g)~.,train=trndat,test=gendat, k=19,kernel="rectangular") gen.err.knn <- mean(pred.gen.knn$fit != gendat[,1]) gen.err.knn # Error rate for 19 nearest neighbors. # Worst so far. # I'll now try using k = 40. pred.gen.knn <- kknn(as.factor(g)~.,train=trndat,test=gendat, k=40,kernel="rectangular") gen.err.knn <- mean(pred.gen.knn$fit != gendat[,1]) gen.err.knn # Error rate for 40 nearest neighbors. # Even worse. ##### # Now I'll try the method from subsection 13.4.2 of HTF2. GlobDimRed <- function(traindata) # 1st col. of data set should be class variable. # (Classes should be specified using consecutive integers.) # Remaining col's of each set should be predictor var's. { first.class <- min(traindata[,1]) last.class <- max(traindata[,1]) n.class <- last.class - first.class + 1 n.var <- length(traindata[1,]) - 1 var.means <- matrix(0, nrow=n.var, ncol=n.class) var.sums <- matrix(0, nrow=n.var, ncol=n.class) class.n <- numeric(n.class) for (g in first.class:last.class) { cases <- traindata[,1] == g class.n[g - first.class + 1] <- sum(cases) classdata <- traindata[cases,] var.means[,g - first.class + 1] <- apply(classdata[,-1],2,mean) var.sums[,g - first.class + 1] <- apply(classdata[,-1],2,sum) } # var.means now contains the mean vectors for the # classes in its columns ... now I'll subtract the # grand mean from each column and put the result in # mean.deviations grand.mean <- apply(var.sums,1,sum)/sum(class.n) mean.deviations <- var.means - grand.mean # now I'll put the matrix given by (13.10) on p. 479 # of HTF2, which is also the matrix B on p. 477 of # HTF2, in B B <- matrix(0, nrow=n.var, ncol=n.var) for (g in first.class:last.class) { col.ind <- g - first.class + 1 B <- B + class.n[col.ind]*cbind(mean.deviations[,col.ind])%*%mean.deviations[,col.ind] } B <- B/sum(class.n) # I'll put the eigenvalues and eigned vectors # of B in ev, and return this object ev <- eigen(B) } dim.red <- GlobDimRed(trndat) dim.red # Compared to the first two eignevalues, the others # are extremely small, and the 2nd is much smaller # than the first. This suggests that a good approximating # subspace may be the one-dimensional subspace spanned by # the 1st eignevector, or possibly a two-dimensional subspace # spanned by the first two eigenvectors. ##### # I'll create new data sets having predictors created # from the eignevectors. If just the first predictor # is used, it'll be as though we're using a predictor # obtained from projecting the original predictors into # the one-dimensional subspace spanned by the first # eigenvector. new.trndat <- data.frame(cbind(trndat[,1], as.matrix(trndat[,-1])%*%dim.red$vector)) names(new.trndat) <- c("g","ev1","ev2","ev3","ev4","ev5") new.gendat <- data.frame(cbind(gendat[,1], as.matrix(gendat[,-1])%*%dim.red$vector)) names(new.gendat) <- c("g","ev1","ev2","ev3","ev4","ev5") # I'll first try the best one-dimensional approximating # subspace, using train.kknn to identify a good choice # of k to use for classifying the test set cases using # the training data. # # (NOTE: If we wanted to use a subspace of dimension # greater than 1, train.kknn may not be good to use # since it automatically scales the predictors, which # would give too much influence to the less important # predictors. I.e., I think it'll work better if we # can give the first eigenvector the most influence in # determining the nearest neighbors, but if the predictors # are scaled this won't be the case.) new.cvknn <- train.kknn(as.factor(g) ~ ., new.trndat[,1:2], kmax=150, kernel="rectangular", distance=2) plot(c(1:150),new.cvknn$MISCLASS,type="l",xlab='k', ylab='proportion misclassified',main='Estimated Error Rates', ylim=c(0.35,0.55),col="maroon") which.min(new.cvknn$MISCLASS) # Let's see what the error rate is if we use # 25 nearest neighbors in the one-dimensional # subspace. pred.gen.new <- kknn(as.factor(g)~.,train=new.trndat[,1:2],test=new.gendat[,1:2], k=25,kernel="rectangular") gen.err.new <- mean(pred.gen.new$fit != new.gendat[,1]) gen.err.new # Error rate for 25 nearest neighbors (in 1-d subspace). # New worst so far. ##### # Now I'll try using the best approximating two-dimensional subspace. # I won't use train.kknn to select a value of k # because scaling the two predictors would give the one based # on the second eigenvalue too much influence. Instead I'll # just keep k at 25 (thinking that the 2nd predictor won't # change things that much (but hoping that it changes things # a bit for the better)). I'll use the knn function instead # of the kknn function because knn doesn't scale the predictors. pred.gen.newest <- knn(new.trndat[,2:3],test=new.gendat[,2:3], cl=new.trndat[,1],k=25) gen.err.newest <- mean(pred.gen.newest != new.gendat[,1]) gen.err.newest # Error rate for 25 nearest neighbors (in 2-d subspace). # Slightly better than KNN based on all variables. ##### # Now for the DANN method (see p. 477 of HTF2). # Here is a function to find square root of # the inverse of a matrix. sqrt.inv.mat <- function( mat ) # won't work for all matrices { eigen.mat <- eigen( mat ) q <- eigen.mat$vectors inv.sqrt.lamb <- diag( sqrt( 1/eigen.mat$values ) ) sr <- ( q%*%inv.sqrt.lamb )%*%t(q) sr } # Here is a function to find the the pooled # within-class covariance matrix (see p. 477 # of HTF2). First column of data matrix # should have groups identified with consecutive # integers. pooled.within.cov <- function( data ) { first.class <- min( data[,1] ) last.class <- max( data[,1] ) var.cov <- 0 for ( g in first.class:last.class ) { cases <- data[,1] == g if (sum(cases) < 2) var.cov <- var.cov + 0 else var.cov <- var.cov + ( sum( cases )/length( data[,1] ) )*cov( data[cases,-1] ) } var.cov } # Here are a pair of functions that determine the # Sigma given by (13.9) on p. 477 of HTF2. (Based # on comment in HTF2, the default value of epsilon # is 1.) between.cov <- function(data) # 1st col. of data set should be class variable. # (Classes should be specified using consecutive integers.) # Remaining col's of each set should be predictor var's. { first.class <- min(data[,1]) last.class <- max(data[,1]) n.class <- last.class - first.class + 1 n.var <- length(data[1,]) - 1 var.means <- matrix(0, nrow=n.var, ncol=n.class) var.sums <- matrix(0, nrow=n.var, ncol=n.class) class.n <- numeric(n.class) for (g in first.class:last.class) { cases <- data[,1] == g class.n[g-first.class+1] <- sum(cases) classdata <- data[cases,] var.means[,g-first.class+1] <- apply(classdata[,-1],2,mean) var.sums[,g-first.class+1] <- apply(classdata[,-1],2,sum) } # var.means now contains the mean vectors for the # classes in its columns ... now I'll subtract the # grand mean from each column and put the result in # mean.deviations grand.mean <- apply(var.sums,1,sum)/sum(class.n) mean.deviations <- var.means - grand.mean # now I'll put the matrix given by (13.10) on p. 479 # of HTF2, which is also the matrix B on p. 477 of # HTF2, in B B <- matrix(0, nrow=n.var, ncol=n.var) for (g in first.class:last.class) { col.ind <- g - first.class + 1 B <- B + class.n[col.ind]*cbind(mean.deviations[,col.ind])%*%mean.deviations[,col.ind] } B <- B/sum(class.n) B } DANN.Sig <- function( data, epsilon=1 ) { w.neg.half <- sqrt.inv.mat( pooled.within.cov( data ) ) b <- between.cov( data ) b.star <- w.neg.half%*%b%*%w.neg.half n.var <- length( data[1,] ) - 1 sig <- w.neg.half%*%( b.star + diag(epsilon, n.var) )%*%w.neg.half } local.Sig <- function( target.point, data, num.neighbors=num.neighbors ) { # data should have class indicator in 1st column, # and predictor variables in remaining columns. # target.point should just have coordinates of # target point. Based on a comment in HFT2, the # default number of nearest neighbors used is 50. distances <- as.matrix(dist(rbind( target.point, data[,-1]))) closest.training.points <- rank( distances[-1,1] ) < num.neighbors + 1 loc.sig <- DANN.Sig( data[closest.training.points,] ) loc.sig } # This function determines the distances given by # (13.8) on p. 477 of HTF2. local.dist <- function( target.point, data, num.neighbors=num.neighbors ) { x.mat <- t( data[,-1] ) # class variable should # be in 1st col of data diffs <- x.mat - as.vector( t( target.point ) ) distances <- diag( ( t( diffs )%*%local.Sig( target.point, data, num.neighbors ) )%*%diffs ) distances } # This function determines predicted class of a target point. pred.class <- function( target.point, data, num.neighbors=num.neighbors, k=k ) { dists <- local.dist( target.point, data, num.neighbors ) near.neighbors <- rank( dists ) < k + 1 first.class <- min( data[,1] ) last.class <- max( data[,1] ) num.classes <- last.class - first.class + 1 classes <- numeric(num.classes) for (g in first.class:last.class ) classes[g - first.class + 1] <- sum( data[near.neighbors,1] == g ) pred <- which.max( classes ) + first.class - 1 pred } DANN <- function( train.data, test.data, num.neighbors=50, k=15 ) { num.cases <- length( test.data[,1] ) predictions <- numeric(num.cases) for (i in 1:num.cases ) predictions[i] <- pred.class( test.data[i,-1], train.data, num.neighbors=num.neighbors, k=k ) predictions } pred <- DANN( trndat, gendat ) DANN.err <- mean(pred != gendat[,1]) DANN.err # Horrible! # I'll try adjusting things with DANN, since sometimes default settings can # do poorly. # Altogether I tried 47 additional settings with DANN. The results are shown # in the 2nd of the two tables below. With some settings the results are # better than ordinary nearest neighbors. The problem is: how do we find # good values for the parameters? (Cross-validation I suppose, but that'd # be a lot of work.) # It can be noted that the best DANN results (given below) occurred when # the initial neighborhood was rather large. So in this case DANN isn't # leading to improvement so much due to it's local nature. but rather, it # seems more due to the variable weighting. The best results are near # those I obtain using the variable weighting/selection scheme described # and used below. ################################################################################# ####################### Summary so far ######################## # LDA (all var's) 0.310 # LDA (stepwise var selection) 0.312 Var selection not effective # SVM (all var's (linear kernel) 0.299 # QDA (all var's) 0.268 # QDA (stepwise var selection) 0.266 Var selection mildly effective (maybe) # KNN (all var's, best c-v k) 0.316 # KNN (all var's, 2nd best c-v k) 0.322 # KNN (global dim red (1-d)) 0.413 # KNN (global dim red (2-d)) 0.309 Global dim red possibly effective # DANN (default settings) 0.512 # DANN (tuned settings) 0.281 DANN possibly effective ################################################################################# ##################################################################### ################## Summary of DANN results #################### num neighbors for classification # ------------------ # 3 7 15 21 23 25 30 60 # # num neighbors 25 479 478 # used initially 50 502 512 # 100 411 404 397 # 125 345 # 150 335 318 314 320 329 # 175 307 299 292 295 301 # 200 283 282 287 281 284 # 225 288 283 286 281 289 # 250 284 279 288 285 296 # 275 284 290 290 290 292 # 300 287 285 # 325 284 287 # 350 303 286 284 # 400 296 297 287 ##################################################################### # Now I'll try a variation of Daniel Saxton's variable weighting scheme for # nearest neighbors. # I'll reset the random number seed so results can be easily reproduced if # one skips some of the previous computations. set.seed(17) # Letting MSB be the "between" MS term, and MSE be the "error" MS term, it # can be noted that if there are no differences between class means for a # predictor, MSB and MSE have the same expectation (equal to the error term # variance). If there are differences between the class means # # MSB - MSE # # is a measure if the amount of differences due to differences in means. # Since this difference depends on the within-class differences in addition # to the between class differences, we can divide by MSE to make it scale # invariant. So we can use # # (MSB - MSE)/MSE = F - 1 # # as weights. (If F - 1 is negative, we can set weight to 0.) Noise # variables should get no or little weight with this scheme. # So that we can use the canned NN functions, I propose first scaling the # data, and then multiplying each variable by # sqrt( F - 1 ). # Then we can use R's knn function normally. (Note: Cannot use R's kknn # function since it automatically scales the predictors, and thus the effect # of the weights would be washed out.) # Step 1: Get the weights from training data # (based on one-way ANOVA F statistics). weights <- numeric(5) for (i in 2:6) weights[i-1] <- max(0, summary( lm( trndat[,i] ~ trndat[,1] ) )$fstatistic[1] - 1) weights <- weights/sum(weights) weights # Weights have been rescaled so that they sum to 1. # Step 2: Apply the weights to training data. scal.trn.vars <- scale(trndat[,-1], center=TRUE, scale=TRUE) scal.trndat <- data.frame( cbind( trndat[,1], scal.trn.vars ) ) names(scal.trndat) <- names(trndat) sqrt.wt <- sqrt(weights) for (i in 2:6) scal.trndat[,i] <- scal.trndat[,i]*sqrt.wt[i-1] # Step 3: Apply the weights to generalization data. scal.gen.vars <- scale(gendat[,-1], center=attr(scal.trn.vars,"scaled:center"), scale=attr(scal.trn.vars,"scaled:scale")) scal.gendat <- data.frame( cbind(gendat[,1], scal.gen.vars ) ) names(scal.gendat) <- names(scal.trndat) for (i in 2:6) scal.gendat[,i] <- scal.gendat[,i]*sqrt.wt[i-1] # Step 4: Do the classification using knn function # (using k from original KNN trial (selected using # c-v with all of the variables)). pred <- knn( scal.trndat[,-1], scal.gendat[,-1], cl=scal.trndat[,1], k=19 ) error.rate <- mean( pred != scal.gendat[,1] ) error.rate # Better than other KNN results, and LDA and SVM, # but not QDA (but close to QDA). IMPORTANT THING # is that it's an improvement over ordinary KNN.