# Example of using DANN for Classification # (comparison of DANN with KNN, LDA, SVMs, etc) # Developed using 2.9.0. library(MASS) library(class) library(kknn) # for kknn and train.kknn f'ns (for KNN) library(klaR) # for stepclass f'n (for stepwise LDA) library(e1071) # for svm f'n (for support vector machines) library(mvtnorm) # for rmvnorm and dmvnorm (for generation # of multivariate normal observations and # computation of multivariate normal pdf # values) set.seed(7) # I'll generate data from three classes. # The means will differ in the first two # dimensions, but not the other two dimensions. # (So two of the four predictor variables are # pure noise.) n <- 200 # n is number of training sample cases per class Sig <- diag(1,4) Sig # Sig is a variance-covariance matrix. Oxvals <- rmvnorm(n, mean=c(0,0,0,0), sigma=Sig) # O is used for the Orange class. Gxvals <- rmvnorm(n, mean=c(0,0,0,0), sigma=Sig) # G is used for the Green class. Bxvals <- rmvnorm(n, mean=c(0,0,0,0), sigma=Sig) # B is used for the Blue class. # Above, all three classes were generated the same way, # but now I'll *alter the first two variables* for the # Orange and Blue classes. Oxvals[,1] <- runif(n,-4,0) Oxvals[,2] <- runif(n,-4,4) Bxvals[,1] <- runif(n,0,4) Bxvals[,2] <- runif(n,-4,4) # I'll plot the data using the first two predictor variables. plot(Oxvals[,1],Oxvals[,2],col="darkorange",main='Training Data (nonnoise variables)', xlab='x1',ylab='x2',xlim=c(-4,4),ylim=c(-4,4)) points(Bxvals[,1], Bxvals[,2], col="royalblue") points(Gxvals[,1], Gxvals[,2], col="limegreen") ##### # I'll give response value of 0 for Orange class observations, # 1 for Green class observations, 2 for the Blue class observations, # and then combine them together to form the training data. Then # I'll generate 1000 observations for each class to serve as the # generalization sample. Oclass <- cbind(0, Oxvals) Gclass <- cbind(1, Gxvals) Bclass <- cbind(2, Bxvals) trndat <- data.frame(rbind(Oclass, Gclass, Bclass)) names(trndat) <- c("g", "x1", "x2", "x3", "x4") Oxvals <- rmvnorm(1000, mean=c(0,0,0,0), sigma=Sig) Gxvals <- rmvnorm(1000, mean=c(0,0,0,0), sigma=Sig) Bxvals <- rmvnorm(1000, mean=c(0,0,0,0), sigma=Sig) Oxvals[,1] <- runif(1000,-4,0) Oxvals[,2] <- runif(1000,-4,4) Bxvals[,1] <- runif(1000,0,4) Bxvals[,2] <- runif(1000,-4,4) Oclass <- cbind(0, Oxvals) Gclass <- cbind(1, Gxvals) Bclass <- cbind(2, Bxvals) gendat <- data.frame(rbind(Oclass, Gclass, Bclass)) names(gendat) <- c("g", "x1", "x2", "x3", "x4") ##### # Before trying the DANN method of HTF2 # I'll try LDA using all of the variables, # and then stepwise LDA. lda.all <- lda(g ~ ., trndat) pred.lda.all <- predict(lda.all, newdata=gendat) lda.all.rate <- mean( pred.lda.all$class != gendat[,1] ) lda.all.rate # LDA rate (all predictors used). ldafit <- stepclass(g ~ ., data=trndat, method="lda") ldafit # Stepwise LDA selects only x1. lda.step <- lda(g ~ x1, trndat) pred.lda.step <- predict(lda.step, newdata=gendat) lda.step.rate <- mean( pred.lda.step$class != gendat[,1] ) lda.step.rate # A tad better than LDA using all of the variables. ##### # Now I'll try QDA (using all variables) and stepwise LDA. qda.all <- qda(g ~ ., trndat) pred.qda.all <- predict(qda.all, newdata=gendat) qda.all.rate <- mean( pred.qda.all$class != gendat[,1] ) qda.all.rate # QDA rate (all predictors used). qdafit <- stepclass(g ~ ., data=trndat, method="qda") qdafit # Stepwise QDA selects only x1 and x2 (correctly not using # the noise variables x3 and x4). qda.step <- qda(g ~ x1 + x2, trndat) pred.qda.step <- predict(qda.step, newdata=gendat) qda.step.rate <- mean( pred.qda.step$class != gendat[,1] ) qda.step.rate # Just a wee bit better than QDA using all of the variables. # Both QDA results are appreciably better than the two LDA # results. ##### # Now I'll try ordinary nearest neighbors, using # train.kknn to select a good value of k. cvknn <- train.kknn(as.factor(g) ~ ., trndat, kmax=100, kernel="rectangular", distance=2) plot(c(1:100),cvknn$MISCLASS,type="l",xlab='k', ylab='proportion misclassified',main='Estimated Error Rates', ylim=c(0.1,0.65),col="maroon") which.min(cvknn$MISCLASS) pred.gen.knn <- kknn(as.factor(g)~.,train=trndat,test=gendat, k=5,kernel="rectangular") gen.err.knn <- mean(pred.gen.knn$fit != gendat[,1]) gen.err.knn # Error rate for 5 nearest neighbors. # Better than the LDA results, but not the QDA results. ##### # Now I'll try the method from subsection 13.4.2 of HTF2. between.cov <- function(data) # 1st col. of data set should be class variable. # (Classes should be specified using consecutive integers.) # Remaining col's of each set should be predictor var's. { first.class <- min(data[,1]) last.class <- max(data[,1]) n.class <- last.class - first.class + 1 n.var <- length(data[1,]) - 1 var.means <- matrix(0, nrow=n.var, ncol=n.class) var.sums <- matrix(0, nrow=n.var, ncol=n.class) class.n <- numeric(n.class) for (g in first.class:last.class) { cases <- data[,1] == g class.n[g-first.class+1] <- sum(cases) classdata <- data[cases,] var.means[,g-first.class+1] <- apply(classdata[,-1],2,mean) var.sums[,g-first.class+1] <- apply(classdata[,-1],2,sum) } # var.means now contains the mean vectors for the # classes in its columns ... now I'll subtract the # grand mean from each column and put the result in # mean.deviations grand.mean <- apply(var.sums,1,sum)/sum(class.n) mean.deviations <- var.means - grand.mean # now I'll put the matrix given by (13.10) on p. 479 # of HTF2, which is also the matrix B on p. 477 of # HTF2, in B B <- matrix(0, nrow=n.var, ncol=n.var) for (g in first.class:last.class) { col.ind <- g - first.class + 1 B <- B + class.n[col.ind]*cbind(mean.deviations[,col.ind])%*%mean.deviations[,col.ind] } B <- B/sum(class.n) B } GlobDimRed2 <- function(data) # 1st col. of data set should be class variable. # (Classes should be specified using consecutive integers.) # Remaining col's of each set should be predictor var's. # (Note: This function, GlobDimRed2, requires my between.cov # function. I have another function, GlobDimRed, that has # the between.cov part already incorporated in it.) { B <- between.cov(data) # I'll put the eigenvalues and eigned vectors # of B in ev, and return this object ev <- eigen(B) ev } dim.red <- GlobDimRed2(trndat) dim.red # Compared to the first two eignevalues, the others # are extremely small, and the 2nd is much smaller # than the first This suggests that a good approximating # subspace may be the one-dimensional subspace spanned by # the 1st eignevector, or the two-dimensional subspace # spanned by the first two eigenvectors. ##### # I'll create new data sets having predictors created # from the eignevectors. If just the first predictor # is used, it'll be as though we're using a predictor # obtained from projecting the original predictors into # the one-dimensional subspace spanned by the first # eigenvector (which is almost the same as doing # classification just using x1). new.trndat <- data.frame(cbind(trndat[,1], as.matrix(trndat[,-1])%*%dim.red$vector)) names(new.trndat) <- c("g","ev1","ev2","ev3","ev4") new.gendat <- data.frame(cbind(gendat[,1], as.matrix(gendat[,-1])%*%dim.red$vector)) names(new.gendat) <- c("g","ev1","ev2","ev3","ev4") # I'll first try the best one-dimensional approximating # subspace, using train.kknn to identify a good choice # of k to use for classifying the test set cases using # the training data. new.cvknn <- train.kknn(as.factor(g) ~ ., new.trndat[,1:2], kmax=100, kernel="rectangular", distance=2) plot(c(1:100),new.cvknn$MISCLASS,type="l",xlab='k', ylab='proportion misclassified',main='Estimated Error Rates', ylim=c(0.1,0.6),col="maroon") which.min(new.cvknn$MISCLASS) # Let's see what the error rate is if we use # 95 nearest neighbors in the one-dimensional # subspace. pred.gen.new <- kknn(as.factor(g)~.,train=new.trndat[,1:2],test=new.gendat[,1:2], k=95,kernel="rectangular") gen.err.new <- mean(pred.gen.new$fit != new.gendat[,1]) gen.err.new # Error rate for 95 nearest neighbors (in 1-d subspace). # Not so good. The symmetry of the situation makes only # x1 look important using this method, but obviously x2 # could be useful in distinguishing the Greens from the # Oranges and Blues. ##### # So let's try the best two-dimensional approximating # subspace, using train.kknn to identify a good choice # of k to use for classifying the test set cases using # the training data. new.cvknn <- train.kknn(as.factor(g) ~ ., new.trndat[,1:3], kmax=100, kernel="rectangular", distance=2) plot(c(1:100),new.cvknn$MISCLASS,type="l",xlab='k', ylab='proportion misclassified',main='Estimated Error Rates', ylim=c(0.1,0.6),col="maroon") which.min(new.cvknn$MISCLASS) # Let's see what the error rate is if we use # 59 nearest neighbors in the two-dimensional # subspace. pred.gen.new <- kknn(as.factor(g)~.,train=new.trndat[,1:3],test=new.gendat[,1:3], k=59,kernel="rectangular") gen.err.new <- mean(pred.gen.new$fit != new.gendat[,1]) gen.err.new # Error rate for 59 nearest neighbors (in 2-d subspace). # Just a tiny wee bit better, but still not good. ##### # Now I will try some support vector machines. # The default kernel is the radial kernel. # Using the svm function, one has to supply a # value for the gamma parameter, or else rely # on the default, which is the inverse of the # number of predictors, or 1/4 in this case. # The help page stresses that one should tune, # and so a variety of gamma values should be # considered. The svm function has a cross- # validation feature which can be used to obtain # an estimate of generalization error rate from # the training sample for the particular value of # gamma which is currently being used. However, # even if the same random number seed is used, # the c-v estimates for a particular value of # gamma can differ from trial to trial. This # prevents me from doing a "trial-and-error" search # for a good gamma and making comments here, # because when this script is run I can't know # in advance what the outcomes will be. So I'll # just try a bunch of different gamma values # and obtain c-v estimates of the error rates, as # well as estimates of the error rates using the # fitted models to make predictions for the large # generalization sample. num.trials <- 8 gamma.val <- c(1/16, 1/8, 3/16, 1/4, 3/8, 1/2, 3/4, 1) error.cv <- numeric(8) error.gen <- numeric(8) for (i in 1:num.trials) { svm.rad <- svm( as.factor(g) ~ ., trndat, cross=10, gamma=gamma.val[i]) error.cv[i] <- svm.rad$tot.accuracy error.gen[i] <- mean( predict(svm.rad, gendat) != as.factor(gendat[,1]) ) } error.cv <- 1 - error.cv/100 plot(gamma.val,error.gen,type="l",xlab='gamma', ylab='proportion misclassified',main='Estimated Error Rates (SVM, radial kernel)', ylim=c(0.1,0.5),col="seagreen") points(gamma.val,error.cv,type="l",col="tomato") legend(0.125,0.5,legend=c("cross-val","generalization"), fill=c("tomato","seagreen")) # Pretty good ... c-v would lead us to an error rate less than 0.2. ##### # Obtaining c-v estimates of the error rates for the different values # of gamma four more times and averaging the results suggested that # setting gamma to 3/16 = 0.1875 is best, with the estimated error rate # being about 0.1813. Let's see how good of a classifier we get using # this choice. svm.rad <- svm( as.factor(g) ~ ., trndat, gamma=3/16) error.svm <- mean( predict(svm.rad, gendat) != as.factor(gendat[,1]) ) error.svm # estimated error rate for svm (radial, gamma = 3/16) # Pretty, good, but not quite as good as the QDA results. ##### # Here is a plot of the SVM classifier (showing just the two # nonnoise variables), using radial basis kernel and gamma = # 3/16. plot(svm.rad, trndat, x2 ~ x1) ##### # Now I'll try using the polynomial kernel. To tune, # I'll start by considering various values for the # degree, keeping the other parameters at their default # settings. svm.poly2 <- svm( as.factor(g) ~ ., trndat, cross=10, kernel="polynomial", degree=2) summary(svm.poly2) svm.poly3 <- svm( as.factor(g) ~ ., trndat, cross=10, kernel="polynomial", degree=3) summary(svm.poly3) svm.poly4 <- svm( as.factor(g) ~ ., trndat, cross=10, kernel="polynomial", degree=4) summary(svm.poly4) svm.poly5 <- svm( as.factor(g) ~ ., trndat, cross=10, kernel="polynomial", degree=5) summary(svm.poly5) # C-v results indicate setting the degree to 3 # works best, but it doesn't look real promising. # (Note: The odd values for the degree (3 and 5) # work better than the even degree values (2 and 4).) # Nevertheless, I'll try some different values of # gamma to see if I can get improvement. for (i in 1:num.trials) { svm.rad <- svm( as.factor(g) ~ ., trndat, cross=10, kernel="polynomial", degree=3, gamma=gamma.val[i]) error.cv[i] <- svm.rad$tot.accuracy error.gen[i] <- mean( predict(svm.rad, gendat) != as.factor(gendat[,1]) ) } error.cv <- 1 - error.cv/100 plot(gamma.val,error.gen,type="l",xlab='gamma', ylab='proportion misclassified',main='Estimated Error Rates (SVM, poly 3 kernel)', ylim=c(0.1,0.85),col="seagreen") points(gamma.val,error.cv,type="l",col="tomato") legend(0.65,0.85,legend=c("cross-val","generalization"), fill=c("tomato","seagreen")) # It could be that additional tuning results in more # improvement, but the radial basis kernel seems better. ##### # Now I'll try the linear kernel. svm.lin <- svm( as.factor(g) ~ ., trndat, cross=10, kernel="linear") summary(svm.lin) # The c-v estimate isn't promising, but I'll check the # performance on the generalization set anyway. mean( predict(svm.lin, gendat) != as.factor(gendat[,1]) ) # Better than the c-v indicated it would be. plot(svm.lin, trndat, x2 ~ x1) ##### # Now for the DANN method (see p. 477 of HTF2). # Here is a function to find square root of # the inverse of a matrix. sqrt.inv.mat <- function( mat ) # won't work for all matrices { eigen.mat <- eigen( mat ) q <- eigen.mat$vectors inv.sqrt.lamb <- diag( sqrt( 1/eigen.mat$values ) ) sr <- ( q%*%inv.sqrt.lamb )%*%t(q) sr } # Here is a function to find the the pooled # within-class covariance matrix (see p. 477 # of HTF2). First column of data matrix # should have groups identified with consecutive # integers. pooled.within.cov <- function( data ) { first.class <- min( data[,1] ) last.class <- max( data[,1] ) var.cov <- 0 for ( g in first.class:last.class ) { cases <- data[,1] == g if (sum(cases) < 2) var.cov <- var.cov + 0 else var.cov <- var.cov + ( sum( cases )/length( data[,1] ) )*cov( data[cases,-1] ) } var.cov } # Here are a pair of functions that determine the # Sigma given by (13.9) on p. 477 of HTF2. (Based # on comment in HTF2, the default value of epsilon # is 1.) DANN.Sig <- function( data, epsilon=1 ) { w.neg.half <- sqrt.inv.mat( pooled.within.cov( data ) ) b <- between.cov( data ) b.star <- w.neg.half%*%b%*%w.neg.half n.var <- length( data[1,] ) - 1 sig <- w.neg.half%*%( b.star + diag(epsilon, n.var) )%*%w.neg.half } local.Sig <- function( target.point, data, num.neighbors=num.neighbors ) { # data should have class indicator in 1st column, # and predictor variables in remaining columns. # target.point should just have coordinates of # target point. Based on a comment in HFT2, the # default number of nearest neighbors used is 50. distances <- as.matrix(dist(rbind( target.point, data[,-1]))) closest.training.points <- rank( distances[-1,1] ) < num.neighbors + 1 loc.sig <- DANN.Sig( data[closest.training.points,] ) loc.sig } # This function determines the distances given by # (13.8) on p. 477 of HTF2. local.dist <- function( target.point, data, num.neighbors=num.neighbors ) { x.mat <- t( data[,-1] ) # class variable should # be in 1st col of data diffs <- x.mat - as.vector( t( target.point ) ) distances <- diag( ( t( diffs )%*%local.Sig( target.point, data, num.neighbors ) )%*%diffs ) distances } # This function determines predicted class of a target point. pred.class <- function( target.point, data, num.neighbors=num.neighbors, k=k ) { dists <- local.dist( target.point, data, num.neighbors ) near.neighbors <- rank( dists ) < k + 1 first.class <- min( data[,1] ) last.class <- max( data[,1] ) num.classes <- last.class - first.class + 1 classes <- numeric(num.classes) for (g in first.class:last.class ) classes[g - first.class + 1] <- sum( data[near.neighbors,1] == g ) pred <- which.max( classes ) - first.class - 1 pred } DANN <- function( train.data, test.data, num.neighbors=50, k=15 ) { num.cases <- length( test.data[,1] ) predictions <- numeric(num.cases) for (i in 1:num.cases ) predictions[i] <- pred.class( test.data[i,-1], train.data, num.neighbors=num.neighbors, k=k ) predictions } pred <- DANN( trndat, gendat ) DANN.err <- mean(pred != gendat[,1]) DANN.err # error rate for DANN method # (not as good as QDA rates) # I'll try it again using only 7 nearest neighbors. # (Previous run used default of 15.) pred <- DANN( trndat, gendat, k=7 ) DANN.7.err <- mean(pred != gendat[,1]) DANN.7.err # error rate for DANN method (w/ k = 7) # Worse. # Now I'll try it using 25 nearest neighbors. pred <- DANN( trndat, gendat, k=25 ) DANN.25.err <- mean(pred != gendat[,1]) DANN.25.err # error rate for DANN method (w/ k = 25) ###################################################### # SUMMARY # # error method # ----- ------ # # 0.272 LDA (all variables) # 0.268 stepwise LDA (x1 chosen) # # 0.176 QDA (all variables) # 0.172 stepwise QDA (x1 and x2 chosen) # # 0.208 5 NN (all variables, 5 chosen using c-v) # 0.278 95 NN (using 1-d subspace, 95 chosen by c-v) # 0.276 59 NN (using 2-d subspace, 59 chosen by c-v) # # 0.180 svm (radial kernel & gamma = 3/16 chosen by c-v) # 0.267 svm (linear) # # 0.196 DANN (k = 7) # 0.188 DANN (k = 15) # 0.189 DANN (k = 25) ##### END #####