# Example of using DANN for Classification # (comparison of DANN with KNN, LDA, SVMs, etc) # Developed using 2.9.0. library(MASS) library(class) library(kknn) # for kknn and train.kknn f'ns (for KNN) library(klaR) # for stepclass f'n (for stepwise LDA) library(e1071) # for svm f'n (for support vector machines) library(mvtnorm) # for rmvnorm and dmvnorm (for generation # of multivariate normal observations and # computation of multivariate normal pdf # values) set.seed(17) # I'll generate data from two classes. # There will be two predictor variables (both meaningful (nonnoise)). n1 <- 1000 # n1 is number of training sample cases per class n2 <- 50 # n2 is number of generalization sample cases per class ang <- runif(2*(n1+n2), 0, 1.5708) x <- cos(ang) y <- sin(ang) class1 <- cbind(1, x[1:(n1+n2)] - runif((n1+n2),0,0.1) + 0.01, y[1:(n1+n2)] - runif((n1+n2),0,0.1) - 0.01) class2 <- cbind(2, x[(n1+n2+1):(2*(n1+n2))] + runif((n1+n2),0,0.1) - 0.01, y[(n1+n2+1):(2*(n1+n2))] + runif((n1+n2),0,0.1) - 0.01) trndat <- data.frame(rbind(class1[1:n1,],class2[1:n1,])) gendat <- data.frame(rbind(class1[(n1+1):(n1+n2),],class2[(n1+1):(n1+n2),])) names(trndat) <- c("g","x1","x2") names(gendat) <- c("g","x1","x2") # I'll plot the training data. plot(trndat[,2],trndat[,3],col=(trndat[,1]+1),main='Training Data',xlab='x1',ylab='x2') ##### # Before trying the DANN method of HTF2 # I'll try LDA. lda.fit <- lda(g ~ ., trndat) pred.lda <- predict(lda.fit, newdata=gendat) lda.rate <- mean( pred.lda$class != gendat[,1] ) lda.rate # LDA rate ##### # Now I'll try QDA. qda.fit <- qda(g ~ ., trndat) pred.qda <- predict(qda.fit, newdata=gendat) qda.rate <- mean( pred.qda$class != gendat[,1] ) qda.rate # QDA rate # It seems odd that the QDA rate is worse. # Let's look at the QDA predictions. points(gendat[,2],gendat[,3],col=(as.numeric(pred.qda$class)+1),pch=15) # It doesn't seem right!!! ##### # Now I'll try ordinary nearest neighbors, using # train.kknn to select a good value of k. cvknn <- train.kknn(as.factor(g) ~ ., trndat, kmax=100, kernel="rectangular", distance=2) plot(c(1:100),cvknn$MISCLASS,type="l",xlab='k', ylab='proportion misclassified',main='Estimated Error Rates',col="maroon") which.min(cvknn$MISCLASS) # I'll use k = 75. pred.gen.knn <- kknn(as.factor(g)~.,train=trndat,test=gendat, k=75,kernel="rectangular") gen.err.knn <- mean(pred.gen.knn$fit != gendat[,1]) gen.err.knn # Error rate for 75 nearest neighbors. # Better than the LDA and QDA results. ##### # Now I will try some support vector machines. # The default kernel is the radial kernel. # Using the svm function, one has to supply a # value for the gamma parameter, or else rely # on the default, which is the inverse of the # number of predictors, or 1/2 in this case. # The help page stresses that one should tune, # and so a variety of gamma values should be # considered. The svm function has a cross- # validation feature which can be used to obtain # an estimate of generalization error rate from # the training sample for the particular value of # gamma which is currently being used. However, # even if the same random number seed is used, # the c-v estimates for a particular value of # gamma can differ from trial to trial. This # prevents me from doing a "trial-and-error" search # for a good gamma and making comments here, # because when this script is run I can't know # in advance what the outcomes will be. So I'll # just try a bunch of different gamma values # and obtain c-v estimates of the error rates, as # well as estimates of the error rates using the # fitted models to make predictions for the large # generalization sample. num.trials <- 8 gamma.val <- c(1/16, 1/8, 3/16, 1/4, 3/8, 1/2, 3/4, 1) error.cv <- numeric(8) error.gen <- numeric(8) for (i in 1:num.trials) { svm.rad <- svm( as.factor(g) ~ ., trndat, cross=10, gamma=gamma.val[i]) error.cv[i] <- svm.rad$tot.accuracy error.gen[i] <- mean( predict(svm.rad, gendat) != as.factor(gendat[,1]) ) } error.cv <- 1 - error.cv/100 plot(gamma.val,error.gen,type="l",xlab='gamma', ylab='proportion misclassified',main='Estimated Error Rates (SVM, radial kernel)', ylim=c(0,0.05),col="seagreen") points(gamma.val,error.cv,type="l",col="tomato") legend(0.65,0.05,legend=c("cross-val","generalization"), fill=c("tomato","seagreen")) # It appears that the results are rather insensitive to the choice of gamma. # So I'll just use the default gamma. ##### svm.rad <- svm( as.factor(g) ~ ., trndat) error.svm <- mean( predict(svm.rad, gendat) != as.factor(gendat[,1]) ) error.svm # estimated error rate for svm (radial, gamma = 1/2 (default)) # Pretty good --- same as KNN, and much better than LDA and QDA. ##### # Here is a plot of the SVM classifier (showing just the two # nonnoise variables), using radial basis kernel and gamma = # 1/2 plot(svm.rad, trndat, x2 ~ x1) ##### # Now for the DANN method (see p. 477 of HTF2). # Here is a function to find square root of # the inverse of a matrix. sqrt.inv.mat <- function( mat ) # won't work for all matrices { eigen.mat <- eigen( mat ) q <- eigen.mat$vectors inv.sqrt.lamb <- diag( sqrt( 1/eigen.mat$values ) ) sr <- ( q%*%inv.sqrt.lamb )%*%t(q) sr } # Here is a function to find the the pooled # within-class covariance matrix (see p. 477 # of HTF2). First column of data matrix # should have groups identified with consecutive # integers. pooled.within.cov <- function( data ) { first.class <- min( data[,1] ) last.class <- max( data[,1] ) var.cov <- 0 for ( g in first.class:last.class ) { cases <- data[,1] == g if (sum(cases) < 2) var.cov <- var.cov + 0 else var.cov <- var.cov + ( sum( cases )/length( data[,1] ) )*cov( data[cases,-1] ) } var.cov } # Here are a pair of functions that determine the # Sigma given by (13.9) on p. 477 of HTF2. (Based # on comment in HTF2, the default value of epsilon # is 1.) between.cov <- function(data) # 1st col. of data set should be class variable. # (Classes should be specified using consecutive integers.) # Remaining col's of each set should be predictor var's. { first.class <- min(data[,1]) last.class <- max(data[,1]) n.class <- last.class - first.class + 1 n.var <- length(data[1,]) - 1 var.means <- matrix(0, nrow=n.var, ncol=n.class) var.sums <- matrix(0, nrow=n.var, ncol=n.class) class.n <- numeric(n.class) for (g in first.class:last.class) { cases <- data[,1] == g class.n[g-first.class+1] <- sum(cases) classdata <- data[cases,] var.means[,g-first.class+1] <- apply(classdata[,-1],2,mean) var.sums[,g-first.class+1] <- apply(classdata[,-1],2,sum) } # var.means now contains the mean vectors for the # classes in its columns ... now I'll subtract the # grand mean from each column and put the result in # mean.deviations grand.mean <- apply(var.sums,1,sum)/sum(class.n) mean.deviations <- var.means - grand.mean # now I'll put the matrix given by (13.10) on p. 479 # of HTF2, which is also the matrix B on p. 477 of # HTF2, in B B <- matrix(0, nrow=n.var, ncol=n.var) for (g in first.class:last.class) { col.ind <- g - first.class + 1 B <- B + class.n[col.ind]*cbind(mean.deviations[,col.ind])%*%mean.deviations[,col.ind] } B <- B/sum(class.n) B } DANN.Sig <- function( data, epsilon=1 ) { w.neg.half <- sqrt.inv.mat( pooled.within.cov( data ) ) b <- between.cov( data ) b.star <- w.neg.half%*%b%*%w.neg.half n.var <- length( data[1,] ) - 1 sig <- w.neg.half%*%( b.star + diag(epsilon, n.var) )%*%w.neg.half } local.Sig <- function( target.point, data, num.neighbors=num.neighbors ) { # data should have class indicator in 1st column, # and predictor variables in remaining columns. # target.point should just have coordinates of # target point. Based on a comment in HFT2, the # default number of nearest neighbors used is 50. distances <- as.matrix(dist(rbind( target.point, data[,-1]))) closest.training.points <- rank( distances[-1,1] ) < num.neighbors + 1 loc.sig <- DANN.Sig( data[closest.training.points,] ) loc.sig } # This function determines the distances given by # (13.8) on p. 477 of HTF2. local.dist <- function( target.point, data, num.neighbors=num.neighbors ) { x.mat <- t( data[,-1] ) # class variable should # be in 1st col of data diffs <- x.mat - as.vector( t( target.point ) ) distances <- diag( ( t( diffs )%*%local.Sig( target.point, data, num.neighbors ) )%*%diffs ) distances } # This function determines predicted class of a target point. pred.class <- function( target.point, data, num.neighbors=num.neighbors, k=k ) { dists <- local.dist( target.point, data, num.neighbors ) near.neighbors <- rank( dists ) < k + 1 first.class <- min( data[,1] ) last.class <- max( data[,1] ) num.classes <- last.class - first.class + 1 classes <- numeric(num.classes) for (g in first.class:last.class ) classes[g - first.class + 1] <- sum( data[near.neighbors,1] == g ) pred <- which.max( classes ) + first.class - 1 pred } DANN <- function( train.data, test.data, num.neighbors=50, k=15 ) { num.cases <- length( test.data[,1] ) predictions <- numeric(num.cases) for (i in 1:num.cases ) predictions[i] <- pred.class( test.data[i,-1], train.data, num.neighbors=num.neighbors, k=k ) predictions } pred <- DANN( trndat, gendat ) DANN.err <- mean(pred != gendat[,1]) DANN.err # error rate for DANN method # Best so far! # I'll see if increasing the number of cases used to determine # "local shape" and increasing the number of nearest neighbors # used for classification (since default is 15, and for ordinary # KNN, c-v indicated using k = 75 is good) improves the results # (which can only occur if there are no errors). pred <- DANN( trndat, gendat, 100, 45 ) DANN.err <- mean(pred != gendat[,1]) DANN.err # Worse, but still as good as SVM and KNN. ##### END #####