# Example of using DANN for Classification
# (comparison of DANN with KNN, LDA, SVMs, etc)

# Developed using 2.9.0.


library(MASS)     
library(class)  
library(kknn)     # for kknn and train.kknn f'ns (for KNN)
library(klaR)     # for stepclass f'n (for stepwise LDA)
library(e1071)    # for svm f'n (for support vector machines)
library(mvtnorm)  # for rmvnorm and dmvnorm (for generation 
                  # of multivariate normal observations and
                  # computation of multivariate normal pdf 
                  # values)
    
set.seed(7)     


# I'll generate data from three classes. 
# The means will differ in the first two 
# dimensions, but not the other two dimensions.  
# (So two of the four predictor variables are
# pure noise.)

n <- 200  # n is number of training sample cases per class

Sig <- diag(1,4) 
Sig
# Sig is a variance-covariance matrix.  

Oxvals <- rmvnorm(n, mean=c(0,0,0,0), sigma=Sig) # O is used for the Orange class.
Gxvals <- rmvnorm(n, mean=c(0,0,0,0), sigma=Sig) # G is used for the Green class.
Bxvals <- rmvnorm(n, mean=c(0,0,0,0), sigma=Sig) # B is used for the Blue class.

# Above, all three classes were generated the same way,
# but now I'll *alter the first two variables* for the
# Orange and Blue classes.

Oxvals[,1] <- runif(n,-4,0)
Oxvals[,2] <- runif(n,-4,4)
Bxvals[,1] <- runif(n,0,4)
Bxvals[,2] <- runif(n,-4,4)

# I'll plot the data using the first two predictor variables.

plot(Oxvals[,1],Oxvals[,2],col="darkorange",main='Training Data (nonnoise variables)',
     xlab='x1',ylab='x2',xlim=c(-4,4),ylim=c(-4,4))
points(Bxvals[,1], Bxvals[,2], col="royalblue")
points(Gxvals[,1], Gxvals[,2], col="limegreen")

#####

# I'll give response value of 0 for Orange class observations, 
# 1 for Green class observations, 2 for the Blue class observations, 
# and then combine them together to form the training data.  Then 
# I'll generate 1000 observations for each class to serve as the 
# generalization sample.

Oclass <- cbind(0, Oxvals)
Gclass <- cbind(1, Gxvals)
Bclass <- cbind(2, Bxvals)
trndat <- data.frame(rbind(Oclass, Gclass, Bclass))
names(trndat) <- c("g", "x1", "x2", "x3", "x4")  

Oxvals <- rmvnorm(1000, mean=c(0,0,0,0), sigma=Sig) 
Gxvals <- rmvnorm(1000, mean=c(0,0,0,0), sigma=Sig) 
Bxvals <- rmvnorm(1000, mean=c(0,0,0,0), sigma=Sig)
Oxvals[,1] <- runif(1000,-4,0)
Oxvals[,2] <- runif(1000,-4,4)
Bxvals[,1] <- runif(1000,0,4)
Bxvals[,2] <- runif(1000,-4,4)
Oclass <- cbind(0, Oxvals)
Gclass <- cbind(1, Gxvals)
Bclass <- cbind(2, Bxvals)
gendat <- data.frame(rbind(Oclass, Gclass, Bclass))
names(gendat) <- c("g", "x1", "x2", "x3", "x4")  

#####

# Before trying the DANN method of HTF2
# I'll try LDA using all of the variables, 
# and then stepwise LDA.

lda.all <- lda(g ~ ., trndat)
pred.lda.all <- predict(lda.all, newdata=gendat)
lda.all.rate <- mean( pred.lda.all$class != gendat[,1] )

lda.all.rate
# LDA rate (all predictors used).

ldafit <- stepclass(g ~ ., data=trndat, method="lda")
ldafit

# Stepwise LDA selects only x1.

lda.step <- lda(g ~ x1, trndat)
pred.lda.step <- predict(lda.step, newdata=gendat)
lda.step.rate <- mean( pred.lda.step$class != gendat[,1] )

lda.step.rate
# A tad better than LDA using all of the variables.

#####

# Now I'll try QDA (using all variables) and stepwise LDA.

qda.all <- qda(g ~ ., trndat)
pred.qda.all <- predict(qda.all, newdata=gendat)
qda.all.rate <- mean( pred.qda.all$class != gendat[,1] )

qda.all.rate
# QDA rate (all predictors used).

qdafit <- stepclass(g ~ ., data=trndat, method="qda")
qdafit

# Stepwise QDA selects only x1 and x2 (correctly not using
# the noise variables x3 and x4).

qda.step <- qda(g ~ x1 + x2, trndat)
pred.qda.step <- predict(qda.step, newdata=gendat)
qda.step.rate <- mean( pred.qda.step$class != gendat[,1] )

qda.step.rate
# Just a wee bit better than QDA using all of the variables.
# Both QDA results are appreciably better than the two LDA 
# results.

#####

# Now I'll try ordinary nearest neighbors, using 
# train.kknn to select a good value of k.

cvknn <- train.kknn(as.factor(g) ~ ., trndat, 
          kmax=100, kernel="rectangular", distance=2) 

plot(c(1:100),cvknn$MISCLASS,type="l",xlab='k',
      ylab='proportion misclassified',main='Estimated Error Rates',
      ylim=c(0.1,0.65),col="maroon")

which.min(cvknn$MISCLASS)

pred.gen.knn <- kknn(as.factor(g)~.,train=trndat,test=gendat,
                     k=5,kernel="rectangular")
gen.err.knn <- mean(pred.gen.knn$fit != gendat[,1])

gen.err.knn
# Error rate for 5 nearest neighbors.
# Better than the LDA results, but not the QDA results.

#####

# Now I'll try the method from subsection 13.4.2 of HTF2.

between.cov <- function(data)
# 1st col. of data set should be class variable.
# (Classes should be specified using consecutive integers.)
# Remaining col's of each set should be predictor var's.
{
  first.class <- min(data[,1])
  last.class <- max(data[,1])
  n.class <- last.class - first.class + 1
  n.var <- length(data[1,]) - 1
  var.means <- matrix(0, nrow=n.var, ncol=n.class)
  var.sums <- matrix(0, nrow=n.var, ncol=n.class)
  class.n <- numeric(n.class)
  for (g in first.class:last.class)
  {
    cases <- data[,1] == g
    class.n[g-first.class+1] <- sum(cases)
    classdata <- data[cases,]
    var.means[,g-first.class+1] <- apply(classdata[,-1],2,mean)
    var.sums[,g-first.class+1] <- apply(classdata[,-1],2,sum)
  }
  # var.means now contains the mean vectors for the
  # classes in its columns ... now I'll subtract the
  # grand mean from each column and put the result in
  # mean.deviations
  grand.mean <- apply(var.sums,1,sum)/sum(class.n)
  mean.deviations <- var.means - grand.mean
  # now I'll put the matrix given by (13.10) on p. 479
  # of HTF2, which is also the matrix B on p. 477 of
  # HTF2, in B
  B <- matrix(0, nrow=n.var, ncol=n.var)
  for (g in first.class:last.class)
  {
    col.ind <- g - first.class + 1
    B <- B + 
         class.n[col.ind]*cbind(mean.deviations[,col.ind])%*%mean.deviations[,col.ind]
  }
  B <- B/sum(class.n)
  B
}

GlobDimRed2 <- function(data)
# 1st col. of data set should be class variable.
# (Classes should be specified using consecutive integers.)
# Remaining col's of each set should be predictor var's.
# (Note: This function, GlobDimRed2, requires my between.cov
# function.  I have another function, GlobDimRed, that has
# the between.cov part already incorporated in it.)
{
  B <- between.cov(data)
  # I'll put the eigenvalues and eigned vectors 
  # of B in ev, and return this object
  ev <- eigen(B)
  ev
}

dim.red <- GlobDimRed2(trndat)
dim.red

# Compared to the first two eignevalues, the others
# are extremely small, and the 2nd is much smaller 
# than the first  This suggests that a good approximating 
# subspace may be the one-dimensional subspace spanned by 
# the 1st eignevector, or the two-dimensional subspace
# spanned by the first two eigenvectors.

#####

# I'll create new data sets having predictors created 
# from the eignevectors.  If just the first predictor
# is used, it'll be as though we're using a predictor
# obtained from projecting the original predictors into
# the one-dimensional subspace spanned by the first
# eigenvector (which is almost the same as doing 
# classification just using x1).

new.trndat <- data.frame(cbind(trndat[,1], as.matrix(trndat[,-1])%*%dim.red$vector))
names(new.trndat) <- c("g","ev1","ev2","ev3","ev4")

new.gendat <- data.frame(cbind(gendat[,1], as.matrix(gendat[,-1])%*%dim.red$vector))
names(new.gendat) <- c("g","ev1","ev2","ev3","ev4")

# I'll first try the best one-dimensional approximating
# subspace, using train.kknn to identify a good choice 
# of k to use for classifying the test set cases using 
# the training data.

new.cvknn <- train.kknn(as.factor(g) ~ ., new.trndat[,1:2], 
          kmax=100, kernel="rectangular", distance=2)

plot(c(1:100),new.cvknn$MISCLASS,type="l",xlab='k',
      ylab='proportion misclassified',main='Estimated Error Rates',
      ylim=c(0.1,0.6),col="maroon")

which.min(new.cvknn$MISCLASS)

# Let's see what the error rate is if we use 
# 95 nearest neighbors in the one-dimensional
# subspace.

  pred.gen.new <- kknn(as.factor(g)~.,train=new.trndat[,1:2],test=new.gendat[,1:2],
                       k=95,kernel="rectangular")
  gen.err.new <- mean(pred.gen.new$fit != new.gendat[,1])

gen.err.new
# Error rate for 95 nearest neighbors (in 1-d subspace).
# Not so good.  The symmetry of the situation makes only
# x1 look important using this method, but obviously x2
# could be useful in distinguishing the Greens from the 
# Oranges and Blues.

##### 

# So let's try the best two-dimensional approximating
# subspace, using train.kknn to identify a good choice 
# of k to use for classifying the test set cases using 
# the training data.

new.cvknn <- train.kknn(as.factor(g) ~ ., new.trndat[,1:3], 
          kmax=100, kernel="rectangular", distance=2)

plot(c(1:100),new.cvknn$MISCLASS,type="l",xlab='k',
      ylab='proportion misclassified',main='Estimated Error Rates',
      ylim=c(0.1,0.6),col="maroon")

which.min(new.cvknn$MISCLASS)

# Let's see what the error rate is if we use 
# 59 nearest neighbors in the two-dimensional
# subspace.

  pred.gen.new <- kknn(as.factor(g)~.,train=new.trndat[,1:3],test=new.gendat[,1:3],
                       k=59,kernel="rectangular")
  gen.err.new <- mean(pred.gen.new$fit != new.gendat[,1])

gen.err.new
# Error rate for 59 nearest neighbors (in 2-d subspace).
# Just a tiny wee bit better, but still not good.

#####

# Now I will try some support vector machines.  
# The default kernel is the radial kernel.  
# Using the svm function, one has to supply a 
# value for the gamma parameter, or else rely
# on the default, which is the inverse of the 
# number of predictors, or 1/4 in this case.
# The help page stresses that one should tune,
# and so a variety of gamma values should be 
# considered.  The svm function has a cross-
# validation feature which can be used to obtain 
# an estimate of generalization error rate from
# the training sample for the particular value of 
# gamma which is currently being used.  However, 
# even if the same random number seed is used,
# the c-v estimates for a particular value of
# gamma can differ from trial to trial.  This 
# prevents me from doing a "trial-and-error" search 
# for a good gamma and making comments here, 
# because when this script is run I can't know 
# in advance what the outcomes will be.  So I'll 
# just try a bunch of different gamma values 
# and obtain c-v estimates of the error rates, as 
# well as estimates of the error rates using the 
# fitted models to make predictions for the large
# generalization sample.

num.trials <- 8
gamma.val <- c(1/16, 1/8, 3/16, 1/4, 3/8, 1/2, 3/4, 1)
error.cv <- numeric(8)
error.gen <- numeric(8)
for (i in 1:num.trials)
{
  svm.rad <- svm( as.factor(g) ~ ., trndat, cross=10, gamma=gamma.val[i])
  error.cv[i] <- svm.rad$tot.accuracy
  error.gen[i] <- mean( predict(svm.rad, gendat) != as.factor(gendat[,1]) )
}
error.cv <- 1 - error.cv/100

plot(gamma.val,error.gen,type="l",xlab='gamma',
      ylab='proportion misclassified',main='Estimated Error Rates (SVM, radial kernel)',
      ylim=c(0.1,0.5),col="seagreen")
points(gamma.val,error.cv,type="l",col="tomato")
legend(0.125,0.5,legend=c("cross-val","generalization"),
       fill=c("tomato","seagreen"))

# Pretty good ... c-v would lead us to an error rate less than 0.2.

#####

# Obtaining c-v estimates of the error rates for the different values
# of gamma four more times and averaging the results suggested that 
# setting gamma to 3/16 = 0.1875 is best, with the estimated error rate
# being about 0.1813.  Let's see how good of a classifier we get using
# this choice.

svm.rad <- svm( as.factor(g) ~ ., trndat, gamma=3/16)
error.svm <- mean( predict(svm.rad, gendat) != as.factor(gendat[,1]) )
error.svm
# estimated error rate for svm (radial, gamma = 3/16)
# Pretty, good, but not quite as good as the QDA results.

#####

# Here is a plot of the SVM classifier (showing just the two
# nonnoise variables), using radial basis kernel and gamma =
# 3/16.

plot(svm.rad, trndat, x2 ~ x1)

#####

# Now I'll try using the polynomial kernel.  To tune,
# I'll start by considering various values for the 
# degree, keeping the other parameters at their default
# settings.

svm.poly2 <- svm( as.factor(g) ~ ., trndat, cross=10, kernel="polynomial", degree=2)
summary(svm.poly2)

svm.poly3 <- svm( as.factor(g) ~ ., trndat, cross=10, kernel="polynomial", degree=3)
summary(svm.poly3)

svm.poly4 <- svm( as.factor(g) ~ ., trndat, cross=10, kernel="polynomial", degree=4)
summary(svm.poly4)

svm.poly5 <- svm( as.factor(g) ~ ., trndat, cross=10, kernel="polynomial", degree=5)
summary(svm.poly5)

# C-v results indicate setting the degree to 3 
# works best, but it doesn't look real promising.
# (Note: The odd values for the degree (3 and 5) 
# work better than the even degree values (2 and 4).) 
# Nevertheless, I'll try some different values of
# gamma to see if I can get improvement.

for (i in 1:num.trials)
{
  svm.rad <- svm( as.factor(g) ~ ., trndat, cross=10, kernel="polynomial",
                  degree=3, gamma=gamma.val[i])
  error.cv[i] <- svm.rad$tot.accuracy
  error.gen[i] <- mean( predict(svm.rad, gendat) != as.factor(gendat[,1]) )
}
error.cv <- 1 - error.cv/100

plot(gamma.val,error.gen,type="l",xlab='gamma',
      ylab='proportion misclassified',main='Estimated Error Rates (SVM, poly 3 kernel)',
      ylim=c(0.1,0.85),col="seagreen")
points(gamma.val,error.cv,type="l",col="tomato")
legend(0.65,0.85,legend=c("cross-val","generalization"),
       fill=c("tomato","seagreen"))

# It could be that additional tuning results in more
# improvement, but the radial basis kernel seems better.

#####

# Now I'll try the linear kernel.

svm.lin <- svm( as.factor(g) ~ ., trndat, cross=10, kernel="linear")
summary(svm.lin)

# The c-v estimate isn't promising, but I'll check the
# performance on the generalization set anyway.

mean( predict(svm.lin, gendat) != as.factor(gendat[,1]) )

# Better than the c-v indicated it would be.

plot(svm.lin, trndat, x2 ~ x1)

#####

# Now for the DANN method (see p. 477 of HTF2).

# Here is a function to find square root of 
# the inverse of a matrix.

sqrt.inv.mat <- function( mat )  # won't work for all matrices
{
  eigen.mat <- eigen( mat )
  q <- eigen.mat$vectors
  inv.sqrt.lamb <- diag( sqrt( 1/eigen.mat$values ) )
  sr <- ( q%*%inv.sqrt.lamb )%*%t(q)
  sr
}

# Here is a function to find the the pooled
# within-class covariance matrix (see p. 477
# of HTF2).  First column of data matrix 
# should have groups identified with consecutive
# integers.

pooled.within.cov <- function( data )
{
  first.class <- min( data[,1] )
  last.class <- max( data[,1] )
  var.cov <- 0
  for ( g in first.class:last.class )
  {
    cases <- data[,1] == g
    if (sum(cases) < 2) var.cov <- var.cov + 0  else
      var.cov <- var.cov + ( sum( cases )/length( data[,1] ) )*cov( data[cases,-1] )
  }
  var.cov
}

# Here are a pair of functions that determine the 
# Sigma given by (13.9) on p. 477 of HTF2.  (Based 
# on comment in HTF2, the default value of epsilon 
# is 1.)  

DANN.Sig <- function( data, epsilon=1 )
{
  w.neg.half <- sqrt.inv.mat( pooled.within.cov( data ) )
  b <- between.cov( data )
  b.star <- w.neg.half%*%b%*%w.neg.half
  n.var <- length( data[1,] ) - 1
  sig <- w.neg.half%*%( b.star + diag(epsilon, n.var) )%*%w.neg.half
}

local.Sig <- function( target.point, data, num.neighbors=num.neighbors )
{
  # data should have class indicator in 1st column, 
  # and predictor variables in remaining columns.
  # target.point should just have coordinates of
  # target point.  Based on a comment in HFT2, the
  # default number of nearest neighbors used is 50.
  distances <- as.matrix(dist(rbind( target.point, data[,-1])))
  closest.training.points <- rank( distances[-1,1] ) < num.neighbors + 1
  loc.sig <- DANN.Sig( data[closest.training.points,] )
  loc.sig
}

# This function determines the distances given by 
# (13.8) on p. 477 of HTF2.

local.dist <- function( target.point, data, num.neighbors=num.neighbors )
{
  x.mat <- t( data[,-1] )  # class variable should 
                           # be in 1st col of data
  diffs <- x.mat - as.vector( t( target.point ) )
  distances <- diag( ( t( diffs )%*%local.Sig( target.point, data, num.neighbors ) )%*%diffs )
  distances
}

# This function determines predicted class of a target point.

pred.class <- function( target.point, data, num.neighbors=num.neighbors, k=k )
{
  dists <- local.dist( target.point, data, num.neighbors )
  near.neighbors <- rank( dists ) < k + 1
  first.class <- min( data[,1] )
  last.class <- max( data[,1] )
  num.classes <- last.class - first.class + 1
  classes <- numeric(num.classes)
  for (g in first.class:last.class )
    classes[g - first.class + 1] <- sum( data[near.neighbors,1] == g  )
  pred <- which.max( classes ) - first.class - 1
  pred
}

DANN <- function( train.data, test.data, num.neighbors=50, k=15 )
{
  num.cases <- length( test.data[,1] )
  predictions <- numeric(num.cases)
  for (i in 1:num.cases )
    predictions[i] <- pred.class( test.data[i,-1], train.data, num.neighbors=num.neighbors, k=k )
  predictions
}

pred <- DANN( trndat, gendat )
DANN.err <- mean(pred != gendat[,1])

DANN.err
# error rate for DANN method
# (not as good as QDA rates)

# I'll try it again using only 7 nearest neighbors.
# (Previous run used default of 15.)

pred <- DANN( trndat, gendat, k=7 )
DANN.7.err <- mean(pred != gendat[,1])

DANN.7.err
# error rate for DANN method (w/ k = 7)
# Worse.

# Now I'll try it using 25 nearest neighbors.

pred <- DANN( trndat, gendat, k=25 )
DANN.25.err <- mean(pred != gendat[,1])

DANN.25.err
# error rate for DANN method (w/ k = 25)

######################################################
#                       SUMMARY                           
#
#  error         method
#  -----         ------
#       
#  0.272  LDA (all variables)
#  0.268  stepwise LDA (x1 chosen)
# 
#  0.176  QDA (all variables)
#  0.172  stepwise QDA (x1 and x2 chosen)
#
#  0.208  5 NN (all variables, 5 chosen using c-v)
#  0.278  95 NN (using 1-d subspace, 95 chosen by c-v)
#  0.276  59 NN (using 2-d subspace, 59 chosen by c-v)
#
#  0.180  svm (radial kernel & gamma = 3/16 chosen by c-v)
#  0.267  svm (linear)
#
#  0.196  DANN (k = 7)
#  0.188  DANN (k = 15)
#  0.189  DANN (k = 25)

##### END #####