# Example of using DANN for Classification
# (comparison of DANN with KNN, LDA, SVMs, etc)

# Developed using 2.9.0.


library(MASS)     
library(class)  
library(kknn)     # for kknn and train.kknn f'ns (for KNN)
library(klaR)     # for stepclass f'n (for stepwise LDA)
library(e1071)    # for svm f'n (for support vector machines)
library(mvtnorm)  # for rmvnorm and dmvnorm (for generation 
                  # of multivariate normal observations and
                  # computation of multivariate normal pdf 
                  # values)
    
set.seed(17)     


# I'll generate data from two classes. 
# There will be two predictor variables (both meaningful (nonnoise)).

n1 <- 1000  # n1 is number of training sample cases per class
n2 <- 50  # n2 is number of generalization sample cases per class

ang <- runif(2*(n1+n2), 0, 1.5708)
x <- cos(ang)
y <- sin(ang)
class1 <- cbind(1, x[1:(n1+n2)] - runif((n1+n2),0,0.1) + 0.01, 
                   y[1:(n1+n2)] - runif((n1+n2),0,0.1) - 0.01)
class2 <- cbind(2, x[(n1+n2+1):(2*(n1+n2))] + runif((n1+n2),0,0.1) - 0.01, 
                   y[(n1+n2+1):(2*(n1+n2))] + runif((n1+n2),0,0.1) - 0.01)
trndat <- data.frame(rbind(class1[1:n1,],class2[1:n1,]))
gendat <- data.frame(rbind(class1[(n1+1):(n1+n2),],class2[(n1+1):(n1+n2),]))
names(trndat) <- c("g","x1","x2")
names(gendat) <- c("g","x1","x2")

# I'll plot the training data.

plot(trndat[,2],trndat[,3],col=(trndat[,1]+1),main='Training Data',xlab='x1',ylab='x2')

#####

# Before trying the DANN method of HTF2
# I'll try LDA.

lda.fit <- lda(g ~ ., trndat)
pred.lda <- predict(lda.fit, newdata=gendat)
lda.rate <- mean( pred.lda$class != gendat[,1] )

lda.rate
# LDA rate

#####

# Now I'll try QDA.

qda.fit <- qda(g ~ ., trndat)
pred.qda <- predict(qda.fit, newdata=gendat)
qda.rate <- mean( pred.qda$class != gendat[,1] )

qda.rate
# QDA rate 

# It seems odd that the QDA rate is worse.
# Let's look at the QDA predictions.

points(gendat[,2],gendat[,3],col=(as.numeric(pred.qda$class)+1),pch=15)

# It doesn't seem right!!!

#####

# Now I'll try ordinary nearest neighbors, using 
# train.kknn to select a good value of k.

cvknn <- train.kknn(as.factor(g) ~ ., trndat, 
          kmax=100, kernel="rectangular", distance=2) 

plot(c(1:100),cvknn$MISCLASS,type="l",xlab='k',
      ylab='proportion misclassified',main='Estimated Error Rates',col="maroon")

which.min(cvknn$MISCLASS)

# I'll use k = 75.

pred.gen.knn <- kknn(as.factor(g)~.,train=trndat,test=gendat,
                     k=75,kernel="rectangular")
gen.err.knn <- mean(pred.gen.knn$fit != gendat[,1])

gen.err.knn
# Error rate for 75 nearest neighbors.
# Better than the LDA and QDA results.

#####

# Now I will try some support vector machines.  
# The default kernel is the radial kernel.  
# Using the svm function, one has to supply a 
# value for the gamma parameter, or else rely
# on the default, which is the inverse of the 
# number of predictors, or 1/2 in this case.
# The help page stresses that one should tune,
# and so a variety of gamma values should be 
# considered.  The svm function has a cross-
# validation feature which can be used to obtain 
# an estimate of generalization error rate from
# the training sample for the particular value of 
# gamma which is currently being used.  However, 
# even if the same random number seed is used,
# the c-v estimates for a particular value of
# gamma can differ from trial to trial.  This 
# prevents me from doing a "trial-and-error" search 
# for a good gamma and making comments here, 
# because when this script is run I can't know 
# in advance what the outcomes will be.  So I'll 
# just try a bunch of different gamma values 
# and obtain c-v estimates of the error rates, as 
# well as estimates of the error rates using the 
# fitted models to make predictions for the large
# generalization sample.

num.trials <- 8
gamma.val <- c(1/16, 1/8, 3/16, 1/4, 3/8, 1/2, 3/4, 1)
error.cv <- numeric(8)
error.gen <- numeric(8)
for (i in 1:num.trials)
{
  svm.rad <- svm( as.factor(g) ~ ., trndat, cross=10, gamma=gamma.val[i])
  error.cv[i] <- svm.rad$tot.accuracy
  error.gen[i] <- mean( predict(svm.rad, gendat) != as.factor(gendat[,1]) )
}
error.cv <- 1 - error.cv/100

plot(gamma.val,error.gen,type="l",xlab='gamma',
      ylab='proportion misclassified',main='Estimated Error Rates (SVM, radial kernel)',
      ylim=c(0,0.05),col="seagreen")
points(gamma.val,error.cv,type="l",col="tomato")
legend(0.65,0.05,legend=c("cross-val","generalization"),
       fill=c("tomato","seagreen"))

# It appears that the results are rather insensitive to the choice of gamma.
# So I'll just use the default gamma.

#####

svm.rad <- svm( as.factor(g) ~ ., trndat)
error.svm <- mean( predict(svm.rad, gendat) != as.factor(gendat[,1]) )
error.svm
# estimated error rate for svm (radial, gamma = 1/2 (default))
# Pretty good --- same as KNN, and much better than LDA and QDA.

#####

# Here is a plot of the SVM classifier (showing just the two
# nonnoise variables), using radial basis kernel and gamma =
# 1/2

plot(svm.rad, trndat, x2 ~ x1)

#####

# Now for the DANN method (see p. 477 of HTF2).

# Here is a function to find square root of 
# the inverse of a matrix.

sqrt.inv.mat <- function( mat )  # won't work for all matrices
{
  eigen.mat <- eigen( mat )
  q <- eigen.mat$vectors
  inv.sqrt.lamb <- diag( sqrt( 1/eigen.mat$values ) )
  sr <- ( q%*%inv.sqrt.lamb )%*%t(q)
  sr
}

# Here is a function to find the the pooled
# within-class covariance matrix (see p. 477
# of HTF2).  First column of data matrix 
# should have groups identified with consecutive
# integers.

pooled.within.cov <- function( data )
{
  first.class <- min( data[,1] )
  last.class <- max( data[,1] )
  var.cov <- 0
  for ( g in first.class:last.class )
  {
    cases <- data[,1] == g
    if (sum(cases) < 2) var.cov <- var.cov + 0  else
      var.cov <- var.cov + ( sum( cases )/length( data[,1] ) )*cov( data[cases,-1] )
  }
  var.cov
}

# Here are a pair of functions that determine the 
# Sigma given by (13.9) on p. 477 of HTF2.  (Based 
# on comment in HTF2, the default value of epsilon 
# is 1.)  

between.cov <- function(data)
# 1st col. of data set should be class variable.
# (Classes should be specified using consecutive integers.)
# Remaining col's of each set should be predictor var's.
{
  first.class <- min(data[,1])
  last.class <- max(data[,1])
  n.class <- last.class - first.class + 1
  n.var <- length(data[1,]) - 1
  var.means <- matrix(0, nrow=n.var, ncol=n.class)
  var.sums <- matrix(0, nrow=n.var, ncol=n.class)
  class.n <- numeric(n.class)
  for (g in first.class:last.class)
  {
    cases <- data[,1] == g
    class.n[g-first.class+1] <- sum(cases)
    classdata <- data[cases,]
    var.means[,g-first.class+1] <- apply(classdata[,-1],2,mean)
    var.sums[,g-first.class+1] <- apply(classdata[,-1],2,sum)
  }
  # var.means now contains the mean vectors for the
  # classes in its columns ... now I'll subtract the
  # grand mean from each column and put the result in
  # mean.deviations
  grand.mean <- apply(var.sums,1,sum)/sum(class.n)
  mean.deviations <- var.means - grand.mean
  # now I'll put the matrix given by (13.10) on p. 479
  # of HTF2, which is also the matrix B on p. 477 of
  # HTF2, in B
  B <- matrix(0, nrow=n.var, ncol=n.var)
  for (g in first.class:last.class)
  {
    col.ind <- g - first.class + 1
    B <- B + 
         class.n[col.ind]*cbind(mean.deviations[,col.ind])%*%mean.deviations[,col.ind]
  }
  B <- B/sum(class.n)
  B
}

DANN.Sig <- function( data, epsilon=1 )
{
  w.neg.half <- sqrt.inv.mat( pooled.within.cov( data ) )
  b <- between.cov( data )
  b.star <- w.neg.half%*%b%*%w.neg.half
  n.var <- length( data[1,] ) - 1
  sig <- w.neg.half%*%( b.star + diag(epsilon, n.var) )%*%w.neg.half
}

local.Sig <- function( target.point, data, num.neighbors=num.neighbors )
{
  # data should have class indicator in 1st column, 
  # and predictor variables in remaining columns.
  # target.point should just have coordinates of
  # target point.  Based on a comment in HFT2, the
  # default number of nearest neighbors used is 50.
  distances <- as.matrix(dist(rbind( target.point, data[,-1])))
  closest.training.points <- rank( distances[-1,1] ) < num.neighbors + 1
  loc.sig <- DANN.Sig( data[closest.training.points,] )
  loc.sig
}

# This function determines the distances given by 
# (13.8) on p. 477 of HTF2.

local.dist <- function( target.point, data, num.neighbors=num.neighbors )
{
  x.mat <- t( data[,-1] )  # class variable should 
                           # be in 1st col of data
  diffs <- x.mat - as.vector( t( target.point ) )
  distances <- diag( ( t( diffs )%*%local.Sig( target.point, data, num.neighbors ) )%*%diffs )
  distances
}

# This function determines predicted class of a target point.

pred.class <- function( target.point, data, num.neighbors=num.neighbors, k=k )
{
  dists <- local.dist( target.point, data, num.neighbors )
  near.neighbors <- rank( dists ) < k + 1
  first.class <- min( data[,1] )
  last.class <- max( data[,1] )
  num.classes <- last.class - first.class + 1
  classes <- numeric(num.classes)
  for (g in first.class:last.class )
    classes[g - first.class + 1] <- sum( data[near.neighbors,1] == g  )
  pred <- which.max( classes ) + first.class - 1
  pred
}

DANN <- function( train.data, test.data, num.neighbors=50, k=15 )
{
  num.cases <- length( test.data[,1] )
  predictions <- numeric(num.cases)
  for (i in 1:num.cases )
    predictions[i] <- pred.class( test.data[i,-1], train.data, num.neighbors=num.neighbors, k=k )
  predictions
}

pred <- DANN( trndat, gendat )
DANN.err <- mean(pred != gendat[,1])

DANN.err
# error rate for DANN method
# Best so far!

# I'll see if increasing the number of cases used to determine 
# "local shape" and increasing the number of nearest neighbors
# used for classification (since default is 15, and for ordinary
# KNN, c-v indicated using k = 75 is good) improves the results
# (which can only occur if there are no errors).

pred <- DANN( trndat, gendat, 100, 45 )
DANN.err <- mean(pred != gendat[,1])

DANN.err
# Worse, but still as good as SVM and KNN.

##### END #####