# Example of Global Dimension Reduction Method for
# Nearest Neighbors from Subsection 13.4.2 of HFT2,
# and Support Vector Machines from Ch. 12.

# Developed using 2.9.0.


library(MASS)     
library(class)  
library(kknn)     # for kknn and train.kknn f'ns (for KNN)
library(klaR)     # for stepclass f'n (for stepwise LDA)
library(e1071)    # for svm f'n (for support vector machines)
library(mvtnorm)  # for rmvnorm and dmvnorm (for generation 
                  # of multivariate normal observations and
                  # computation of multivariate normal pdf 
                  # values)
    
set.seed(7)     


# I'll generate data from four classes, using a multivariate
# normal distribution for each class.  The covariance matrix
# will be the same for all of the classes.  The means will 
# differ in the first two dimensions, but not the other six
# dimensions.  (So six of the eight predictor variables are
# pure noise.)

Sig <- diag(1,8)  
Sig[1,2] <- 0.5
Sig[2,1] <- 0.5
Sig
# Sig is a variance-covariance matrix.  

Oxvals <- rmvnorm(100, mean=c(-1.5,-1,0,0,0,0,0,0), sigma=Sig) # O is used for the Orange class.
Gxvals <- rmvnorm(100, mean=c(-0.75,-0.5,0,0,0,0,0,0), sigma=Sig) # G is used for the Green class.
Mxvals <- rmvnorm(100, mean=c(0.75,1,0,0,0,0,0,0), sigma=Sig) # M is used for the Maroon class.
Bxvals <- rmvnorm(100, mean=c(1.5,1.5,0,0,0,0,0,0), sigma=Sig) # B is used for the Blue class.

plot(Oxvals[,1],Oxvals[,2],col="darkorange",main='Training Data (nonnoise variables)',
     xlab='x1',ylab='x2',xlim=c(-4.5,4.5),ylim=c(-4.5,4.5))
points(Gxvals[,1], Gxvals[,2], col="limegreen")
points(Mxvals[,1], Mxvals[,2], col="maroon")
points(Bxvals[,1], Bxvals[,2], col="royalblue")

# There's quite a bit of overlap.

#####

# I'll give response value of 0 for Orange class observations, 
# 1 for Green class observations, 2 for the Maroon class
# observations, and 3 for the Blue class observations, and then 
# combine them together to form the training data.  Then I'll 
# generate 1000 observations for each class to serve as the 
# generalization sample.

Oclass <- cbind(0, Oxvals)
Gclass <- cbind(1, Gxvals)
Mclass <- cbind(2, Mxvals)
Bclass <- cbind(3, Bxvals)
trndat <- data.frame(rbind(Oclass, Gclass, Mclass, Bclass))
names(trndat) <- c("g", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8")  

Oxvals <- rmvnorm(1000, mean=c(-1.5,-1,0,0,0,0,0,0), sigma=Sig) # O is used for the Orange class.
Gxvals <- rmvnorm(1000, mean=c(-0.75,-0.5,0,0,0,0,0,0), sigma=Sig) # G is used for the Green class.
Mxvals <- rmvnorm(1000, mean=c(0.75,1,0,0,0,0,0,0), sigma=Sig) # M is used for the Maroon class.
Bxvals <- rmvnorm(1000, mean=c(1.5,1.5,0,0,0,0,0,0), sigma=Sig) # B is used for the Blue class.

Oclass <- cbind(0, Oxvals)
Gclass <- cbind(1, Gxvals)
Mclass <- cbind(2, Mxvals)
Bclass <- cbind(3, Bxvals)
gendat <- data.frame(rbind(Oclass, Gclass, Mclass, Bclass))
names(gendat) <- c("g", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8")  

#####

# I'll obtain an estimate of the Bayes error 
# rate using the large generalization sample.

Opdf <- dmvnorm(gendat[,2:9], c(-1.5,-1,0,0,0,0,0,0), Sig) 
Gpdf <- dmvnorm(gendat[,2:9], c(-0.75,-0.5,0,0,0,0,0,0), Sig)
Mpdf <- dmvnorm(gendat[,2:9], c(0.75,1,0,0,0,0,0,0), Sig)
Bpdf <- dmvnorm(gendat[,2:9], c(1.5,1.5,0,0,0,0,0,0), Sig)

pred.bayes <- ifelse(Gpdf > Mpdf, 1, 2)
pred.bayes <- ifelse(Opdf > Gpdf & Opdf > Bpdf & Opdf > Mpdf, 0, pred.bayes)
pred.bayes <- ifelse(Bpdf > Opdf & Bpdf > Gpdf & Bpdf > Mpdf, 3, pred.bayes)
est.bayes.rate <- mean(pred.bayes != gendat[,1])

est.bayes.rate
# This is the estimated error rate of the Bayes classifier.

# Below is a confusion matrix, with true class for the 
# rows and predicted class for the columns.  The Green
# and Maroon classes are the most difficult one to 
# correctly predict.

table(gendat[,1], pred.bayes)

#####

# Before trying the method from subsection 13.4.2 of HTF2
# I'll try LDA using all of the variables, and then step-
# wise LDA.

lda.all <- lda(g ~ ., trndat)
pred.lda.all <- predict(lda.all, newdata=gendat)
lda.all.rate <- mean( pred.lda.all$class != gendat[,1] )

lda.all.rate
# LDA rate (all predictors used).

ldafit <- stepclass(g ~ ., data=trndat, method="lda")
ldafit

# Stepwise LDA selects only x2.

lda.step <- lda(g ~ x2, trndat)
pred.lda.step <- predict(lda.step, newdata=gendat)
lda.step.rate <- mean( pred.lda.step$class != gendat[,1] )

lda.step.rate
# LDA rate (using predictor chosen by stepwise procedure).
# Not good!  Stepwise LDA failed us.

#####

# Now I'll try ordinary nearest neighbors, using 
# train.kknn to selct a good value of k.

cvknn <- train.kknn(as.factor(g) ~ ., trndat, 
          kmax=150, kernel="rectangular", distance=2) 

plot(c(1:150),cvknn$MISCLASS,type="l",xlab='k',
      ylab='proportion misclassified',main='Estimated Error Rates',
      ylim=c(0.4,0.65),col="maroon")
abline(est.bayes.rate, 0, col="purple")
legend(100,0.65,legend=c("cross-val","Bayes"),
       fill=c("maroon","purple"))

which.min(cvknn$MISCLASS)

# Not looking good.  I'll go with k = 11.

pred.gen.knn <- kknn(as.factor(g)~.,train=trndat,test=gendat,
                     k=11,kernel="rectangular")
gen.err.knn <- mean(pred.gen.knn$fit != gendat[,1])

gen.err.knn
# Error rate for 11 nearest neighbors.
# (Not good.)

#####

# Now I'll try the method from subsection 13.4.2 of HTF2.

GlobDimRed <- function(traindata)
# 1st col. of data set should be class variable.
# (Classes should be specified using consecutive integers.)
# Remaining col's of each set should be predictor var's.
{
  first.class <- min(traindata[,1])
  last.class <- max(traindata[,1])
  n.class <- last.class - first.class + 1
  n.var <- length(traindata[1,]) - 1
  var.means <- matrix(0, nrow=n.var, ncol=n.class)
  var.sums <- matrix(0, nrow=n.var, ncol=n.class)
  class.n <- numeric(n.class)
  for (g in first.class:last.class)
  {
    cases <- traindata[,1] == g
    class.n[g - first.class + 1] <- sum(cases)
    classdata <- traindata[cases,]
    var.means[,g - first.class + 1] <- apply(classdata[,-1],2,mean)
    var.sums[,g - first.class + 1] <- apply(classdata[,-1],2,sum)
  }
  # var.means now contains the mean vectors for the
  # classes in its columns ... now I'll subtract the
  # grand mean from each column and put the result in
  # mean.deviations
  grand.mean <- apply(var.sums,1,sum)/sum(class.n)
  mean.deviations <- var.means - grand.mean
  # now I'll put the matrix given by (13.10) on p. 479
  # of HTF2, which is also the matrix B on p. 477 of
  # HTF2, in B
  B <- matrix(0, nrow=n.var, ncol=n.var)
  for (g in first.class:last.class)
  {
    col.ind <- g - first.class + 1
    B <- B + 
         class.n[col.ind]*cbind(mean.deviations[,col.ind])%*%mean.deviations[,col.ind]
  }
  B <- B/sum(class.n)
  # I'll put the eigenvalues and eigned vectors 
  # of B in ev, and return this object
  ev <- eigen(B)
}

dim.red <- GlobDimRed(trndat)
dim.red

# Compared to the first three eignevalues, the others
# are extremely small, and the 2nd and 3rd are much smaller 
# than the first.  This suggests that a good approximating 
# subspace may be the one-dimensional subspace spanned by 
# the 1st eignevector, or the three-dimensional subspace
# spanned by the first three eigenvectors.

#####

# I'll create new data sets having predictors created 
# from the eignevectors.  If just the first predictor
# is used, it'll be as though we're using a predictor
# obtained from projecting the original predictors into
# the one-dimensional subspace spanned by the first
# eigenvector.

new.trndat <- data.frame(cbind(trndat[,1], as.matrix(trndat[,-1])%*%dim.red$vector))
names(new.trndat) <- c("g","ev1","ev2","ev3","ev4","ev5","ev6","ev7","ev8")

new.gendat <- data.frame(cbind(gendat[,1], as.matrix(gendat[,-1])%*%dim.red$vector))
names(new.gendat) <- c("g","ev1","ev2","ev3","ev4","ev5","ev6","ev7","ev8")

# I'll first try the best one-dimensional approximating
# subspace, using train.kknn to identify a good choice 
# of k to use for classifying the test set cases using 
# the training data.
# (NOTE: If we wanted to use a subspace of dimension
# greater than 1, train.kknn may not be good to use
# since it automatically scales the predictors, which 
# would give too much influence to the less important
# predictors.  I.e., I think it'll work better if we
# can give the first eigenvector the most influence in 
# determining the nearest neighbors, but if the predictors 
# are scaled this won't be the case.) 

new.cvknn <- train.kknn(as.factor(g) ~ ., new.trndat[,1:2], 
          kmax=150, kernel="rectangular", distance=2)

plot(c(1:150),new.cvknn$MISCLASS,type="l",xlab='k',
      ylab='proportion misclassified',main='Estimated Error Rates',
      ylim=c(0.4,0.7),col="maroon")
abline(est.bayes.rate, 0, col="purple")
legend(108,0.7,legend=c("cross-val","Bayes"),
       fill=c("maroon","purple"))

which.min(new.cvknn$MISCLASS)

# Let's see what the error rate is if we use 
# 40 nearest neighbors in the one-dimensional
# subspace.

  pred.gen.new <- kknn(as.factor(g)~.,train=new.trndat[,1:2],test=new.gendat[,1:2],
                       k=43,kernel="rectangular")
  gen.err.new <- mean(pred.gen.new$fit != new.gendat[,1])

gen.err.new
# Error rate for 40 nearest neighbors (in 1-d subspace).
# A decent improvement over ordinary nearest neighbors 
# using all of the variables, but not as good as LDA 
# (using all of the variables).  (Of course the data was
# generated in such a way as to make LDA well-suited for it.)

#####

# Now I'll try using the best approximating three-dimensional
# subspace.  From below, it can be noted that the sample 
# variances of the new variables aren't all the same (even 
# though each eigenvector is of length 1).

apply( new.trndat[,-1], 2, var )  

# I won't use train.kknn to select a value of k
# because scaling the three predictors would give the one based
# on the second and third eigenvalues too much influence.  Instead I'll
# just keep k at 40 (thinking that the 2nd and 3rd predictors won't
# change things that much (but hoping that it changes things 
# a bit for the better)).  I'll use the knn function instead 
# of the kknn function because knn doesn't scale the predictors.

pred.gen.newest <- knn(new.trndat[,2:4],test=new.gendat[,2:4],
                       cl=new.trndat[,1],k=40)
gen.err.newest <- mean(pred.gen.newest != new.gendat[,1])

gen.err.newest
# Error rate for 40 nearest neighbors (in 3-d subspace).
# Just a tad better.

##### 

# Now I will try some support vector machines.  
# The default kernel is the radial kernel.  
# Using the svm function, one has to supply a 
# value for the gamma parameter, or else rely
# on the default, which is the inverse of the 
# number of predictors, or 1/8 in this case.
# The help page stresses that one should tune,
# and so a variety of gamma values should be 
# considered.  The svm function has a cross-
# validation feature which can be used to obtain 
# an estimate of generalization error rate from
# the training sample for the particular value of 
# gamma which is currently being used.  However, 
# even if the same random number seed is used,
# the c-v estimates for a particular value of
# gamma can differ from trial to trial.  This 
# prevents from doing a "trial-and-error" search 
# for a good gamma and making comments here, 
# because when this script is run I can't know 
# in advance what the outcoems will be.  So I'll 
# just try a bunch of different gamma values 
# and obtain c-v estimates of the error rates, as 
# well as estimates of the error rates using the 
# fitted models to make predictions for the large
# generalization sample.

num.trials <- 8
gamma.val <- c(1/16, 1/8, 3/16, 1/4, 5/16, 3/8, 7/16, 1/2)
error.cv <- numeric(8)
error.gen <- numeric(8)
for (i in 1:num.trials)
{
  svm.rad <- svm( as.factor(g) ~ ., trndat, cross=10, gamma=gamma.val[i])
  error.cv[i] <- svm.rad$tot.accuracy
  error.gen[i] <- mean( predict(svm.rad, gendat) != as.factor(gendat[,1]) )
}
error.cv <- 1 - error.cv/100

plot(gamma.val,error.gen,type="l",xlab='gamma',
      ylab='proportion misclassified',main='Estimated Error Rates (SVM, radial kernel)',
      ylim=c(0.4,0.70),col="seagreen")
points(gamma.val,error.cv,type="l",col="tomato")
abline(est.bayes.rate, 0, col="purple")
legend(0.15,0.7,legend=c("cross-val","generalization","Bayes"),
       fill=c("tomato","seagreen","purple"))

#####

# Now I'll try using the polynomial kernel.  To tune,
# I'll start by considering various values for the 
# degree, keeping the other parameters at their default
# settings.

svm.poly2 <- svm( as.factor(g) ~ ., trndat, cross=10, kernel="polynomial", degree=2)
summary(svm.poly2)

svm.poly3 <- svm( as.factor(g) ~ ., trndat, cross=10, kernel="polynomial", degree=3)
summary(svm.poly3)

svm.poly4 <- svm( as.factor(g) ~ ., trndat, cross=10, kernel="polynomial", degree=4)
summary(svm.poly4)

svm.poly5 <- svm( as.factor(g) ~ ., trndat, cross=10, kernel="polynomial", degree=5)
summary(svm.poly4)

# C-v results indicate setting the degree to 3 
# works best, but it doesn't look real promising.
# Nevertheless, I'll try some different values of
# gamma to see if I can get improvement.

for (i in 1:num.trials)
{
  svm.rad <- svm( as.factor(g) ~ ., trndat, cross=10, kernel="polynomial",
                  degree=3, gamma=gamma.val[i])
  error.cv[i] <- svm.rad$tot.accuracy
  error.gen[i] <- mean( predict(svm.rad, gendat) != as.factor(gendat[,1]) )
}
error.cv <- 1 - error.cv/100

plot(gamma.val,error.gen,type="l",xlab='gamma',
      ylab='proportion misclassified',main='Estimated Error Rates (SVM, poly 3 kernel)',
      ylim=c(0.4,0.85),col="seagreen")
points(gamma.val,error.cv,type="l",col="tomato")
abline(est.bayes.rate, 0, col="purple")
legend(0.15,0.85,legend=c("cross-val","generalization","Bayes"),
       fill=c("tomato","seagreen","purple"))

#####

# Before quitting, I'll try the linear kernel.

svm.lin <- svm( as.factor(g) ~ ., trndat, cross=10, kernel="linear")
summary(svm.lin)

# The c-v estimate isn't promising, but I'll check the
# performance on the generalization set anyway.

mean( predict(svm.lin, gendat) != as.factor(gendat[,1]) )

# Better than the c-v indicated it would be.
# In fact, except for LDA, this is our best
# result.  Let's look at a plot to see if we 
# can figure out why. 

plot(svm.lin, trndat, x2 ~ x1)

# Could it be that the linear kernel leads to a fit
# similar to the LDA fit having linear boundaries?

##### End #####