# Example of Global Dimension Reduction Method for
# Nearest Neighbors from Subsection 13.4.2 of HFT2.

# Developed using 2.9.0.


library(MASS)     
library(class)  
library(kknn)     # for kknn and train.kknn f'ns (for KNN)
library(klaR)     # for stepclass f'n (for stepwise LDA)
library(mvtnorm)  # for rmvnorm and dmvnorm (for generation 
                  # of multivariate normal observations and
                  # computation of multivariate normal pdf 
                  # values)
    
set.seed(7)     


# I'll generate data from three classes, using a multivariate
# normal distribution for each class.  The covariance matrix
# will be the same for all of the classes.  The means will 
# differ in the first two dimensions, but not the other two
# dimensions.  (So two of the four predictor variables are
# pure noise.)

Sig <- diag(1,4)  # Sig is a variance-covariance matrix.  
Oxvals <- rmvnorm(100, mean=c(0,1,0,0), sigma=Sig) # O is used for the Orange class.
Gxvals <- rmvnorm(100, mean=c(0,0,0,0), sigma=Sig) # G is used for the Green class.
Bxvals <- rmvnorm(100, mean=c(1,0,0,0), sigma=Sig) # B is used for the Blue class.

plot(Oxvals[,1],Oxvals[,2],col="darkorange",main='Training Data (nonnoise variables)',
     xlab='x1',ylab='x2',xlim=c(-3,4),ylim=c(-3,4))
points(Gxvals[,1], Gxvals[,2], col="limegreen")
points(Bxvals[,1], Bxvals[,2], col="royalblue")

# There's quite a bit of overlap.

#####

# I'll give response value of 0 for Orange class observations, 
# 1 for Green class observations, and 2 for the Blue class
# observations, and then combine them together to form the 
# training data.  Then I'll generate 1000 observations for each 
# class to serve as the generalization sample.

Oclass <- cbind(0, Oxvals)
Gclass <- cbind(1, Gxvals)
Bclass <- cbind(2, Bxvals)
trndat <- data.frame(rbind(Oclass, Gclass, Bclass))
names(trndat) <- c("g", "x1", "x2", "x3", "x4")  

Oxvals <- rmvnorm(1000, mean=c(0,1,0,0), sigma=Sig)
Gxvals <- rmvnorm(1000, mean=c(0,0,0,0), sigma=Sig) 
Bxvals <- rmvnorm(1000, mean=c(1,0,0,0), sigma=Sig) 

Oclass <- cbind(0, Oxvals)
Gclass <- cbind(1, Gxvals)
Bclass <- cbind(2, Bxvals)
gendat <- data.frame(rbind(Oclass, Gclass, Bclass))
names(gendat) <- c("g", "x1", "x2", "x3", "x4")  

#####

# I'll obtain an estimate of the Bayes error 
# rate using the large generalization sample.

Opdf <-rep(0,3000)
Gpdf <-rep(0,3000)
Bpdf <-rep(0,3000)
Opdf <- Opdf + dmvnorm(gendat[,2:5], c(0,1,0,0), Sig) 
Gpdf <- Gpdf + dmvnorm(gendat[,2:5], c(0,0,0,0), Sig)
Bpdf <- Bpdf + dmvnorm(gendat[,2:5], c(1,0,0,0), Sig)
pred.bayes <- ifelse(Gpdf > Bpdf, 1, 2)
pred.bayes <- ifelse(Opdf > Gpdf & Opdf > Bpdf, 0, pred.bayes)
est.bayes.rate <- mean(pred.bayes != gendat[,1])

est.bayes.rate
# This is the estimated error rate of the Bayes classifier.

# Below is a confusion matrix, with true class for the 
# rows and predicted class for the columns.  The Green
# class is the most difficult one to correctly predict.

table(gendat[,1], pred.bayes)

#####

# Before trying the method from subsection 13.4.2 of HTF2
# I'll try LDA using all of the variables, and then step-
# wise LDA.

lda.all <- lda(g ~ ., trndat)
pred.lda.all <- predict(lda.all, newdata=gendat)
lda.all.rate <- mean( pred.lda.all$class != gendat[,1] )

lda.all.rate
# LDA rate (all predictors used).

ldafit <- stepclass(g ~ ., data=trndat, method="lda")
ldafit

# Stepwise LDA selects the two nonnoise variables.

lda.step <- lda(g ~ x1 + x2, trndat)
pred.lda.step <- predict(lda.step, newdata=gendat)
lda.step.rate <- mean( pred.lda.step$class != gendat[,1] )

lda.step.rate
# LDA rate (using predictors chosen by stepwise procedure).

# Eliminating the noise variables had little effect.  
# Whether or not they are used, LDA gets close to the
# optimal Bayes error rate.

#####

# Now I'll try ordinary nearest neighbors, using 
# train.kknn to select a good value of k.

cvknn <- train.kknn(as.factor(g) ~ ., trndat, 
          kmax=150, kernel="rectangular", distance=2) 

plot(c(1:150),cvknn$MISCLASS,type="l",xlab='k',
      ylab='proportion misclassified',main='Estimated Error Rates',
      ylim=c(0.4,0.65),col="maroon")
abline(est.bayes.rate, 0, col="purple")
legend(100,0.65,legend=c("cross-val","Bayes"),
       fill=c("maroon","purple"))

which.min(cvknn$MISCLASS)

# We could go with the minimizing value of k, 87, 
# or we could mentally smooth the curve and select
# a value like 105.

#####

# Because the indicated value of k is curiously large,
# and because the c-v results suggest getting close to 
# the Bayes rate is possible, I obtained estimates of  
# the generalization errors to see how KNN performs for 
# all values of k up to 150.  I won't include that part
# here because it takes a relatively long time to run,
# but I can report that for many large values of k the
# estimated error rate was pretty close to the Bayes rate.

pred.gen.105nn <- kknn(as.factor(g)~.,train=trndat,test=gendat,
                     k=105,kernel="rectangular")
gen.err.105nn <- mean(pred.gen.105nn$fit != gendat[,1])

pred.gen.87nn <- kknn(as.factor(g)~.,train=trndat,test=gendat,
                     k=187,kernel="rectangular")
gen.err.87nn <- mean(pred.gen.87nn$fit != gendat[,1])

gen.err.105nn
# Error rate for 105 nearest neighbors.

gen.err.87nn
# Error rate for 87 nearest neighbors.
# (Close to LDA rate and the Bayes rate.)

# Note: The estimated error rate for k = 65 
# is 0.431 (same as estimated Bayes rate).

#####

# Now I'll try the method from subsection 13.4.2 of HTF2.
# Given that both LDA and ordinary nearest neighbors using 
# all of the predictors work so well, it'll be disappointing 
# if this method doesn't also work well.

GlobDimRed <- function(traindata)
# 1st col. of data set should be class variable.
# (Classes should be specified using consecutive integers.)
# Remaining col's of each set should be predictor var's.
{
  first.class <- min(traindata[,1])
  last.class <- max(traindata[,1])
  n.class <- last.class - first.class + 1
  n.var <- length(traindata[1,]) - 1
  var.means <- matrix(0, nrow=n.var, ncol=n.class)
  var.sums <- matrix(0, nrow=n.var, ncol=n.class)
  class.n <- numeric(n.class)
  for (g in first.class:last.class)
  {
    cases <- traindata[,1] == g
    class.n[g - first.class + 1] <- sum(cases)
    classdata <- traindata[cases,]
    var.means[,g - first.class + 1] <- apply(classdata[,-1],2,mean)
    var.sums[,g - first.class + 1] <- apply(classdata[,-1],2,sum)
  }
  # var.means now contains the mean vectors for the
  # classes in its columns ... now I'll subtract the
  # grand mean from each column and put the result in
  # mean.deviations
  grand.mean <- apply(var.sums,1,sum)/sum(class.n)
  mean.deviations <- var.means - grand.mean
  # now I'll put the matrix given by (13.10) on p. 479
  # of HTF2, which is also the matrix B on p. 477 of
  # HTF2, in B
  B <- matrix(0, nrow=n.var, ncol=n.var)
  for (g in first.class:last.class)
  {
    col.ind <- g - first.class + 1
    B <- B + 
         class.n[col.ind]*cbind(mean.deviations[,col.ind])%*%mean.deviations[,col.ind]
  }
  B <- B/sum(class.n)
  # I'll put the eigenvalues and eigned vectors 
  # of B in ev, and return this object
  ev <- eigen(B)
}

dim.red <- GlobDimRed(trndat)
dim.red

# Compared to the 1st two eignevalues, the other two
# eigenvalues are extremely small.  This suggests 
# that a good approximating subspace is the one-
# dimensional subspace spanned by the 1st eignevector
# (which is given by the first column in the matrix
# shown just above), or the two-dimensional subspace
# spanned by the first two eigenvectors.

#####

# I'll create new data sets having predictors created 
# from the eignevectors.  If just the first predictor
# is used, it'll be as though we're using a predictor
# obtained from projecting the original predictors into
# the one-dimensional subspace spanned by the first
# eigenvector.

new.trndat <- data.frame(cbind(trndat[,1], as.matrix(trndat[,-1])%*%ndat[,-1])%*%dim.red$vector))
names(new.gendat) <- c("g", "ev1", "ev2", "ev3", "ev4")

# I'll use train.kknn to identify a good choice of k
# to use for classifying the test set cases using the 
# training data.
# (NOTE: If we wanted to use a subspace of dimension
# greater than 1, train.kknn would not be good to use
# since it automatically scales the predictors, which 
# would give too much influence to the less important
# predictors.  I.e., without scaling, the first eigen-
# vector has the most influence in determining the 
# nearest neighbors, but if the predictors are scaled
# this won't be the case.) 

new.cvknn <- train.kknn(as.factor(g) ~ ., new.trndat[,1:2], 
          kmax=150, kernel="rectangular", distance=2)

plot(c(1:150),new.cvknn$MISCLASS,type="l",xlab='k',
      ylab='proportion misclassified',main='Estimated Error Rates',
      ylim=c(0.4,0.7),col="maroon")
abline(est.bayes.rate, 0, col="purple")
legend(108,0.52,legend=c("cross-val","Bayes"),
       fill=c("maroon","purple"))

which.min(new.cvknn$MISCLASS)

# Let's see what the error rate is if we use 
# 19 nearest neighbors in the one-dimensional
# subspace.

  pred.gen.new <- kknn(as.factor(g)~.,train=new.trndat[,1:2],
                                      test=new.gendat[,1:2],
                                      k=19,kernel="rectangular")
  gen.err.new <- mean(pred.gen.new$fit != new.gendat[,1])

gen.err.new
# Error rate for 19 nearest neighbors (in subspace).

# Quite horrible!

##### END #####