# Methods for Variable Selection / Weighting
# Part II


# Developed using 2.9.0.


library(MASS)
library(class)  
library(kknn)     # for kknn and train.kknn f'ns (for KNN)
library(klaR)     # for stepclass f'n (for stepwise LDA)
library(e1071)    # for svm f'n (for support vector machines)
library(mvtnorm)  # for rmvnorm and dmvnorm (for generation 
                  # of multivariate normal observations and
                  # computation of multivariate normal pdf 
                  # values)
    
set.seed(7)     


# I'll generate data for four classes.  The means will 
# differ in the first two dimensions, but not the other three
# dimensions.  (So three of the five predictor variables are
# pure noise.)

Sig <- diag(1,5)  
Sig[1,2] <- 0.6
Sig[2,1] <- 0.6
Sig[3,4] <- 0.4
Sig[4,3] <- 0.4

Oxvals <- rmvnorm(100, mean=c(3.5,3,5,5,5), sigma=Sig) # O is used for the Orange class.
Gxvals <- (0.8*rmvnorm(100, mean=c(3.25,3.75,6.25,6.25,6.25), sigma=Sig)) # G is used for the Green class.
Mxvals <- (0.8*rmvnorm(100, mean=c(4.75,4,6.25,6.25,6.25), sigma=Sig)) # M is used for the Maroon class.
Bxvals <- rmvnorm(100, mean=c(5.5,5.5,5,5,5), sigma=Sig) # B is used for the Blue class.
Gxvals[,2] <- Gxvals[,2]^0.75
Mxvals[,2] <- Mxvals[,2]^1.5

plot(Oxvals[,1],Oxvals[,2],col="darkorange",main='Training Data (nonnoise variables)',
     xlab='x1',ylab='x2',xlim=c(-1,9),ylim=c(-1,12.5))
points(Gxvals[,1], Gxvals[,2], col="limegreen")
points(Mxvals[,1], Mxvals[,2], col="maroon")
points(Bxvals[,1], Bxvals[,2], col="royalblue")

#####

# I'll give response value of 1 for Orange class observations, 
# 2 for Green class observations, 3 for the Maroon class
# observations, and 4 for the Blue class observations, and then 
# combine them together to form the training data.  Then I'll 
# generate 250 observations for each class to serve as the 
# generalization sample.

Oclass <- cbind(1, Oxvals)
Gclass <- cbind(2, Gxvals)
Mclass <- cbind(3, Mxvals)
Bclass <- cbind(4, Bxvals)
trndat <- data.frame(rbind(Oclass, Gclass, Mclass, Bclass))
names(trndat) <- c("g", "x1", "x2", "x3", "x4", "x5")  
trndat[,5] <- trndat[,5]^0.75
trndat[,6] <- trndat[,6]^1.5

Oxvals <- rmvnorm(250, mean=c(3.5,3,5,5,5), sigma=Sig) # O is used for the Orange class.
Gxvals <- (0.8*rmvnorm(250, mean=c(3.25,3.75,6.25,6.25,6.25), sigma=Sig)) # G is used for the Green class.
Mxvals <- (0.8*rmvnorm(250, mean=c(4.75,4,6.25,6.25,6.25), sigma=Sig)) # M is used for the Maroon class.
Bxvals <- rmvnorm(250, mean=c(5.5,5.5,5,5,5), sigma=Sig) # B is used for the Blue class.

Oclass <- cbind(1, Oxvals)
Gxvals[,2] <- Gxvals[,2]^0.75
Gclass <- cbind(2, Gxvals)
Mxvals[,2] <- Mxvals[,2]^1.5
Mclass <- cbind(3, Mxvals)
Bclass <- cbind(4, Bxvals)
gendat <- data.frame(rbind(Oclass, Gclass, Mclass, Bclass))
names(gendat) <- c("g", "x1", "x2", "x3", "x4", "x5")  
gendat[,5] <- gendat[,5]^0.75
gendat[,6] <- gendat[,6]^1.5

#####

# Before trying the method from subsection 13.4.2 of HTF2
# I'll try LDA using all of the variables, and then step-
# wise LDA.

lda.all <- lda(g ~ ., trndat)
pred.lda.all <- predict(lda.all, newdata=gendat)
lda.all.rate <- mean( pred.lda.all$class != gendat[,1] )

lda.all.rate
# LDA rate (all predictors used).

ldafit <- stepclass(g ~ ., data=trndat, method="lda")
ldafit

# Stepwise LDA (correctly) selects the two meaningful variables.

lda.step <- lda(g ~ x1 + x2, trndat)
pred.lda.step <- predict(lda.step, newdata=gendat)
lda.step.rate <- mean( pred.lda.step$class != gendat[,1] )

lda.step.rate
# LDA rate (using predictor chosen by stepwise procedure).
# A tad worse!

##### 

# I'll now try SVM using the linear kernel.

svm.lin <- svm( as.factor(g) ~ ., trndat, cross=10, kernel="linear")
summary(svm.lin)

mean( predict(svm.lin, gendat) != as.factor(gendat[,1]) )

plot(svm.lin, trndat, x2 ~ x1)

#####

# Now I'll try QDA.

qda.all <- qda(g ~ ., trndat)
pred.qda.all <- predict(qda.all, newdata=gendat)
qda.all.rate <- mean( pred.qda.all$class != gendat[,1] )

qda.all.rate
# QDA rate (all predictors used).

qdafit <- stepclass(g ~ ., data=trndat, method="qda")
qdafit

# Stepwise QDA (correctly) selects the 2 useful variables.

qda.step <- qda(g ~ x1 + x2, trndat)
pred.qda.step <- predict(qda.step, newdata=gendat)
qda.step.rate <- mean( pred.qda.step$class != gendat[,1] )

qda.step.rate
# QDA rate (using predictor chosen by stepwise procedure).
# Just a wee tad better (2 more correct out of 1000).

#####

# Now I'll try ordinary nearest neighbors, using 
# train.kknn to selct a good value of k.

cvknn <- train.kknn(as.factor(g) ~ ., trndat, 
          kmax=100, kernel="rectangular", distance=2) 

plot(c(1:100),cvknn$MISCLASS,type="l",xlab='k',
      ylab='proportion misclassified',main='Estimated Error Rates',
      ylim=c(0.25,0.45),col="tomato")

which.min(cvknn$MISCLASS)

# I'll go with k = 19.

pred.gen.knn <- kknn(as.factor(g)~.,train=trndat,test=gendat,
                     k=19,kernel="rectangular")
gen.err.knn <- mean(pred.gen.knn$fit != gendat[,1])

gen.err.knn
# Error rate for 19 nearest neighbors.
# Worst so far.

# I'll now try using k = 40.

pred.gen.knn <- kknn(as.factor(g)~.,train=trndat,test=gendat,
                     k=40,kernel="rectangular")
gen.err.knn <- mean(pred.gen.knn$fit != gendat[,1])

gen.err.knn
# Error rate for 40 nearest neighbors.
# Even worse.

#####

# Now I'll try the method from subsection 13.4.2 of HTF2.

GlobDimRed <- function(traindata)
# 1st col. of data set should be class variable.
# (Classes should be specified using consecutive integers.)
# Remaining col's of each set should be predictor var's.
{
  first.class <- min(traindata[,1])
  last.class <- max(traindata[,1])
  n.class <- last.class - first.class + 1
  n.var <- length(traindata[1,]) - 1
  var.means <- matrix(0, nrow=n.var, ncol=n.class)
  var.sums <- matrix(0, nrow=n.var, ncol=n.class)
  class.n <- numeric(n.class)
  for (g in first.class:last.class)
  {
    cases <- traindata[,1] == g
    class.n[g - first.class + 1] <- sum(cases)
    classdata <- traindata[cases,]
    var.means[,g - first.class + 1] <- apply(classdata[,-1],2,mean)
    var.sums[,g - first.class + 1] <- apply(classdata[,-1],2,sum)
  }
  # var.means now contains the mean vectors for the
  # classes in its columns ... now I'll subtract the
  # grand mean from each column and put the result in
  # mean.deviations
  grand.mean <- apply(var.sums,1,sum)/sum(class.n)
  mean.deviations <- var.means - grand.mean
  # now I'll put the matrix given by (13.10) on p. 479
  # of HTF2, which is also the matrix B on p. 477 of
  # HTF2, in B
  B <- matrix(0, nrow=n.var, ncol=n.var)
  for (g in first.class:last.class)
  {
    col.ind <- g - first.class + 1
    B <- B + 
         class.n[col.ind]*cbind(mean.deviations[,col.ind])%*%mean.deviations[,col.ind]
  }
  B <- B/sum(class.n)
  # I'll put the eigenvalues and eigned vectors 
  # of B in ev, and return this object
  ev <- eigen(B)
}

dim.red <- GlobDimRed(trndat)
dim.red

# Compared to the first two eignevalues, the others
# are extremely small, and the 2nd is much smaller 
# than the first.  This suggests that a good approximating 
# subspace may be the one-dimensional subspace spanned by 
# the 1st eignevector, or possibly a two-dimensional subspace
# spanned by the first two eigenvectors.

#####

# I'll create new data sets having predictors created 
# from the eignevectors.  If just the first predictor
# is used, it'll be as though we're using a predictor
# obtained from projecting the original predictors into
# the one-dimensional subspace spanned by the first
# eigenvector.

new.trndat <- data.frame(cbind(trndat[,1], as.matrix(trndat[,-1])%*%dim.red$vector))
names(new.trndat) <- c("g","ev1","ev2","ev3","ev4","ev5")

new.gendat <- data.frame(cbind(gendat[,1], as.matrix(gendat[,-1])%*%dim.red$vector))
names(new.gendat) <- c("g","ev1","ev2","ev3","ev4","ev5")

# I'll first try the best one-dimensional approximating
# subspace, using train.kknn to identify a good choice 
# of k to use for classifying the test set cases using 
# the training data.
#
# (NOTE: If we wanted to use a subspace of dimension
# greater than 1, train.kknn may not be good to use
# since it automatically scales the predictors, which 
# would give too much influence to the less important
# predictors.  I.e., I think it'll work better if we
# can give the first eigenvector the most influence in 
# determining the nearest neighbors, but if the predictors 
# are scaled this won't be the case.) 

new.cvknn <- train.kknn(as.factor(g) ~ ., new.trndat[,1:2], 
          kmax=150, kernel="rectangular", distance=2)

plot(c(1:150),new.cvknn$MISCLASS,type="l",xlab='k',
      ylab='proportion misclassified',main='Estimated Error Rates',
      ylim=c(0.35,0.55),col="maroon")

which.min(new.cvknn$MISCLASS)

# Let's see what the error rate is if we use 
# 25 nearest neighbors in the one-dimensional
# subspace.

  pred.gen.new <- kknn(as.factor(g)~.,train=new.trndat[,1:2],test=new.gendat[,1:2],
                       k=25,kernel="rectangular")
  gen.err.new <- mean(pred.gen.new$fit != new.gendat[,1])

gen.err.new
# Error rate for 25 nearest neighbors (in 1-d subspace).
# New worst so far.

#####

# Now I'll try using the best approximating two-dimensional subspace.  

# I won't use train.kknn to select a value of k
# because scaling the two predictors would give the one based
# on the second eigenvalue too much influence.  Instead I'll
# just keep k at 25 (thinking that the 2nd predictor won't
# change things that much (but hoping that it changes things 
# a bit for the better)).  I'll use the knn function instead 
# of the kknn function because knn doesn't scale the predictors.

pred.gen.newest <- knn(new.trndat[,2:3],test=new.gendat[,2:3],
                       cl=new.trndat[,1],k=25)
gen.err.newest <- mean(pred.gen.newest != new.gendat[,1])

gen.err.newest
# Error rate for 25 nearest neighbors (in 2-d subspace).
# Slightly better than KNN based on all variables.

#####

# Now for the DANN method (see p. 477 of HTF2).

# Here is a function to find square root of 
# the inverse of a matrix.

sqrt.inv.mat <- function( mat )  # won't work for all matrices
{
  eigen.mat <- eigen( mat )
  q <- eigen.mat$vectors
  inv.sqrt.lamb <- diag( sqrt( 1/eigen.mat$values ) )
  sr <- ( q%*%inv.sqrt.lamb )%*%t(q)
  sr
}

# Here is a function to find the the pooled
# within-class covariance matrix (see p. 477
# of HTF2).  First column of data matrix 
# should have groups identified with consecutive
# integers.

pooled.within.cov <- function( data )
{
  first.class <- min( data[,1] )
  last.class <- max( data[,1] )
  var.cov <- 0
  for ( g in first.class:last.class )
  {
    cases <- data[,1] == g
    if (sum(cases) < 2) var.cov <- var.cov + 0  else
      var.cov <- var.cov + ( sum( cases )/length( data[,1] ) )*cov( data[cases,-1] )
  }
  var.cov
}

# Here are a pair of functions that determine the 
# Sigma given by (13.9) on p. 477 of HTF2.  (Based 
# on comment in HTF2, the default value of epsilon 
# is 1.)  

between.cov <- function(data)
# 1st col. of data set should be class variable.
# (Classes should be specified using consecutive integers.)
# Remaining col's of each set should be predictor var's.
{
  first.class <- min(data[,1])
  last.class <- max(data[,1])
  n.class <- last.class - first.class + 1
  n.var <- length(data[1,]) - 1
  var.means <- matrix(0, nrow=n.var, ncol=n.class)
  var.sums <- matrix(0, nrow=n.var, ncol=n.class)
  class.n <- numeric(n.class)
  for (g in first.class:last.class)
  {
    cases <- data[,1] == g
    class.n[g-first.class+1] <- sum(cases)
    classdata <- data[cases,]
    var.means[,g-first.class+1] <- apply(classdata[,-1],2,mean)
    var.sums[,g-first.class+1] <- apply(classdata[,-1],2,sum)
  }
  # var.means now contains the mean vectors for the
  # classes in its columns ... now I'll subtract the
  # grand mean from each column and put the result in
  # mean.deviations
  grand.mean <- apply(var.sums,1,sum)/sum(class.n)
  mean.deviations <- var.means - grand.mean
  # now I'll put the matrix given by (13.10) on p. 479
  # of HTF2, which is also the matrix B on p. 477 of
  # HTF2, in B
  B <- matrix(0, nrow=n.var, ncol=n.var)
  for (g in first.class:last.class)
  {
    col.ind <- g - first.class + 1
    B <- B + 
         class.n[col.ind]*cbind(mean.deviations[,col.ind])%*%mean.deviations[,col.ind]
  }
  B <- B/sum(class.n)
  B
}

DANN.Sig <- function( data, epsilon=1 )
{
  w.neg.half <- sqrt.inv.mat( pooled.within.cov( data ) )
  b <- between.cov( data )
  b.star <- w.neg.half%*%b%*%w.neg.half
  n.var <- length( data[1,] ) - 1
  sig <- w.neg.half%*%( b.star + diag(epsilon, n.var) )%*%w.neg.half
}

local.Sig <- function( target.point, data, num.neighbors=num.neighbors )
{
  # data should have class indicator in 1st column, 
  # and predictor variables in remaining columns.
  # target.point should just have coordinates of
  # target point.  Based on a comment in HFT2, the
  # default number of nearest neighbors used is 50.
  distances <- as.matrix(dist(rbind( target.point, data[,-1])))
  closest.training.points <- rank( distances[-1,1] ) < num.neighbors + 1
  loc.sig <- DANN.Sig( data[closest.training.points,] )
  loc.sig
}

# This function determines the distances given by 
# (13.8) on p. 477 of HTF2.

local.dist <- function( target.point, data, num.neighbors=num.neighbors )
{
  x.mat <- t( data[,-1] )  # class variable should 
                           # be in 1st col of data
  diffs <- x.mat - as.vector( t( target.point ) )
  distances <- diag( ( t( diffs )%*%local.Sig( target.point, data, num.neighbors ) )%*%diffs )
  distances
}

# This function determines predicted class of a target point.

pred.class <- function( target.point, data, num.neighbors=num.neighbors, k=k )
{
  dists <- local.dist( target.point, data, num.neighbors )
  near.neighbors <- rank( dists ) < k + 1
  first.class <- min( data[,1] )
  last.class <- max( data[,1] )
  num.classes <- last.class - first.class + 1
  classes <- numeric(num.classes)
  for (g in first.class:last.class )
    classes[g - first.class + 1] <- sum( data[near.neighbors,1] == g  )
  pred <- which.max( classes ) + first.class - 1
  pred
}

DANN <- function( train.data, test.data, num.neighbors=50, k=15 )
{
  num.cases <- length( test.data[,1] )
  predictions <- numeric(num.cases)
  for (i in 1:num.cases )
    predictions[i] <- pred.class( test.data[i,-1], train.data, num.neighbors=num.neighbors, k=k )
  predictions
}

pred <- DANN( trndat, gendat )
DANN.err <- mean(pred != gendat[,1])

DANN.err
# Horrible!

# I'll try adjusting things with DANN, since sometimes default settings can
# do poorly.

# Altogether I tried 47 additional settings with DANN.  The results are shown
# in the 2nd of the two tables below.  With some settings the results are
# better than ordinary nearest neighbors.  The problem is: how do we find 
# good values for the parameters?  (Cross-validation I suppose, but that'd
# be a lot of work.)

# It can be noted that the best DANN results (given below) occurred when
# the initial neighborhood was rather large.  So in this case DANN isn't 
# leading to improvement so much due to it's local nature. but rather, it
# seems more due to the variable weighting.  The best results are near 
# those I obtain using the variable weighting/selection scheme described
# and used below.


#################################################################################
#######################          Summary so far          ########################


#  LDA (all var's)                 0.310
#  LDA (stepwise var selection)    0.312   Var selection not effective
  
#  SVM (all var's (linear kernel)  0.299

#  QDA (all var's)                 0.268
#  QDA (stepwise var selection)    0.266   Var selection mildly effective (maybe)

#  KNN (all var's, best c-v k)     0.316
#  KNN (all var's, 2nd best c-v k) 0.322
#  KNN (global dim red (1-d))      0.413
#  KNN (global dim red (2-d))      0.309   Global dim red possibly effective
#  DANN (default settings)         0.512   
#  DANN (tuned settings)           0.281   DANN possibly effective

#################################################################################


#####################################################################
##################    Summary of DANN results    ####################

     
                                         num neighbors        
                                       for classification   
#                                      ------------------  

#                            3    7   15   21   23   25   30   60  
#
# num neighbors      25    479  478  
# used initially     50         502  512
#                    100        411  404                 397
#                    125                       345
#                    150        335  318       314       320  329
#                    175             307  299  292  295  301
#                    200             283  282  287  281  284
#                    225             288  283  286  281  289
#                    250             284  279  288  285  296
#                    275             284  290  290  290  292
#                    300             287            285
#                    325             284            287
#                    350        303  286            284
#                    400        296  297            287

#####################################################################


# Now I'll try a variation of Daniel Saxton's variable weighting scheme for
# nearest neighbors.

# I'll reset the random number seed so results can be easily reproduced if 
# one skips some of the previous computations.

set.seed(17)

# Letting MSB be the "between" MS term, and MSE be the "error" MS term, it 
# can be noted that if there are no differences between class means for a 
# predictor, MSB and MSE have the same expectation (equal to the error term
# variance).  If there are differences between the class means
#                             
#                            MSB - MSE
#
# is a measure if the amount of differences due to differences in means.
# Since this difference depends on the within-class differences in addition
# to the between class differences, we can divide by MSE to make it scale
# invariant.  So we can use 
#
#                      (MSB - MSE)/MSE  =  F - 1
#
# as weights.  (If F - 1 is negative, we can set weight to 0.)  Noise 
# variables should get no or little weight with this scheme.

# So that we can use the canned NN functions, I propose first scaling the
# data, and then multiplying each variable by 
#                              sqrt( F - 1 ).
# Then we can use R's knn function normally.  (Note: Cannot use R's kknn
# function since it automatically scales the predictors, and thus the effect
# of the weights would be washed out.)


# Step 1:  Get the weights from training data 
# (based on one-way ANOVA F statistics).

weights <- numeric(5)
for (i in 2:6) weights[i-1] <- max(0, summary( lm( trndat[,i] ~ trndat[,1] ) )$fstatistic[1] - 1)
weights <- weights/sum(weights)
weights
# Weights have been rescaled so that they sum to 1.
 

# Step 2: Apply the weights to training data.

scal.trn.vars <- scale(trndat[,-1], center=TRUE, scale=TRUE)
scal.trndat <- data.frame( cbind( trndat[,1], scal.trn.vars ) )
names(scal.trndat) <- names(trndat)

sqrt.wt <- sqrt(weights)
for (i in 2:6) scal.trndat[,i] <- scal.trndat[,i]*sqrt.wt[i-1]       
 

# Step 3: Apply the weights to generalization data.

scal.gen.vars <- scale(gendat[,-1], center=attr(scal.trn.vars,"scaled:center"),
                                    scale=attr(scal.trn.vars,"scaled:scale"))
scal.gendat <- data.frame( cbind(gendat[,1], scal.gen.vars ) )
names(scal.gendat) <- names(scal.trndat)

for (i in 2:6) scal.gendat[,i] <- scal.gendat[,i]*sqrt.wt[i-1]


# Step 4: Do the classification using knn function  
# (using k from original KNN trial (selected using 
# c-v with all of the variables)).

pred <- knn( scal.trndat[,-1], scal.gendat[,-1], cl=scal.trndat[,1], k=19 )
error.rate <-  mean( pred != scal.gendat[,1] )
error.rate
# Better than other KNN results, and LDA and SVM,
# but not QDA (but close to QDA).  IMPORTANT THING
# is that it's an improvement over ordinary KNN.