# Methods for Variable Selection / Weighting


# Developed using 2.9.0.


library(MASS)
library(class)  
library(kknn)     # for kknn and train.kknn f'ns (for KNN)
library(klaR)     # for stepclass f'n (for stepwise LDA)
library(e1071)    # for svm f'n (for support vector machines)
library(mvtnorm)  # for rmvnorm and dmvnorm (for generation 
                  # of multivariate normal observations and
                  # computation of multivariate normal pdf 
                  # values)
    
set.seed(7)     


# I'll generate data from four classes, using a multivariate
# normal distribution for each class.  The covariance matrix
# will be *similar* for all of the classes.  The means will 
# differ in the first two dimensions, but not the other three
# dimensions.  (So three of the five predictor variables are
# pure noise.)

Sig <- diag(1,5)  
Sig[1,2] <- 0.6
Sig[2,1] <- 0.6
Sig[3,4] <- 0.4
Sig[4,3] <- 0.4
Sig
# Sig is the covariance matrix for classes 1 (Orange) and 4 (Blue).
# The covarince matrix for classes 2 (Green) and 3 (Maroon) is similar
# (correlations the same, but everything is adjusted by a scale factor).

Oxvals <- rmvnorm(100, mean=c(-1.5,-1.25,0,0,0), sigma=Sig) # O is used for the Orange class.
Gxvals <- 0.8*rmvnorm(100, mean=c(-0.75,-0.25,0,0,0), sigma=Sig) # G is used for the Green class.
Mxvals <- 0.8*rmvnorm(100, mean=c(0.75,0,0,0,0), sigma=Sig) # M is used for the Maroon class.
Bxvals <- rmvnorm(100, mean=c(1.5,1.75,0,0,0), sigma=Sig) # B is used for the Blue class.

plot(Oxvals[,1],Oxvals[,2],col="darkorange",main='Training Data (nonnoise variables)',
     xlab='x1',ylab='x2',xlim=c(-4.5,4.5),ylim=c(-4.5,4.5))
points(Gxvals[,1], Gxvals[,2], col="limegreen")
points(Mxvals[,1], Mxvals[,2], col="maroon")
points(Bxvals[,1], Bxvals[,2], col="royalblue")

# There's quite a bit of overlap.

#####

# I'll give response value of 1 for Orange class observations, 
# 2 for Green class observations, 3 for the Maroon class
# observations, and 4 for the Blue class observations, and then 
# combine them together to form the training data.  Then I'll 
# generate 250 observations for each class to serve as the 
# generalization sample.

Oclass <- cbind(1, Oxvals)
Gclass <- cbind(2, Gxvals)
Mclass <- cbind(3, Mxvals)
Bclass <- cbind(4, Bxvals)
trndat <- data.frame(rbind(Oclass, Gclass, Mclass, Bclass))
names(trndat) <- c("g", "x1", "x2", "x3", "x4", "x5")  

Oxvals <- rmvnorm(250, mean=c(-1.5,-1.25,0,0,0), sigma=Sig) # O is used for the Orange class.
Gxvals <- 0.8*rmvnorm(250, mean=c(-0.75,-0.25,0,0,0), sigma=Sig) # G is used for the Green class.
Mxvals <- 0.8*rmvnorm(250, mean=c(0.75,0,0,0,0), sigma=Sig) # M is used for the Maroon class.
Bxvals <- rmvnorm(250, mean=c(1.5,1.75,0,0,0), sigma=Sig) # B is used for the Blue class.

Oclass <- cbind(1, Oxvals)
Gclass <- cbind(2, Gxvals)
Mclass <- cbind(3, Mxvals)
Bclass <- cbind(4, Bxvals)
gendat <- data.frame(rbind(Oclass, Gclass, Mclass, Bclass))
names(gendat) <- c("g", "x1", "x2", "x3", "x4", "x5")  

#####

# Before trying the method from subsection 13.4.2 of HTF2
# I'll try LDA using all of the variables, and then step-
# wise LDA.

lda.all <- lda(g ~ ., trndat)
pred.lda.all <- predict(lda.all, newdata=gendat)
lda.all.rate <- mean( pred.lda.all$class != gendat[,1] )

lda.all.rate
# LDA rate (all predictors used).

ldafit <- stepclass(g ~ ., data=trndat, method="lda")
ldafit

# Stepwise LDA (correctly) selects only x1 and x2.

lda.step <- lda(g ~ x1 + x2, trndat)
pred.lda.step <- predict(lda.step, newdata=gendat)
lda.step.rate <- mean( pred.lda.step$class != gendat[,1] )

lda.step.rate
# LDA rate (using predictor chosen by stepwise procedure).
# A slight improvement (9 more correct out of 1000).  The
# stepwise procedure was effective, but the noise variables
# don't create a big problem if stepwise isn't used.

##### 

# I'll now try SVM using the linear kernel.

svm.lin <- svm( as.factor(g) ~ ., trndat, cross=10, kernel="linear")
summary(svm.lin)

# The c-v estimate isn't promising, but I'll check the
# performance on the generalization set anyway.

mean( predict(svm.lin, gendat) != as.factor(gendat[,1]) )

# Better than the c-v indicated it would be.
# Not too much worse than LDA result.

plot(svm.lin, trndat, x2 ~ x1)

# Could it be that the linear kernel leads to a fit
# similar to the LDA fit having linear boundaries?

#####

# Now I'll try ordinary nearest neighbors, using 
# train.kknn to selct a good value of k.

cvknn <- train.kknn(as.factor(g) ~ ., trndat, 
          kmax=150, kernel="rectangular", distance=2) 

plot(c(1:150),cvknn$MISCLASS,type="l",xlab='k',
      ylab='proportion misclassified',main='Estimated Error Rates',
      ylim=c(0.33,0.55),col="tomato")

which.min(cvknn$MISCLASS)

# I'll go with k = 17.

pred.gen.knn <- kknn(as.factor(g)~.,train=trndat,test=gendat,
                     k=17,kernel="rectangular")
gen.err.knn <- mean(pred.gen.knn$fit != gendat[,1])

gen.err.knn
# Error rate for 17 nearest neighbors.
# Worse than LDA results.

#####

# Now I'll try the method from subsection 13.4.2 of HTF2.

GlobDimRed <- function(traindata)
# 1st col. of data set should be class variable.
# (Classes should be specified using consecutive integers.)
# Remaining col's of each set should be predictor var's.
{
  first.class <- min(traindata[,1])
  last.class <- max(traindata[,1])
  n.class <- last.class - first.class + 1
  n.var <- length(traindata[1,]) - 1
  var.means <- matrix(0, nrow=n.var, ncol=n.class)
  var.sums <- matrix(0, nrow=n.var, ncol=n.class)
  class.n <- numeric(n.class)
  for (g in first.class:last.class)
  {
    cases <- traindata[,1] == g
    class.n[g - first.class + 1] <- sum(cases)
    classdata <- traindata[cases,]
    var.means[,g - first.class + 1] <- apply(classdata[,-1],2,mean)
    var.sums[,g - first.class + 1] <- apply(classdata[,-1],2,sum)
  }
  # var.means now contains the mean vectors for the
  # classes in its columns ... now I'll subtract the
  # grand mean from each column and put the result in
  # mean.deviations
  grand.mean <- apply(var.sums,1,sum)/sum(class.n)
  mean.deviations <- var.means - grand.mean
  # now I'll put the matrix given by (13.10) on p. 479
  # of HTF2, which is also the matrix B on p. 477 of
  # HTF2, in B
  B <- matrix(0, nrow=n.var, ncol=n.var)
  for (g in first.class:last.class)
  {
    col.ind <- g - first.class + 1
    B <- B + 
         class.n[col.ind]*cbind(mean.deviations[,col.ind])%*%mean.deviations[,col.ind]
  }
  B <- B/sum(class.n)
  # I'll put the eigenvalues and eigned vectors 
  # of B in ev, and return this object
  ev <- eigen(B)
}

dim.red <- GlobDimRed(trndat)
dim.red

# Compared to the first three eignevalues, the other
# two are extremely small, and the 2nd is much smaller 
# than the first, and the 3rd is much smaller than
# the 2nd.  This suggests that a good approximating 
# subspace may be the one-dimensional subspace spanned by 
# the 1st eignevector, or possibly a two-dimensional subspace
# spanned by the first two eigenvectors.

#####

# I'll create new data sets having predictors created 
# from the eignevectors.  If just the first predictor
# is used, it'll be as though we're using a predictor
# obtained from projecting the original predictors into
# the one-dimensional subspace spanned by the first
# eigenvector.

new.trndat <- data.frame(cbind(trndat[,1], as.matrix(trndat[,-1])%*%dim.red$vector))
names(new.trndat) <- c("g","ev1","ev2","ev3","ev4","ev5")

new.gendat <- data.frame(cbind(gendat[,1], as.matrix(gendat[,-1])%*%dim.red$vector))
names(new.gendat) <- c("g","ev1","ev2","ev3","ev4","ev5")

# I'll first try the best one-dimensional approximating
# subspace, using train.kknn to identify a good choice 
# of k to use for classifying the test set cases using 
# the training data.
#
# (NOTE: If we wanted to use a subspace of dimension
# greater than 1, train.kknn may not be good to use
# since it automatically scales the predictors, which 
# would give too much influence to the less important
# predictors.  I.e., I think it'll work better if we
# can give the first eigenvector the most influence in 
# determining the nearest neighbors, but if the predictors 
# are scaled this won't be the case.) 

new.cvknn <- train.kknn(as.factor(g) ~ ., new.trndat[,1:2], 
          kmax=150, kernel="rectangular", distance=2)

plot(c(1:150),new.cvknn$MISCLASS,type="l",xlab='k',
      ylab='proportion misclassified',main='Estimated Error Rates',
      ylim=c(0.35,0.55),col="maroon")

which.min(new.cvknn$MISCLASS)

# Let's see what the error rate is if we use 
# 16 nearest neighbors in the one-dimensional
# subspace.

  pred.gen.new <- kknn(as.factor(g)~.,train=new.trndat[,1:2],test=new.gendat[,1:2],
                       k=16,kernel="rectangular")
  gen.err.new <- mean(pred.gen.new$fit != new.gendat[,1])

gen.err.new
# Error rate for 16 nearest neighbors (in 1-d subspace).
# Not very good.

#####

# Now I'll try using the best approximating two-dimensional
# subspace.  From below, it can be noted that the sample 
# variances of the new variables aren't all the same (even 
# though each eigenvector is of length 1).

apply( new.trndat[,-1], 2, var )  

# I won't use train.kknn to select a value of k
# because scaling the three predictors would give the one based
# on the second eigenvalue too much influence.  Instead I'll
# just keep k at 16 (thinking that the 2nd predictor won't
# change things that much (but hoping that it changes things 
# a bit for the better)).  I'll use the knn function instead 
# of the kknn function because knn doesn't scale the predictors.

pred.gen.newest <- knn(new.trndat[,2:3],test=new.gendat[,2:3],
                       cl=new.trndat[,1],k=16)
gen.err.newest <- mean(pred.gen.newest != new.gendat[,1])

gen.err.newest
# Error rate for 16 nearest neighbors (in 2-d subspace).
# Better than 1-d result, and better than what was obtained
# using all five original predictors.  But not as good as 
# LDA results.

#####

# Now for the DANN method (see p. 477 of HTF2).

# Here is a function to find square root of 
# the inverse of a matrix.

sqrt.inv.mat <- function( mat )  # won't work for all matrices
{
  eigen.mat <- eigen( mat )
  q <- eigen.mat$vectors
  inv.sqrt.lamb <- diag( sqrt( 1/eigen.mat$values ) )
  sr <- ( q%*%inv.sqrt.lamb )%*%t(q)
  sr
}

# Here is a function to find the the pooled
# within-class covariance matrix (see p. 477
# of HTF2).  First column of data matrix 
# should have groups identified with consecutive
# integers.

pooled.within.cov <- function( data )
{
  first.class <- min( data[,1] )
  last.class <- max( data[,1] )
  var.cov <- 0
  for ( g in first.class:last.class )
  {
    cases <- data[,1] == g
    if (sum(cases) < 2) var.cov <- var.cov + 0  else
      var.cov <- var.cov + ( sum( cases )/length( data[,1] ) )*cov( data[cases,-1] )
  }
  var.cov
}

# Here are a pair of functions that determine the 
# Sigma given by (13.9) on p. 477 of HTF2.  (Based 
# on comment in HTF2, the default value of epsilon 
# is 1.)  

between.cov <- function(data)
# 1st col. of data set should be class variable.
# (Classes should be specified using consecutive integers.)
# Remaining col's of each set should be predictor var's.
{
  first.class <- min(data[,1])
  last.class <- max(data[,1])
  n.class <- last.class - first.class + 1
  n.var <- length(data[1,]) - 1
  var.means <- matrix(0, nrow=n.var, ncol=n.class)
  var.sums <- matrix(0, nrow=n.var, ncol=n.class)
  class.n <- numeric(n.class)
  for (g in first.class:last.class)
  {
    cases <- data[,1] == g
    class.n[g-first.class+1] <- sum(cases)
    classdata <- data[cases,]
    var.means[,g-first.class+1] <- apply(classdata[,-1],2,mean)
    var.sums[,g-first.class+1] <- apply(classdata[,-1],2,sum)
  }
  # var.means now contains the mean vectors for the
  # classes in its columns ... now I'll subtract the
  # grand mean from each column and put the result in
  # mean.deviations
  grand.mean <- apply(var.sums,1,sum)/sum(class.n)
  mean.deviations <- var.means - grand.mean
  # now I'll put the matrix given by (13.10) on p. 479
  # of HTF2, which is also the matrix B on p. 477 of
  # HTF2, in B
  B <- matrix(0, nrow=n.var, ncol=n.var)
  for (g in first.class:last.class)
  {
    col.ind <- g - first.class + 1
    B <- B + 
         class.n[col.ind]*cbind(mean.deviations[,col.ind])%*%mean.deviations[,col.ind]
  }
  B <- B/sum(class.n)
  B
}

DANN.Sig <- function( data, epsilon=1 )
{
  w.neg.half <- sqrt.inv.mat( pooled.within.cov( data ) )
  b <- between.cov( data )
  b.star <- w.neg.half%*%b%*%w.neg.half
  n.var <- length( data[1,] ) - 1
  sig <- w.neg.half%*%( b.star + diag(epsilon, n.var) )%*%w.neg.half
}

local.Sig <- function( target.point, data, num.neighbors=num.neighbors )
{
  # data should have class indicator in 1st column, 
  # and predictor variables in remaining columns.
  # target.point should just have coordinates of
  # target point.  Based on a comment in HFT2, the
  # default number of nearest neighbors used is 50.
  distances <- as.matrix(dist(rbind( target.point, data[,-1])))
  closest.training.points <- rank( distances[-1,1] ) < num.neighbors + 1
  loc.sig <- DANN.Sig( data[closest.training.points,] )
  loc.sig
}

# This function determines the distances given by 
# (13.8) on p. 477 of HTF2.

local.dist <- function( target.point, data, num.neighbors=num.neighbors )
{
  x.mat <- t( data[,-1] )  # class variable should 
                           # be in 1st col of data
  diffs <- x.mat - as.vector( t( target.point ) )
  distances <- diag( ( t( diffs )%*%local.Sig( target.point, data, num.neighbors ) )%*%diffs )
  distances
}

# This function determines predicted class of a target point.

pred.class <- function( target.point, data, num.neighbors=num.neighbors, k=k )
{
  dists <- local.dist( target.point, data, num.neighbors )
  near.neighbors <- rank( dists ) < k + 1
  first.class <- min( data[,1] )
  last.class <- max( data[,1] )
  num.classes <- last.class - first.class + 1
  classes <- numeric(num.classes)
  for (g in first.class:last.class )
    classes[g - first.class + 1] <- sum( data[near.neighbors,1] == g  )
  pred <- which.max( classes ) + first.class - 1
  pred
}

DANN <- function( train.data, test.data, num.neighbors=50, k=15 )
{
  num.cases <- length( test.data[,1] )
  predictions <- numeric(num.cases)
  for (i in 1:num.cases )
    predictions[i] <- pred.class( test.data[i,-1], train.data, num.neighbors=num.neighbors, k=k )
  predictions
}

pred <- DANN( trndat, gendat )
DANN.err <- mean(pred != gendat[,1])

DANN.err
# error rate for DANN method
# Better than other nearest neighbors results,
# but not as good as LDA and SVM results.



##########################################################################
####################          Summary so far          ####################


#   LDA (all var's)                0.321
#   LDA (stepwise var selection)   0.312   Var selection mildly effective
  
#   SVM (all var's (linear kernel) 0.325

#   KNN (all var's)                0.388
#   KNN (global dim red (1-d))     0.424
#   KNN (global dim red (2-d))     0.356   Global dim red effective
#   DANN                           0.347   DANN effective

##########################################################################



# Now I'll try a variation of Daniel Saxton's variable weighting scheme for
# nearest neighbors.

# I'll reset the random number seed so results can be easily reproduced if 
# one skips some of the previous computations.

set.seed(17)

# Letting MSB be the "between" MS term, and MSE be the "error" MS term, it 
# can be noted that if there are no differences between class means for a 
# predictor, MSB and MSE have the same expectation (equal to the error term
# variance).  If there are differences between the class means
#                             
#                            MSB - MSE
#
# is a measure if the amount of differences due to differences in means.
# Since this difference depends on the within-class differences in addition
# to the between class differences, we can divide by MSE to make it scale
# invariant.  So we can use 
#
#                      (MSB - MSE)/MSE  =  F - 1
#
# as weights.  (If F - 1 is negative, we can set weight to 0.)  Noise 
# variables should get no or little weight with this scheme.

# So that we can use the canned NN functions, I propose first scaling the
# data, and then multiplying each variable by 
#                              sqrt( F - 1 ).
# Then we can use R's knn function normally.  (Note: Cannot use R's kknn
# function since it automatically scales the predictors, and thus the effect
# of the weights would be washed out.)


# Step 1:  Get the weights from training data 
# (based on one-way ANOVA F statistics).

weights <- numeric(5)
for (i in 2:6) weights[i-1] <- max(0, summary( lm( trndat[,i] ~ trndat[,1] ) )$fstatistic[1] - 1)
weights <- weights/sum(weights)
weights
# Weights have been rescaled so that they sum to 1.
 

# Step 2: Apply the weights to training data.

scal.trn.vars <- scale(trndat[,-1], center=TRUE, scale=TRUE)
scal.trndat <- data.frame( cbind( trndat[,1], scal.trn.vars ) )
names(scal.trndat) <- names(trndat)

sqrt.wt <- sqrt(weights)
for (i in 2:6) scal.trndat[,i] <- scal.trndat[,i]*sqrt.wt[i-1]       
 

# Step 3: Apply the weights to generalization data.

scal.gen.vars <- scale(gendat[,-1], center=attr(scal.trn.vars,"scaled:center"), 
                                    scale=attr(scal.trn.vars,"scaled:scale"))
scal.gendat <- data.frame( cbind(gendat[,1], scal.gen.vars ) )
names(scal.gendat) <- names(scal.trndat)

for (i in 2:6) scal.gendat[,i] <- scal.gendat[,i]*sqrt.wt[i-1]


# Step 4: Do the classification using knn function  
# (using k from original KNN trial (selected using 
# c-v with all of the variables)).

pred <- knn( scal.trndat[,-1], scal.gendat[,-1], cl=scal.trndat[,1], k=17 )
error.rate <-  mean( pred != scal.gendat[,1] )
error.rate
# Better than other KNN results, but not LDA or SVM. 
# (Improvement over ordinary KNN: 0.336 vs. 0.388.)