# Classification with gene expression data (4 classes, 
# 2308 predictors) referred to in Sec. 18.2 of HTF2.
      
# Developed with vesion 2.9.0.

library(class)
library(MASS)
library(e1071) 
library(kknn)
library(klaR)
library(sma)
library(pamr)

set.seed(7)

traindat.b <- read.table("http://mason.gmu.edu/~csutton/getrainb.txt",sep='',header=F)
traindat.a <- read.table("http://mason.gmu.edu/~csutton/getraina.txt",sep='',header=F)
trndat <- data.frame( cbind( t(traindat.b), t(traindat.a) ) )
testdat.b <- read.table("http://mason.gmu.edu/~csutton/getestb.txt",sep='',header=F)
testdat.a <- read.table("http://mason.gmu.edu/~csutton/getesta.txt",sep='',header=F)
tstdat <- data.frame( cbind( t(testdat.b), t(testdat.a) ) )
tstdat <- tstdat[-c(3,5,9,11,13),]  # test data had missing response values for 5 cases

# I'll use the stat.diag.da function from the sma library.
# I think it'll do the diagonal LDA classification as des-
# cribed in Sec. 18.2 of HTF2 (which some refer to as DDA).

pooled.dda <- stat.diag.da( trndat[,-1], trndat[,1], tstdat[,-1], pool=1)
err.pooled.dda <- mean( pooled.dda$pred != tstdat[,1] )

err.pooled.dda
# error rate for DDA (using pooled estimates of standard deviations)
# (The 25% error rate matches what is given on p. 652 of HTF2.)
 
#####

# The stat.diag.da f'n also allows one to not use pooled
# estimates of the standard deviations and instead use
# sample standard deviation computed from each class separately.
# (To get this version, set the value of the pool argument to 0.) 
# Since the 63 training sample cases are split among 4 classes,
# it could be that the sample sizes are too small for this to work 
# well.

unpooled.dda <- stat.diag.da( trndat[,-1], trndat[,1], tstdat[,-1], pool=0)
err.unpooled.dda <- mean( unpooled.dda$pred != tstdat[,1] )

err.unpooled.dda
# error rate for DDA (not pooling to estimate standard deviations)
# (Not good!)
 
#####

# Next I'll try the nearest shrunken centroids (NSC) method of Sec. 18.2.
# The function pamr.train in the library pamr was written by Hastie,
# Tibshirani, and others to train a NSC classifier.  (Note: The required
# form for the data is not what I'm used to!  The cases go down columns,
# and the variables across rows.)

data4pamr <- list( x=t(trndat[,-1]), y=factor(trndat[,1]) )

results.pamr.train <- pamr.train( data4pamr, n.threshold=50 )

# Note: Not sure why the stuff above is there.

results.pamr.train

# Note from above that the largest threshold value that gave 
# zero training set errors is 4.495, which results in only 
# 37 predictors surviving the threshold.  The caption of Fig.
# 18.4 on p. 655 of HTF2 indicates that a threshold value of 
# 4.34 is selected by c-v (and this results in 43 selected
# predictors).

# Here is a plot of the estimated error rates for the different
# threshold values.  I guess these should match the green points
# in the top plot of Fig. 18.4 on p. 655 of HTF2 (and they seem to).

plot(results.pamr.train$threshold,results.pamr.train$error/63,
       xlab="threshold",ylab="error rate",col="limegreen")

#####

# Now I'll use pamr.cv to get c-v estimates of the error rate
# for each threshold value.

results.pamr.cv <- pamr.cv( results.pamr.train, data4pamr )

results.pamr.cv

points(results.pamr.cv$threshold,results.pamr.cv$error,
         col="darkorange")

# Plot slightly different from orange points in Fig. 18.4 of HTF2
# due to different random numbers (resulting in different folds).
# But as in HTF2, c-v indicates a threshold of 4.34 (which corre-
# sponds to selecting 43 predictors) is best (since among threshold
# values resulting in 0 errors, the largest such threshold eliminates
# more predictors).

#####

# Now I'll use pamr.predict to get predictions for the test set (using
# the threshold value indicated by c-v).

pred.pamr <- pamr.predict(results.pamr.train, t(tstdat[,-1]) , threshold=4.34)
err.nsc <- mean( pred.pamr != tstdat[,1] )

err.nsc
# error rate for NSC
# Great!!!  No prediction errors.

#####

# There are too many variables (or too few cases) to do LDA
# using all of the predictors, but we can try stepwise LDA.

# *** since stepclass took a long time to run I'll comment it out and
# just report that the 1646th and 108th predictors were selected, and
# the c-v estimate of the error rate was about 9 percent.
# ldafit <- stepclass(x=trndat[,-1], grouping=trndat[,1], method="lda")

lda.step <- lda(x=trndat[,c(1647,109)], grouping=trndat[,1])
pred.lda.step <- predict(lda.step, newdata=tstdat[,c(1647,109)])
lda.step.rate <- mean( pred.lda.step$class != tstdat[,1] )

lda.step.rate
# The c-v estimate of the error rate was optimistic.  
# 6 out of 20 test cases were misclassified.

# I'll try qda using the same two predictors.

qda.result <- qda(x=trndat[,c(1647,109)], grouping=trndat[,1])
pred.qda <- predict(qda.result, newdata=tstdat[,c(1647,109)])
qda.rate <- mean( pred.qda$class != tstdat[,1] )

qda.rate
# A bit better (only misses 5 out of 20).

# Here's a plot of the data (using the two variables identified 
# by stepwise LDA).

plot( trndat[,1647], trndat[,109], col=trndat[,1] )

# It may be that using more variables would be better.
# Simple boundaries won't separate the classes well.

#####

# 1nn is a possibility.

onenn.result <- knn(trndat[,c(1647,109)], tstdat[,c(1647,109)], cl=trndat[,1], k=1)
onenn.rate <- mean( onenn.result != tstdat[,1] )

onenn.rate 
# Not good.

#####  End  (for now)  #####