# Classification with gene expression data (4 classes, # 2308 predictors) referred to in Sec. 18.2 of HTF2. # Developed with vesion 2.9.0. library(class) library(MASS) library(e1071) library(kknn) library(klaR) library(sma) library(pamr) set.seed(7) traindat.b <- read.table("http://mason.gmu.edu/~csutton/getrainb.txt",sep='',header=F) traindat.a <- read.table("http://mason.gmu.edu/~csutton/getraina.txt",sep='',header=F) trndat <- data.frame( cbind( t(traindat.b), t(traindat.a) ) ) testdat.b <- read.table("http://mason.gmu.edu/~csutton/getestb.txt",sep='',header=F) testdat.a <- read.table("http://mason.gmu.edu/~csutton/getesta.txt",sep='',header=F) tstdat <- data.frame( cbind( t(testdat.b), t(testdat.a) ) ) tstdat <- tstdat[-c(3,5,9,11,13),] # test data had missing response values for 5 cases # I'll use the stat.diag.da function from the sma library. # I think it'll do the diagonal LDA classification as des- # cribed in Sec. 18.2 of HTF2 (which some refer to as DDA). pooled.dda <- stat.diag.da( trndat[,-1], trndat[,1], tstdat[,-1], pool=1) err.pooled.dda <- mean( pooled.dda$pred != tstdat[,1] ) err.pooled.dda # error rate for DDA (using pooled estimates of standard deviations) # (The 25% error rate matches what is given on p. 652 of HTF2.) ##### # The stat.diag.da f'n also allows one to not use pooled # estimates of the standard deviations and instead use # sample standard deviation computed from each class separately. # (To get this version, set the value of the pool argument to 0.) # Since the 63 training sample cases are split among 4 classes, # it could be that the sample sizes are too small for this to work # well. unpooled.dda <- stat.diag.da( trndat[,-1], trndat[,1], tstdat[,-1], pool=0) err.unpooled.dda <- mean( unpooled.dda$pred != tstdat[,1] ) err.unpooled.dda # error rate for DDA (not pooling to estimate standard deviations) # (Not good!) ##### # Next I'll try the nearest shrunken centroids (NSC) method of Sec. 18.2. # The function pamr.train in the library pamr was written by Hastie, # Tibshirani, and others to train a NSC classifier. (Note: The required # form for the data is not what I'm used to! The cases go down columns, # and the variables across rows.) data4pamr <- list( x=t(trndat[,-1]), y=factor(trndat[,1]) ) results.pamr.train <- pamr.train( data4pamr, n.threshold=50 ) # Note: Not sure why the stuff above is there. results.pamr.train # Note from above that the largest threshold value that gave # zero training set errors is 4.495, which results in only # 37 predictors surviving the threshold. The caption of Fig. # 18.4 on p. 655 of HTF2 indicates that a threshold value of # 4.34 is selected by c-v (and this results in 43 selected # predictors). # Here is a plot of the estimated error rates for the different # threshold values. I guess these should match the green points # in the top plot of Fig. 18.4 on p. 655 of HTF2 (and they seem to). plot(results.pamr.train$threshold,results.pamr.train$error/63, xlab="threshold",ylab="error rate",col="limegreen") ##### # Now I'll use pamr.cv to get c-v estimates of the error rate # for each threshold value. results.pamr.cv <- pamr.cv( results.pamr.train, data4pamr ) results.pamr.cv points(results.pamr.cv$threshold,results.pamr.cv$error, col="darkorange") # Plot slightly different from orange points in Fig. 18.4 of HTF2 # due to different random numbers (resulting in different folds). # But as in HTF2, c-v indicates a threshold of 4.34 (which corre- # sponds to selecting 43 predictors) is best (since among threshold # values resulting in 0 errors, the largest such threshold eliminates # more predictors). ##### # Now I'll use pamr.predict to get predictions for the test set (using # the threshold value indicated by c-v). pred.pamr <- pamr.predict(results.pamr.train, t(tstdat[,-1]) , threshold=4.34) err.nsc <- mean( pred.pamr != tstdat[,1] ) err.nsc # error rate for NSC # Great!!! No prediction errors. ##### # There are too many variables (or too few cases) to do LDA # using all of the predictors, but we can try stepwise LDA. # *** since stepclass took a long time to run I'll comment it out and # just report that the 1646th and 108th predictors were selected, and # the c-v estimate of the error rate was about 9 percent. # ldafit <- stepclass(x=trndat[,-1], grouping=trndat[,1], method="lda") lda.step <- lda(x=trndat[,c(1647,109)], grouping=trndat[,1]) pred.lda.step <- predict(lda.step, newdata=tstdat[,c(1647,109)]) lda.step.rate <- mean( pred.lda.step$class != tstdat[,1] ) lda.step.rate # The c-v estimate of the error rate was optimistic. # 6 out of 20 test cases were misclassified. # I'll try qda using the same two predictors. qda.result <- qda(x=trndat[,c(1647,109)], grouping=trndat[,1]) pred.qda <- predict(qda.result, newdata=tstdat[,c(1647,109)]) qda.rate <- mean( pred.qda$class != tstdat[,1] ) qda.rate # A bit better (only misses 5 out of 20). # Here's a plot of the data (using the two variables identified # by stepwise LDA). plot( trndat[,1647], trndat[,109], col=trndat[,1] ) # It may be that using more variables would be better. # Simple boundaries won't separate the classes well. ##### # 1nn is a possibility. onenn.result <- knn(trndat[,c(1647,109)], tstdat[,c(1647,109)], cl=trndat[,1], k=1) onenn.rate <- mean( onenn.result != tstdat[,1] ) onenn.rate # Not good. ##### End (for now) #####