# Part of Exercise 13.7 on p. 483 of HTF2 # ... Dealing with the "Easy" Problem # The first step of the exercise is to reproduce the # results in the left panel of Fig. 13.5 on p. 469. # Do do this, I first need to create the training and # test sets. I'll load some libraries that may be useful, # set the random number seed, and then create the data sets. library(MASS) library(class) library(kknn) set.seed(12) # creation of training and test data n.trn <- 100 # can easily change sample size # (should be divisible by 5 (due # to use of 5fold c-v later)) n.tst <- 1000 # can easily change sample size x1 <- runif(n.trn) x2 <- runif(n.trn) x3 <- runif(n.trn) x4 <- runif(n.trn) x5 <- runif(n.trn) x6 <- runif(n.trn) x7 <- runif(n.trn) x8 <- runif(n.trn) x9 <- runif(n.trn) x10 <- runif(n.trn) y <- numeric(n.trn) y <- ifelse( x1 > 0.5, 1, 0 ) trndat <- data.frame(cbind(y,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10)) x1 <- runif(n.tst) x2 <- runif(n.tst) x3 <- runif(n.tst) x4 <- runif(n.tst) x5 <- runif(n.tst) x6 <- runif(n.tst) x7 <- runif(n.tst) x8 <- runif(n.tst) x9 <- runif(n.tst) x10 <- runif(n.tst) y <- numeric() y <- ifelse( x1 > 0.5, 1, 0 ) tstdat <- data.frame(cbind(y,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10)) # I'll estimate misclassification rate of KNN using the # training data to classify the test data. I'll use both # the kknn function (which scales the data) and the knn # function (which does not scale the data). k.max <- 70 # largest num. of nearest neighbors considered # (can easily be changed) gen.err.s <- numeric(k.max) gen.err.u <- numeric(k.max) for (k.val in 1:k.max) { pred.s <- kknn(as.factor(y)~.,train=trndat,test=tstdat,k=k.val,kernel="rectangular") gen.err.s[k.val] <- mean(pred.s$fit != tstdat[,1]) pred.u <- knn(trndat[,-1], tstdat[,-1], cl=trndat[,1], k=k.val) gen.err.u[k.val] <- mean(pred.u != tstdat[,1]) } plot(c(1:k.max),gen.err.u,type="l",xlab='k', ylab='proportion misclassified',main='Estimated Error Rates', ylim=c(0.08,0.27),col="darkgreen") points(c(1:k.max),gen.err.s,type="l",col="green") legend(20,0.27,legend=c("unbiased from test sample (unscaled)", "unbiased from test sample (scaled)"), fill=c("darkgreen","green")) # The results from the scaled and the unscaled data are very similar. # This is due to the fact that all of the predictor variables came from # the same distribution, so that the sample standard deviations should # be very similar (making it so scaling has little effect). # (Note: When I first did this exercise, I forgot that the default for # kknn is to use the triangular kernel, which does weighted nearest # neighbors classification. When I did it incorrectly this way, the # results from kknn and knn were much more different (not so much due # to the scaling vs. not scaling issue, but rather due to the difference # between doing weighted nearest neighbors and ordinary nearest neighbors # classification).) ##### # Below I'll compute the sample standard deviation for x1 through x10. # We can see that they are all fairly close to one another. stdev <- numeric(10) for (i in 2:11) stdev[i-1] <- sd(trndat[,i]) feature <- c(1:10) results <- cbind(feature,stdev) results # Note: All of the underlying random variables, being # uniform (0,1), have standard deviation 1/sqrt(12). 1/sqrt(12) ##### # The next step of the exercise is to estimate the # misclassification error using fivefold cross-validation # and compare these c-v estimates with the estimates of the # true error rate obtained from the large test sample. # Since using train.kknn is an easy way to do leave-one-out # (nfold) c-v, I'll first use it, and then write and use a # routine to perform fivefold c-v. # Here's the "canned" nfold c-v using train.kknn. cvknn <- train.kknn(as.factor(y) ~ ., trndat, kmax=k.max, kernel="rectangular", distance=2) plot(c(1:k.max),gen.err.s,type="l",xlab='k', ylab='proportion misclassified',main='Estimated Error Rates (Scaled Data)', ylim=c(0.08,0.33),col="green") points(c(1:k.max),cvknn$MISCLASS,type="l",col="cornflowerblue") legend(20,0.33,legend=c("unbiased from test sample", "nfold c-v","5fold c-v","AIC-like"), fill=c("green","cornflowerblue","blue","red")) # Because there is little difference between using # the scaled and the unscaled data, I'll just show # results based on the scaled data. ##### # Here's my fivefold c-v routine. cv5.err.s <- rep(0,k.max) fold.cases.block <- c(1:(n.trn/5)) for (fold in 1:5) { fold.cases <- (fold - 1)*(n.trn/5) + fold.cases.block for (k.val in 1:k.max) { pred.s <- kknn(as.factor(y)~., train=trndat[-fold.cases,], test=trndat[fold.cases,], k=k.val, kernel="rectangular") cv5.err.s[k.val] <- cv5.err.s[k.val] + sum(pred.s$fit != trndat[fold.cases,1]) } } cv5.err.s <- cv5.err.s/n.trn points(c(1:k.max), cv5.err.s,type="l",col="blue") # Overall, it seems for a training sample size of 100, # nfold c-v tracks the true error rates better than # fivefold c-v does. ##### # The last step of the exercise is to compute an # "AIC-like" estimate of the error rate by adding # a penalty to the resubstitution estimate of the # error rate. Specifically, the AIC-like measure # is # resub. est. + 2(approx. num. param.)/n. # Letting # approx. num. param. = n/k, # we get that the AIC-like measure is # resub. est. + 2/k. # (Note: This will produce an error rate exceeding # 1 (it'll be 2 if there are no equal x-vectors) for # k = 1, and an estimated error rate of at least 1 # if k = 2.) AIC.like.s <- numeric(k.max) for (k.val in 1:k.max) { pred.s <- kknn(as.factor(y)~.,train=trndat,test=trndat, k=k.val,kernel="rectangular") AIC.like.s[k.val] <- mean(pred.s$fit != trndat[,1]) + 2/k.val } points(c(1:70), AIC.like.s,type="l",col="red") # The AIC-like error rate estimates are poor for the # smaller values of k, but do a bit better then fivefold # c-v for large values of k. But overall, I think the # nfold c-v does the best job. ##### # I ran everything again with the sample sizes given below. # There was less difference between the nfold and 5fold c-v # estimates, and the AIC-like measures were horribly bad # compared to c-v estimates. n.trn <- 400 n.test <- 2000 ##### End #####