# Part of Exercise 13.7 on p. 483 of HTF2
# ... Dealing with the "Easy" Problem


# The first step of the exercise is to reproduce the 
# results in the left panel of Fig. 13.5 on p. 469.
# Do do this, I first need to create the training and
# test sets.  I'll load some libraries that may be useful,
# set the random number seed, and then create the data sets.

library(MASS)
library(class)
library(kknn)

set.seed(12)


# creation of training and test data

n.trn <- 100 # can easily change sample size
             # (should be divisible by 5 (due
             # to use of 5fold c-v later))

n.tst <- 1000 # can easily change sample size

x1 <- runif(n.trn)
x2 <- runif(n.trn)
x3 <- runif(n.trn)
x4 <- runif(n.trn)
x5 <- runif(n.trn)
x6 <- runif(n.trn)
x7 <- runif(n.trn)
x8 <- runif(n.trn)
x9 <- runif(n.trn)
x10 <- runif(n.trn)
y <- numeric(n.trn)
y <- ifelse( x1 > 0.5, 1, 0 )
trndat <- data.frame(cbind(y,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10))

x1 <- runif(n.tst)
x2 <- runif(n.tst)
x3 <- runif(n.tst)
x4 <- runif(n.tst)
x5 <- runif(n.tst)
x6 <- runif(n.tst)
x7 <- runif(n.tst)
x8 <- runif(n.tst)
x9 <- runif(n.tst)
x10 <- runif(n.tst)
y <- numeric()
y <- ifelse( x1 > 0.5, 1, 0 )
tstdat <- data.frame(cbind(y,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10))

# I'll estimate misclassification rate of KNN using the 
# training data to classify the test data.  I'll use both 
# the kknn function (which scales the data) and the knn
# function (which does not scale the data).

k.max <- 70 # largest num. of nearest neighbors considered
            # (can easily be changed)

gen.err.s <- numeric(k.max)
gen.err.u <- numeric(k.max)
for (k.val in 1:k.max)
{
  pred.s <- kknn(as.factor(y)~.,train=trndat,test=tstdat,k=k.val,kernel="rectangular")
  gen.err.s[k.val] <- mean(pred.s$fit != tstdat[,1])
  pred.u <- knn(trndat[,-1], tstdat[,-1], cl=trndat[,1], k=k.val)
  gen.err.u[k.val] <- mean(pred.u != tstdat[,1])
}

plot(c(1:k.max),gen.err.u,type="l",xlab='k',
      ylab='proportion misclassified',main='Estimated Error Rates',
      ylim=c(0.08,0.27),col="darkgreen") 
points(c(1:k.max),gen.err.s,type="l",col="green")
legend(20,0.27,legend=c("unbiased from test sample (unscaled)",
                        "unbiased from test sample (scaled)"),
       fill=c("darkgreen","green"))

# The results from the scaled and the unscaled data are very similar.
# This is due to the fact that all of the predictor variables came from 
# the same distribution, so that the sample standard deviations should 
# be very similar (making it so scaling has little effect).  

# (Note: When I first did this exercise, I forgot that the default for
# kknn is to use the triangular kernel, which does weighted nearest
# neighbors classification.  When I did it incorrectly this way, the
# results from kknn and knn were much more different (not so much due 
# to the scaling vs. not scaling issue, but rather due to the difference
# between doing weighted nearest neighbors and ordinary nearest neighbors
# classification).)

#####

# Below I'll compute the sample standard deviation for x1 through x10.
# We can see that they are all fairly close to one another.

stdev <- numeric(10)
for (i in 2:11) stdev[i-1] <- sd(trndat[,i]) 
feature <- c(1:10)
results <- cbind(feature,stdev)
results
# Note: All of the underlying random variables, being 
# uniform (0,1), have standard deviation 1/sqrt(12).
1/sqrt(12)

#####

# The next step of the exercise is to estimate the
# misclassification error using fivefold cross-validation
# and compare these c-v estimates with the estimates of the
# true error rate obtained from the large test sample.

# Since using train.kknn is an easy way to do leave-one-out 
# (nfold) c-v, I'll first use it, and then write and use a 
# routine to perform fivefold c-v.

# Here's the "canned" nfold c-v using train.kknn.

cvknn <- train.kknn(as.factor(y) ~ ., trndat, 
          kmax=k.max, kernel="rectangular", distance=2) 

plot(c(1:k.max),gen.err.s,type="l",xlab='k',
      ylab='proportion misclassified',main='Estimated Error Rates (Scaled Data)',
      ylim=c(0.08,0.33),col="green")
points(c(1:k.max),cvknn$MISCLASS,type="l",col="cornflowerblue")
legend(20,0.33,legend=c("unbiased from test sample",
       "nfold c-v","5fold c-v","AIC-like"),
       fill=c("green","cornflowerblue","blue","red"))

# Because there is little difference between using 
# the scaled and the unscaled data, I'll just show 
# results based on the scaled data.

#####

# Here's my fivefold c-v routine.

cv5.err.s <- rep(0,k.max)
fold.cases.block <- c(1:(n.trn/5))
for (fold in 1:5)
{
  fold.cases <- (fold - 1)*(n.trn/5) + fold.cases.block
  for (k.val in 1:k.max)
  {
    pred.s <- kknn(as.factor(y)~., train=trndat[-fold.cases,],
                   test=trndat[fold.cases,], k=k.val, kernel="rectangular")
    cv5.err.s[k.val] <- cv5.err.s[k.val] +
                        sum(pred.s$fit != trndat[fold.cases,1])
  }
}
cv5.err.s <- cv5.err.s/n.trn
points(c(1:k.max), cv5.err.s,type="l",col="blue")

# Overall, it seems for a training sample size of 100,
# nfold c-v tracks the true error rates better than 
# fivefold c-v does.

#####

# The last step of the exercise is to compute an
# "AIC-like" estimate of the error rate by adding
# a penalty to the resubstitution estimate of the
# error rate.  Specifically, the AIC-like measure 
# is
#      resub. est. + 2(approx. num. param.)/n.
# Letting
#      approx. num. param. = n/k,
# we get that the AIC-like measure is
#      resub. est. + 2/k.
# (Note: This will produce an error rate exceeding
# 1 (it'll be 2 if there are no equal x-vectors) for
# k = 1, and an estimated error rate of at least 1
# if k = 2.)

AIC.like.s <- numeric(k.max)
for (k.val in 1:k.max)
{
  pred.s <- kknn(as.factor(y)~.,train=trndat,test=trndat,
                 k=k.val,kernel="rectangular")
  AIC.like.s[k.val] <- mean(pred.s$fit != trndat[,1]) + 2/k.val
}
points(c(1:70), AIC.like.s,type="l",col="red")

# The AIC-like error rate estimates are poor for the 
# smaller values of k, but do a bit better then fivefold 
# c-v for large values of k.  But overall, I think the 
# nfold c-v does the best job.

#####

# I ran everything again with the sample sizes given below.
# There was less difference between the nfold and 5fold c-v
# estimates, and the AIC-like measures were horribly bad 
# compared to c-v estimates.
 
n.trn <- 400
n.test <- 2000

##### End #####