# Investigation of the Functions
#    knn, kknn, & train.kknn     
# ------------------------------


library(MASS)
library(class)
library(kknn)

set.seed(13)


# creation of training data

x1 <- runif(50)
x2 <- 2*runif(50)
y <- numeric(50)
y <- ifelse( x1 > 0.5, 1, 0 )
trndat <- data.frame(cbind(y,x1,x2))

# x1 determines the class, but the noise variable, x2,
# has greater variation, and will play a larger role in 
# determining the nearest neighbors than x1 will if the 
# two predictor variables aren't scaled.

# Let's compare the sample standard deviations of x1 & x2.

sd(x1)

sd(x2)

#####

# creation of test data

x1 <- runif(500)
x2 <- 2*runif(500)
y <- numeric(500)
y <- ifelse( x1 > 0.5, 1, 0 )
tstdat <- data.frame(cbind(y,x1,x2))


# I'll estimate misclassification rate of KNN using 
# the training data to classify the test data.  I'll 
# use both the kknn function (which scales the data) 
# and the knn function (which does not scale the data).  

# Due to the way the data was generated, whether or not 
# the predictors are scaled should make an appreciable 
# difference, with the classification errors being reduced 
# if the scaled data is used (since scaling will reduce the 
# weight of the noise variable, x2, in the determination of 
# the nearest neighbors).

gen.err.kknn <- numeric(50)
gen.err.knn <- numeric(50)

for (k.val in 1:50)
{
  pred.kknn <- kknn(as.factor(y)~.,train=trndat,test=tstdat,k=k.val,kernel="rectangular")
  gen.err.kknn[k.val] <- mean(pred.kknn$fit != tstdat[,1])
  pred.knn <- knn(trndat[,-1], tstdat[,-1], cl=trndat[,1], k=k.val)
  gen.err.knn[k.val] <- mean(pred.knn != tstdat[,1])
}

plot(c(1:50),gen.err.knn,type="l",xlab='k',
      ylab='proportion misclassified',main='Estimated Error Rates',
      ylim=c(0,0.60),col="limegreen") 
points(c(1:50),gen.err.kknn,type="l",col="red")
legend(0,0.6,legend=c("knn (raw data)","knn (scaled data)",
       "kknn (raw data)","kknn (scaled data)"),
       fill=c("limegreen","purple","red","goldenrod"))

# As anticipated, kknn resulted in fewer misclassifications
# because it scales the data.  Even with k = 1 (for which they
# both do very well), kknn is just a tad better (0.044 vs. 0.046).

#####

# To better understand the proportion of errors for the larger
# values of k, let's consider the distribution of the two classes
# in the training and test samples.

table(trndat[,1])

table(tstdat[,1])

# For k >= 43, more than half of the nearest neighbors of a test
# case in the training set will be of class 1 (since there are only 
# 21 class 0 cases in the training set).  Since there are 252 class 
# 0 cases in the test set, for k >= 43 we should get exactly 252
# errors (since all of the class 0 cases will be classified incorrectly, 
# and all of the class 1 cases will be classified correctly), and so the 
# proportion misclassified will be 252/500 = 0.504.  Let's take a look at 
# the proportions of test set cases misclassified for k >= 43.

# For knn:
gen.err.knn[43:50]

# For kknn:
gen.err.kknn[43:50]

# Both functions produced the expected results.  As Dan Carr would say
# "The world is safe."  (Note: When I first looked into this, it seemed
# to me as though the world was inside out and upside down ... then I 
# realized that I had forgotten that the default with kknn is to use the
# triangular kernel (doing weighted KNN) instead of the rectangluar kernel
# (which does ordinary vanilla KNN).)

#####

# Now I'll scale the data and use knn.

sc.trn <- scale(trndat[,-1])
scaled.trn <- data.frame(cbind(trndat[,1],sc.trn))
names(scaled.trn) <- names(trndat)

# sample standard deviation of scaled x1:
sd(scaled.trn[,2])

# sample standard deviation of scaled x2:
sd(scaled.trn[,3])

sc.tst <- scale(tstdat[,-1],center=attr(sc.trn,"scaled:center"),scale=attr(sc.trn,"scaled:scale"))
scaled.tst <- data.frame(cbind(tstdat[,1],sc.tst))
names(scaled.tst) <- names(tstdat)
# (Note: Don't forget that the test set data should be scaled 
# using the means and standard deviations from the training set.)

gen.err.knn.scale <- numeric(50)
gen.err.kknn.scale <- numeric(50)
for (k.val in 1:50)
{
  pred.kknn.scale <- kknn(as.factor(y)~.,train=scaled.trn,test=scaled.tst,k=k.val,kernel="rectangular")
  gen.err.kknn.scale[k.val] <- mean(pred.kknn.scale$fit != scaled.tst[,1])
  pred.knn.scale <- knn(scaled.trn[,-1], scaled.tst[,-1], cl=scaled.trn[,1], k=k.val)
  gen.err.knn.scale[k.val] <- mean(pred.knn.scale != scaled.tst[,1])
}

points(c(1:50),gen.err.kknn.scale,type="l",lty="dashed",col="goldenrod")
points(c(1:50),gen.err.knn.scale,type="l",lty="dotted",col="purple")

# It looks like kknn produces exactly the same results whether it is 
# given the raw data or the scaled data, but let's confirm this.

gen.err.kknn - gen.err.kknn.scale

# Yep, exactly the same.

# Note: Can also check this using the all.equal function.

all.equal(gen.err.kknn, gen.err.kknn.scale)

#####

#                     **** CONCLUSIONS ****

# (1) kknn produces the same results with the scaled and raw data.

# (2) Using scaled data with knn results in the nearly same thing 
#     as using kknn. 


# To explore how using scaled data with knn differs from using kknn
# let's see if they are exactly the same for odd values of k, but 
# differ in some cases with even values of k.

even <- 2*c(1:25)
odd <- even - 1

# k odd:
gen.err.knn.scale[odd] - gen.err.kknn[odd]

# k even:
gen.err.knn.scale[even] - gen.err.kknn[even]

# Yep, they only differ for some even values 
# of k (due to ties being broke randomly).

#####

# Now I just want to confirm that train.kknn does what I think it
# does ... that is, do a leave-one out (aka nfold) c-v scaling the
# data first (so that it won't matter whether or not it is given 
# scaled data).

# First I'll use train.kknn with both the raw and the scaled data.

cvkknn <- train.kknn(as.factor(y) ~ ., trndat, kmax = 49, kernel="rectangular")
cvkknn.scale <- train.kknn(as.factor(y) ~ ., scaled.trn, kmax = 49, kernel="rectangular")
plot(c(1:49),cvkknn$MISCLASS,type="l",xlab='k',
      ylab='proportion misclassified',main='Estimated Error Rates',
      ylim=c(0,0.6),col="red") 
points(c(1:49),cvkknn.scale$MISCLASS,type="l",lty="dashed",col="goldenrod")
legend(0,0.6,legend=c("train.kknn (raw data)","train.kknn (scaled data)",
       "my c-v w/ kknn(raw data)", "my c-v w/ knn (scaled data)"),
       fill=c("red","goldenrod","purple","limegreen"))

# The results are identical.
 
###################################################################
# Note: For some reason the above code may produce an error       #
# the first time you run it. But I've found that if I run it      #
# again starting with the line                                    #
#     cvkknn.scale <- train.kknn( ....                            #
# it works fine.  (Sometimes when I run it I don't get an error.) #                                          #
###################################################################

#####

# Now I'll use my own nfold c-v routine.

mycv.err.knn <- numeric(49)
mycv.err.kknn <- numeric(49)
for (i in 1:50)
{
  for (k.val in 1:49)
  {
    pred.kknn <- kknn(as.factor(y)~., train=trndat[-i,],
                   test=trndat[i,], k=k.val, kernel="rectangular")
    mycv.err.kknn[k.val] <- mycv.err.kknn[k.val] +
                            (pred.kknn$fit != trndat[i,1])
    pred.knn <- knn(scaled.trn[-i,-1], scaled.trn[i,-1], 
                  cl=scaled.trn[-i,1], k=k.val)
    mycv.err.knn[k.val] <- mycv.err.knn[k.val] +
                           (pred.knn != scaled.trn[i,1])
  }
}
mycv.err.knn <- mycv.err.knn/50
mycv.err.kknn <- mycv.err.kknn/50
points(c(1:49), mycv.err.kknn, type="l", lty="dotted", col="purple")
points(c(1:49), mycv.err.knn, type="l", lty="dotted", col="limegreen")

# Let's check to see if all four ways produce agreement for odd values
# of k.  If so, then the differences we see are due to even values of 
# k, and we can assume they are due to the random breaking of ties.

all.equal(cvkknn.scale$MISCLASS[odd], cvkknn$MISCLASS[odd])
all.equal(mycv.err.kknn[odd], cvkknn$MISCLASS[odd])
all.equal(mycv.err.knn[odd], cvkknn$MISCLASS[odd])

# Everything checks out!  I think the biggest worries in using knn and kknn
# is to remember that kknn scales the data automatically, while knn doesn't, 
# and to remember that the default with kknn is the triangular kernel instead
# of the more commonly used rectangular kernel. 

##### FINISH #####