# Investigation of the Functions # knn, kknn, & train.kknn # ------------------------------ library(MASS) library(class) library(kknn) set.seed(13) # creation of training data x1 <- runif(50) x2 <- 2*runif(50) y <- numeric(50) y <- ifelse( x1 > 0.5, 1, 0 ) trndat <- data.frame(cbind(y,x1,x2)) # x1 determines the class, but the noise variable, x2, # has greater variation, and will play a larger role in # determining the nearest neighbors than x1 will if the # two predictor variables aren't scaled. # Let's compare the sample standard deviations of x1 & x2. sd(x1) sd(x2) ##### # creation of test data x1 <- runif(500) x2 <- 2*runif(500) y <- numeric(500) y <- ifelse( x1 > 0.5, 1, 0 ) tstdat <- data.frame(cbind(y,x1,x2)) # I'll estimate misclassification rate of KNN using # the training data to classify the test data. I'll # use both the kknn function (which scales the data) # and the knn function (which does not scale the data). # Due to the way the data was generated, whether or not # the predictors are scaled should make an appreciable # difference, with the classification errors being reduced # if the scaled data is used (since scaling will reduce the # weight of the noise variable, x2, in the determination of # the nearest neighbors). gen.err.kknn <- numeric(50) gen.err.knn <- numeric(50) for (k.val in 1:50) { pred.kknn <- kknn(as.factor(y)~.,train=trndat,test=tstdat,k=k.val,kernel="rectangular") gen.err.kknn[k.val] <- mean(pred.kknn$fit != tstdat[,1]) pred.knn <- knn(trndat[,-1], tstdat[,-1], cl=trndat[,1], k=k.val) gen.err.knn[k.val] <- mean(pred.knn != tstdat[,1]) } plot(c(1:50),gen.err.knn,type="l",xlab='k', ylab='proportion misclassified',main='Estimated Error Rates', ylim=c(0,0.60),col="limegreen") points(c(1:50),gen.err.kknn,type="l",col="red") legend(0,0.6,legend=c("knn (raw data)","knn (scaled data)", "kknn (raw data)","kknn (scaled data)"), fill=c("limegreen","purple","red","goldenrod")) # As anticipated, kknn resulted in fewer misclassifications # because it scales the data. Even with k = 1 (for which they # both do very well), kknn is just a tad better (0.044 vs. 0.046). ##### # To better understand the proportion of errors for the larger # values of k, let's consider the distribution of the two classes # in the training and test samples. table(trndat[,1]) table(tstdat[,1]) # For k >= 43, more than half of the nearest neighbors of a test # case in the training set will be of class 1 (since there are only # 21 class 0 cases in the training set). Since there are 252 class # 0 cases in the test set, for k >= 43 we should get exactly 252 # errors (since all of the class 0 cases will be classified incorrectly, # and all of the class 1 cases will be classified correctly), and so the # proportion misclassified will be 252/500 = 0.504. Let's take a look at # the proportions of test set cases misclassified for k >= 43. # For knn: gen.err.knn[43:50] # For kknn: gen.err.kknn[43:50] # Both functions produced the expected results. As Dan Carr would say # "The world is safe." (Note: When I first looked into this, it seemed # to me as though the world was inside out and upside down ... then I # realized that I had forgotten that the default with kknn is to use the # triangular kernel (doing weighted KNN) instead of the rectangluar kernel # (which does ordinary vanilla KNN).) ##### # Now I'll scale the data and use knn. sc.trn <- scale(trndat[,-1]) scaled.trn <- data.frame(cbind(trndat[,1],sc.trn)) names(scaled.trn) <- names(trndat) # sample standard deviation of scaled x1: sd(scaled.trn[,2]) # sample standard deviation of scaled x2: sd(scaled.trn[,3]) sc.tst <- scale(tstdat[,-1],center=attr(sc.trn,"scaled:center"),scale=attr(sc.trn,"scaled:scale")) scaled.tst <- data.frame(cbind(tstdat[,1],sc.tst)) names(scaled.tst) <- names(tstdat) # (Note: Don't forget that the test set data should be scaled # using the means and standard deviations from the training set.) gen.err.knn.scale <- numeric(50) gen.err.kknn.scale <- numeric(50) for (k.val in 1:50) { pred.kknn.scale <- kknn(as.factor(y)~.,train=scaled.trn,test=scaled.tst,k=k.val,kernel="rectangular") gen.err.kknn.scale[k.val] <- mean(pred.kknn.scale$fit != scaled.tst[,1]) pred.knn.scale <- knn(scaled.trn[,-1], scaled.tst[,-1], cl=scaled.trn[,1], k=k.val) gen.err.knn.scale[k.val] <- mean(pred.knn.scale != scaled.tst[,1]) } points(c(1:50),gen.err.kknn.scale,type="l",lty="dashed",col="goldenrod") points(c(1:50),gen.err.knn.scale,type="l",lty="dotted",col="purple") # It looks like kknn produces exactly the same results whether it is # given the raw data or the scaled data, but let's confirm this. gen.err.kknn - gen.err.kknn.scale # Yep, exactly the same. # Note: Can also check this using the all.equal function. all.equal(gen.err.kknn, gen.err.kknn.scale) ##### # **** CONCLUSIONS **** # (1) kknn produces the same results with the scaled and raw data. # (2) Using scaled data with knn results in the nearly same thing # as using kknn. # To explore how using scaled data with knn differs from using kknn # let's see if they are exactly the same for odd values of k, but # differ in some cases with even values of k. even <- 2*c(1:25) odd <- even - 1 # k odd: gen.err.knn.scale[odd] - gen.err.kknn[odd] # k even: gen.err.knn.scale[even] - gen.err.kknn[even] # Yep, they only differ for some even values # of k (due to ties being broke randomly). ##### # Now I just want to confirm that train.kknn does what I think it # does ... that is, do a leave-one out (aka nfold) c-v scaling the # data first (so that it won't matter whether or not it is given # scaled data). # First I'll use train.kknn with both the raw and the scaled data. cvkknn <- train.kknn(as.factor(y) ~ ., trndat, kmax = 49, kernel="rectangular") cvkknn.scale <- train.kknn(as.factor(y) ~ ., scaled.trn, kmax = 49, kernel="rectangular") plot(c(1:49),cvkknn$MISCLASS,type="l",xlab='k', ylab='proportion misclassified',main='Estimated Error Rates', ylim=c(0,0.6),col="red") points(c(1:49),cvkknn.scale$MISCLASS,type="l",lty="dashed",col="goldenrod") legend(0,0.6,legend=c("train.kknn (raw data)","train.kknn (scaled data)", "my c-v w/ kknn(raw data)", "my c-v w/ knn (scaled data)"), fill=c("red","goldenrod","purple","limegreen")) # The results are identical. ################################################################### # Note: For some reason the above code may produce an error # # the first time you run it. But I've found that if I run it # # again starting with the line # # cvkknn.scale <- train.kknn( .... # # it works fine. (Sometimes when I run it I don't get an error.) # # ################################################################### ##### # Now I'll use my own nfold c-v routine. mycv.err.knn <- numeric(49) mycv.err.kknn <- numeric(49) for (i in 1:50) { for (k.val in 1:49) { pred.kknn <- kknn(as.factor(y)~., train=trndat[-i,], test=trndat[i,], k=k.val, kernel="rectangular") mycv.err.kknn[k.val] <- mycv.err.kknn[k.val] + (pred.kknn$fit != trndat[i,1]) pred.knn <- knn(scaled.trn[-i,-1], scaled.trn[i,-1], cl=scaled.trn[-i,1], k=k.val) mycv.err.knn[k.val] <- mycv.err.knn[k.val] + (pred.knn != scaled.trn[i,1]) } } mycv.err.knn <- mycv.err.knn/50 mycv.err.kknn <- mycv.err.kknn/50 points(c(1:49), mycv.err.kknn, type="l", lty="dotted", col="purple") points(c(1:49), mycv.err.knn, type="l", lty="dotted", col="limegreen") # Let's check to see if all four ways produce agreement for odd values # of k. If so, then the differences we see are due to even values of # k, and we can assume they are due to the random breaking of ties. all.equal(cvkknn.scale$MISCLASS[odd], cvkknn$MISCLASS[odd]) all.equal(mycv.err.kknn[odd], cvkknn$MISCLASS[odd]) all.equal(mycv.err.knn[odd], cvkknn$MISCLASS[odd]) # Everything checks out! I think the biggest worries in using knn and kknn # is to remember that kknn scales the data automatically, while knn doesn't, # and to remember that the default with kknn is the triangular kernel instead # of the more commonly used rectangular kernel. ##### FINISH #####