# This example deals with the classification setting # dealt with in Sec. 2.3 of HTF2. I'll use SVMs in # in addition to some methods from Ch. 2, Ch. 4, and # Sec. 13.2. # Developed using 2.9.0. # I'll load libraries and set the random number seed. library(MASS) library(mvtnorm) library(multinomRob) library(class) library(kknn) library(e1071) # for svm f'n (for support vector machines) set.seed(18) # First I'll generate and plot the component means. Sig1 <- diag(1,2) # Sig1 is a variance-covariance matrix. Omeans <- rmvnorm(10, mean=c(0,1), sigma=Sig1) # O is used for the Orange class. Bmeans <- rmvnorm(10, mean=c(1,0), sigma=Sig1) # B is used for the Blue class. plot(Omeans[,1],Omeans[,2],col="darkorange",main='Component Means',xlab='x1',ylab='x2',xlim=c(-3,4),ylim=c(-3,4)) points(Bmeans[,1], Bmeans[,2], col="royalblue") ##### # For each class I'll generate 10 counts which sum to 100 # using a multinomial (100,0.1,0.1,...,0.1) distribution. Ocounts <- rmultinomial(100, rep(0.1,10)) Bcounts <- rmultinomial(100, rep(0.1,10)) # Using the generated component means and # the generated counts as sample sizes (for # the various components of the mixture # distributions), I'll generate 100 training # sample obervations for each class. Sig2 <- diag(0.2, 2) Oxvals <- matrix(0, 100, 2) finish <- 0 for (i in 1:10) { start <- finish + 1 finish <- finish + Ocounts[i] Oxvals[start:finish,] <- rmvnorm(Ocounts[i], mean=Omeans[i,], sigma=Sig2) } Bxvals <- matrix(0, 100, 2) finish <- 0 for (i in 1:10) { start <- finish + 1 finish <- finish + Bcounts[i] Bxvals[start:finish,] <- rmvnorm(Bcounts[i], mean=Bmeans[i,], sigma=Sig2) } # Let's look at the training data. plot(Oxvals[,1],Oxvals[,2],col="darkorange",main='Training Data',xlab='x1',ylab='x2', xlim=c(-3.5,4.5),ylim=c(-3.5,4.5)) points(Bxvals[,1], Bxvals[,2], col="royalblue") ##### # I'll give response value of 0 for Orange class observations, # and 1 for the Blue class observations, and then combine them # together to form the training data. Oclass <- cbind(0, Oxvals) Bclass <- cbind(1, Bxvals) trdata <- data.frame(rbind(Oclass, Bclass)) names(trdata) <- c("g", "x1", "x2") # This completes the generation of the training data. # I'll generate a test set of 1000 additional # observations from each of the two classes. # (Note: The component means are the ones used # to create the training data.) I like to call # this data the generalization sample, because I # will only use it to get good estimates of the # error rate of various fitted classifiers for # future observations. Some call it a test sample, # but I like to think of a test sample as data I have # during the training/fitting process that I have # set aside so that I can avoid using things like # c-v and AIC for model/parameter selection. Ocounts <- rmultinomial(1000, rep(0.1,10)) Bcounts <- rmultinomial(1000, rep(0.1,10)) Oxvals <- matrix(0, 1000, 2) finish <- 0 for (i in 1:10) { start <- finish + 1 finish <- finish + Ocounts[i] Oxvals[start:finish,] <- rmvnorm(Ocounts[i], mean=Omeans[i,], sigma=Sig2) } Bxvals <- matrix(0, 1000, 2) finish <- 0 for (i in 1:10) { start <- finish + 1 finish <- finish + Bcounts[i] Bxvals[start:finish,] <- rmvnorm(Bcounts[i], mean=Bmeans[i,], sigma=Sig2) } Oclass <- cbind(0, Oxvals) Bclass <- cbind(1, Bxvals) gendata <- data.frame(rbind(Oclass, Bclass)) names(gendata) <- c("g", "x1", "x2") # I'll obtain an estimate of the Bayes error # rate using the large generalization sample. Opdf <-rep(0,2000) Bpdf <-rep(0,2000) for (i in 1:10) { Opdf <- Opdf + dmvnorm(gendata[,2:3], Omeans[i,], Sig2) Bpdf <- Bpdf + dmvnorm(gendata[,2:3], Bmeans[i,], Sig2) } pred.bayes <- ifelse(Opdf > Bpdf, 0, 1) est.bayes.rate <- mean(pred.bayes != gendata[,1]) est.bayes.rate # This is the estimated error rate of the Bayes classifier. # Below is a confusion matrix, with true class for the # rows and predicted class for the columns. Below that # is a plot showing the decision boundaries of the # Bayes classifier. table(gendata[,1], pred.bayes) plot(gendata[,2],gendata[,3],col=pred.bayes+2,main='Bayes Classifier',xlab='x1',ylab='x2', xlim=c(-3.5,4.5),ylim=c(-3.5,4.5)) ##### # I'll use c-v to select a good number of nearest neighbors # to use for "vanilla" nearest neighbors classification. cvknn <- train.kknn(as.factor(g) ~ ., trdata, kmax=50, kernel="rectangular", distance=2) plot(c(1:50),cvknn$MISCLASS,type="l",xlab='k', ylab='proportion misclassified',main='Estimated Error Rates', ylim=c(0.2,0.5),col="cornflowerblue") which.min(cvknn$MISCLASS) cvknn$MISCLASS[9:35] # Several values of k ranging from 9 to 31 tie for producing # the fewest misclassifications. Since the "true curve" # (as opposed to the estimated curve we get from c-v) # may be smoother and concave upward with a single # minimum, I'd choose to use k = 21 (or 19) to classify # future observations. (I think an odd value for k is # better since it should lead to fewer ties being randomly # broken.) # I'll use the large generalization sample to estimate # the error rate for the 21 nearest neighbors classifier. pred.21nn <- kknn(as.factor(g)~.,train=trdata,test=gendata,k=21) gen.err.21nn <- mean(pred.21nn$fit != gendata[,1]) gen.err.21nn # est. gen. err. rate for 21 nearest neighbors ##### # Now let's check out classification using LDA and QDA. trlda <- lda(g ~ . , trdata) pred.trlda <- predict(trlda, newdata=gendata) lda.rate <- mean( pred.trlda$class != gendata[,1] ) lda.rate # LDA rate. trqda <- qda(g ~ . , trdata) pred.trqda <- predict(trqda, newdata=gendata) qda.rate <- mean( pred.trqda$class != gendata[,1] ) qda.rate # QDA rate. # (Better than LDA, but not as good as 21 nearest neighbors.) ##### # Now I'll try 1st-, 2nd-, and 3rd-order logisitic # regression models. logreg.fit1 <- glm(as.factor(g) ~ ., data=trdata, family=binomial) summary(logreg.fit1) # Let's see how well the fitted model predicts the # classes for the generalization set. pred.logreg1 <- predict(logreg.fit1, newdata=gendata, type="response") logreg1.rate <- mean( round(pred.logreg1) != gendata[,1] ) logreg1.rate # error rate for 1st-order logistic regression model # (very close to LDA error rate) # I'll try a full 2nd-order model. logreg.fit2 <- glm(as.factor(g) ~ x1 + x2 + I(x1^2) + I(x2^2) + I(x1*x2), data=trdata, family=binomial) summary(logreg.fit2) pred.logreg2 <- predict(logreg.fit2, newdata=gendata, type="response") logreg2.rate <- mean( round(pred.logreg2) != gendata[,1] ) logreg2.rate # error rate for 2nd-order logistic regression model # I'll try a 3rd-order model. logreg.fit3 <- glm(as.factor(g) ~ x1 + x2 + I(x1^2) + I(x2^2) + I(x1*x2) + I(x1^3) + I(x2^3) +I(x1*x2^2) + I(x2*x1^2), data=trdata, family=binomial) summary(logreg.fit3) pred.logreg3 <- predict(logreg.fit3, newdata=gendata, type="response") logreg3.rate <- mean( round(pred.logreg3) != gendata[,1] ) logreg3.rate # error rate for 3rd-order logistic regression model ##### # Now I'll define a function that uses c-v with the # K-means clustering method from subsection 13.2.1 # of HTF2. The user supplies a vector of proportions # that determines the numbers of clusters per class # that will be compared using 10-fold c-v estimates # of error rates. The second of the two functions # defined below is the one that does the c-v. The # first of the two functions is used by the second # of the two functions. KMeansClassMod <- function(traindata, testdata, n.comp) # 1st col. of each data set should be class variable # (and classes should be labeled with consectutive # nonnegative integers). Remaining col's of each set # should be predictor var's. n.comp should be a vector # containing the numbers of clusters to use for each of # the classes. To avoid an error with the kmeans function, # if the number of clusters specified is less than 2, the # number will be increased to 2. { first.class <- min(traindata[,1]) last.class <- max(traindata[,1]) prototypes <- NULL for (g in first.class:last.class) { cases <- traindata[,1] == g classdata <- traindata[cases,] num.clusters <- max(n.comp[g-first.class+1], 2) clusters <- kmeans(classdata[,-1],num.clusters,iter.max=15,nstart=5) class.prototypes <- cbind(g, clusters$centers) prototypes <- rbind(prototypes, class.prototypes) } prototypes <- data.frame(prototypes) names(prototypes) <- names(traindata) predict.class <- kknn(as.factor(g)~.,train=prototypes,test=testdata,k=1) test.error <- mean( predict.class$fit != testdata[,1] ) test.error } KMeansClassCV <- function(data, proportions) # The proportions vector determines the numbers of clusters # per class that will be compared using 10-fold c-v estimates # of error rates. E.g., if the proportions vector is c(0.05, # 0.1, 0.2), for each class about 0.05*n_j clusters will be used # for K-means classification the first trial, 0.1*n_j clusters # will be used the second trial, and 0.2*n_j clusters will be # used the third trial, where n_j is the number of training # set cases for the class. (An exception is that if the number # of clusters would be less than 2, it is made to be 2. This # is done to avoid an error with the kmeans function.) The 1st # column of the data set should be class variable (and classes # should be labeled with consectutive nonnegative integers). # Remaining columns should be predictor var's. { n.total <- length(data[,1]) first.class <- min(data[,1]) last.class <- max(data[,1]) num.classes <- last.class - first.class + 1 n.class <- numeric(num.classes) for (j in first.class:last.class) { class.cases <- (data[,1] == j) n.class[j-first.class+1] <- length(data[class.cases,1]) } fold.size <- ceiling(n.total/10) fold.id <- rep(c(1:10), fold.size) fold.id <- sample(fold.id, n.total) num.trials <- length(proportions) errors <- rep(0, num.trials) for (i in 1:num.trials) { num.clusters <- round(n.class*proportions[i]) for (fold in 1:10) { fold.cases <- (fold.id == fold) errors[i] <- errors[i] + length(data[fold.cases,1])*KMeansClassMod(data[!fold.cases,],data[fold.cases,],num.clusters) } } err.rate <- errors/n.total results <- cbind(proportions, err.rate) results } KMeansClassCV(trdata, c(0.05, 0.1, 0.15, 0.2, 0.25)) # Note: Originally I used kkn in my functions, and I # didn't get any warnings. But then I decided to use # kknn (so that the data would be scaled automatically), # which required that I make the set of prototypes into # a data frame, and then I got warnings. I don't think # the warnings indicate a serious problem. # I'll try some more proportions. KMeansClassCV(trdata, c(0.08,0.1,0.12,0.14)) # And some more. KMeansClassCV(trdata, c(0.06,0.07,0.08,0.09,0.1,0.11,0.12)) # It looks like 8 prototypes per class may be good. # Let's see what the estimated generalization error # is for that choice. KMeansClassMod(trdata,gendata,c(8,8)) # Not bad, but not as good as using 21 nearest neighbors. ##### # I next used c-v with the LVQ method # from subsection 13.2.2 of HTF2, but # since it took a long time to run # several times as I searched for good # parameter values to use, I'll skip # that part here and below just get # the estimated generalization error # for the choices that I ultimately made. LVQCV <- function(data, proportions, num.it, init.ep) # The proportions vector determines the numbers of clusters # per class that will be compared using 10-fold c-v estimates # of error rates. E.g., if the proportions vector is c(0.05, # 0.1, 0.2), for each class about 0.05*n_j clusters will be used # for K-means classification the first trial, 0.1*n_j clusters # will be used the second trial, and 0.2*n_j clusters will be # used the third trial, where n_j is the number of training # set cases for the class. The first column of the data set # should be class variable (and classes should be labeled with # consectutive nonnegative integers). Remaining columns should # be predictor var's. { n.total <- length(data[,1]) first.class <- min(data[,1]) last.class <- max(data[,1]) num.classes <- last.class - first.class + 1 n.class <- numeric(num.classes) for (j in first.class:last.class) { class.cases <- (data[,1] == j) n.class[j-first.class+1] <- length(data[class.cases,1]) } fold.size <- ceiling(n.total/10) fold.id <- rep(c(1:10), fold.size) fold.id <- sample(fold.id, n.total) num.trials <- length(proportions) errors <- rep(0, num.trials) for (i in 1:num.trials) { num.pro <- round(n.class*proportions[i]) for (fold in 1:10) { fold.cases <- (fold.id == fold) errors[i] <- errors[i] + length(data[fold.cases,1])*LVQMod(data[!fold.cases,],data[fold.cases,],num.pro,num.it,init.ep) } } err.rate <- errors/n.total results <- cbind(proportions, err.rate) results } LVQMod <- function(traindata,testdata,n.pro,iterations,initial.epsilon) # 1st col. of each data set should be class variable # (and classes should be labeled with consectutive # nonnegative integers). # Remaining col's of each set should be predictor var's. { first.class <- min(traindata[,1]) last.class <- max(traindata[,1]) prototypes <- NULL for (g in first.class:last.class) { cases <- traindata[,1] == g classdata <- traindata[cases,] n.cases <- length(classdata[,1]) selected <- sample.int(n.cases, n.pro[g-first.class+1]) class.prototypes <- classdata[selected,] prototypes <- rbind(prototypes, class.prototypes) } train.size <- length(traindata[,1]) for (i in 1:iterations) { epsilon <- initial.epsilon/i case <- sample.int(train.size, 1) distances <- as.matrix(dist(rbind(traindata[case,-1],prototypes[,-1]))) closest <- as.integer(which.min(distances[-1,1])) ifelse(prototypes[closest,1]==traindata[case,1], prototypes[closest,-1] <- (1 - epsilon)*prototypes[closest,-1] + epsilon*traindata[case,-1], prototypes[closest,-1] <- (1 - epsilon)*prototypes[closest,-1] - epsilon*traindata[case,-1]) } predict.class <- knn(prototypes[,-1], testdata[,-1], prototypes[,1], k=1) errors <- mean( predict.class != testdata[,1] ) errors } # I tried several combinations of proportions and parameter # settings, and a proportion of 0.15 worked best. LVQMod(trdata, gendata, c(15,15,15), 500, 0.67) ##### # Now I'll combine the K-means starting points # with the movement of the prototypes used in # the LVQ method. KMeansPlusLVQCV <- function(data,proportions,num.it,init.ep) # 1st col. of each data set should be class variable # (and classes should be labeled with consectutive # nonnegative integers). # Remaining col's of each set should be predictor var's. { n.total <- length(data[,1]) first.class <- min(data[,1]) last.class <- max(data[,1]) num.classes <- last.class - first.class + 1 n.class <- numeric(num.classes) for (j in first.class:last.class) { class.cases <- (data[,1] == j) n.class[j-first.class+1] <- length(data[class.cases,1]) } fold.size <- ceiling(n.total/10) fold.id <- rep(c(1:10), fold.size) fold.id <- sample(fold.id, n.total) num.trials <- length(proportions) errors <- rep(0, num.trials) for (i in 1:num.trials) { num.pro <- round(n.class*proportions[i]) for (fold in 1:10) { fold.cases <- (fold.id == fold) errors[i] <- errors[i] + length(data[fold.cases,1])*KMeansPlusLVQMod(data[!fold.cases,],data[fold.cases,],num.pro,num.it,init.ep) } } err.rate <- errors/n.total results <- cbind(proportions, err.rate) results } KMeansPlusLVQMod <- function(traindata,testdata,n.pro,iterations,initial.epsilon) { first.class <- min(traindata[,1]) last.class <- max(traindata[,1]) prototypes <- NULL for (g in first.class:last.class) { cases <- traindata[,1] == g classdata <- traindata[cases,] clusters <- kmeans(classdata[,-1],n.pro[g-first.class+1],iter.max=15,nstart=5) class.prototypes <- cbind(g, clusters$centers) prototypes <- rbind(prototypes, class.prototypes) } prototypes <- data.frame(prototypes) train.size <- length(traindata[,1]) for (i in 1:iterations) { epsilon <- initial.epsilon/i case <- sample.int(train.size, 1) distances <- as.matrix(dist(rbind(traindata[case,-1],prototypes[,-1]))) closest <- as.integer(which.min(distances[-1,1])) ifelse(prototypes[closest,1]==traindata[case,1], prototypes[closest,-1] <- (1 - epsilon)*prototypes[closest,-1] + epsilon*traindata[case,-1], prototypes[closest,-1] <- (1 - epsilon)*prototypes[closest,-1] - epsilon*traindata[case,-1]) } predict.class <- knn(prototypes[,-1], testdata[,-1], prototypes[,1], k=1) test.error <- mean( predict.class != testdata[,1] ) test.error } # Without showing details of my search, I found that the # parameter settings used below looked promising. KMeansPlusLVQMod(trdata,gendata,c(6,6),500,0.5) # 2nd best result so far. # (21 nearest neighbors was better.) ##### # Now I will try some support vector machines. # The default kernel is the radial kernel. # Using the svm function, one has to supply a # value for the gamma parameter, or else rely # on the default, which is the inverse of the # number of predictors, or 1/2 in this case. # The help page stresses that one should tune, # and so a variety of gamma values should be # considered. The svm function has a cross- # validation feature which can be used to obtain # an estimate of generalization error rate from # the training sample for the particular value of # gamma which is currently being used. However, # even if the same random number seed is used, # the c-v estimates for a particular value of # gamma can differ from trial to trial. This # prevents me from doing a "trial-and-error" search # for a good gamma and making comments here, # because when this script is run I can't know # in advance what the outcomes will be. So I'll # just try a bunch of different gamma values # and obtain c-v estimates of the error rates, as # well as estimates of the error rates using the # fitted models to make predictions for the large # generalization sample. num.trials <- 15 gamma.val <- (1:15)/10 error.cv <- numeric(15) error.gen <- numeric(15) for (i in 1:num.trials) { svm.rad <- svm( as.factor(g) ~ ., trdata, cross=10, gamma=gamma.val[i]) error.cv[i] <- svm.rad$tot.accuracy error.gen[i] <- mean( predict(svm.rad, gendata) != as.factor(gendata[,1]) ) } error.cv <- 1 - error.cv/100 plot(gamma.val,error.gen,type="l",xlab='gamma', ylab='proportion misclassified',main='Estimated Error Rates (SVM, radial kernel)', ylim=c(0.2,0.5),col="seagreen") points(gamma.val,error.cv,type="l",col="tomato") abline(est.bayes.rate, 0, col="purple") legend(0.15,0.5,legend=c("cross-val","generalization","Bayes"), fill=c("tomato","seagreen","purple")) # Let's look at the misclassification proportions # for the large generalization sample. error.gen # For one time I did it, cross-validation indicated # setting gamma to 0.9 would be good. Using 0.9, # only about 24% percent of the generalization sample # was misclassified. (Another time I did it the c-v # indicated gamma equal to 1.0 would be good, and yet # another time the indication was that gamma should be # 1.5. All of these choices result in good error rates. ##### # Below I'll demonstate the plot command applied to an svm object. # (It corresponds to the last svm fit above, for gamma = 1.5.) plot(svm.rad, trdata) summary(svm.rad) ##### # Here's the plot showing the Bayes decision boundaries again. plot(gendata[,2],gendata[,3],col=pred.bayes+2,main='Bayes Classifier',xlab='x1',ylab='x2', xlim=c(-3.5,4.5),ylim=c(-3.5,4.5)) ##### # Now I'll try using the polynomial kernel. To tune, # I'll start by considering various values for the # degree, keeping the other parameters at their default # settings. svm.poly2 <- svm( as.factor(g) ~ ., trdata, cross=10, kernel="polynomial", degree=2) summary(svm.poly2) svm.poly3 <- svm( as.factor(g) ~ ., trdata, cross=10, kernel="polynomial", degree=3) summary(svm.poly3) svm.poly4 <- svm( as.factor(g) ~ ., trdata, cross=10, kernel="polynomial", degree=4) summary(svm.poly4) # C-v results indicate setting the degree to 3 # works worst, with a degree of 2 and 4 both # both working better. But even the best degree, # 2, doesn't look very promising. Nevertheless, # I'll try some different values of gamma to see # if I can get improvement with the degree equal 2. for (i in 1:num.trials) { svm.rad <- svm( as.factor(g) ~ ., trdata, cross=10, kernel="polynomial", degree=2, gamma=gamma.val[i]) error.cv[i] <- svm.rad$tot.accuracy error.gen[i] <- mean( predict(svm.rad, gendata) != as.factor(gendata[,1]) ) } error.cv <- 1 - error.cv/100 plot(gamma.val,error.gen,type="l",xlab='gamma', ylab='proportion misclassified',main='Estimated Error Rates (SVM, poly 2 kernel)', ylim=c(0.2,0.5),col="seagreen") points(gamma.val,error.cv,type="l",col="tomato") abline(est.bayes.rate, 0, col="purple") legend(0.15,0.5,legend=c("cross-val","generalization","Bayes"), fill=c("tomato","seagreen","purple")) ##### # To be thorough, I'll now try altering the coef0 # parameter. (The default value is 0.) I'll set # gamma to 1.5. num.trials <- 15 coef0.val <- (-7:7)/5 error.cv <- numeric(15) error.gen <- numeric(15) for (i in 1:num.trials) { svm.rad <- svm( as.factor(g) ~ ., trdata, cross=10, kernel="polynomial", degree=2, gamma=1.5, coef0=coef0.val[i]) error.cv[i] <- svm.rad$tot.accuracy error.gen[i] <- mean( predict(svm.rad, gendata) != as.factor(gendata[,1]) ) } error.cv <- 1 - error.cv/100 plot(coef0.val,error.gen,type="l",xlab='coef0', ylab='proportion misclassified',main='Estimated Error Rates (SVM, poly 2 kernel)', ylim=c(0.2,0.70),col="seagreen") points(coef0.val,error.cv,type="l",col="tomato") abline(est.bayes.rate, 0, col="purple") legend(0,0.7,legend=c("cross-val","generalization","Bayes"), fill=c("tomato","seagreen","purple")) ##### # Let's try some larger values for coef0. num.trials <- 15 coef0.val <- 2 + (-7:7)/5 error.cv <- numeric(15) error.gen <- numeric(15) for (i in 1:num.trials) { svm.rad <- svm( as.factor(g) ~ ., trdata, cross=10, kernel="polynomial", degree=2, gamma=1.0, coef0=coef0.val[i]) error.cv[i] <- svm.rad$tot.accuracy error.gen[i] <- mean( predict(svm.rad, gendata) != as.factor(gendata[,1]) ) } error.cv <- 1 - error.cv/100 plot(coef0.val,error.gen,type="l",xlab='coef0', ylab='proportion misclassified',main='Estimated Error Rates (SVM, poly 2 kernel)', ylim=c(0.2,0.70),col="seagreen") points(coef0.val,error.cv,type="l",col="tomato") abline(est.bayes.rate, 0, col="purple") legend(1.0,0.7,legend=c("cross-val","generalization","Bayes"), fill=c("tomato","seagreen","purple")) ##### # Before quitting, I'll try the linear kernel. svm.lin <- svm( as.factor(g) ~ ., trdata, cross=10, kernel="linear") summary(svm.lin) # The c-v estimate isn't promising, but I'll check the # performance on the generalization set anyway. mean( predict(svm.lin, gendata) != as.factor(gendata[,1]) ) # Not good. plot(svm.lin, trdata) ##### # Here is a summary of classification # performance on the generalization set. # Method est. error rate # ------------------------ --------------- # LDA 0.385 # SVM (linear) 0.3845 # logistic reg (1st order) 0.3845 # LVQ 0.3035 # K Means 0.2755 # QDA 0.2755 # logistic reg (3rd order) 0.275 # logitsic reg (2nd order) 0.272 # SVM (polynomial, 2) 0.27 # K Means + LVQ 0.265 # 21 nearest neighbors 0.252 # SVM (radial) 0.24 (about, it depends on what c-v leads you to select) # Bayes 0.2295 ########## End ##########