# This example deals with an extension of the # classification setting dealt with in Sec. 2.3 # of HTF2. I'll apply methods from Sec. 13.2 to # it in addition to some methods from Ch. 2 and # Ch. 4. # Developed using 2.9.0. # I'll load libraries and set the random number seed. library(MASS) library(mvtnorm) library(multinomRob) library(class) library(kknn) library(nnet) set.seed(7) # First I'll generate and plot the component means. Sig1 <- diag(1,2) # Sig1 is a variance-covariance matrix. Omeans <- rmvnorm(10, mean=c(0,1), sigma=Sig1) # O is used for the Orange class. Gmeans <- rmvnorm(10, mean=c(0,0), sigma=Sig1) # G is used for the Green class. Bmeans <- rmvnorm(10, mean=c(1,0), sigma=Sig1) # B is used for the Blue class. plot(Omeans[,1],Omeans[,2],col="darkorange",main='Component Means',xlab='x1',ylab='x2',xlim=c(-3,4),ylim=c(-3,4)) points(Gmeans[,1], Gmeans[,2], col="limegreen") points(Bmeans[,1], Bmeans[,2], col="royalblue") ##### # For each class I'll generate 10 counts which sum to 100 # using a multinomial (100,0.1,0.1,...,0.1) distribution. Ocounts <- rmultinomial(100, rep(0.1,10)) Gcounts <- rmultinomial(100, rep(0.1,10)) Bcounts <- rmultinomial(100, rep(0.1,10)) # Using the generated component means and # the generated counts as sample sizes (for # the various components of the mixture # distributions), I'll generate 100 training # sample obervations for each class. Sig2 <- diag(0.2, 2) Oxvals <- matrix(0, 100, 2) finish <- 0 for (i in 1:10) { start <- finish + 1 finish <- finish + Ocounts[i] Oxvals[start:finish,] <- rmvnorm(Ocounts[i], mean=Omeans[i,], sigma=Sig2) } Gxvals <- matrix(0, 100, 2) finish <- 0 for (i in 1:10) { start <- finish + 1 finish <- finish + Gcounts[i] Gxvals[start:finish,] <- rmvnorm(Gcounts[i], mean=Gmeans[i,], sigma=Sig2) } Bxvals <- matrix(0, 100, 2) finish <- 0 for (i in 1:10) { start <- finish + 1 finish <- finish + Bcounts[i] Bxvals[start:finish,] <- rmvnorm(Bcounts[i], mean=Bmeans[i,], sigma=Sig2) } # Let's look at the training data. plot(Oxvals[,1],Oxvals[,2],col="darkorange",main='Training Data',xlab='x1',ylab='x2', xlim=c(-3.5,4.5),ylim=c(-3.5,4.5)) points(Gxvals[,1], Gxvals[,2], col="limegreen") points(Bxvals[,1], Bxvals[,2], col="royalblue") ##### # I'll give response value of 0 for Orange class observations, # 1 for Green class observations, and 2 for the Blue class # observations, and then combine them together to form the # training data. Oclass <- cbind(0, Oxvals) Gclass <- cbind(1, Gxvals) Bclass <- cbind(2, Bxvals) trdata <- data.frame(rbind(Oclass, Gclass, Bclass)) names(trdata) <- c("g", "x1", "x2") # This completes the generation of the training data. # I'll generate a test set of 1000 additional # observations from each of the three classes. # (Note: The component means are the ones used # to create the training data.) I like to call # this data the generalization sample, because I # will only use it to get good estimates of the # error rate of various fitted classifiers for # future observations. Some call it a test sample, # but I like to think of a test sample as data I have # during the training/fitting process that I have # set aside so that I can avoid using things like # c-v and AIC for model/parameter selection. Ocounts <- rmultinomial(1000, rep(0.1,10)) Gcounts <- rmultinomial(1000, rep(0.1,10)) Bcounts <- rmultinomial(1000, rep(0.1,10)) Oxvals <- matrix(0, 1000, 2) finish <- 0 for (i in 1:10) { start <- finish + 1 finish <- finish + Ocounts[i] Oxvals[start:finish,] <- rmvnorm(Ocounts[i], mean=Omeans[i,], sigma=Sig2) } Gxvals <- matrix(0, 1000, 2) finish <- 0 for (i in 1:10) { start <- finish + 1 finish <- finish + Gcounts[i] Gxvals[start:finish,] <- rmvnorm(Gcounts[i], mean=Gmeans[i,], sigma=Sig2) } Bxvals <- matrix(0, 1000, 2) finish <- 0 for (i in 1:10) { start <- finish + 1 finish <- finish + Bcounts[i] Bxvals[start:finish,] <- rmvnorm(Bcounts[i], mean=Bmeans[i,], sigma=Sig2) } Oclass <- cbind(0, Oxvals) Gclass <- cbind(1, Gxvals) Bclass <- cbind(2, Bxvals) gendata <- data.frame(rbind(Oclass, Gclass, Bclass)) names(gendata) <- c("g", "x1", "x2") # I'll obtain an estimate of the Bayes error # rate using the large generalization sample. Opdf <-rep(0,3000) Gpdf <-rep(0,3000) Bpdf <-rep(0,3000) for (i in 1:10) { Opdf <- Opdf + dmvnorm(gendata[,2:3], Omeans[i,], Sig2) Gpdf <- Gpdf + dmvnorm(gendata[,2:3], Gmeans[i,], Sig2) Bpdf <- Bpdf + dmvnorm(gendata[,2:3], Bmeans[i,], Sig2) } pred.bayes <- ifelse(Gpdf > Bpdf, 1, 2) pred.bayes <- ifelse(Opdf > Gpdf & Opdf > Bpdf, 0, pred.bayes) est.bayes.rate <- mean(pred.bayes != gendata[,1]) est.bayes.rate # This is the estimated error rate of the Bayes classifier. # Below is a confusion matrix, with true class for the # rows and predicted class for the columns. The Green # class is the most difficult one to correctly predict. table(gendata[,1], pred.bayes) ##### # I'll use c-v to select a good number of nearest neighbors # to use for "vanilla" nearest neighbors classification. cvknn <- train.kknn(as.factor(g) ~ ., trdata, kmax=75, kernel="rectangular", distance=2) plot(c(1:75),cvknn$MISCLASS,type="l",xlab='k', ylab='proportion misclassified',main='Estimated Error Rates', ylim=c(0.27,0.45),col="cornflowerblue") which.min(cvknn$MISCLASS) cvknn$MISCLASS[17:30] # k values of 18, 20, and 28 produce a tie for the # fewest misclassifications. Since the "true curve" # (as opposed to the estimated curve we get from c-v) # may be smoother and concave upward with a single # minimum, I'd choose to use k = 19 (or 21) to classify # future observations. (I think an odd value for k is # better, since in regions where just two of the classes # are heavily competing, an odd value of k should lead to # fewer ties being randomly broken.) One can also note # that some of the c-v estimate are lower than the Bayes # rate estimate obtained from the large generalization # sample. #####(go ahead and start next chunk --- plenty of time to look at graph)##### # Let's use the large generalization sample to estimate # the error rates for future obnservations (the general- # ization rates). Also, I'll add a line showing the # (estimated) Bayes error rate. # Note: Since the function train.kknn automatically scales # the data, I'll use the the kknn function to do nearest # neighbors since it scales the data as well. I'll also # try it with unscaled data using the knn function. gen.err.s <- numeric(75) gen.err.u <- numeric(75) for (i in 1:75) { pred.gen.s <- kknn(as.factor(g)~.,train=trdata,test=gendata,k=i) gen.err.s[i] <- mean(pred.gen.s$fit != gendata[,1]) pred.gen.u <- knn(trdata[,-1], gendata[,-1], cl=trdata[,1], k=i) gen.err.u[i] <- mean(pred.gen.u != gendata[,1]) } points(c(1:75), gen.err.s, type="l", col="seagreen") points(c(1:75), gen.err.u, type="l", col="olivedrab") abline(est.bayes.rate, 0, col="violetred") legend(2,0.45,legend=c("generalization (unscaled)", "generalization (scaled)","cross-val","Bayes"), fill=c("olivedrab","seagreen","cornflowerblue","violetred")) # Note that the green curves for scaled and unscaled differ # appreciably for larger values of k. I wouldn't think that # scaling would have a large effect. Let's look at the standard # deviations of the two predictor variables and see how different # they are. sd(trdata[,2]) # sample standard deviation of x1 sd(trdata[,3]) # sample standard deviation of x2 # Not too different. # Below is the error rate for value of k I picked # after lookeding at the cross-validation results. gen.err.s[19] # error rate for 19nn classification # (using scaled data) gen.err.u[19] # error rate for 19nn classification # (using unscaled data) ##### # I'll try using weighted nearest neighbors # (with triangular weight function). cvknn <- train.kknn(as.factor(g) ~ ., trdata, kmax=75, kernel="triangular", distance=2) plot(c(1:75),cvknn$MISCLASS,type="l",xlab='k', ylab='proportion misclassified',main='Estimated Error Rates', ylim=c(0.26,0.37),col="cornflowerblue") # k values of 9, 10, 13, and 14 tie for producing # the fewest misclassifications. Based on this, # I'll pick k = 11. Let's see what proportion of # the generalization sample is misclassified with # this choice. weighted11nn <- kknn(as.factor(g) ~ ., train=trdata, test=gendata, k=11, kernel="triangular", distance=2) mean(weighted11nn$fit != gendata[,1]) # A tad worse than vanilla nearest neighbors. ##### # Now let's check out classification using LDA. trlda <- lda(g ~ . , trdata) pred.trlda <- predict(trlda, newdata=gendata) lda.rate <- mean( pred.trlda$class != gendata[,1] ) lda.rate # LDA rate. # (A little worse than 19nn.) ##### # Now QDA. trqda <- qda(g ~ . , trdata) pred.trqda <- predict(trqda, newdata=gendata) qda.rate <- mean( pred.trqda$class != gendata[,1] ) qda.rate # QDA rate. # (A bit better.) ##### # Now I'll try 1st-, 2nd-, and 3rd-order logisitic # regression models. logreg.fit1 <- multinom(as.factor(g) ~ ., data=trdata) summary(logreg.fit1, Wald=T) # Let's see how well the fitted model predicts the # classes for the generalization set. pred.logreg <- predict(logreg.fit1, newdata=gendata, type="class") logreg.rate <- mean( pred.logreg != gendata[,1] ) logreg.rate # error rate for 1st-order logistic regression model # I'll try a full 2nd-order model. logreg.fit2 <- multinom(as.factor(g) ~ x1 + x2 + I(x1^2) + I(x2^2) + I(x1*x2), data=trdata) summary(logreg.fit2, Wald=T) pred.logreg <- predict(logreg.fit2, newdata=gendata, type="class") logreg.rate <- mean( pred.logreg != gendata[,1] ) logreg.rate # error rate for 2nd-order logistic regression model # Best so far (by just a wee bit). # I'll try a 3rd-order model. logreg.fit3 <- multinom(as.factor(g) ~ x1 + x2 + I(x1^2) + I(x2^2) + I(x1*x2) + I(x1^3) + I(x2^3) +I(x1*x2^2) + I(x2*x1^2), data=trdata) summary(logreg.fit3, Wald=T) pred.logreg <- predict(logreg.fit3, newdata=gendata, type="class") logreg.rate <- mean( pred.logreg != gendata[,1] ) logreg.rate # error rate for 3rd-order logistic regression model # Let's look at AIC values and error rates together. # model AIC error # ----- ----- ----- # 1st 473 0.372 # 2nd 448 0.335 # 3rd 397 0.345 #####(go ahead and start next chunk --- then look at log reg results)##### # Now I'll define a function that uses c-v with the # K-means clustering method from subsection 13.2.1 # of HTF2. The user supplies a vector of proportions # that determines the numbers of clusters per class # that will be compared using 10-fold c-v estimates # of error rates. The second of the two functions # defined below is the one that does the c-v. The # first of the two functions is used by the second # of the two functions. KMeansClassMod <- function(traindata, testdata, n.comp) # 1st col. of each data set should be class variable # (and classes should be labeled with consectutive # nonnegative integers). Remaining col's of each set # should be predictor var's. n.comp should be a vector # containing the numbers of clusters to use for each of # the classes. To avoid an error with the kmeans function, # if the number of clusters specified is less than 2, the # number will be increased to 2. { first.class <- min(traindata[,1]) last.class <- max(traindata[,1]) prototypes <- NULL for (g in first.class:last.class) { cases <- traindata[,1] == g classdata <- traindata[cases,] num.clusters <- max(n.comp[g-first.class+1], 2) clusters <- kmeans(classdata[,-1],num.clusters,iter.max=15,nstart=5) class.prototypes <- cbind(g, clusters$centers) prototypes <- rbind(prototypes, class.prototypes) } prototypes <- data.frame(prototypes) names(prototypes) <- names(traindata) predict.class <- kknn(as.factor(g)~.,train=prototypes,test=testdata,k=1) test.error <- mean( predict.class$fit != testdata[,1] ) test.error } KMeansClassCV <- function(data, proportions) # The proportions vector determines the numbers of clusters # per class that will be compared using 10-fold c-v estimates # of error rates. E.g., if the proportions vector is c(0.05, # 0.1, 0.2), for each class about 0.05*n_j clusters will be used # for K-means classification the first trial, 0.1*n_j clusters # will be used the second trial, and 0.2*n_j clusters will be # used the third trial, where n_j is the number of training # set cases for the class. (An exception is that if the number # of clusters would be less than 2, it is made to be 2. This # is done to avoid an error with the kmeans function.) The 1st # column of the data set should be class variable (and classes # should be labeled with consectutive nonnegative integers). # Remaining columns should be predictor var's. { n.total <- length(data[,1]) first.class <- min(data[,1]) last.class <- max(data[,1]) num.classes <- last.class - first.class + 1 n.class <- numeric(num.classes) for (j in first.class:last.class) { class.cases <- (data[,1] == j) n.class[j-first.class+1] <- length(data[class.cases,1]) } fold.size <- ceiling(n.total/10) fold.id <- rep(c(1:10), fold.size) fold.id <- sample(fold.id, n.total) num.trials <- length(proportions) errors <- rep(0, num.trials) for (i in 1:num.trials) { num.clusters <- round(n.class*proportions[i]) for (fold in 1:10) { fold.cases <- (fold.id == fold) errors[i] <- errors[i] + length(data[fold.cases,1])*KMeansClassMod(data[!fold.cases,],data[fold.cases,],num.clusters) } } err.rate <- errors/n.total results <- cbind(proportions, err.rate) results } KMeansClassCV(trdata, c(0.05, 0.1, 0.15, 0.2, 0.25)) # Note: Originally I used kkn in my functions, and I # didn't get any warnings. But then I decided to use # kknn (so that the data would be scaled automatically), # which required that I make the set of prototypes into # a data frame, and then I got warnings. I don't think # the warnings indicate a serious problem. # I'll try some smaller proportions. KMeansClassCV(trdata, c(0.01,0.02,0.03,0.04,0.05)) # A proportion of 0.04 corresponds to 4 clusters per class. # But since 4 is what worked best during the 10-fold c-v, # when the average sample size per class was 90, it could # be that with 100 per class in the entire training sample, # 5 clusters per class will work good. Altogether, I'll check # out 3, 4, 5, and 6 clusters per class for a comparison. KMeansClassMod(trdata,gendata,c(3,3,3)) KMeansClassMod(trdata,gendata,c(4,4,4)) KMeansClassMod(trdata,gendata,c(5,5,5)) KMeansClassMod(trdata,gendata,c(6,6,6)) # 4 and 5 clusters per class worked best (of these # choices). The error rates are close to the best # so far, but lots of things seem to be doing about # the same (and giving error rates not too far above # the Bayes rate). ##### # Since some of the following parts take a long # time to run, I'm going to reset the random number # seed so that before the seminar meeting I can run # these parts first, and then go back and run the # earlier stuff during the seminar presentation. set.seed(12) # Now I want to use c-v with the LVQ method # from subsection 13.2.2 of HTF2. LVQCV <- function(data, proportions, num.it, init.ep) # The proportions vector determines the numbers of clusters # per class that will be compared using 10-fold c-v estimates # of error rates. E.g., if the proportions vector is c(0.05, # 0.1, 0.2), for each class about 0.05*n_j clusters will be used # for K-means classification the first trial, 0.1*n_j clusters # will be used the second trial, and 0.2*n_j clusters will be # used the third trial, where n_j is the number of training # set cases for the class. The first column of the data set # should be class variable (and classes should be labeled with # consectutive nonnegative integers). Remaining columns should # be predictor var's. { n.total <- length(data[,1]) first.class <- min(data[,1]) last.class <- max(data[,1]) num.classes <- last.class - first.class + 1 n.class <- numeric(num.classes) for (j in first.class:last.class) { class.cases <- (data[,1] == j) n.class[j-first.class+1] <- length(data[class.cases,1]) } fold.size <- ceiling(n.total/10) fold.id <- rep(c(1:10), fold.size) fold.id <- sample(fold.id, n.total) num.trials <- length(proportions) errors <- rep(0, num.trials) for (i in 1:num.trials) { num.pro <- round(n.class*proportions[i]) for (fold in 1:10) { fold.cases <- (fold.id == fold) errors[i] <- errors[i] + length(data[fold.cases,1])*LVQMod(data[!fold.cases,],data[fold.cases,],num.pro,num.it,init.ep) } } err.rate <- errors/n.total results <- cbind(proportions, err.rate) results } LVQMod <- function(traindata,testdata,n.pro,iterations,initial.epsilon) # 1st col. of each data set should be class variable # (and classes should be labeled with consectutive # nonnegative integers). # Remaining col's of each set should be predictor var's. { first.class <- min(traindata[,1]) last.class <- max(traindata[,1]) prototypes <- NULL for (g in first.class:last.class) { cases <- traindata[,1] == g classdata <- traindata[cases,] n.cases <- length(classdata[,1]) selected <- sample.int(n.cases, n.pro[g-first.class+1]) class.prototypes <- classdata[selected,] prototypes <- rbind(prototypes, class.prototypes) } train.size <- length(traindata[,1]) for (i in 1:iterations) { epsilon <- initial.epsilon/i case <- sample.int(train.size, 1) distances <- as.matrix(dist(rbind(traindata[case,-1],prototypes[,-1]))) closest <- as.integer(which.min(distances[-1,1])) ifelse(prototypes[closest,1]==traindata[case,1], prototypes[closest,-1] <- (1 - epsilon)*prototypes[closest,-1] + epsilon*traindata[case,-1], prototypes[closest,-1] <- (1 - epsilon)*prototypes[closest,-1] - epsilon*traindata[case,-1]) } predict.class <- knn(prototypes[,-1], testdata[,-1], prototypes[,1], k=1) errors <- mean( predict.class != testdata[,1] ) errors } LVQCV(trdata, c(0.15, 0.16, 0.17, 0.18, 0.19), 700, 0.5) # I tried other combinations of proportions and parameter # settings, and a proportion of 0.16 with the settings as # above worked best. # A proportion of 0.16 corresponds to 16 clusters per class. # But since 16 is what worked best during the 10-fold c-v, # when the average sample size per class was 90, it could # be that with 100 per class in the entire training sample, # 17 pr 18 clusters per class will work better. Altogether, # I'll check out 16, 17, and 18 clusters per class for a # comparison. LVQMod(trdata,gendata,c(16,16,16),700,0.5) LVQMod(trdata,gendata,c(17,17,17),700,0.5) LVQMod(trdata,gendata,c(18,18,18),700,0.5) # 16 prototypes per class worked best. ##### # Now I'll combine the K-means starting points # with the movement of the prototypes used in # the LVQ method. KMeansPlusLVQCV <- function(data,proportions,num.it,init.ep) # 1st col. of each data set should be class variable # (and classes should be labeled with consectutive # nonnegative integers). # Remaining col's of each set should be predictor var's. { n.total <- length(data[,1]) first.class <- min(data[,1]) last.class <- max(data[,1]) num.classes <- last.class - first.class + 1 n.class <- numeric(num.classes) for (j in first.class:last.class) { class.cases <- (data[,1] == j) n.class[j-first.class+1] <- length(data[class.cases,1]) } fold.size <- ceiling(n.total/10) fold.id <- rep(c(1:10), fold.size) fold.id <- sample(fold.id, n.total) num.trials <- length(proportions) errors <- rep(0, num.trials) for (i in 1:num.trials) { num.pro <- round(n.class*proportions[i]) for (fold in 1:10) { fold.cases <- (fold.id == fold) errors[i] <- errors[i] + length(data[fold.cases,1])*KMeansPlusLVQMod(data[!fold.cases,],data[fold.cases,],num.pro,num.it,init.ep) } } err.rate <- errors/n.total results <- cbind(proportions, err.rate) results } KMeansPlusLVQMod <- function(traindata,testdata,n.pro,iterations,initial.epsilon) { first.class <- min(traindata[,1]) last.class <- max(traindata[,1]) prototypes <- NULL for (g in first.class:last.class) { cases <- traindata[,1] == g classdata <- traindata[cases,] clusters <- kmeans(classdata[,-1],n.pro[g-first.class+1],iter.max=15,nstart=5) class.prototypes <- cbind(g, clusters$centers) prototypes <- rbind(prototypes, class.prototypes) } prototypes <- data.frame(prototypes) train.size <- length(traindata[,1]) for (i in 1:iterations) { epsilon <- initial.epsilon/i case <- sample.int(train.size, 1) distances <- as.matrix(dist(rbind(traindata[case,-1],prototypes[,-1]))) closest <- as.integer(which.min(distances[-1,1])) ifelse(prototypes[closest,1]==traindata[case,1], prototypes[closest,-1] <- (1 - epsilon)*prototypes[closest,-1] + epsilon*traindata[case,-1], prototypes[closest,-1] <- (1 - epsilon)*prototypes[closest,-1] - epsilon*traindata[case,-1]) } predict.class <- knn(prototypes[,-1], testdata[,-1], prototypes[,1], k=1) test.error <- mean( predict.class != testdata[,1] ) test.error } KMeansPlusLVQCV(trdata,c(0.05,0.1,0.15,0.2),500,0.5) # I tried other combinations of proportions and parameter # settings, and a proportion of 0.04 with the settings as # above worked best. KMeansPlusLVQMod(trdata,gendata,c(4,4,4),500,0.5) KMeansPlusLVQMod(trdata,gendata,c(5,5,5),500,0.5) ########## End ##########