# This example addresses the classification # setting dealt with in Sec. 2.3 of HTF2. # But I'll apply methods from Sec. 13.2 to it in # addition to some methods from Ch. 2 and Ch. 4. # Developed using 2.9.0. # I'll load libraries and set the random number seed. library(MASS) library(mvtnorm) library(multinomRob) library(class) library(kknn) set.seed(7) # First I'll generate the training data as described on pp. 16-17 of HTF2. Sig1 <- diag(1,2) # Sig1 is a variance-covariance matrix. Bmeans <- rmvnorm(10, mean=c(1,0), sigma=Sig1) # B is used for the Blue class. Omeans <- rmvnorm(10, mean=c(0,1), sigma=Sig1) # O is used for the Orange class. # I'll plot the component means. plot(Bmeans[,1],Bmeans[,2],col=4,main='Component Means',xlab='x1',ylab='x2',xlim=c(-3,4),ylim=c(-3,4)) points(Omeans[,1], Omeans[,2], col="darkorange") ##### # For each class I'll generate 10 counts which sum to 100 # using a multinomial (100,0.1,0.1,...,0.1) distribution. Bcounts <- rmultinomial(100, rep(0.1,10)) Ocounts <- rmultinomial(100, rep(0.1,10)) # Using the generated component means and # the generated counts as sample sizes (for # the various components of the mixture # distributions), I'll generate 100 training # sample obervations for each class. Sig2 <- diag(0.2, 2) Bxvals <- matrix(0, 100, 2) finish <- 0 for (i in 1:10) { start <- finish + 1 finish <- finish + Bcounts[i] Bxvals[start:finish,] <- rmvnorm(Bcounts[i], mean=Bmeans[i,], sigma=Sig2) } Oxvals <- matrix(0, 100, 2) finish <- 0 for (i in 1:10) { start <- finish + 1 finish <- finish + Ocounts[i] Oxvals[start:finish,] <- rmvnorm(Ocounts[i], mean=Omeans[i,], sigma=Sig2) } # Let's look at the training data. plot(Bxvals[,1],Bxvals[,2],col=4,main='Training Data',xlab='x1',ylab='x2', xlim=c(-3.5,4.5),ylim=c(-3.5,4.5)) points(Oxvals[,1], Oxvals[,2], col="darkorange") ##### # I'll give response value of 0 for Blue class observations, and # 1 for Orange class observations, then combine them together to # form the training data. Bclass <- cbind(0, Bxvals) Oclass <- cbind(1, Oxvals) trdata <- data.frame(rbind(Bclass, Oclass)) names(trdata) <- c("g", "x1", "x2") # This completes the generation of the training data. # I'll generate a test set of 1000 additional # observations from each of the two classes. # (Note: The component means are the ones used # to create the training data.) I like to call # this data the generalization sample, because I # will only use it to get good estimates of the # error rate of various fitted classifiers for # future observations. Some call it a test sample, # but I like to think of a test sample as data I have # during the training/fitting process that I have # set aside so that I can avoid using things like # c-v and AIC for model/parameter selection. Bcounts <- rmultinomial(1000, rep(0.1,10)) Ocounts <- rmultinomial(1000, rep(0.1,10)) Bxvals <- matrix(0, 1000, 2) finish <- 0 for (i in 1:10) { start <- finish + 1 finish <- finish + Bcounts[i] Bxvals[start:finish,] <- rmvnorm(Bcounts[i], mean=Bmeans[i,], sigma=Sig2) } Oxvals <- matrix(0, 1000, 2) finish <- 0 for (i in 1:10) { start <- finish + 1 finish <- finish + Ocounts[i] Oxvals[start:finish,] <- rmvnorm(Ocounts[i], mean=Omeans[i,], sigma=Sig2) } Bclass <- cbind(0, Bxvals) Oclass <- cbind(1, Oxvals) gendata <- data.frame(rbind(Bclass, Oclass)) names(gendata) <- c("g", "x1", "x2") # I'll obtain an estimate of the Bayes error # rate using the large generalization sample. Bpdf <-rep(0,2000) Opdf <-rep (0,2000) for (i in 1:10) { Bpdf <- Bpdf + dmvnorm(gendata[,2:3], Bmeans[i,], Sig2) Opdf <- Opdf + dmvnorm(gendata[,2:3], Omeans[i,], Sig2) } pred.bayes <- ifelse(Bpdf > Opdf, 0, 1) est.bayes.rate <- mean(pred.bayes != gendata[,1]) est.bayes.rate # This is the estimated error rate of the Bayes classifier. ##### # I'll use c-v to selct a good number of nearest neighbors # to use for "vanilla" nearest neighbors classification. cvknn <- train.kknn(as.factor(g) ~ ., trdata, kmax=49, kernel="rectangular", distance=2) plot(c(1:49),cvknn$MISCLASS,type="l",xlab='k', ylab='proportion misclassified',main='Estimated Error Rates',col=4) ##### # Looks like k = 29 is a good choice. Let's use the large # generalization sample to estimate the error rates for # future obnservations (the generalization rates). Also, # I'll add a line showing the (estimated) Bayes error rate. gen.err <- numeric(49) for (i in 1:49) { pred.gen <- knn(trdata[,-1], gendata[,-1], cl=trdata[,1], k=i) gen.err[i] <- mean(pred.gen != gendata[,1]) } points(c(1:49), gen.err, type="l", col=3) abline(est.bayes.rate, 0, col="purple") legend(32,0.32,legend=c("cross-val","generalization","Bayes"),fill=c(4,3,"purple")) # The c-v estimates of error are generally optimistic, but the # indicated choice of k=29 isn't bad since it appears that any # k between 5 and 30 will get you fairly close to the Bayes rate # (estimated to be 0.253). ##### # Now I will use the training data to fit a # least squares regression model, using the # 0/1 class labels as the response var, to # determine the predicted class for each # observation using (2.7) on p. 12 of HTF2. ols <- lm(g ~ x1 + x2, data=trdata) # Let's look at the data and the # linear decision boundary together. plot(trdata[1:100,2], trdata[1:100,3], col=4, main='1st-order OLS', xlab='x1',ylab='x2', xlim=c(-2.5,4),ylim=c(-1.2,3.5)) points(trdata[101:200,2], trdata[101:200,3], col="darkorange") abline((0.5 - ols$coef[1])/ols$coef[3], - ols$coef[2]/ols$coef[3], col=8) ##### # A linear classifier is going to have trouble # in this setting. Let's determine the generali- # zation error rate for this classifier. pred.ols <- predict(ols, newdata=gendata) ols.err <- mean( ifelse(pred.ols > 0.5, 1, 0) != gendata[,1] ) ols.err # This is the estimated 1st-order OLS classifier # error rate (on future cases) ... nearly twice # the Bayes rate, and not much better than using # a coin flip to make predictions, or simply # predicting all future cases to be blue (or all # future cases to be orange). ##### # Now let's check out classification using LDA. # Since we have equal sample sizes in the training # data, the classifier should be identical to the # one from the regression method. trlda <- lda(g ~ . , trdata) pred.trlda <- predict(trlda, newdata=gendata) lda.rate <- mean( pred.trlda$class != gendata[,1] ) lda.rate # LDA rate. # (Same as rate from regression fit.) ##### # Now QDA. trqda <- qda(g ~ . , trdata) pred.trqda <- predict(trqda, newdata=gendata) qda.rate <- mean( pred.trqda$class != gendata[,1] ) qda.rate # QDA rate. # (Better.) ##### # Now I'll try 1st-, 2nd-, and 3rd-order logisitic # regression models. logreg1 <- glm(g ~ ., data=trdata, family=binomial) summary(logreg1) pred.logreg1 <- predict(logreg1, newdata=gendata, type="response") err.logreg1 <- mean(round(pred.logreg1) != gendata[,1]) err.logreg1 # error rate for 1st-order logistic regression model # This error rate is similar to LDA error rate. Even though # coefficient for x2 isn't even significant in 1st-order model, # I'll try a full 2nd-order model. logreg2 <- glm(g ~ x1 + x2 + I(x1^2) + I(x2^2) + I(x1*x2), data=trdata, family=binomial) summary(logreg2) pred.logreg2 <- predict(logreg2, newdata=gendata, type="response") err.logreg2 <- mean(round(pred.logreg2) != gendata[,1]) err.logreg2 # error rate for 2nd-order logistic regression model # Much better. Even though x2 didn't look important # in output from 1st-order model, the 2nd-order terms # involving x2 were higly significant in 2nd-order model. # Let's try a 3rd-order model ... 200 observations ought # to be enough to fit 8 parameters decently. logreg3 <- glm(g ~ x1 + x2 + I(x1^2) + I(x2^2) + I(x1*x2) + I(x1^3) + I(x2^3) +I(x1*x2^2) + I(x2*x1^2), data=trdata, family=binomial) summary(logreg3) pred.logreg3 <- predict(logreg3, newdata=gendata, type="response") err.logreg3 <- mean(round(pred.logreg3) != gendata[,1]) err.logreg3 # error rate for 3rd-order logistic regression model # Fairly close to the estimated Bayes rate. Let's look # at AIC values and error rates together. # model AIC error # ----- ----- ----- # 1st 240 0.44 # 2nd 199 0.29 # 3rd 179 0.26 # This time we get # the smaller the AIC, the smaller the error rate. ##### # Now I want to try the K-means clustering # method from subsection 13.2.1 of HTF2. KMeansClassRandom <- function(traindata, testdata, n.comp) # 1st col. of each data set should be class variable # (and classes should be labeled with consectutive # nonnegative integers). # Remaining col's of each set should be predictor var's. { first.class <- min(traindata[,1]) last.class <- max(traindata[,1]) prototypes <- NULL for (g in first.class:last.class) { cases <- traindata[,1] == g classdata <- traindata[cases,] clusters <- kmeans(classdata[,-1],n.comp,iter.max=15,nstart=5) class.prototypes <- cbind(g, clusters$centers) prototypes <- rbind(prototypes, class.prototypes) } names(prototypes) <- names(traindata) predict.class <- knn(prototypes[,-1], testdata[,-1], prototypes[,1], k=1) test.error <- mean( predict.class != testdata[,1] ) test.error } KMeansClassRandom(trdata, gendata, 10) # error rate for 10 clusters per class KMeansClassRandom(trdata, gendata, 5) # error rate for 5 clusters per class KMeansClassRandom(trdata, gendata, 15) # error rate for 15 clusters per class # The error rate with 5 clusters per class # is great. But how would we pick a good # number of clusters per class to use if # all we have is the training data? # (Experience perhaps; or do we go to the # trouble of writing a c-v routine?) # Before moving on, let's try it with 5 # clusters per class again. (We might # have gotten real lucky with the random # numbers the first time.) KMeansClassRandom(trdata, gendata, 5) # Still pretty good, but not as close to # the Bayes rate as last time. ##### # Now I want to try the LVQ method # from subsection 13.2.2 of HTF2. LVQrandom <- function(traindata,testdata,n.pro,iterations,initial.epsilon) # 1st col. of each data set should be class variable # (and classes should be labeled with consectutive # nonnegative integers). # Remaining col's of each set should be predictor var's. { first.class <- min(traindata[,1]) last.class <- max(traindata[,1]) prototypes <- NULL for (g in first.class:last.class) { cases <- traindata[,1] == g classdata <- traindata[cases,] n.cases <- length(classdata[,1]) selected <- sample.int(n.cases, n.pro) class.prototypes <- classdata[selected,] prototypes <- rbind(prototypes, class.prototypes) } train.size <- length(traindata[,1]) plot(traindata[,2],traindata[,3],col=(2+traindata[,1]),pch=20, main='Drifting Prototypes',xlab='1st predictor',ylab='2nd predictor', xlim=c(min(traindata[,2]),max(traindata[,2])),ylim=c(min(traindata[,3]),max(traindata[,3]))) for (i in 1:iterations) { epsilon <- initial.epsilon/i case <- sample.int(train.size, 1) distances <- as.matrix(dist(rbind(traindata[case,-1],prototypes[,-1]))) closest <- as.integer(which.min(distances[-1,1])) ifelse(prototypes[closest,1]==traindata[case,1], prototypes[closest,-1] <- (1 - epsilon)*prototypes[closest,-1] + epsilon*traindata[case,-1], prototypes[closest,-1] <- (1 - epsilon)*prototypes[closest,-1] - epsilon*traindata[case,-1]) points(prototypes[closest,2],prototypes[closest,3],col=(2+prototypes[closest,1])) } predict.class <- knn(prototypes[,-1], testdata[,-1], prototypes[,1], k=1) test.error <- mean( predict.class != testdata[,1] ) test.error } LVQrandom(trdata,gendata,10,400,0.5) ##### # Not that good. I'll change the number # of prototypes per class and the value of # initial.epsilon. LVQrandom(trdata,gendata,5,400,0.75) ##### # A lot of the performance may be due to the random selection # of the initial prototypes. I'll try using more of them in # case some of the chosen ones are bad in some sense. LVQrandom(trdata,gendata,15,400,0.75) ##### # Pretty good! I'll try it one more time, only # increasing the number of iterations to make # epsilon decrease towards 0 slower. LVQrandom(trdata,gendata,15,900,0.75) ##### # Not as good. Now I'll go back to just 400 # iterations (since maybe the quicker decrease # is better). LVQrandom(trdata,gendata,15,400,0.75) # Well, isn't that a kick in the pants! # I guess a lesson to be learned here is # that the performance of any method that # uses randomness can really depend on the # random number sequence. In practice, w/o # having a large test/generalization sample # to guide the selection of parameters, how # should we determine what parameter values # to use? (Maybe write a cross validation # routine.) # Idea for *future research*: Since bad randomly # chosen starting prototypes may be a big part of # why the LVQ method may perform poorly at times, # why not combine the K-means starting points of # the previously considered method with the move- # ment of the prototypes away from points in other # classes from this method? ##### # The future is now! Let's do it. KMeansPlusLVQ <- function(traindata,testdata,n.pro,iterations,initial.epsilon) # 1st col. of each data set should be class variable # (and classes should be labeled with consectutive # nonnegative integers). # Remaining col's of each set should be predictor var's. { first.class <- min(traindata[,1]) last.class <- max(traindata[,1]) prototypes <- NULL for (g in first.class:last.class) { cases <- traindata[,1] == g classdata <- traindata[cases,] clusters <- kmeans(classdata[,-1],n.pro,iter.max=15,nstart=5) class.prototypes <- cbind(g, clusters$centers) prototypes <- rbind(prototypes, class.prototypes) } prototypes <- data.frame(prototypes) train.size <- length(traindata[,1]) plot(traindata[,2],traindata[,3],col=(2+traindata[,1]),pch=20, main='Drifting Prototypes',xlab='1st predictor',ylab='2nd predictor', xlim=c(min(traindata[,2]),max(traindata[,2])),ylim=c(min(traindata[,3]),max(traindata[,3]))) points(prototypes[,2],prototypes[,3],col=(2+prototypes[,1]),pch=11) for (i in 1:iterations) { epsilon <- initial.epsilon/i case <- sample.int(train.size, 1) distances <- as.matrix(dist(rbind(traindata[case,-1],prototypes[,-1]))) closest <- as.integer(which.min(distances[-1,1])) ifelse(prototypes[closest,1]==traindata[case,1], prototypes[closest,-1] <- (1 - epsilon)*prototypes[closest,-1] + epsilon*traindata[case,-1], prototypes[closest,-1] <- (1 - epsilon)*prototypes[closest,-1] - epsilon*traindata[case,-1]) points(prototypes[closest,2],prototypes[closest,3],col=(2+prototypes[closest,1])) } predict.class <- knn(prototypes[,-1], testdata[,-1], prototypes[,1], k=1) test.error <- mean( predict.class != testdata[,1] ) test.error } KMeansPlusLVQ(trdata,gendata,10,400,0.75) # Does anyone know what the warning message means, # and if it indicates a real problem? ########## End ##########