# This example deals with the classification setting # described on p. 17 of HTF, and considered in various # parts of HTF. # (I would like to acknowledge the assistance of Li Li. # Most of what I have below was developed with great help # from her as I learned how to use R.) # # First I'll generate the data as described on p. 17 of HTF. # This will be done in a number of steps. # # Step 1: For each class generate 10 observations from # the proper bivariate normal distribution to serve as # means for the components of the mixture distribution. # # I will make use of some functions from various libraries # which may need to be installed if you haven't used them # yet, but once installed, they do not need to be installed # again (unless something goes wrong with your copy of R). # However, one does need to load the packages each session. library(MASS) library(stats) library(mvtnorm) # for generating multivariate normal values library(multinomRob ) # for generating multinomial dist'n values library(class) # for nearest neighbors methods (knn) library(kknn) # for knn with easy cross-validation ability, and weights library(klaR) # for naive Bayes classification library(e1071) # for support vector machines library(boost) # for adaboost # # I'll set the seed for the random number generator. set.seed(3) # gmeans <- rmvnorm(10, mean=c(1,0), sigma=diag(1,2)) # g is used for the green group rmeans <- rmvnorm(10, mean=c(0,1), sigma=diag(1,2)) # r is used for the red group # # I'll plot the means. plot(gmeans[,1],gmeans[,2],col=3,main='component means',xlab='x1',ylab='x2',xlim=c(-3,4),ylim=c(-3,4)) points(rmeans[,1], rmeans[,2], col=2) # # Step 2: For each class, generate 10 counts which sum to 100, # using a multinomial (100,0.1,0.1,...,0.1) distribution. gcounts <- rmultinomial(100, rep(0.1,10)) rcounts <- rmultinomial(100, rep(0.1,10)) # # Step 3: using the component means generated in step 1, # and the counts generated in step 2 as sample sizes (for # the various components of the mixture distributions), # generate 100 training sample obervations for each class. gclass <- matrix(0, 100, 2) finish <- 0 for (i in 1:10) { start <- finish + 1 finish <- finish + gcounts[i] gclass[start:finish,] <- rmvnorm(gcounts[i], mean=gmeans[i,], sigma=diag(0.2,2)) } rclass <- matrix(0, 100, 2) finish <- 0 for (i in 1:10) { start <- finish + 1 finish <- finish + rcounts[i] rclass[start:finish,] <- rmvnorm(rcounts[i], mean=rmeans[i,], sigma=diag(0.2,2)) } # # Step 4: Give response value for green class as 0 # and red as 1, then combine them together to form # the training data. gclass <- cbind(gclass, 0) rclass <- cbind(rclass, 1) trdata <- rbind(gclass, rclass) trdata <- data.frame(trdata) names(trdata) <- c("x1", "x2", "y") plot(trdata[,1], trdata[,2], col=3-trdata[,3], main='training data', xlab='x1', ylab='x2') # This completes the generation of the training data. # # Now I will estimate the Bayes error rate, # using the training sample. We can see how # well the optimal Bayes classifier could # classify the training sample observations # without using the known class information. # This is done by comparing the "height" of # the green dist'n pdf to the "height" of the # red dist'n pdf at each (x1,x2) for the cases # of the training sample, and classifying a # case as being green if the green pdf value # is the greater of the two, and classifying # it as red otherwise. Then this predicted # class is compared to the known class. The # proportion of the 200 training sample cases # for which the predicted class differs from # the known class is the observed error rate # for the Bayes classifier. gpdf <-rep(0,200) rpdf <-rep (0,200) for (i in 1:10) { gpdf <- gpdf + dmvnorm(trdata[,1:2], gmeans[i,], diag(0.2,2)) rpdf <- rpdf + dmvnorm(trdata[,1:2], rmeans[i,], diag(0.2,2)) } pred.bayes.class <- ifelse(gpdf > rpdf, 0, 1) est.bayes.rate <- mean(pred.bayes.class != trdata[,3]) est.bayes.rate # This is the error rate of the Bayes classifier # on the training data. # # Now I will use the training data to fit a # least squares regression model, using the # 0/1 class labels as the response var, to # determine the predicted class for each # observation using (2.7) on p. 13 of HTF. attach(trdata) ols1 <- lm(y ~ x1 + x2) pred.ols1 <- predict(ols1, newdata=trdata) tr.err.ols1 <- mean( ifelse(pred.ols1 > 0.5, 1, 0) != trdata[,3] ) tr.err.ols1 # This is the 1st-order OLS classifier error rate. plot(x1, x2, col=3-y, main='1st-order OLS', xlab='x1',ylab='x2') abline((0.5 - ols1$coef[1])/ols1$coef[3], - ols1$coef[2]/ols1$coef[3], col=8) # # Now I'll try a 2nd-order regression fit. trdata2 <- cbind(trdata[,1], trdata[,2], trdata[,1]^2, trdata[,2]^2, trdata[,1]*trdata[,2], trdata[,3]) trdata2 <- data.frame(trdata2) names(trdata2) <- c( "x1", "x2", "x1^2", "x2^2", "x1*x2", "y") ols2 <- lm (y~., data=trdata2) pred.ols2<- ifelse(predict(ols2, newdata=trdata2) <= 0.5, 0, 1) tr.err.ols2 <- mean( pred.ols2 != trdata2[,6] ) tr.err.ols2 # This is the 2nd-order OLS classifier error rate. # # Now I'll use k-nearest-neighbors classification to get # class predictions for each case in the training sample. # (Note: Typically one would standardize the predictors first # by dividing by the sample standard deviation, but with this # data I don't think that it would make a lot of difference.) # (Note: The class label should not be included in the # variables used by the knn function to determine distances.) # # I'll use 1, 3, 5, ..., 59 as the values for k. I <- 30 tr.err.knn <- numeric(I) for (i in 1:I) { pred.knn <- knn(trdata[,-3], trdata[,-3], cl=trdata[,3], k = (2*i - 1)) tr.err.knn[i] <- mean( (as.numeric(pred.knn)-1) != trdata[,3]) } plot(2*seq(1:I)-1,tr.err.knn,type="l",xlab='k',ylab='proportion misclassified',main='k-nearest-neighbors',col=3,ylim=c(0,0.4)) # It should be noted that, as is usually the case with # classifiers, using the same data to both classify points # and estimate the accuracy of the classifier can lead to # overoptimistic estimates of accuracy. This is particularly # evident for the case of k = 1. # # I'll use two-dimensional kernel density estimates # to classify by directly approximating the Bayes # classifier. # (Note: The kde2d function used below will only work # if there are two predictors. If there is just one # predictor, then R's density function can be used to # obtain a density estimate. Unfortunately, there # doesn't seem to be a function which will do kernel # density estimation for dimensions greater than 2 # (although in a lot of cases I suppose it is the case # that for dimensions greater than 2 the method doesn't # work well compared to other methods). Let me know if # you know of some function that will do density estimation # in dimensions greater than 2.) x1limL <- min(trdata[,1]) - 2 x1limU <- max(trdata[,1]) + 2 x2limL <- min(trdata[,2]) - 2 x2limU <- max(trdata[,2]) + 2 gden <- kde2d(trdata[1:100,1], trdata[1:100,2], n=300, lims=c(x1limL,x1limU,x2limL,x2limU)) image(gden) # This is a representation of the green # normal mixture model density. # # Here's the red pdf. rden <- kde2d(trdata[101:200,1], trdata[101:200,2], n=300, lims=c(x1limL,x1limU,x2limL,x2limU)) image(rden) # denrat <- rden$z/gden$z contour(gden$x, gden$y, denrat, levels=c(0.5,1,2)) # The contour labeled with 1 is the decision boundary. # Outside of the wavy band, one estimated density is # more than twice as high as the one one. # pred.kern.den <- ifelse(denrat < 1, 0, 1) image(gden$x, gden$y, pred.kern.den) # Another representation of the decision boundary # determined from the training data. # x1index <- round(1 + 299*(trdata[,1] - min(gden$x))/(max(gden$x) - min(gden$x))) x2index <- round(1 + 299*(trdata[,2] - min(gden$y))/(max(gden$y) - min(gden$y))) pred.kern.den <- rep(0, 200) for (i in 1:200) pred.kern.den[i] <- denrat[x1index[i],x2index[i]] tr.err.kern.den <- mean( ifelse(pred.kern.den > 1, 1, 0) != trdata[,3] ) tr.err.kern.den # The training error for the kernel density estimation # method is quite small. # # Now I will use naive Bayes classifiers. Although the # method is based on an assumption of independence, which # may be grossly violated, a nice thing is that this # method may be done using R for dimensions greater than 2. # # R's NaiveBayes function apparently needs to be given # data with the class variable being a "factor" with # "levels" specified in order for the predict function # to work properly on the object. Therefore, I created # a new data frame to use at this point. alttrdata <- data.frame(cbind(trdata[,1],trdata[,2],factor(trdata[,3]))) names(alttrdata) <- c("x1","x2","y") levels(alttrdata$y) <- c("1","2") # # I'll start with a version of the method that assumes # the marginal dist'ns are normal, and so instead of # using a nonparametric density estimator, a parametric # model is used, and the data is used to estimate the # unknown parameters. nbnorm <- NaiveBayes(y ~ ., data=alttrdata) pred.nbnorm <- predict(nbnorm, newdata=alttrdata) tr.err.nbnorm <- mean( pred.nbnorm$class != alttrdata[,3] ) tr.err.nbnorm # # Next, I will use what I consider to be the standard # way of doing naive Bayes classification, with a kernel # density estimator being used to obtain a nonparametric # density estimate for each predictor. nbkern <- NaiveBayes(y ~ ., data=alttrdata, usekernel=TRUE) pred.nbkern <- predict(nbkern, newdata=alttrdata) tr.err.nbkern <- mean( pred.nbkern$class != alttrdata[,3] ) tr.err.nbkern # ### Begining of 1st part of support vector machine stuff ### added 7/17. # Now I will try some support vector machines. # The default kernel is the radial kernel. xxx <- subset(alttrdata, select = -y) yyy <- as.character(alttrdata$y) # Note: I had a hard time getting this to work so # that I could get class predictions. Just making # the response a factor didn't work. (The predictions # were numerical.) But it worked when I made the # response a character variable that then was clearly # recognized to be a categorical variable. svmrad1 <- svm(xxx, yyy, type="C-classification", cross=5) # (Note: The c-v isn't used to fit the model, # it's just used to estimate the accuracy of # the fitted model. It would be nice if c-v # was used to determine a good value of the # gamma argument of the vsn function (which # I suppose is a tuning parameter). Because # this is not the case, it might be good to # try several different values of gamma and # pick the one which has the smallest estimated # error rate (using the c-v estimates). summary(svmrad1) # The c-v estimate of the error rate is 0.11 (0.12). # (NOTE: Rather oddly, I didn't get the same result # when I ran it two different times, using the same # initial seed. It seems that the c-v rountine used # by the svm function doesn't maintain the random # number sequence in the usual way. Below, I give # two numbers for each estimate --- these come from # two different runs (starting w/ same seed). It # should also be noted that unless you start at the # beginning of all of the stuff, and don't skip anything, # your results may be rther different.) # This results from using the default value of # gamma, which is 1/(# of predictors) = 0.5. # I'll try using a value a tad larger, 0.6, and # a value a tad smaller, 0.4, and see which one # results in the smallest c-v error rate. svmrad2 <- svm(xxx, yyy, type="C-classification", cross=5, gamma=0.6) summary(svmrad2) svmrad3 <- svm(xxx, yyy, type="C-classification", cross=5, gamma=0.4) summary(svmrad3) # The estimated error rates are 0.115 / 0.12 (for 0.4), # 0.11 / 0.12 (for 0.5), and 0.125 / 0.12 (for 0.6). Because # these are estimates, based on a training set of # only 200 points (100 of each class), the small # differences shouldn't be taken too seriously # --- given a different random number seed, the # results could be different. # I'll try two more values, and see what I get. svmrad4 <- svm(xxx, yyy, type="C-classification", cross=5, gamma=0.7) summary(svmrad4) svmrad5 <- svm(xxx, yyy, type="C-classification", cross=5, gamma=0.3) summary(svmrad5) # The est. error rates are 0.125 / 0.12 (for 0.7) # and 0.12 / 0.115 (for 0.3). # # Now I'll try a polynomial kernel, using the default # degree of 3. svmpoly1 <- svm(xxx,yyy,type="C-classification",kernel="polynomial",cross=5) summary(svmpoly1) # The estimated error rate is 0.14 / 0.145. # Rather than alter gamma at this point, I'll try # using degree = 2 and degree = 4. svmpoly2 <- svm(xxx,yyy,type="C-classification",kernel="polynomial",degree=2,cross=5) summary(svmpoly2) svmpoly3 <- svm(xxx,yyy,type="C-classification",kernel="polynomial",degree=4,cross=5) summary(svmpoly3) # The number of support vectors went way up, # compared with the other runs. Also, the # est. error rates are horrible. So I'll go # back to using degree = 3 and alter gamma. # (Note: I think that it is very strange that # changing the degree from 3 to 2 and 4 led to # such bad performance.) svmpoly4 <- svm(xxx,yyy,type="C-classification",kernel="polynomial",gamma=0.6,cross=5) summary(svmpoly4) svmpoly5 <- svm(xxx,yyy,type="C-classification",kernel="polynomial",gamma=0.4,cross=5) summary(svmpoly5) # I'll try two more values. svmpoly6 <- svm(xxx,yyy,type="C-classification",kernel="polynomial",gamma=0.7,cross=5) summary(svmpoly6) svmpoly7 <- svm(xxx,yyy,type="C-classification",kernel="polynomial",gamma=0.3,cross=5) summary(svmpoly7) # The radial kernel seems better (in this case) # than the polynomial kernels. ### END of 1st part of svm stuff added 7/17. # # Now I'll generate a large sample (5000 cases from # each class) with which to estimate the generalization # error rates of the various classifiers. gcounts <- rmultinomial(5000, rep(0.1, 10)) rcounts <- rmultinomial(5000, rep(0.1, 10)) gclass <- matrix(0, 5000, 2) finish <- 0 for(i in 1:10) { start <- finish + 1 finish <- finish + gcounts[i] gclass[start:finish,] <- rmvnorm(gcounts[i], mean=gmeans[i,], sigma=diag(0.2,2)) } rclass <- matrix(0, 5000, 2) finish <- 0 for(i in 1:10) { start <- finish + 1 finish <- finish + rcounts[i] rclass[start:finish,] <- rmvnorm(rcounts[i], mean=rmeans[i,], sigma=diag(0.2,2)) } gclass <- cbind(gclass,0) rclass <- cbind(rclass,1) gendata <- rbind(gclass,rclass) gendata <- data.frame(gendata) names(gendata) <- names(trdata) # # I'll use this large data set to get a better # estimate of the Bayes error rate. gpdf <- rep(0, 10000) rpdf <- rep(0, 10000) for (i in 1:10) { gpdf <- gpdf + dmvnorm(gendata[,1:2], gmeans[i,], diag(0.2,2)) rpdf <- rpdf + dmvnorm(gendata[,1:2], rmeans[i,], diag(0.2,2)) } pred.bayes.class <- ifelse(gpdf > rpdf, 0, 1) est.bayes.rate <- mean(pred.bayes.class != gendata[,3]) est.bayes.rate # This is a good estimate of the error rate # of the Bayes classifier. # (It's a Monte Carlo estimate.) # # Since the optimal Bayes classifier has a larger # error rate than those estimated from the training # data for the other classifiers, it appears that # the estimates obtained using the training data # were indeed optimistic. # # Now I'll use the new large sample to estimate # the generalization error rates of the classifiers # that were constructed using the training data. # # First I'll consider the rule created from the # 1st-order OLS fit. pred.ols1 <-predict(ols1, newdata=gendata) gen.err.ols1 <- mean( ifelse(pred.ols1 <= 0.5, 0, 1) != gendata[,3]) gen.err.ols1 # estimate of gen. error rate # 1st-order OLS # # Next I'll consider the rule created from the # 2nd-order OLS fit. gendata2 <- cbind(gendata[,1],gendata[,2],gendata[,1]^2,gendata[,2]^2,gendata[,1]*gendata[,2],gendata[,3]) gendata2 <- data.frame(gendata2) names(gendata2) <- names(trdata2) pred.ols2 <- ifelse(predict(ols2, newdata=gendata2) <= 0.5, 0, 1) gen.err.ols2 <- mean( pred.ols2 != gendata2[,6]) gen.err.ols2 # estimate of gen. error rate # 2nd-order OLS # # First let's consider the straightforward density estimation # method (to directly approx. Bayes classifier) --- I'll # estimate the gen. error for the 2-D kernel density estimation # method. x1index <- round(1 + 299*(gendata[,1] - min(gden$x))/(max(gden$x) - min(gden$x))) x2index <- round(1 + 299*(gendata[,2] - min(gden$y))/(max(gden$y) - min(gden$y))) pred.kern.den <- rep(0, 10000) for (i in 1:10000) pred.kern.den[i] <- denrat[x1index[i],x2index[i]] gen.err.kern.den <- mean( ifelse(pred.kern.den > 1, 1, 0) != gendata[,3] ) gen.err.kern.den # It did better than the regression models. # # Now I will get the estimates of the gen. error # for the two naive Bayes classifiers. altgendata <- data.frame(cbind(gendata[,1],gendata[,2],factor(gendata[,3]))) names(altgendata) <- c("x1","x2","y") levels(altgendata$y) <- c("1","2") # # First the parametric normal model version. pred.nbnorm <- predict(nbnorm, newdata=altgendata) gen.err.nbnorm <- mean( pred.nbnorm$class != altgendata[,3] ) gen.err.nbnorm # # Then the nonparametric version. pred.nbkern <- predict(nbkern, newdata=altgendata) gen.err.nbkern <- mean( pred.nbkern$class != altgendata[,3] ) gen.err.nbkern # The naive Bayes methods didn't work as well as # the kernel density estimation method. # # Now I'll check the k-nearest-neighbors classifiers. gen.err.knn <- numeric(I) for (i in 1:I) { pred.knn <- knn(trdata[,-3], gendata[,-3], cl=trdata[,3], k=i*2-1) gen.err.knn[i] <- mean( (as.numeric(pred.knn)-1) != gendata[,3] ) } plot(2*seq(1:I)-1,tr.err.knn,type="l",xlab='k',ylab='proportion misclassified',main='k-nearest-neighbors',col=3,ylim=c(0,0.4)) points(2*seq(1:I)-1, gen.err.knn, type="l", col=2) abline(h=est.bayes.rate, col=8) legend( 5, 0.35, legend=c("training", "generalization", "c-v", "Bayes"), fill=c(3,2,4,8)) # # If you change the seed for the random number # generator to 632, then you will observe very # different behavior --- the regression methods # are no longer close in performance to the # best nearest-neighbors classifiers. # (But if you change the seed to any number # other than 3, my comments may no longer be # sensible.) # # Now I want to explore k-nearest-neighbors # classification a bit more. # # It can be seen that the error rates estimated # from the training sample can be very misleading --- # for one thing, k = 1 produced 0 misclassifications. # # Using the huge "generalization set" one can # determine which value of k worked best, and # can get a good estimate of the generalization # error rate for this value of k. bestk <- 2*which.min(gen.err.knn) - 1 bestk # best value of k gen.err.knn[which.min(gen.err.knn)] # estimate of gen. error rate # best knn # # Of course, in practice we don't always have a huge # data set to help us select the best value of k to # use as was done above (and if we did have a lot of # data like that, it would be better to use a large # portion of it in the training sample). # # In practice, say if we only had the training set of # 200 cases to work with, we could use cross-validation # to determine a good value of k to use when classifying # cases with unknown class. Although for some purposes # using an n-fold c-v may take an excessively long time # and may be worse than using a smaller number of folds, # with k-nearest-neighbors n-fold c-v is simple and # and relatively quick. (It's simple because with n-fold # c-v, to get the predicted class of a case the k nearest # _other_ cases are the neighbors used ... so it's like # k+1 nearest neighbors can be determined in the usual # way and then the nearest neighbor (the case itself) is # ignored. The train.kknn function, as used below, will # do this.) cvknn <- train.kknn(as.factor(y) ~ ., trdata, kmax=59, kernel="rectangular", distance=2) # (Note: If the response is coded numerically, it must be # converted to make it be recogonized as being nominal # by the kknn function.) cv.err.knn <- numeric(I) for (i in 1:I) cv.err.knn[i] <- cvknn$MISCLASS[2*i-1] points(2*seq(1:I)-1, cv.err.knn, type="l", col=4) bestk <- 2*which.min(cv.err.knn) - 1 bestk # best value of k cv.err.knn[which.min(cv.err.knn)] # estimate of gen. error rate # best knn # # It can be seen that while the actual c-v estimates # aren't real close to the true generalization error # rates (assuming that the estimates of the generalization # error rates from the huge sample are good), the # shape of the c-v error rate curve is not misleading as # to what values of k are good, and that's the important # thing! # gen.err.knn[which.min(cv.err.knn)] # Here is a better estimate of the gen. err. for the # knn classifier selected using cross-validation. # # One can also use a weighted nearest neighbors method. # (This will be covered in the July 7 class meeting.) # I'll start by using the Epanechnikov kernel (with # default value of smoothing parameter). cvwknn <- train.kknn(as.factor(y) ~ ., trdata, kmax=59, kernel="epanechnikov", distance=2) cv.err.wknn <- numeric(I) for (i in 1:I) cv.err.wknn[i] <- cvwknn$MISCLASS[2*i-1] plot(2*seq(1:I)-1, cv.err.wknn, type="l",xlab='k',ylab='proportion misclassified',main='wt k-nearest-neighbors',ylim=c(0,0.4) , col=4) bestk <- 2*which.min(cv.err.wknn) - 1 bestk # best value of k cv.err.wknn[which.min(cv.err.wknn)] # c-v estimate of gen. error rate # best weighted (Epanechnikov) knn # gen.err.wknn <- numeric(I-1) for (i in 1:(I-1)) { currentk <- 2*i + 1 pred.wknn <- kknn(y ~ ., trdata, gendata, k=currentk, distance=2, kernel="epanechnikov") gen.err.wknn[i] <- mean( round(pred.wknn$fitted.values) != gendata[,3] ) } # (Note: kknn is somewhat screwy, although in the end # I think that I got it working the way I wanted it # to work. Whether the response variable was of # type factor or numeric, the predicted values were # given as noninteger numbers. So what worked, was # to let y be numeric, and round the fitted values # to make them intergers.) points(2*seq(1:(I-1))+1, gen.err.wknn, type="l", col=2) abline(h=est.bayes.rate, col=8) legend( 5, 0.35, legend=c("generalization", "c-v", "Bayes"), fill=c(2,4,8)) # # Here is the est. gen. error for the value # of k chosen by cross-validation. gen.err.wknn[which.min(cv.err.wknn) - 1] # estimate of gen. error rate # best weighted (Epanechnikov) knn # as chosen by c-v # # Here is the est. gen. error for the actual # best k for weighted knn. gen.err.wknn[which.min(gen.err.wknn)] # # ### The rest of the stuff added 7/17 is below. # Now I'll get the generalization error estimates # for the various svm fits. gxxx <- subset(altgendata, select = -y) gyyy <- as.character(altgendata$y) gen.err.svm.r1 <- mean( gyyy != predict(svmrad1, gxxx) ) gen.err.svm.r2 <- mean( gyyy != predict(svmrad2, gxxx) ) gen.err.svm.r3 <- mean( gyyy != predict(svmrad3, gxxx) ) gen.err.svm.r4 <- mean( gyyy != predict(svmrad4, gxxx) ) gen.err.svm.r5 <- mean( gyyy != predict(svmrad5, gxxx) ) gen.err.svm.p1 <- mean( gyyy != predict(svmpoly1, gxxx) ) gen.err.svm.p2 <- mean( gyyy != predict(svmpoly2, gxxx) ) gen.err.svm.p3 <- mean( gyyy != predict(svmpoly3, gxxx) ) gen.err.svm.p4 <- mean( gyyy != predict(svmpoly4, gxxx) ) gen.err.svm.p5 <- mean( gyyy != predict(svmpoly5, gxxx) ) gen.err.svm.p6 <- mean( gyyy != predict(svmpoly6, gxxx) ) gen.err.svm.p7 <- mean( gyyy != predict(svmpoly7, gxxx) ) gen.err.svm.r1 gen.err.svm.r2 gen.err.svm.r3 gen.err.svm.r4 gen.err.svm.r5 gen.err.svm.p1 gen.err.svm.p2 gen.err.svm.p3 gen.err.svm.p4 gen.err.svm.p5 gen.err.svm.p6 gen.err.svm.p7 # Three of these error rates are lower than the error rates # from *all* of the other methods. # ###### boosting part added Thursday, 7/21 # (Note: The adaboost function is sensitive to how the # data is given --- trying to just use columns from a # data frame can cause problems, so I go through some # manipulations to make the x values given in a plain # matrix. (Also note that the y values must be a vector # of 0 and 1 values. (Type in ?adaboost to get more info.))) tx <- matrix(unlist(trdata[,-3]), nrow=nrow(trdata)) ty <- as.numeric(trdata[,3]) gx <- matrix(unlist(gendata[,-3]), nrow=nrow(gendata)) gy <- as.numeric(gendata[,3]) boostpred <- adaboost(tx, ty, gx) summarize(boostpred, gy) # These results don't seem reasonable --- in other examples # I tried, the error rate went down ... it was not constant.