# This example deals with the classification setting
# described on p. 17 of HTF, and considered in various
# parts of HTF.
# (I would like to acknowledge the assistance of Li Li.
# Most of what I have below was developed with great help
# from her as I learned how to use R.)
#
# First I'll generate the data as described on p. 17 of HTF.
# This will be done in a number of steps.
#
# Step 1: For each class generate 10 observations from 
# the proper bivariate normal distribution to serve as 
# means for the components of the mixture distribution.
# 
# I will make use of some functions from various libraries
# which may need to be installed if you haven't used them
# yet, but once installed, they do not need to be installed 
# again (unless something goes wrong with your copy of R).
# However, one does need to load the packages each session.
library(MASS)    
library(stats)        
library(mvtnorm)      # for generating multivariate normal values
library(multinomRob ) # for generating multinomial dist'n values
library(class)        # for nearest neighbors methods (knn)
library(kknn)         # for knn with easy cross-validation ability, and weights
library(klaR)         # for naive Bayes classification
library(e1071)        # for support vector machines
library(boost)        # for adaboost
#
# I'll set the seed for the random number generator.
set.seed(3)              
#
gmeans <- rmvnorm(10, mean=c(1,0), sigma=diag(1,2)) # g is used for the green group
rmeans <- rmvnorm(10, mean=c(0,1), sigma=diag(1,2)) # r is used for the red group
#
# I'll plot the means.
plot(gmeans[,1],gmeans[,2],col=3,main='component means',xlab='x1',ylab='x2',xlim=c(-3,4),ylim=c(-3,4))
points(rmeans[,1], rmeans[,2], col=2)

#
# Step 2: For each class, generate 10 counts which sum to 100,  
# using a multinomial (100,0.1,0.1,...,0.1) distribution. 
gcounts <- rmultinomial(100, rep(0.1,10))
rcounts <- rmultinomial(100, rep(0.1,10))
#
# Step 3: using the component means generated in step 1, 
# and the counts generated in step 2 as sample sizes (for
# the various components of the mixture distributions), 
# generate 100 training sample obervations for each class. 
gclass <- matrix(0, 100, 2)
finish <- 0
for (i in 1:10) 
{
  start <- finish + 1
  finish <- finish + gcounts[i]
  gclass[start:finish,] <- rmvnorm(gcounts[i], mean=gmeans[i,], sigma=diag(0.2,2)) 
}
rclass <- matrix(0, 100, 2)
finish <- 0
for (i in 1:10) 
{
  start <- finish + 1
  finish <- finish + rcounts[i]
  rclass[start:finish,] <- rmvnorm(rcounts[i], mean=rmeans[i,], sigma=diag(0.2,2)) 
}
#
# Step 4: Give response value for green class as 0 
# and red as 1, then combine them together to form
# the training data.
gclass <- cbind(gclass, 0)
rclass <- cbind(rclass, 1)
trdata <- rbind(gclass, rclass)
trdata <- data.frame(trdata)
names(trdata) <- c("x1", "x2", "y")
plot(trdata[,1], trdata[,2], col=3-trdata[,3], main='training data', xlab='x1', ylab='x2')   
# This completes the generation of the training data.

#
# Now I will estimate the Bayes error rate, 
# using the training sample.  We can see how
# well the optimal Bayes classifier could
# classify the training sample observations 
# without using the known class information.
# This is done by comparing the "height" of
# the green dist'n pdf to the "height" of the
# red dist'n pdf at each (x1,x2) for the cases
# of the training sample, and classifying a 
# case as being green if the green pdf value 
# is the greater of the two, and classifying 
# it as red otherwise.  Then this predicted
# class is compared to the known class.  The 
# proportion of the 200 training sample cases
# for which the predicted class differs from
# the known class is the observed error rate 
# for the Bayes classifier.
gpdf <-rep(0,200)
rpdf <-rep (0,200)
for (i in 1:10) 
{
  gpdf <- gpdf + dmvnorm(trdata[,1:2], gmeans[i,], diag(0.2,2))
  rpdf <- rpdf + dmvnorm(trdata[,1:2], rmeans[i,], diag(0.2,2))
}
pred.bayes.class <- ifelse(gpdf > rpdf, 0, 1)
est.bayes.rate <- mean(pred.bayes.class != trdata[,3])
est.bayes.rate
# This is the error rate of the Bayes classifier
# on the training data.

#
# Now I will use the training data to fit a 
# least squares regression model, using the 
# 0/1 class labels as the response var, to
# determine the predicted class for each 
# observation using (2.7) on p. 13 of HTF.
attach(trdata)
ols1 <- lm(y ~ x1 + x2)
pred.ols1 <- predict(ols1, newdata=trdata)
tr.err.ols1 <- mean( ifelse(pred.ols1 > 0.5, 1, 0) != trdata[,3] ) 
tr.err.ols1   
# This is the 1st-order OLS classifier error rate.
plot(x1, x2, col=3-y, main='1st-order OLS', xlab='x1',ylab='x2') 
abline((0.5 - ols1$coef[1])/ols1$coef[3], - ols1$coef[2]/ols1$coef[3], col=8)

#
# Now I'll try a 2nd-order regression fit.
trdata2 <- cbind(trdata[,1], trdata[,2], trdata[,1]^2, trdata[,2]^2, trdata[,1]*trdata[,2], trdata[,3])
trdata2 <- data.frame(trdata2)
names(trdata2) <- c( "x1", "x2", "x1^2", "x2^2", "x1*x2", "y")
ols2 <- lm (y~., data=trdata2)
pred.ols2<- ifelse(predict(ols2, newdata=trdata2) <= 0.5, 0, 1)
tr.err.ols2 <- mean( pred.ols2 != trdata2[,6] )    
tr.err.ols2 
# This is the 2nd-order OLS classifier error rate.

#
# Now I'll use k-nearest-neighbors classification to get 
# class predictions for each case in the training sample.
# (Note: Typically one would standardize the predictors first
# by dividing by the sample standard deviation, but with this
# data I don't think that it would make a lot of difference.)
# (Note: The class label should not be included in the 
# variables used by the knn function to determine distances.)
#
# I'll use 1, 3, 5, ..., 59 as the values for k.
I <- 30
tr.err.knn <- numeric(I)
for (i in 1:I)
{
  pred.knn <- knn(trdata[,-3], trdata[,-3], cl=trdata[,3], k = (2*i - 1))
  tr.err.knn[i] <- mean( (as.numeric(pred.knn)-1) != trdata[,3])
}
plot(2*seq(1:I)-1,tr.err.knn,type="l",xlab='k',ylab='proportion misclassified',main='k-nearest-neighbors',col=3,ylim=c(0,0.4)) 
# It should be noted that, as is usually the case with 
# classifiers, using the same data to both classify points
# and estimate the accuracy of the classifier can lead to
# overoptimistic estimates of accuracy.  This is particularly 
# evident for the case of k = 1.

#
# I'll use two-dimensional kernel density estimates
# to classify by directly approximating the Bayes 
# classifier.
# (Note: The kde2d function used below will only work
# if there are two predictors.  If there is just one
# predictor, then R's density function can be used to
# obtain a density estimate.  Unfortunately, there 
# doesn't seem to be a function which will do kernel
# density estimation for dimensions greater than 2
# (although in a lot of cases I suppose it is the case
# that for dimensions greater than 2 the method doesn't
# work well compared to other methods).  Let me know if 
# you know of some function that will do density estimation
# in dimensions greater than 2.)
x1limL <- min(trdata[,1]) - 2
x1limU <- max(trdata[,1]) + 2
x2limL <- min(trdata[,2]) - 2
x2limU <- max(trdata[,2]) + 2
gden <- kde2d(trdata[1:100,1], trdata[1:100,2], n=300, lims=c(x1limL,x1limU,x2limL,x2limU))
image(gden)
# This is a representation of the green
# normal mixture model density.

#
# Here's the red pdf.
rden <- kde2d(trdata[101:200,1], trdata[101:200,2], n=300, lims=c(x1limL,x1limU,x2limL,x2limU))
image(rden)

#
denrat <- rden$z/gden$z
contour(gden$x, gden$y, denrat, levels=c(0.5,1,2))
# The contour labeled with 1 is the decision boundary.
# Outside of the wavy band, one estimated density is
# more than twice as high as the one one. 

#
pred.kern.den <- ifelse(denrat < 1, 0, 1)
image(gden$x, gden$y, pred.kern.den)
# Another representation of the decision boundary
# determined from the training data.

#
x1index <- round(1 + 299*(trdata[,1] - min(gden$x))/(max(gden$x) - min(gden$x)))
x2index <- round(1 + 299*(trdata[,2] - min(gden$y))/(max(gden$y) - min(gden$y)))
pred.kern.den <- rep(0, 200)
for (i in 1:200) pred.kern.den[i] <- denrat[x1index[i],x2index[i]]
tr.err.kern.den <- mean( ifelse(pred.kern.den > 1, 1, 0) != trdata[,3] ) 
tr.err.kern.den
# The training error for the kernel density estimation
# method is quite small.

#
# Now I will use naive Bayes classifiers.  Although the 
# method is based on an assumption of independence, which
# may be grossly violated, a nice thing is that this 
# method may be done using R for dimensions greater than 2.
#
# R's NaiveBayes function apparently needs to be given
# data with the class variable being a "factor" with 
# "levels" specified in order for the predict function 
# to work properly on the object.  Therefore, I created
# a new data frame to use at this point.
alttrdata <- data.frame(cbind(trdata[,1],trdata[,2],factor(trdata[,3])))
names(alttrdata) <- c("x1","x2","y")
levels(alttrdata$y) <- c("1","2")
#
# I'll start with a version of the method that assumes
# the marginal dist'ns are normal, and so instead of 
# using a nonparametric density estimator, a parametric
# model is used, and the data is used to estimate the 
# unknown parameters.
nbnorm <- NaiveBayes(y ~ ., data=alttrdata) 
pred.nbnorm <- predict(nbnorm, newdata=alttrdata)
tr.err.nbnorm <- mean( pred.nbnorm$class != alttrdata[,3] ) 
tr.err.nbnorm

#
# Next, I will use what I consider to be the standard
# way of doing naive Bayes classification, with a kernel
# density estimator being used to obtain a nonparametric
# density estimate for each predictor.
nbkern <- NaiveBayes(y ~ ., data=alttrdata, usekernel=TRUE) 
pred.nbkern <- predict(nbkern, newdata=alttrdata)
tr.err.nbkern <- mean( pred.nbkern$class != alttrdata[,3] ) 
tr.err.nbkern

#
### Begining of 1st part of support vector machine stuff
### added 7/17.
# Now I will try some support vector machines.  
# The default kernel is the radial kernel.
xxx <- subset(alttrdata, select = -y)
yyy <- as.character(alttrdata$y)
# Note: I had a hard time getting this to work so
# that I could get class predictions.  Just making
# the response a factor didn't work.  (The predictions 
# were numerical.) But it worked when I made the 
# response a character variable that then was clearly 
# recognized to be a categorical variable.
svmrad1 <- svm(xxx, yyy, type="C-classification", cross=5)
# (Note: The c-v isn't used to fit the model, 
# it's just used to estimate the accuracy of
# the fitted model.  It would be nice if c-v
# was used to determine a good value of the 
# gamma argument of the vsn function (which 
# I suppose is a tuning parameter).  Because
# this is not the case, it might be good to
# try several different values of gamma and
# pick the one which has the smallest estimated 
# error rate (using the c-v estimates).
summary(svmrad1)
# The c-v estimate of the error rate is 0.11 (0.12).
# (NOTE: Rather oddly, I didn't get the same result
# when I ran it two different times, using the same
# initial seed. It seems that the c-v rountine used 
# by the svm function doesn't maintain the random 
# number sequence in the usual way.  Below, I give 
# two numbers for each estimate --- these come from
# two different runs (starting w/ same seed).  It 
# should also be noted that unless you start at the 
# beginning of all of the stuff, and don't skip anything, 
# your results may be rther different.)
# This results from using the default value of
# gamma, which is 1/(# of predictors) = 0.5.
# I'll try using a value a tad larger, 0.6, and
# a value a tad smaller, 0.4, and see which one
# results in the smallest c-v error rate.
svmrad2 <- svm(xxx, yyy, type="C-classification", cross=5, gamma=0.6)
summary(svmrad2)
svmrad3 <- svm(xxx, yyy, type="C-classification", cross=5, gamma=0.4)
summary(svmrad3)
# The estimated error rates are 0.115 / 0.12 (for 0.4), 
# 0.11 / 0.12 (for 0.5), and 0.125 / 0.12 (for 0.6).  Because
# these are estimates, based on a training set of 
# only 200 points (100 of each class), the small
# differences shouldn't be taken too seriously
# --- given a different random number seed, the
# results could be different.
# I'll try two more values, and see what I get.
svmrad4 <- svm(xxx, yyy, type="C-classification", cross=5, gamma=0.7)
summary(svmrad4)
svmrad5 <- svm(xxx, yyy, type="C-classification", cross=5, gamma=0.3)
summary(svmrad5)
# The est. error rates are 0.125 / 0.12 (for 0.7) 
# and 0.12 / 0.115 (for 0.3).
#
# Now I'll try a polynomial kernel, using the default
# degree of 3.
svmpoly1 <- svm(xxx,yyy,type="C-classification",kernel="polynomial",cross=5)
summary(svmpoly1)
# The estimated error rate is 0.14 / 0.145.
# Rather than alter gamma at this point, I'll try 
# using degree = 2 and degree = 4.
svmpoly2 <- svm(xxx,yyy,type="C-classification",kernel="polynomial",degree=2,cross=5)
summary(svmpoly2)
svmpoly3 <- svm(xxx,yyy,type="C-classification",kernel="polynomial",degree=4,cross=5)
summary(svmpoly3)
# The number of support vectors went way up,
# compared with the other runs.  Also, the 
# est. error rates are horrible.  So I'll go 
# back to using degree = 3 and alter gamma.
# (Note: I think that it is very strange that 
# changing the degree from 3 to 2 and 4 led to
# such bad performance.) 
svmpoly4 <- svm(xxx,yyy,type="C-classification",kernel="polynomial",gamma=0.6,cross=5)
summary(svmpoly4)
svmpoly5 <- svm(xxx,yyy,type="C-classification",kernel="polynomial",gamma=0.4,cross=5)
summary(svmpoly5)
# I'll try two more values.
svmpoly6 <- svm(xxx,yyy,type="C-classification",kernel="polynomial",gamma=0.7,cross=5)
summary(svmpoly6)
svmpoly7 <- svm(xxx,yyy,type="C-classification",kernel="polynomial",gamma=0.3,cross=5)
summary(svmpoly7)
# The radial kernel seems better (in this case)
# than the polynomial kernels.
### END of 1st part of svm stuff added 7/17.

#
# Now I'll generate a large sample (5000 cases from 
# each class) with which to estimate the generalization
# error rates of the various classifiers.
gcounts <- rmultinomial(5000, rep(0.1, 10))
rcounts <- rmultinomial(5000, rep(0.1, 10))
gclass <- matrix(0, 5000, 2)
finish <- 0
for(i in 1:10)
{
  start <- finish + 1
  finish <- finish + gcounts[i]
  gclass[start:finish,] <- rmvnorm(gcounts[i], mean=gmeans[i,], sigma=diag(0.2,2)) 
}
rclass <- matrix(0, 5000, 2)
finish <- 0
for(i in 1:10)
{
  start <- finish + 1
  finish <- finish + rcounts[i]
  rclass[start:finish,] <- rmvnorm(rcounts[i], mean=rmeans[i,], sigma=diag(0.2,2)) 
}
gclass <- cbind(gclass,0)
rclass <- cbind(rclass,1) 
gendata <- rbind(gclass,rclass)
gendata <- data.frame(gendata)
names(gendata) <- names(trdata)

#
# I'll use this large data set to get a better
# estimate of the Bayes error rate.
gpdf <- rep(0, 10000)
rpdf <- rep(0, 10000)
for (i in 1:10)
{
  gpdf <- gpdf + dmvnorm(gendata[,1:2], gmeans[i,], diag(0.2,2))
  rpdf <- rpdf + dmvnorm(gendata[,1:2], rmeans[i,], diag(0.2,2))
}
pred.bayes.class <- ifelse(gpdf > rpdf, 0, 1)
est.bayes.rate <- mean(pred.bayes.class != gendata[,3])
est.bayes.rate
# This is a good estimate of the error rate 
# of the Bayes classifier.  
# (It's a Monte Carlo estimate.)
#
# Since the optimal Bayes classifier has a larger
# error rate than those estimated from the training
# data for the other classifiers, it appears that
# the estimates obtained using the training data
# were indeed optimistic.

#
# Now I'll use the new large sample to estimate
# the generalization error rates of the classifiers
# that were constructed using the training data.
# 
# First I'll consider the rule created from the
# 1st-order OLS fit. 
pred.ols1 <-predict(ols1, newdata=gendata)
gen.err.ols1 <- mean( ifelse(pred.ols1 <= 0.5, 0, 1) != gendata[,3])   
gen.err.ols1 
# estimate of gen. error rate
# 1st-order OLS

#
# Next I'll consider the rule created from the
# 2nd-order OLS fit. 
gendata2 <- cbind(gendata[,1],gendata[,2],gendata[,1]^2,gendata[,2]^2,gendata[,1]*gendata[,2],gendata[,3])
gendata2 <- data.frame(gendata2)
names(gendata2) <- names(trdata2)
pred.ols2 <- ifelse(predict(ols2, newdata=gendata2) <= 0.5, 0, 1)
gen.err.ols2 <- mean( pred.ols2 != gendata2[,6])    
gen.err.ols2
# estimate of gen. error rate
# 2nd-order OLS

#
# First let's consider the straightforward density estimation
# method (to directly approx. Bayes classifier) --- I'll 
# estimate the gen. error for the 2-D kernel density estimation 
# method.
x1index <- round(1 + 299*(gendata[,1] - min(gden$x))/(max(gden$x) - min(gden$x)))
x2index <- round(1 + 299*(gendata[,2] - min(gden$y))/(max(gden$y) - min(gden$y)))
pred.kern.den <- rep(0, 10000)
for (i in 1:10000) pred.kern.den[i] <- denrat[x1index[i],x2index[i]]
gen.err.kern.den <- mean( ifelse(pred.kern.den > 1, 1, 0) != gendata[,3] ) 
gen.err.kern.den
# It did better than the regression models.

#
# Now I will get the estimates of the gen. error 
# for the two naive Bayes classifiers.
altgendata <- data.frame(cbind(gendata[,1],gendata[,2],factor(gendata[,3])))
names(altgendata) <- c("x1","x2","y")
levels(altgendata$y) <- c("1","2")
#
# First the parametric normal model version. 
pred.nbnorm <- predict(nbnorm, newdata=altgendata)
gen.err.nbnorm <- mean( pred.nbnorm$class != altgendata[,3] ) 
gen.err.nbnorm

#
# Then the nonparametric version. 
pred.nbkern <- predict(nbkern, newdata=altgendata)
gen.err.nbkern <- mean( pred.nbkern$class != altgendata[,3] ) 
gen.err.nbkern
# The naive Bayes methods didn't work as well as 
# the kernel density estimation method.

#
# Now I'll check the k-nearest-neighbors classifiers.
gen.err.knn <- numeric(I)
for (i in 1:I)
{
  pred.knn <- knn(trdata[,-3], gendata[,-3], cl=trdata[,3], k=i*2-1)
  gen.err.knn[i] <- mean( (as.numeric(pred.knn)-1) != gendata[,3] )
}
plot(2*seq(1:I)-1,tr.err.knn,type="l",xlab='k',ylab='proportion misclassified',main='k-nearest-neighbors',col=3,ylim=c(0,0.4)) 
points(2*seq(1:I)-1, gen.err.knn, type="l", col=2) 
abline(h=est.bayes.rate, col=8)   
legend( 5, 0.35, legend=c("training", "generalization", "c-v", "Bayes"), fill=c(3,2,4,8))
#
# If you change the seed for the random number 
# generator to 632, then you will observe very
# different behavior --- the regression methods
# are no longer close in performance to the 
# best nearest-neighbors classifiers.  
# (But if you change the seed to any number
# other than 3, my comments may no longer be 
# sensible.)

#
# Now I want to explore k-nearest-neighbors 
# classification a bit more.  
#
# It can be seen that the error rates estimated
# from the training sample can be very misleading ---
# for one thing, k = 1 produced 0 misclassifications.
#
# Using the huge "generalization set" one can 
# determine which value of k worked best, and 
# can get a good estimate of the generalization
# error rate for this value of k.
bestk <- 2*which.min(gen.err.knn) - 1
bestk
# best value of k
gen.err.knn[which.min(gen.err.knn)]
# estimate of gen. error rate
# best knn

#
# Of course, in practice we don't always have a huge
# data set to help us select the best value of k to
# use as was done above (and if we did have a lot of 
# data like that, it would be better to use a large
# portion of it in the training sample).  
#
# In practice, say if we only had the training set of
# 200 cases to work with, we could use cross-validation
# to determine a good value of k to use when classifying
# cases with unknown class.  Although for some purposes
# using an n-fold c-v may take an excessively long time
# and may be worse than using a smaller number of folds,
# with k-nearest-neighbors n-fold c-v is simple and 
# and relatively quick.  (It's simple because with n-fold
# c-v, to get the predicted class of a case the k nearest
# _other_ cases are the neighbors used ... so it's like
# k+1 nearest neighbors can be determined in the usual
# way and then the nearest neighbor (the case itself) is
# ignored.  The train.kknn function, as used below, will
# do this.)
cvknn <- train.kknn(as.factor(y) ~ ., trdata, kmax=59, kernel="rectangular", distance=2) 
# (Note: If the response is coded numerically, it must be 
# converted to make it be recogonized as being nominal
# by the kknn function.)
cv.err.knn <- numeric(I)
for (i in 1:I) cv.err.knn[i] <- cvknn$MISCLASS[2*i-1]
points(2*seq(1:I)-1, cv.err.knn, type="l", col=4) 
bestk <- 2*which.min(cv.err.knn) - 1
bestk
# best value of k
cv.err.knn[which.min(cv.err.knn)]
# estimate of gen. error rate
# best knn
#
# It can be seen that while the actual c-v estimates 
# aren't real close to the true generalization error
# rates (assuming that the estimates of the generalization 
# error rates from the huge sample are good), the 
# shape of the c-v error rate curve is not misleading as 
# to what values of k are good, and that's the important 
# thing!
#
gen.err.knn[which.min(cv.err.knn)]
# Here is a better estimate of the gen. err. for the 
# knn classifier selected using cross-validation.

#
# One can also use a weighted nearest neighbors method.
# (This will be covered in the July 7 class meeting.)
# I'll start by using the Epanechnikov kernel (with 
# default value of smoothing parameter).
cvwknn <- train.kknn(as.factor(y) ~ ., trdata, kmax=59, kernel="epanechnikov", distance=2)
cv.err.wknn <- numeric(I)
for (i in 1:I) cv.err.wknn[i] <- cvwknn$MISCLASS[2*i-1]
plot(2*seq(1:I)-1, cv.err.wknn, type="l",xlab='k',ylab='proportion misclassified',main='wt k-nearest-neighbors',ylim=c(0,0.4) , col=4) 
bestk <- 2*which.min(cv.err.wknn) - 1
bestk
# best value of k
cv.err.wknn[which.min(cv.err.wknn)]
# c-v estimate of gen. error rate
# best weighted (Epanechnikov) knn

#
gen.err.wknn <- numeric(I-1)
for (i in 1:(I-1))
{
  currentk <- 2*i + 1
  pred.wknn <- kknn(y ~ ., trdata, gendata, k=currentk, distance=2, kernel="epanechnikov")
  gen.err.wknn[i] <- mean( round(pred.wknn$fitted.values) != gendata[,3] )
}
# (Note: kknn is somewhat screwy, although in the end 
# I think that I got it working the way I wanted it 
# to work.  Whether the response variable was of
# type factor or numeric, the predicted values were
# given as noninteger numbers.  So what worked, was
# to let y be numeric, and round the fitted values
# to make them intergers.) 
points(2*seq(1:(I-1))+1, gen.err.wknn, type="l", col=2) 
abline(h=est.bayes.rate, col=8)   
legend( 5, 0.35, legend=c("generalization", "c-v", "Bayes"), fill=c(2,4,8))

#
# Here is the est. gen. error for the value
# of k chosen by cross-validation.
gen.err.wknn[which.min(cv.err.wknn) - 1]
# estimate of gen. error rate
# best weighted (Epanechnikov) knn
# as chosen by c-v
#
# Here is the est. gen. error for the actual 
# best k for weighted knn.
gen.err.wknn[which.min(gen.err.wknn)]
#

#
### The rest of the stuff added 7/17 is below.
# Now I'll get the generalization error estimates
# for the various svm fits.
gxxx <- subset(altgendata, select = -y)
gyyy <- as.character(altgendata$y)
gen.err.svm.r1 <- mean( gyyy != predict(svmrad1, gxxx) )
gen.err.svm.r2 <- mean( gyyy != predict(svmrad2, gxxx) )
gen.err.svm.r3 <- mean( gyyy != predict(svmrad3, gxxx) )
gen.err.svm.r4 <- mean( gyyy != predict(svmrad4, gxxx) )
gen.err.svm.r5 <- mean( gyyy != predict(svmrad5, gxxx) )
gen.err.svm.p1 <- mean( gyyy != predict(svmpoly1, gxxx) )
gen.err.svm.p2 <- mean( gyyy != predict(svmpoly2, gxxx) )
gen.err.svm.p3 <- mean( gyyy != predict(svmpoly3, gxxx) )
gen.err.svm.p4 <- mean( gyyy != predict(svmpoly4, gxxx) )
gen.err.svm.p5 <- mean( gyyy != predict(svmpoly5, gxxx) )
gen.err.svm.p6 <- mean( gyyy != predict(svmpoly6, gxxx) )
gen.err.svm.p7 <- mean( gyyy != predict(svmpoly7, gxxx) )
gen.err.svm.r1
gen.err.svm.r2
gen.err.svm.r3
gen.err.svm.r4
gen.err.svm.r5
gen.err.svm.p1
gen.err.svm.p2
gen.err.svm.p3
gen.err.svm.p4
gen.err.svm.p5
gen.err.svm.p6
gen.err.svm.p7
# Three of these error rates are lower than the error rates
# from *all* of the other methods.  

#
###### boosting part added Thursday, 7/21
# (Note: The adaboost function is sensitive to how the 
# data is given --- trying to just use columns from a 
# data frame can cause problems, so I go through some
# manipulations to make the x values given in a plain 
# matrix.  (Also note that the y values must be a vector
# of 0 and 1 values.  (Type in ?adaboost to get more info.)))
tx <- matrix(unlist(trdata[,-3]), nrow=nrow(trdata))
ty <- as.numeric(trdata[,3])
gx <- matrix(unlist(gendata[,-3]), nrow=nrow(gendata))
gy <- as.numeric(gendata[,3])
boostpred <- adaboost(tx, ty, gx)
summarize(boostpred, gy)
# These results don't seem reasonable --- in other examples
# I tried, the error rate went down ... it was not constant.