# This example deals with the classification setting 
# dealt with in Sec. 2.3 of HTF2.  I'll use SVMs in
# in addition to some methods from Ch. 2, Ch. 4, and
# Sec. 13.2.

# Developed using 2.9.0.


# I'll load libraries and set the random number seed.

library(MASS)     
library(mvtnorm)     
library(multinomRob) 
library(class)       
library(kknn)  
library(e1071)    # for svm f'n (for support vector machines)

       

set.seed(18)     


# First I'll generate and plot the component means.

Sig1 <- diag(1,2)  # Sig1 is a variance-covariance matrix.  
Omeans <- rmvnorm(10, mean=c(0,1), sigma=Sig1) # O is used for the Orange class.
Bmeans <- rmvnorm(10, mean=c(1,0), sigma=Sig1) # B is used for the Blue class.

plot(Omeans[,1],Omeans[,2],col="darkorange",main='Component Means',xlab='x1',ylab='x2',xlim=c(-3,4),ylim=c(-3,4))
points(Bmeans[,1], Bmeans[,2], col="royalblue")

#####

# For each class I'll generate 10 counts which sum to 100  
# using a multinomial (100,0.1,0.1,...,0.1) distribution.
 
Ocounts <- rmultinomial(100, rep(0.1,10))
Bcounts <- rmultinomial(100, rep(0.1,10))

# Using the generated component means and 
# the generated counts as sample sizes (for
# the various components of the mixture 
# distributions), I'll generate 100 training 
# sample obervations for each class. 

Sig2 <- diag(0.2, 2)
Oxvals <- matrix(0, 100, 2)
finish <- 0
for (i in 1:10) 
{
  start <- finish + 1
  finish <- finish + Ocounts[i]
  Oxvals[start:finish,] <- rmvnorm(Ocounts[i], mean=Omeans[i,], sigma=Sig2) 
}
Bxvals <- matrix(0, 100, 2)
finish <- 0
for (i in 1:10) 
{
  start <- finish + 1
  finish <- finish + Bcounts[i]
  Bxvals[start:finish,] <- rmvnorm(Bcounts[i], mean=Bmeans[i,], sigma=Sig2) 
}

# Let's look at the training data.

plot(Oxvals[,1],Oxvals[,2],col="darkorange",main='Training Data',xlab='x1',ylab='x2',
      xlim=c(-3.5,4.5),ylim=c(-3.5,4.5))
points(Bxvals[,1], Bxvals[,2], col="royalblue")

#####

# I'll give response value of 0 for Orange class observations, 
# and 1 for the Blue class observations, and then combine them 
# together to form the training data.

Oclass <- cbind(0, Oxvals)
Bclass <- cbind(1, Bxvals)
trdata <- data.frame(rbind(Oclass, Bclass))
names(trdata) <- c("g", "x1", "x2")  

# This completes the generation of the training data.

# I'll generate a test set of 1000 additional 
# observations from each of the two classes.
# (Note: The component means are the ones used 
# to create the training data.)  I like to call
# this data the generalization sample, because I 
# will only use it to get good estimates of the 
# error rate of various fitted classifiers for 
# future observations.  Some call it a test sample, 
# but I like to think of a test sample as data I have 
# during the training/fitting process that I have
# set aside so that I can avoid using things like 
# c-v and AIC for model/parameter selection. 

Ocounts <- rmultinomial(1000, rep(0.1,10))
Bcounts <- rmultinomial(1000, rep(0.1,10))
Oxvals <- matrix(0, 1000, 2)
finish <- 0
for (i in 1:10) 
{
  start <- finish + 1
  finish <- finish + Ocounts[i]
  Oxvals[start:finish,] <- rmvnorm(Ocounts[i], mean=Omeans[i,], sigma=Sig2) 
}
Bxvals <- matrix(0, 1000, 2)
finish <- 0
for (i in 1:10) 
{
  start <- finish + 1
  finish <- finish + Bcounts[i]
  Bxvals[start:finish,] <- rmvnorm(Bcounts[i], mean=Bmeans[i,], sigma=Sig2) 
}
Oclass <- cbind(0, Oxvals)
Bclass <- cbind(1, Bxvals)
gendata <- data.frame(rbind(Oclass, Bclass))
names(gendata) <- c("g", "x1", "x2")

# I'll obtain an estimate of the Bayes error 
# rate using the large generalization sample.

Opdf <-rep(0,2000)
Bpdf <-rep(0,2000)
for (i in 1:10) 
{
  Opdf <- Opdf + dmvnorm(gendata[,2:3], Omeans[i,], Sig2) 
  Bpdf <- Bpdf + dmvnorm(gendata[,2:3], Bmeans[i,], Sig2)
}
pred.bayes <- ifelse(Opdf > Bpdf, 0, 1)
est.bayes.rate <- mean(pred.bayes != gendata[,1])

est.bayes.rate
# This is the estimated error rate of the Bayes classifier.

# Below is a confusion matrix, with true class for the 
# rows and predicted class for the columns.  Below that
# is a plot showing the decision boundaries of the
# Bayes classifier.

table(gendata[,1], pred.bayes)

plot(gendata[,2],gendata[,3],col=pred.bayes+2,main='Bayes Classifier',xlab='x1',ylab='x2',
      xlim=c(-3.5,4.5),ylim=c(-3.5,4.5))

#####

# I'll use c-v to select a good number of nearest neighbors
# to use for "vanilla" nearest neighbors classification.
cvknn <- train.kknn(as.factor(g) ~ ., trdata, 
          kmax=50, kernel="rectangular", distance=2) 
plot(c(1:50),cvknn$MISCLASS,type="l",xlab='k',
      ylab='proportion misclassified',main='Estimated Error Rates',
      ylim=c(0.2,0.5),col="cornflowerblue") 

which.min(cvknn$MISCLASS)
cvknn$MISCLASS[9:35]

# Several values of k ranging from 9 to 31 tie for producing  
# the fewest misclassifications.  Since the "true curve"
# (as opposed to the estimated curve we get from c-v)
# may be smoother and concave upward with a single
# minimum, I'd choose to use k = 21 (or 19) to classify 
# future observations.  (I think an odd value for k is 
# better since it should lead to fewer ties being randomly 
# broken.)  

# I'll use the large generalization sample to estimate 
# the error rate for the 21 nearest neighbors classifier.
  
pred.21nn <- kknn(as.factor(g)~.,train=trdata,test=gendata,k=21)
gen.err.21nn <- mean(pred.21nn$fit != gendata[,1])
gen.err.21nn
# est. gen. err. rate for 21 nearest neighbors

#####

# Now let's check out classification using LDA and QDA.

trlda <- lda(g ~ . , trdata)
pred.trlda <- predict(trlda, newdata=gendata)
lda.rate <- mean( pred.trlda$class != gendata[,1] )

lda.rate
# LDA rate.

trqda <- qda(g ~ . , trdata)
pred.trqda <- predict(trqda, newdata=gendata)
qda.rate <- mean( pred.trqda$class != gendata[,1] )

qda.rate
# QDA rate.
# (Better than LDA, but not as good as 21 nearest neighbors.)

#####

# Now I'll try 1st-, 2nd-, and 3rd-order logisitic 
# regression models.

logreg.fit1 <- glm(as.factor(g) ~ ., data=trdata, family=binomial)
summary(logreg.fit1)

# Let's see how well the fitted model predicts the  
# classes for the generalization set.

pred.logreg1 <- predict(logreg.fit1, newdata=gendata, type="response")
logreg1.rate <-  mean( round(pred.logreg1) != gendata[,1] )
logreg1.rate
# error rate for 1st-order logistic regression model
# (very close to LDA error rate)

# I'll try a full 2nd-order model.
 
logreg.fit2 <- glm(as.factor(g) ~ x1 + x2 + I(x1^2) + I(x2^2)
                                      + I(x1*x2), data=trdata, family=binomial)
summary(logreg.fit2)
pred.logreg2 <- predict(logreg.fit2, newdata=gendata, type="response")
logreg2.rate <-  mean( round(pred.logreg2) != gendata[,1] )
logreg2.rate
# error rate for 2nd-order logistic regression model

# I'll try a 3rd-order model.

logreg.fit3 <- glm(as.factor(g) ~ x1 + x2 + I(x1^2) + I(x2^2) + I(x1*x2) 
                    + I(x1^3) + I(x2^3) +I(x1*x2^2) 
                    + I(x2*x1^2), data=trdata, family=binomial)
summary(logreg.fit3)
pred.logreg3 <- predict(logreg.fit3, newdata=gendata, type="response")
logreg3.rate <-  mean( round(pred.logreg3) != gendata[,1] )
logreg3.rate
# error rate for 3rd-order logistic regression model

#####

# Now I'll define a function that uses c-v with the 
# K-means clustering method from subsection 13.2.1 
# of HTF2.  The user supplies a vector of proportions
# that determines the numbers of clusters per class
# that will be compared using 10-fold c-v estimates 
# of error rates.  The second of the two functions 
# defined below is the one that does the c-v.  The 
# first of the two functions is used by the second
# of the two functions.

KMeansClassMod <- function(traindata, testdata, n.comp)
# 1st col. of each data set should be class variable
# (and classes should be labeled with consectutive
# nonnegative integers).  Remaining col's of each set 
# should be predictor var's.  n.comp should be a vector
# containing the numbers of clusters to use for each of 
# the classes.  To avoid an error with the kmeans function,
# if the number of clusters specified is less than 2, the
# number will be increased to 2.
{
  first.class <- min(traindata[,1])
  last.class <- max(traindata[,1])
  prototypes <- NULL
  for (g in first.class:last.class)
  {
    cases <- traindata[,1] == g
    classdata <- traindata[cases,]  
    num.clusters <- max(n.comp[g-first.class+1], 2)
    clusters <- kmeans(classdata[,-1],num.clusters,iter.max=15,nstart=5)
    class.prototypes <- cbind(g, clusters$centers)
    prototypes <- rbind(prototypes, class.prototypes)
  }
  prototypes <- data.frame(prototypes)
  names(prototypes) <- names(traindata)
  predict.class <- kknn(as.factor(g)~.,train=prototypes,test=testdata,k=1)
  test.error <-  mean( predict.class$fit != testdata[,1] )
  test.error
}

KMeansClassCV <- function(data, proportions)
# The proportions vector determines the numbers of clusters
# per class that will be compared using 10-fold c-v estimates
# of error rates.  E.g., if the proportions vector is c(0.05,
# 0.1, 0.2), for each class about 0.05*n_j clusters will be used 
# for K-means classification the first trial, 0.1*n_j clusters
# will be used the second trial, and 0.2*n_j clusters will be
# used the third trial, where n_j is the number of training
# set cases for the class.  (An exception is that if the number
# of clusters would be less than 2, it is made to be 2.  This 
# is done to avoid an error with the kmeans function.)  The 1st 
# column of the data set should be class variable (and classes 
# should be labeled with consectutive nonnegative integers).  
# Remaining columns should be predictor var's.
{
  n.total <- length(data[,1])
  first.class <- min(data[,1])
  last.class <- max(data[,1])
  num.classes <- last.class - first.class + 1
  n.class <- numeric(num.classes)
  for (j in first.class:last.class)
  {
    class.cases <- (data[,1] == j)
    n.class[j-first.class+1] <- length(data[class.cases,1])

  }
  fold.size <- ceiling(n.total/10)
  fold.id <- rep(c(1:10), fold.size)
  fold.id <- sample(fold.id, n.total)
  num.trials <- length(proportions)
  errors <- rep(0, num.trials)
  for (i in 1:num.trials)
  {  
    num.clusters <- round(n.class*proportions[i])
    for (fold in 1:10)
    {
      fold.cases <- (fold.id == fold)
      errors[i] <- errors[i] +
                   length(data[fold.cases,1])*KMeansClassMod(data[!fold.cases,],data[fold.cases,],num.clusters)
    }
  }
  err.rate <- errors/n.total
  results <- cbind(proportions, err.rate)
  results
}
    
KMeansClassCV(trdata, c(0.05, 0.1, 0.15, 0.2, 0.25))
# Note: Originally I used kkn in my functions, and I 
# didn't get any warnings.  But then I decided to use
# kknn (so that the data would be scaled automatically),
# which required that I make the set of prototypes into
# a data frame, and then I got warnings.  I don't think
# the warnings indicate a serious problem.

# I'll try some more proportions.

KMeansClassCV(trdata, c(0.08,0.1,0.12,0.14))

# And some more.

KMeansClassCV(trdata, c(0.06,0.07,0.08,0.09,0.1,0.11,0.12))
 
# It looks like 8 prototypes per class may be good.
# Let's see what the estimated generalization error 
# is for that choice.
 
KMeansClassMod(trdata,gendata,c(8,8))

# Not bad, but not as good as using 21 nearest neighbors.

#####

# I next used c-v with the LVQ method 
# from subsection 13.2.2 of HTF2, but
# since it took a long time to run 
# several times as I searched for good
# parameter values to use, I'll skip
# that part here and below just get
# the estimated generalization error
# for the choices that I ultimately made.

LVQCV <- function(data, proportions, num.it, init.ep)
# The proportions vector determines the numbers of clusters
# per class that will be compared using 10-fold c-v estimates
# of error rates.  E.g., if the proportions vector is c(0.05,
# 0.1, 0.2), for each class about 0.05*n_j clusters will be used 
# for K-means classification the first trial, 0.1*n_j clusters
# will be used the second trial, and 0.2*n_j clusters will be
# used the third trial, where n_j is the number of training
# set cases for the class.  The first column of the data set 
# should be class variable (and classes should be labeled with 
# consectutive nonnegative integers).  Remaining columns should 
# be predictor var's.
{
  n.total <- length(data[,1])
  first.class <- min(data[,1])
  last.class <- max(data[,1])
  num.classes <- last.class - first.class + 1
  n.class <- numeric(num.classes)
  for (j in first.class:last.class)
  {
    class.cases <- (data[,1] == j)
    n.class[j-first.class+1] <- length(data[class.cases,1])
  }
  fold.size <- ceiling(n.total/10)
  fold.id <- rep(c(1:10), fold.size)
  fold.id <- sample(fold.id, n.total)
  num.trials <- length(proportions)
  errors <- rep(0, num.trials)
  for (i in 1:num.trials)
  {  
    num.pro <- round(n.class*proportions[i])
    for (fold in 1:10)
    {
      fold.cases <- (fold.id == fold)
      errors[i] <- errors[i] +
                   length(data[fold.cases,1])*LVQMod(data[!fold.cases,],data[fold.cases,],num.pro,num.it,init.ep)
    }
  }
  err.rate <- errors/n.total
  results <- cbind(proportions, err.rate)
  results
}

LVQMod <- function(traindata,testdata,n.pro,iterations,initial.epsilon)
# 1st col. of each data set should be class variable
# (and classes should be labeled with consectutive
# nonnegative integers).
# Remaining col's of each set should be predictor var's.
{
  first.class <- min(traindata[,1])
  last.class <- max(traindata[,1])
  prototypes <- NULL
  for (g in first.class:last.class)
  {
    cases <- traindata[,1] == g
    classdata <- traindata[cases,]  
    n.cases <- length(classdata[,1])
    selected <- sample.int(n.cases, n.pro[g-first.class+1])
    class.prototypes <- classdata[selected,]
    prototypes <- rbind(prototypes, class.prototypes)
  }
  train.size <- length(traindata[,1])
  for (i in 1:iterations)
  {
    epsilon <- initial.epsilon/i
    case <- sample.int(train.size, 1)
    distances <- as.matrix(dist(rbind(traindata[case,-1],prototypes[,-1])))
    closest <- as.integer(which.min(distances[-1,1]))
    ifelse(prototypes[closest,1]==traindata[case,1],
      prototypes[closest,-1] <- (1 - epsilon)*prototypes[closest,-1] + epsilon*traindata[case,-1],
      prototypes[closest,-1] <- (1 - epsilon)*prototypes[closest,-1] - epsilon*traindata[case,-1])
  }   
  predict.class <- knn(prototypes[,-1], testdata[,-1], prototypes[,1], k=1)
  errors <- mean( predict.class != testdata[,1] )
  errors
}

# I tried several combinations of proportions and parameter
# settings, and a proportion of 0.15 worked best.

LVQMod(trdata, gendata, c(15,15,15), 500, 0.67)

#####

# Now I'll combine the K-means starting points  
# with the movement of the prototypes used in
# the LVQ method.

KMeansPlusLVQCV <- function(data,proportions,num.it,init.ep)
# 1st col. of each data set should be class variable
# (and classes should be labeled with consectutive
# nonnegative integers).
# Remaining col's of each set should be predictor var's.
{
  n.total <- length(data[,1])
  first.class <- min(data[,1])
  last.class <- max(data[,1])
  num.classes <- last.class - first.class + 1
  n.class <- numeric(num.classes)
  for (j in first.class:last.class)
  {
    class.cases <- (data[,1] == j)
    n.class[j-first.class+1] <- length(data[class.cases,1])
  }
  fold.size <- ceiling(n.total/10)
  fold.id <- rep(c(1:10), fold.size)
  fold.id <- sample(fold.id, n.total)
  num.trials <- length(proportions)
  errors <- rep(0, num.trials)
  for (i in 1:num.trials)
  {  
    num.pro <- round(n.class*proportions[i])
    for (fold in 1:10)
    {
      fold.cases <- (fold.id == fold)
      errors[i] <- errors[i] +
                   length(data[fold.cases,1])*KMeansPlusLVQMod(data[!fold.cases,],data[fold.cases,],num.pro,num.it,init.ep)

    }
  }
  err.rate <- errors/n.total
  results <- cbind(proportions, err.rate)
  results
}
 
KMeansPlusLVQMod <- function(traindata,testdata,n.pro,iterations,initial.epsilon)
{
  first.class <- min(traindata[,1])
  last.class <- max(traindata[,1])
  prototypes <- NULL
  for (g in first.class:last.class)
  {
    cases <- traindata[,1] == g
    classdata <- traindata[cases,]  
    clusters <- kmeans(classdata[,-1],n.pro[g-first.class+1],iter.max=15,nstart=5)
    class.prototypes <- cbind(g, clusters$centers)
    prototypes <- rbind(prototypes, class.prototypes)
  }
  prototypes <- data.frame(prototypes)
  train.size <- length(traindata[,1])
  for (i in 1:iterations)
  {
    epsilon <- initial.epsilon/i
    case <- sample.int(train.size, 1)
    distances <- as.matrix(dist(rbind(traindata[case,-1],prototypes[,-1])))
    closest <- as.integer(which.min(distances[-1,1]))
    ifelse(prototypes[closest,1]==traindata[case,1],
      prototypes[closest,-1] <- (1 - epsilon)*prototypes[closest,-1] + epsilon*traindata[case,-1],
      prototypes[closest,-1] <- (1 - epsilon)*prototypes[closest,-1] - epsilon*traindata[case,-1])
      }   
  predict.class <- knn(prototypes[,-1], testdata[,-1], prototypes[,1], k=1)
  test.error <-  mean( predict.class != testdata[,1] )
  test.error
}

# Without showing details of my search, I found that the 
# parameter settings used below looked promising.

KMeansPlusLVQMod(trdata,gendata,c(6,6),500,0.5)

# 2nd best result so far. 
# (21 nearest neighbors was better.)

#####

# Now I will try some support vector machines.  
# The default kernel is the radial kernel.  
# Using the svm function, one has to supply a 
# value for the gamma parameter, or else rely
# on the default, which is the inverse of the 
# number of predictors, or 1/2 in this case.
# The help page stresses that one should tune,
# and so a variety of gamma values should be 
# considered.  The svm function has a cross-
# validation feature which can be used to obtain 
# an estimate of generalization error rate from
# the training sample for the particular value of 
# gamma which is currently being used.  However, 
# even if the same random number seed is used,
# the c-v estimates for a particular value of
# gamma can differ from trial to trial.  This 
# prevents me from doing a "trial-and-error" search 
# for a good gamma and making comments here, 
# because when this script is run I can't know 
# in advance what the outcomes will be.  So I'll 
# just try a bunch of different gamma values 
# and obtain c-v estimates of the error rates, as 
# well as estimates of the error rates using the 
# fitted models to make predictions for the large
# generalization sample.

num.trials <- 15
gamma.val <- (1:15)/10
error.cv <- numeric(15)
error.gen <- numeric(15)
for (i in 1:num.trials)
{
  svm.rad <- svm( as.factor(g) ~ ., trdata, cross=10, gamma=gamma.val[i])
  error.cv[i] <- svm.rad$tot.accuracy
  error.gen[i] <- mean( predict(svm.rad, gendata) != as.factor(gendata[,1]) )
}
error.cv <- 1 - error.cv/100

plot(gamma.val,error.gen,type="l",xlab='gamma',
      ylab='proportion misclassified',main='Estimated Error Rates (SVM, radial kernel)',
      ylim=c(0.2,0.5),col="seagreen")
points(gamma.val,error.cv,type="l",col="tomato")
abline(est.bayes.rate, 0, col="purple")
legend(0.15,0.5,legend=c("cross-val","generalization","Bayes"),
       fill=c("tomato","seagreen","purple"))

# Let's look at the misclassification proportions 
# for the large generalization sample.

error.gen

# For one time I did it, cross-validation indicated 
# setting gamma to 0.9 would be good.  Using 0.9, 
# only about 24% percent of the generalization sample
# was misclassified.  (Another time I did it the c-v
# indicated gamma equal to 1.0 would be good, and yet
# another time the indication was that gamma should be 
# 1.5.  All of these choices result in good error rates.

#####

# Below I'll demonstate the plot command applied to an svm object.
# (It corresponds to the last svm fit above, for gamma = 1.5.)

plot(svm.rad, trdata)

summary(svm.rad)

#####

# Here's the plot showing the Bayes decision boundaries again.

plot(gendata[,2],gendata[,3],col=pred.bayes+2,main='Bayes Classifier',xlab='x1',ylab='x2',
      xlim=c(-3.5,4.5),ylim=c(-3.5,4.5))

#####

# Now I'll try using the polynomial kernel.  To tune,
# I'll start by considering various values for the 
# degree, keeping the other parameters at their default
# settings.

svm.poly2 <- svm( as.factor(g) ~ ., trdata, cross=10, kernel="polynomial", degree=2)
summary(svm.poly2)

svm.poly3 <- svm( as.factor(g) ~ ., trdata, cross=10, kernel="polynomial", degree=3)
summary(svm.poly3)

svm.poly4 <- svm( as.factor(g) ~ ., trdata, cross=10, kernel="polynomial", degree=4)
summary(svm.poly4)

# C-v results indicate setting the degree to 3 
# works worst, with a degree of 2 and 4 both 
# both working better.  But even the best degree,
# 2, doesn't look very promising.   Nevertheless, 
# I'll try some different values of gamma to see 
# if I can get improvement with the degree equal 2.

for (i in 1:num.trials)
{
  svm.rad <- svm( as.factor(g) ~ ., trdata, cross=10, kernel="polynomial",
                  degree=2, gamma=gamma.val[i])
  error.cv[i] <- svm.rad$tot.accuracy
  error.gen[i] <- mean( predict(svm.rad, gendata) != as.factor(gendata[,1]) )
}
error.cv <- 1 - error.cv/100

plot(gamma.val,error.gen,type="l",xlab='gamma',
      ylab='proportion misclassified',main='Estimated Error Rates (SVM, poly 2 kernel)',
      ylim=c(0.2,0.5),col="seagreen")
points(gamma.val,error.cv,type="l",col="tomato")
abline(est.bayes.rate, 0, col="purple")
legend(0.15,0.5,legend=c("cross-val","generalization","Bayes"),
       fill=c("tomato","seagreen","purple"))

#####

# To be thorough, I'll now try altering the coef0
# parameter.  (The default value is 0.)  I'll set
# gamma to 1.5.

num.trials <- 15
coef0.val <- (-7:7)/5
error.cv <- numeric(15)
error.gen <- numeric(15)
for (i in 1:num.trials)
{
  svm.rad <- svm( as.factor(g) ~ ., trdata, cross=10, kernel="polynomial",
                  degree=2, gamma=1.5, coef0=coef0.val[i])
  error.cv[i] <- svm.rad$tot.accuracy
  error.gen[i] <- mean( predict(svm.rad, gendata) != as.factor(gendata[,1]) )
}
error.cv <- 1 - error.cv/100

plot(coef0.val,error.gen,type="l",xlab='coef0',
      ylab='proportion misclassified',main='Estimated Error Rates (SVM, poly 2 kernel)',
      ylim=c(0.2,0.70),col="seagreen")
points(coef0.val,error.cv,type="l",col="tomato")
abline(est.bayes.rate, 0, col="purple")
legend(0,0.7,legend=c("cross-val","generalization","Bayes"),
       fill=c("tomato","seagreen","purple"))

#####

# Let's try some larger values for coef0.

num.trials <- 15
coef0.val <- 2 + (-7:7)/5
error.cv <- numeric(15)
error.gen <- numeric(15)
for (i in 1:num.trials)
{
  svm.rad <- svm( as.factor(g) ~ ., trdata, cross=10, kernel="polynomial",
                  degree=2, gamma=1.0, coef0=coef0.val[i])
  error.cv[i] <- svm.rad$tot.accuracy
  error.gen[i] <- mean( predict(svm.rad, gendata) != as.factor(gendata[,1]) )
}
error.cv <- 1 - error.cv/100

plot(coef0.val,error.gen,type="l",xlab='coef0',
      ylab='proportion misclassified',main='Estimated Error Rates (SVM, poly 2 kernel)',
      ylim=c(0.2,0.70),col="seagreen")
points(coef0.val,error.cv,type="l",col="tomato")
abline(est.bayes.rate, 0, col="purple")
legend(1.0,0.7,legend=c("cross-val","generalization","Bayes"),
       fill=c("tomato","seagreen","purple"))

#####

# Before quitting, I'll try the linear kernel.

svm.lin <- svm( as.factor(g) ~ ., trdata, cross=10, kernel="linear")
summary(svm.lin)

# The c-v estimate isn't promising, but I'll check the
# performance on the generalization set anyway.

mean( predict(svm.lin, gendata) != as.factor(gendata[,1]) )
# Not good.

plot(svm.lin, trdata)

#####

# Here is a summary of classification 
# performance on the generalization set.

#         Method           est. error rate
# ------------------------ ---------------

# LDA                           0.385
# SVM (linear)                  0.3845
# logistic reg (1st order)      0.3845
# LVQ                           0.3035
# K Means                       0.2755
# QDA                           0.2755
# logistic reg (3rd order)      0.275
# logitsic reg (2nd order)      0.272
# SVM (polynomial, 2)           0.27
# K Means + LVQ                 0.265
# 21 nearest neighbors          0.252
# SVM (radial)                  0.24   (about, it depends on what c-v leads you to select)

# Bayes                         0.2295


########## End ##########