# This example deals with an extension of the 
# classification setting dealt with in Sec. 2.3 
# of HTF2.  I'll apply methods from Sec. 13.2 to 
# it in addition to some methods from Ch. 2 and 
# Ch. 4.

# Developed using 2.9.0.


# I'll load libraries and set the random number seed.

library(MASS)     
library(mvtnorm)     
library(multinomRob) 
library(class)       
library(kknn)  
library(nnet)
       

set.seed(7)     


# First I'll generate and plot the component means.

Sig1 <- diag(1,2)  # Sig1 is a variance-covariance matrix.  
Omeans <- rmvnorm(10, mean=c(0,1), sigma=Sig1) # O is used for the Orange class.
Gmeans <- rmvnorm(10, mean=c(0,0), sigma=Sig1) # G is used for the Green class.
Bmeans <- rmvnorm(10, mean=c(1,0), sigma=Sig1) # B is used for the Blue class.

plot(Omeans[,1],Omeans[,2],col="darkorange",main='Component Means',xlab='x1',ylab='x2',xlim=c(-3,4),ylim=c(-3,4))
points(Gmeans[,1], Gmeans[,2], col="limegreen")
points(Bmeans[,1], Bmeans[,2], col="royalblue")

#####

# For each class I'll generate 10 counts which sum to 100  
# using a multinomial (100,0.1,0.1,...,0.1) distribution.
 
Ocounts <- rmultinomial(100, rep(0.1,10))
Gcounts <- rmultinomial(100, rep(0.1,10))
Bcounts <- rmultinomial(100, rep(0.1,10))

# Using the generated component means and 
# the generated counts as sample sizes (for
# the various components of the mixture 
# distributions), I'll generate 100 training 
# sample obervations for each class. 

Sig2 <- diag(0.2, 2)
Oxvals <- matrix(0, 100, 2)
finish <- 0
for (i in 1:10) 
{
  start <- finish + 1
  finish <- finish + Ocounts[i]
  Oxvals[start:finish,] <- rmvnorm(Ocounts[i], mean=Omeans[i,], sigma=Sig2) 
}
Gxvals <- matrix(0, 100, 2)
finish <- 0
for (i in 1:10) 
{
  start <- finish + 1
  finish <- finish + Gcounts[i]
  Gxvals[start:finish,] <- rmvnorm(Gcounts[i], mean=Gmeans[i,], sigma=Sig2) 
}
Bxvals <- matrix(0, 100, 2)
finish <- 0
for (i in 1:10) 
{
  start <- finish + 1
  finish <- finish + Bcounts[i]
  Bxvals[start:finish,] <- rmvnorm(Bcounts[i], mean=Bmeans[i,], sigma=Sig2) 
}

# Let's look at the training data.

plot(Oxvals[,1],Oxvals[,2],col="darkorange",main='Training Data',xlab='x1',ylab='x2',
      xlim=c(-3.5,4.5),ylim=c(-3.5,4.5))
points(Gxvals[,1], Gxvals[,2], col="limegreen")
points(Bxvals[,1], Bxvals[,2], col="royalblue")

#####

# I'll give response value of 0 for Orange class observations, 
# 1 for Green class observations, and 2 for the Blue class
# observations, and then combine them together to form the 
# training data.

Oclass <- cbind(0, Oxvals)
Gclass <- cbind(1, Gxvals)
Bclass <- cbind(2, Bxvals)
trdata <- data.frame(rbind(Oclass, Gclass, Bclass))
names(trdata) <- c("g", "x1", "x2")  

# This completes the generation of the training data.

# I'll generate a test set of 1000 additional 
# observations from each of the three classes.
# (Note: The component means are the ones used 
# to create the training data.)  I like to call
# this data the generalization sample, because I 
# will only use it to get good estimates of the 
# error rate of various fitted classifiers for 
# future observations.  Some call it a test sample, 
# but I like to think of a test sample as data I have 
# during the training/fitting process that I have
# set aside so that I can avoid using things like 
# c-v and AIC for model/parameter selection. 

Ocounts <- rmultinomial(1000, rep(0.1,10))
Gcounts <- rmultinomial(1000, rep(0.1,10))
Bcounts <- rmultinomial(1000, rep(0.1,10))
Oxvals <- matrix(0, 1000, 2)
finish <- 0
for (i in 1:10) 
{
  start <- finish + 1
  finish <- finish + Ocounts[i]
  Oxvals[start:finish,] <- rmvnorm(Ocounts[i], mean=Omeans[i,], sigma=Sig2) 
}
Gxvals <- matrix(0, 1000, 2)
finish <- 0
for (i in 1:10) 
{
  start <- finish + 1
  finish <- finish + Gcounts[i]
  Gxvals[start:finish,] <- rmvnorm(Gcounts[i], mean=Gmeans[i,], sigma=Sig2) 
}
Bxvals <- matrix(0, 1000, 2)
finish <- 0
for (i in 1:10) 
{
  start <- finish + 1
  finish <- finish + Bcounts[i]
  Bxvals[start:finish,] <- rmvnorm(Bcounts[i], mean=Bmeans[i,], sigma=Sig2) 
}
Oclass <- cbind(0, Oxvals)
Gclass <- cbind(1, Gxvals)
Bclass <- cbind(2, Bxvals)
gendata <- data.frame(rbind(Oclass, Gclass, Bclass))
names(gendata) <- c("g", "x1", "x2")

# I'll obtain an estimate of the Bayes error 
# rate using the large generalization sample.

Opdf <-rep(0,3000)
Gpdf <-rep(0,3000)
Bpdf <-rep(0,3000)
for (i in 1:10) 
{
  Opdf <- Opdf + dmvnorm(gendata[,2:3], Omeans[i,], Sig2) 
  Gpdf <- Gpdf + dmvnorm(gendata[,2:3], Gmeans[i,], Sig2)
  Bpdf <- Bpdf + dmvnorm(gendata[,2:3], Bmeans[i,], Sig2)
}
pred.bayes <- ifelse(Gpdf > Bpdf, 1, 2)
pred.bayes <- ifelse(Opdf > Gpdf & Opdf > Bpdf, 0, pred.bayes)
est.bayes.rate <- mean(pred.bayes != gendata[,1])

est.bayes.rate
# This is the estimated error rate of the Bayes classifier.

# Below is a confusion matrix, with true class for the 
# rows and predicted class for the columns.  The Green
# class is the most difficult one to correctly predict.

table(gendata[,1], pred.bayes)

#####

# I'll use c-v to select a good number of nearest neighbors
# to use for "vanilla" nearest neighbors classification.
cvknn <- train.kknn(as.factor(g) ~ ., trdata, 
          kmax=75, kernel="rectangular", distance=2) 
plot(c(1:75),cvknn$MISCLASS,type="l",xlab='k',
      ylab='proportion misclassified',main='Estimated Error Rates',
      ylim=c(0.27,0.45),col="cornflowerblue") 

which.min(cvknn$MISCLASS)
cvknn$MISCLASS[17:30]
# k values of 18, 20, and 28 produce a tie for the 
# fewest misclassifications.  Since the "true curve"
# (as opposed to the estimated curve we get from c-v)
# may be smoother and concave upward with a single
# minimum, I'd choose to use k = 19 (or 21) to classify 
# future observations.  (I think an odd value for k is 
# better, since in regions where just two of the classes 
# are heavily competing, an odd value of k should lead to
# fewer ties being randomly broken.)  One can also note
# that some of the c-v estimate are lower than the Bayes
# rate estimate obtained from the large generalization
# sample.

#####(go ahead and start next chunk --- plenty of time to look at graph)#####

# Let's use the large generalization sample to estimate 
# the error rates for future obnservations (the general-
# ization rates).  Also, I'll add a line showing the 
# (estimated) Bayes error rate.

# Note: Since the function train.kknn automatically scales 
# the data, I'll use the the kknn function to do nearest 
# neighbors since it scales the data as well.  I'll also 
# try it with unscaled data using the knn function.

gen.err.s <- numeric(75)
gen.err.u <- numeric(75)
for (i in 1:75)
{
  pred.gen.s <- kknn(as.factor(g)~.,train=trdata,test=gendata,k=i)
  gen.err.s[i] <- mean(pred.gen.s$fit != gendata[,1])
  pred.gen.u <- knn(trdata[,-1], gendata[,-1], cl=trdata[,1], k=i)
  gen.err.u[i] <- mean(pred.gen.u != gendata[,1])

}
points(c(1:75), gen.err.s, type="l", col="seagreen") 
points(c(1:75), gen.err.u, type="l", col="olivedrab")
abline(est.bayes.rate, 0, col="violetred")
legend(2,0.45,legend=c("generalization (unscaled)",
       "generalization (scaled)","cross-val","Bayes"),
       fill=c("olivedrab","seagreen","cornflowerblue","violetred"))

# Note that the green curves for scaled and unscaled differ
# appreciably for larger values of k.  I wouldn't think that 
# scaling would have a large effect.  Let's look at the standard 
# deviations of the two predictor variables and see how different 
# they are.

sd(trdata[,2])
# sample standard deviation of x1

sd(trdata[,3])
# sample standard deviation of x2

# Not too different.

# Below is the error rate for value of k I picked
# after lookeding at the cross-validation results.

gen.err.s[19]
# error rate for 19nn classification
# (using scaled data)

gen.err.u[19]
# error rate for 19nn classification
# (using unscaled data)

#####

# I'll try using weighted nearest neighbors
# (with triangular weight function).

cvknn <- train.kknn(as.factor(g) ~ ., trdata, 
          kmax=75, kernel="triangular", distance=2) 
plot(c(1:75),cvknn$MISCLASS,type="l",xlab='k',
      ylab='proportion misclassified',main='Estimated Error Rates',
      ylim=c(0.26,0.37),col="cornflowerblue") 

# k values of 9, 10, 13, and 14 tie for producing 
# the fewest misclassifications.  Based on this,
# I'll pick k = 11.  Let's see what proportion of
# the generalization sample is misclassified with 
# this choice.

weighted11nn <- kknn(as.factor(g) ~ ., train=trdata, test=gendata, 
          k=11, kernel="triangular", distance=2)
mean(weighted11nn$fit != gendata[,1])

# A tad worse than vanilla nearest neighbors.

#####

# Now let's check out classification using LDA.

trlda <- lda(g ~ . , trdata)
pred.trlda <- predict(trlda, newdata=gendata)
lda.rate <- mean( pred.trlda$class != gendata[,1] )

lda.rate
# LDA rate.
# (A little worse than 19nn.)

#####

# Now QDA.

trqda <- qda(g ~ . , trdata)
pred.trqda <- predict(trqda, newdata=gendata)
qda.rate <- mean( pred.trqda$class != gendata[,1] )

qda.rate
# QDA rate.
# (A bit better.)

#####

# Now I'll try 1st-, 2nd-, and 3rd-order logisitic 
# regression models.

logreg.fit1 <- multinom(as.factor(g) ~ ., data=trdata)
summary(logreg.fit1, Wald=T)

# Let's see how well the fitted model predicts the  
# classes for the generalization set.

pred.logreg <- predict(logreg.fit1, newdata=gendata, type="class")
logreg.rate <-  mean( pred.logreg != gendata[,1] )
logreg.rate
# error rate for 1st-order logistic regression model

# I'll try a full 2nd-order model.
 
logreg.fit2 <- multinom(as.factor(g) ~ x1 + x2 + I(x1^2) + I(x2^2)
                                      + I(x1*x2), data=trdata)
summary(logreg.fit2, Wald=T)
pred.logreg <- predict(logreg.fit2, newdata=gendata, type="class")
logreg.rate <-  mean( pred.logreg != gendata[,1] )
logreg.rate
# error rate for 2nd-order logistic regression model

# Best so far (by just a wee bit).
 
# I'll try a 3rd-order model.

logreg.fit3 <- multinom(as.factor(g) ~ x1 + x2 + I(x1^2) + I(x2^2) + I(x1*x2) 
                    + I(x1^3) + I(x2^3) +I(x1*x2^2) 
                    + I(x2*x1^2), data=trdata)
summary(logreg.fit3, Wald=T)
pred.logreg <- predict(logreg.fit3, newdata=gendata, type="class")
logreg.rate <-  mean( pred.logreg != gendata[,1] )
logreg.rate
# error rate for 3rd-order logistic regression model

# Let's look at AIC values and error rates together.

#        model   AIC   error
#        -----  -----  -----
#         1st    473   0.372
#         2nd    448   0.335
#         3rd    397   0.345

#####(go ahead and start next chunk --- then look at log reg results)#####

# Now I'll define a function that uses c-v with the 
# K-means clustering method from subsection 13.2.1 
# of HTF2.  The user supplies a vector of proportions
# that determines the numbers of clusters per class
# that will be compared using 10-fold c-v estimates 
# of error rates.  The second of the two functions 
# defined below is the one that does the c-v.  The 
# first of the two functions is used by the second
# of the two functions.

KMeansClassMod <- function(traindata, testdata, n.comp)
# 1st col. of each data set should be class variable
# (and classes should be labeled with consectutive
# nonnegative integers).  Remaining col's of each set 
# should be predictor var's.  n.comp should be a vector
# containing the numbers of clusters to use for each of 
# the classes.  To avoid an error with the kmeans function,
# if the number of clusters specified is less than 2, the
# number will be increased to 2.
{
  first.class <- min(traindata[,1])
  last.class <- max(traindata[,1])
  prototypes <- NULL
  for (g in first.class:last.class)
  {
    cases <- traindata[,1] == g
    classdata <- traindata[cases,]  
    num.clusters <- max(n.comp[g-first.class+1], 2)
    clusters <- kmeans(classdata[,-1],num.clusters,iter.max=15,nstart=5)
    class.prototypes <- cbind(g, clusters$centers)
    prototypes <- rbind(prototypes, class.prototypes)
  }
  prototypes <- data.frame(prototypes)
  names(prototypes) <- names(traindata)
  predict.class <- kknn(as.factor(g)~.,train=prototypes,test=testdata,k=1)
  test.error <-  mean( predict.class$fit != testdata[,1] )
  test.error
}

KMeansClassCV <- function(data, proportions)
# The proportions vector determines the numbers of clusters
# per class that will be compared using 10-fold c-v estimates
# of error rates.  E.g., if the proportions vector is c(0.05,
# 0.1, 0.2), for each class about 0.05*n_j clusters will be used 
# for K-means classification the first trial, 0.1*n_j clusters
# will be used the second trial, and 0.2*n_j clusters will be
# used the third trial, where n_j is the number of training
# set cases for the class.  (An exception is that if the number
# of clusters would be less than 2, it is made to be 2.  This 
# is done to avoid an error with the kmeans function.)  The 1st 
# column of the data set should be class variable (and classes 
# should be labeled with consectutive nonnegative integers).  
# Remaining columns should be predictor var's.
{
  n.total <- length(data[,1])
  first.class <- min(data[,1])
  last.class <- max(data[,1])
  num.classes <- last.class - first.class + 1
  n.class <- numeric(num.classes)
  for (j in first.class:last.class)
  {
    class.cases <- (data[,1] == j)
    n.class[j-first.class+1] <- length(data[class.cases,1])

  }
  fold.size <- ceiling(n.total/10)
  fold.id <- rep(c(1:10), fold.size)
  fold.id <- sample(fold.id, n.total)
  num.trials <- length(proportions)
  errors <- rep(0, num.trials)
  for (i in 1:num.trials)
  {  
    num.clusters <- round(n.class*proportions[i])
    for (fold in 1:10)
    {
      fold.cases <- (fold.id == fold)
      errors[i] <- errors[i] +
                   length(data[fold.cases,1])*KMeansClassMod(data[!fold.cases,],data[fold.cases,],num.clusters)
    }
  }
  err.rate <- errors/n.total
  results <- cbind(proportions, err.rate)
  results
}
    
KMeansClassCV(trdata, c(0.05, 0.1, 0.15, 0.2, 0.25))
# Note: Originally I used kkn in my functions, and I 
# didn't get any warnings.  But then I decided to use
# kknn (so that the data would be scaled automatically),
# which required that I make the set of prototypes into
# a data frame, and then I got warnings.  I don't think
# the warnings indicate a serious problem.

# I'll try some smaller proportions.

KMeansClassCV(trdata, c(0.01,0.02,0.03,0.04,0.05))

# A proportion of 0.04 corresponds to 4 clusters per class. 
# But since 4 is what worked best during the 10-fold c-v, 
# when the average sample size per class was 90, it could 
# be that with 100 per class in the entire training sample,
# 5 clusters per class will work good.  Altogether, I'll check
# out 3, 4, 5, and 6 clusters per class for a comparison.

KMeansClassMod(trdata,gendata,c(3,3,3))
KMeansClassMod(trdata,gendata,c(4,4,4))
KMeansClassMod(trdata,gendata,c(5,5,5))
KMeansClassMod(trdata,gendata,c(6,6,6))

# 4 and 5 clusters per class worked best (of these 
# choices).  The error rates are close to the best 
# so far, but lots of things seem to be doing about
# the same (and giving error rates not too far above
# the Bayes rate).

#####

# Since some of the following parts take a long 
# time to run, I'm going to reset the random number 
# seed so that before the seminar meeting I can run 
# these parts first, and then go back and run the 
# earlier stuff during the seminar presentation.

set.seed(12)

# Now I want to use c-v with the LVQ method 
# from subsection 13.2.2 of HTF2.

LVQCV <- function(data, proportions, num.it, init.ep)
# The proportions vector determines the numbers of clusters
# per class that will be compared using 10-fold c-v estimates
# of error rates.  E.g., if the proportions vector is c(0.05,
# 0.1, 0.2), for each class about 0.05*n_j clusters will be used 
# for K-means classification the first trial, 0.1*n_j clusters
# will be used the second trial, and 0.2*n_j clusters will be
# used the third trial, where n_j is the number of training
# set cases for the class.  The first column of the data set 
# should be class variable (and classes should be labeled with 
# consectutive nonnegative integers).  Remaining columns should 
# be predictor var's.
{
  n.total <- length(data[,1])
  first.class <- min(data[,1])
  last.class <- max(data[,1])
  num.classes <- last.class - first.class + 1
  n.class <- numeric(num.classes)
  for (j in first.class:last.class)
  {
    class.cases <- (data[,1] == j)
    n.class[j-first.class+1] <- length(data[class.cases,1])
  }
  fold.size <- ceiling(n.total/10)
  fold.id <- rep(c(1:10), fold.size)
  fold.id <- sample(fold.id, n.total)
  num.trials <- length(proportions)
  errors <- rep(0, num.trials)
  for (i in 1:num.trials)
  {  
    num.pro <- round(n.class*proportions[i])
    for (fold in 1:10)
    {
      fold.cases <- (fold.id == fold)
      errors[i] <- errors[i] +
                   length(data[fold.cases,1])*LVQMod(data[!fold.cases,],data[fold.cases,],num.pro,num.it,init.ep)
    }
  }
  err.rate <- errors/n.total
  results <- cbind(proportions, err.rate)
  results
}

LVQMod <- function(traindata,testdata,n.pro,iterations,initial.epsilon)
# 1st col. of each data set should be class variable
# (and classes should be labeled with consectutive
# nonnegative integers).
# Remaining col's of each set should be predictor var's.
{
  first.class <- min(traindata[,1])
  last.class <- max(traindata[,1])
  prototypes <- NULL
  for (g in first.class:last.class)
  {
    cases <- traindata[,1] == g
    classdata <- traindata[cases,]  
    n.cases <- length(classdata[,1])
    selected <- sample.int(n.cases, n.pro[g-first.class+1])
    class.prototypes <- classdata[selected,]
    prototypes <- rbind(prototypes, class.prototypes)
  }
  train.size <- length(traindata[,1])
  for (i in 1:iterations)
  {
    epsilon <- initial.epsilon/i
    case <- sample.int(train.size, 1)
    distances <- as.matrix(dist(rbind(traindata[case,-1],prototypes[,-1])))
    closest <- as.integer(which.min(distances[-1,1]))
    ifelse(prototypes[closest,1]==traindata[case,1],
      prototypes[closest,-1] <- (1 - epsilon)*prototypes[closest,-1] + epsilon*traindata[case,-1],
      prototypes[closest,-1] <- (1 - epsilon)*prototypes[closest,-1] - epsilon*traindata[case,-1])
  }   
  predict.class <- knn(prototypes[,-1], testdata[,-1], prototypes[,1], k=1)
  errors <- mean( predict.class != testdata[,1] )
  errors
}

LVQCV(trdata, c(0.15, 0.16, 0.17, 0.18, 0.19), 700, 0.5)

# I tried other combinations of proportions and parameter
# settings, and a proportion of 0.16 with the settings as
# above worked best.

# A proportion of 0.16 corresponds to 16 clusters per class. 
# But since 16 is what worked best during the 10-fold c-v, 
# when the average sample size per class was 90, it could 
# be that with 100 per class in the entire training sample,
# 17 pr 18 clusters per class will work better.  Altogether, 
# I'll check out 16, 17, and 18 clusters per class for a 
# comparison.

LVQMod(trdata,gendata,c(16,16,16),700,0.5)
LVQMod(trdata,gendata,c(17,17,17),700,0.5)
LVQMod(trdata,gendata,c(18,18,18),700,0.5)

# 16 prototypes per class worked best.

#####

# Now I'll combine the K-means starting points  
# with the movement of the prototypes used in
# the LVQ method.

KMeansPlusLVQCV <- function(data,proportions,num.it,init.ep)
# 1st col. of each data set should be class variable
# (and classes should be labeled with consectutive
# nonnegative integers).
# Remaining col's of each set should be predictor var's.
{
  n.total <- length(data[,1])
  first.class <- min(data[,1])
  last.class <- max(data[,1])
  num.classes <- last.class - first.class + 1
  n.class <- numeric(num.classes)
  for (j in first.class:last.class)
  {
    class.cases <- (data[,1] == j)
    n.class[j-first.class+1] <- length(data[class.cases,1])
  }
  fold.size <- ceiling(n.total/10)
  fold.id <- rep(c(1:10), fold.size)
  fold.id <- sample(fold.id, n.total)
  num.trials <- length(proportions)
  errors <- rep(0, num.trials)
  for (i in 1:num.trials)
  {  
    num.pro <- round(n.class*proportions[i])
    for (fold in 1:10)
    {
      fold.cases <- (fold.id == fold)
      errors[i] <- errors[i] +
                   length(data[fold.cases,1])*KMeansPlusLVQMod(data[!fold.cases,],data[fold.cases,],num.pro,num.it,init.ep)

    }
  }
  err.rate <- errors/n.total
  results <- cbind(proportions, err.rate)
  results
}
 
KMeansPlusLVQMod <- function(traindata,testdata,n.pro,iterations,initial.epsilon)
{
  first.class <- min(traindata[,1])
  last.class <- max(traindata[,1])
  prototypes <- NULL
  for (g in first.class:last.class)
  {
    cases <- traindata[,1] == g
    classdata <- traindata[cases,]  
    clusters <- kmeans(classdata[,-1],n.pro[g-first.class+1],iter.max=15,nstart=5)
    class.prototypes <- cbind(g, clusters$centers)
    prototypes <- rbind(prototypes, class.prototypes)
  }
  prototypes <- data.frame(prototypes)
  train.size <- length(traindata[,1])
  for (i in 1:iterations)
  {
    epsilon <- initial.epsilon/i
    case <- sample.int(train.size, 1)
    distances <- as.matrix(dist(rbind(traindata[case,-1],prototypes[,-1])))
    closest <- as.integer(which.min(distances[-1,1]))
    ifelse(prototypes[closest,1]==traindata[case,1],
      prototypes[closest,-1] <- (1 - epsilon)*prototypes[closest,-1] + epsilon*traindata[case,-1],
      prototypes[closest,-1] <- (1 - epsilon)*prototypes[closest,-1] - epsilon*traindata[case,-1])
      }   
  predict.class <- knn(prototypes[,-1], testdata[,-1], prototypes[,1], k=1)
  test.error <-  mean( predict.class != testdata[,1] )
  test.error
}

KMeansPlusLVQCV(trdata,c(0.05,0.1,0.15,0.2),500,0.5)

# I tried other combinations of proportions and parameter
# settings, and a proportion of 0.04 with the settings as
# above worked best.

KMeansPlusLVQMod(trdata,gendata,c(4,4,4),500,0.5)
KMeansPlusLVQMod(trdata,gendata,c(5,5,5),500,0.5)


########## End ##########