# This example addresses the classification 
# setting dealt with in Sec. 2.3 of HTF2.
# But I'll apply methods from Sec. 13.2 to it in
# addition to some methods from Ch. 2 and Ch. 4.

# Developed using 2.9.0.


# I'll load libraries and set the random number seed.

library(MASS)     
library(mvtnorm)     
library(multinomRob) 
library(class)       
library(kknn)         

set.seed(7)     


# First I'll generate the training data as described on pp. 16-17 of HTF2.

Sig1 <- diag(1,2)  # Sig1 is a variance-covariance matrix.  
Bmeans <- rmvnorm(10, mean=c(1,0), sigma=Sig1) # B is used for the Blue class.
Omeans <- rmvnorm(10, mean=c(0,1), sigma=Sig1) # O is used for the Orange class.

# I'll plot the component means.

plot(Bmeans[,1],Bmeans[,2],col=4,main='Component Means',xlab='x1',ylab='x2',xlim=c(-3,4),ylim=c(-3,4))
points(Omeans[,1], Omeans[,2], col="darkorange")

#####

# For each class I'll generate 10 counts which sum to 100  
# using a multinomial (100,0.1,0.1,...,0.1) distribution. 

Bcounts <- rmultinomial(100, rep(0.1,10))
Ocounts <- rmultinomial(100, rep(0.1,10))

# Using the generated component means and 
# the generated counts as sample sizes (for
# the various components of the mixture 
# distributions), I'll generate 100 training 
# sample obervations for each class. 

Sig2 <- diag(0.2, 2)
Bxvals <- matrix(0, 100, 2)
finish <- 0
for (i in 1:10) 
{
  start <- finish + 1
  finish <- finish + Bcounts[i]
  Bxvals[start:finish,] <- rmvnorm(Bcounts[i], mean=Bmeans[i,], sigma=Sig2) 
}
Oxvals <- matrix(0, 100, 2)
finish <- 0
for (i in 1:10) 
{
  start <- finish + 1
  finish <- finish + Ocounts[i]
  Oxvals[start:finish,] <- rmvnorm(Ocounts[i], mean=Omeans[i,], sigma=Sig2) 
}

# Let's look at the training data.

plot(Bxvals[,1],Bxvals[,2],col=4,main='Training Data',xlab='x1',ylab='x2',
      xlim=c(-3.5,4.5),ylim=c(-3.5,4.5))
points(Oxvals[,1], Oxvals[,2], col="darkorange")

#####

# I'll give response value of 0 for Blue class observations, and
# 1 for Orange class observations, then combine them together to
# form the training data.

Bclass <- cbind(0, Bxvals)
Oclass <- cbind(1, Oxvals)
trdata <- data.frame(rbind(Bclass, Oclass))
names(trdata) <- c("g", "x1", "x2")  

# This completes the generation of the training data.

# I'll generate a test set of 1000 additional 
# observations from each of the two classes.
# (Note: The component means are the ones used 
# to create the training data.)  I like to call
# this data the generalization sample, because I 
# will only use it to get good estimates of the 
# error rate of various fitted classifiers for 
# future observations.  Some call it a test sample, 
# but I like to think of a test sample as data I have 
# during the training/fitting process that I have
# set aside so that I can avoid using things like 
# c-v and AIC for model/parameter selection. 

Bcounts <- rmultinomial(1000, rep(0.1,10))
Ocounts <- rmultinomial(1000, rep(0.1,10))
Bxvals <- matrix(0, 1000, 2)
finish <- 0
for (i in 1:10) 
{
  start <- finish + 1
  finish <- finish + Bcounts[i]
  Bxvals[start:finish,] <- rmvnorm(Bcounts[i], mean=Bmeans[i,], sigma=Sig2) 
}
Oxvals <- matrix(0, 1000, 2)
finish <- 0
for (i in 1:10) 
{
  start <- finish + 1
  finish <- finish + Ocounts[i]
  Oxvals[start:finish,] <- rmvnorm(Ocounts[i], mean=Omeans[i,], sigma=Sig2) 
}
Bclass <- cbind(0, Bxvals)
Oclass <- cbind(1, Oxvals)
gendata <- data.frame(rbind(Bclass, Oclass))
names(gendata) <- c("g", "x1", "x2")

# I'll obtain an estimate of the Bayes error 
# rate using the large generalization sample.

Bpdf <-rep(0,2000)
Opdf <-rep (0,2000)
for (i in 1:10) 
{
  Bpdf <- Bpdf + dmvnorm(gendata[,2:3], Bmeans[i,], Sig2)
  Opdf <- Opdf + dmvnorm(gendata[,2:3], Omeans[i,], Sig2)
}
pred.bayes <- ifelse(Bpdf > Opdf, 0, 1)
est.bayes.rate <- mean(pred.bayes != gendata[,1])

est.bayes.rate
# This is the estimated error rate of the Bayes classifier.

#####

# I'll use c-v to selct a good number of nearest neighbors
# to use for "vanilla" nearest neighbors classification.
cvknn <- train.kknn(as.factor(g) ~ ., trdata, 
          kmax=49, kernel="rectangular", distance=2) 
plot(c(1:49),cvknn$MISCLASS,type="l",xlab='k',
      ylab='proportion misclassified',main='Estimated Error Rates',col=4) 

#####

# Looks like k = 29 is a good choice.  Let's use the large
# generalization sample to estimate the error rates for 
# future obnservations (the generalization rates).  Also,
# I'll add a line showing the (estimated) Bayes error rate.

gen.err <- numeric(49)
for (i in 1:49)
{
  pred.gen <- knn(trdata[,-1], gendata[,-1], cl=trdata[,1], k=i)
  gen.err[i] <- mean(pred.gen != gendata[,1])
}
points(c(1:49), gen.err, type="l", col=3) 
abline(est.bayes.rate, 0, col="purple")
legend(32,0.32,legend=c("cross-val","generalization","Bayes"),fill=c(4,3,"purple"))

# The c-v estimates of error are generally optimistic, but the
# indicated choice of k=29 isn't bad since it appears that any
# k between 5 and 30 will get you fairly close to the Bayes rate
# (estimated to be 0.253).

#####

# Now I will use the training data to fit a 
# least squares regression model, using the 
# 0/1 class labels as the response var, to
# determine the predicted class for each 
# observation using (2.7) on p. 12 of HTF2.

ols <- lm(g ~ x1 + x2, data=trdata)

# Let's look at the data and the 
# linear decision boundary together.

plot(trdata[1:100,2], trdata[1:100,3], col=4, main='1st-order OLS', 
      xlab='x1',ylab='x2', xlim=c(-2.5,4),ylim=c(-1.2,3.5))
points(trdata[101:200,2], trdata[101:200,3], col="darkorange") 
abline((0.5 - ols$coef[1])/ols$coef[3], - ols$coef[2]/ols$coef[3], col=8)

#####

# A linear classifier is going to have trouble 
# in this setting.  Let's determine the generali-
# zation error rate for this classifier.

pred.ols <- predict(ols, newdata=gendata)
ols.err <- mean( ifelse(pred.ols > 0.5, 1, 0) != gendata[,1] ) 

ols.err
# This is the estimated 1st-order OLS classifier 
# error rate (on future cases) ... nearly twice 
# the Bayes rate, and not much better than using
# a coin flip to make predictions, or simply 
# predicting all future cases to be blue (or all
# future cases to be orange).

#####

# Now let's check out classification using LDA.
# Since we have equal sample sizes in the training 
# data, the classifier should be identical to the 
# one from the regression method.

trlda <- lda(g ~ . , trdata)
pred.trlda <- predict(trlda, newdata=gendata)
lda.rate <- mean( pred.trlda$class != gendata[,1] )

lda.rate
# LDA rate.
# (Same as rate from regression fit.)

#####

# Now QDA.

trqda <- qda(g ~ . , trdata)
pred.trqda <- predict(trqda, newdata=gendata)
qda.rate <- mean( pred.trqda$class != gendata[,1] )

qda.rate
# QDA rate.
# (Better.)

#####

# Now I'll try 1st-, 2nd-, and 3rd-order logisitic 
# regression models.

logreg1 <- glm(g ~ ., data=trdata, family=binomial)
summary(logreg1)
pred.logreg1 <- predict(logreg1, newdata=gendata, type="response")
err.logreg1 <-  mean(round(pred.logreg1) != gendata[,1])

err.logreg1
# error rate for 1st-order logistic regression model

# This error rate is similar to LDA error rate.  Even though
# coefficient for x2 isn't even significant in 1st-order model,
# I'll try a full 2nd-order model.
 
logreg2 <- glm(g ~ x1 + x2 + I(x1^2) + I(x2^2)
                    + I(x1*x2), data=trdata, family=binomial)
summary(logreg2)
pred.logreg2 <- predict(logreg2, newdata=gendata, type="response")
err.logreg2 <-  mean(round(pred.logreg2) != gendata[,1])

err.logreg2
# error rate for 2nd-order logistic regression model

# Much better.  Even though x2 didn't look important 
# in output from 1st-order model, the 2nd-order terms 
# involving x2 were higly significant in 2nd-order model.

# Let's try a 3rd-order model ... 200 observations ought  
# to be enough to fit 8 parameters decently.

logreg3 <- glm(g ~ x1 + x2 + I(x1^2) + I(x2^2) + I(x1*x2) 
                    + I(x1^3) + I(x2^3) +I(x1*x2^2) 
                    + I(x2*x1^2), data=trdata, family=binomial)
summary(logreg3)
pred.logreg3 <- predict(logreg3, newdata=gendata, type="response")
err.logreg3 <-  mean(round(pred.logreg3) != gendata[,1])

err.logreg3
# error rate for 3rd-order logistic regression model

# Fairly close to the estimated Bayes rate.  Let's look 
# at AIC values and error rates together.

#        model   AIC   error
#        -----  -----  -----
#         1st    240   0.44
#         2nd    199   0.29
#         3rd    179   0.26

# This time we get
#    the smaller the AIC, the smaller the error rate.

#####

# Now I want to try the K-means clustering 
# method from subsection 13.2.1 of HTF2.

KMeansClassRandom <- function(traindata, testdata, n.comp)
# 1st col. of each data set should be class variable
# (and classes should be labeled with consectutive
# nonnegative integers).
# Remaining col's of each set should be predictor var's.
{
  first.class <- min(traindata[,1])
  last.class <- max(traindata[,1])
  prototypes <- NULL
  for (g in first.class:last.class)
  {
    cases <- traindata[,1] == g
    classdata <- traindata[cases,]  
    clusters <- kmeans(classdata[,-1],n.comp,iter.max=15,nstart=5)
    class.prototypes <- cbind(g, clusters$centers)
    prototypes <- rbind(prototypes, class.prototypes)
  }
  names(prototypes) <- names(traindata)
  predict.class <- knn(prototypes[,-1], testdata[,-1], prototypes[,1], k=1)
  test.error <-  mean( predict.class != testdata[,1] )
  test.error
}

KMeansClassRandom(trdata, gendata, 10)
# error rate for 10 clusters per class

KMeansClassRandom(trdata, gendata, 5)
# error rate for 5 clusters per class

KMeansClassRandom(trdata, gendata, 15)
# error rate for 15 clusters per class

# The error rate with 5 clusters per class
# is great.  But how would we pick a good
# number of clusters per class to use if
# all we have is the training data?  
# (Experience perhaps; or do we go to the
# trouble of writing a c-v routine?)

# Before moving on, let's try it with 5 
# clusters per class again.  (We might 
# have gotten real lucky with the random 
# numbers the first time.)

KMeansClassRandom(trdata, gendata, 5)

# Still pretty good, but not as close to 
# the Bayes rate as last time.

#####

# Now I want to try the LVQ method 
# from subsection 13.2.2 of HTF2.

LVQrandom <- function(traindata,testdata,n.pro,iterations,initial.epsilon)
# 1st col. of each data set should be class variable
# (and classes should be labeled with consectutive
# nonnegative integers).
# Remaining col's of each set should be predictor var's.
{
  first.class <- min(traindata[,1])
  last.class <- max(traindata[,1])
  prototypes <- NULL
  for (g in first.class:last.class)
  {
    cases <- traindata[,1] == g
    classdata <- traindata[cases,]  
    n.cases <- length(classdata[,1])
    selected <- sample.int(n.cases, n.pro)
    class.prototypes <- classdata[selected,]
    prototypes <- rbind(prototypes, class.prototypes)
  }


  train.size <- length(traindata[,1])
  plot(traindata[,2],traindata[,3],col=(2+traindata[,1]),pch=20,
      main='Drifting Prototypes',xlab='1st predictor',ylab='2nd predictor',
      xlim=c(min(traindata[,2]),max(traindata[,2])),ylim=c(min(traindata[,3]),max(traindata[,3])))
  for (i in 1:iterations)
  {
    epsilon <- initial.epsilon/i
    case <- sample.int(train.size, 1)
    distances <- as.matrix(dist(rbind(traindata[case,-1],prototypes[,-1])))
    closest <- as.integer(which.min(distances[-1,1]))
    ifelse(prototypes[closest,1]==traindata[case,1],
      prototypes[closest,-1] <- (1 - epsilon)*prototypes[closest,-1] + epsilon*traindata[case,-1],
      prototypes[closest,-1] <- (1 - epsilon)*prototypes[closest,-1] - epsilon*traindata[case,-1])
    points(prototypes[closest,2],prototypes[closest,3],col=(2+prototypes[closest,1]))
  }   
  predict.class <- knn(prototypes[,-1], testdata[,-1], prototypes[,1], k=1)
  test.error <-  mean( predict.class != testdata[,1] )
  test.error
}

LVQrandom(trdata,gendata,10,400,0.5)

#####

# Not that good.  I'll change the number
# of prototypes per class and the value of 
# initial.epsilon.

LVQrandom(trdata,gendata,5,400,0.75)

#####

# A lot of the performance may be due to the random selection 
# of the initial prototypes.  I'll try using more of them in
# case some of the chosen ones are bad in some sense.

LVQrandom(trdata,gendata,15,400,0.75)

#####

# Pretty good!  I'll try it one more time, only
# increasing the number of iterations to make
# epsilon decrease towards 0 slower.

LVQrandom(trdata,gendata,15,900,0.75)

#####

# Not as good.  Now I'll go back to just 400
# iterations (since maybe the quicker decrease 
# is better).

LVQrandom(trdata,gendata,15,400,0.75)

# Well, isn't that a kick in the pants!
# I guess a lesson to be learned here is 
# that the performance of any method that 
# uses randomness can really depend on the 
# random number sequence.  In practice, w/o
# having a large test/generalization sample 
# to guide the selection of parameters, how 
# should we determine what parameter values
# to use?  (Maybe write a cross validation
# routine.)

# Idea for *future research*: Since bad randomly 
# chosen starting prototypes may be a big part of 
# why the LVQ method may perform poorly at times, 
# why not combine the K-means starting points of 
# the previously considered method with the move-
# ment of the prototypes away from points in other 
# classes from this method?

#####

# The future is now!  Let's do it.

KMeansPlusLVQ <- function(traindata,testdata,n.pro,iterations,initial.epsilon)
# 1st col. of each data set should be class variable
# (and classes should be labeled with consectutive
# nonnegative integers).
# Remaining col's of each set should be predictor var's.
{
  first.class <- min(traindata[,1])
  last.class <- max(traindata[,1])
  prototypes <- NULL
  for (g in first.class:last.class)
  {
    cases <- traindata[,1] == g
    classdata <- traindata[cases,]  
    clusters <- kmeans(classdata[,-1],n.pro,iter.max=15,nstart=5)
    class.prototypes <- cbind(g, clusters$centers)
    prototypes <- rbind(prototypes, class.prototypes)
  }
  prototypes <- data.frame(prototypes)
  train.size <- length(traindata[,1])
  plot(traindata[,2],traindata[,3],col=(2+traindata[,1]),pch=20,
      main='Drifting Prototypes',xlab='1st predictor',ylab='2nd predictor',
      xlim=c(min(traindata[,2]),max(traindata[,2])),ylim=c(min(traindata[,3]),max(traindata[,3])))
  points(prototypes[,2],prototypes[,3],col=(2+prototypes[,1]),pch=11)
  for (i in 1:iterations)
  {
    epsilon <- initial.epsilon/i
    case <- sample.int(train.size, 1)
    distances <- as.matrix(dist(rbind(traindata[case,-1],prototypes[,-1])))
    closest <- as.integer(which.min(distances[-1,1]))
    ifelse(prototypes[closest,1]==traindata[case,1],
      prototypes[closest,-1] <- (1 - epsilon)*prototypes[closest,-1] + epsilon*traindata[case,-1],
      prototypes[closest,-1] <- (1 - epsilon)*prototypes[closest,-1] - epsilon*traindata[case,-1])
    points(prototypes[closest,2],prototypes[closest,3],col=(2+prototypes[closest,1]))
  }   
  predict.class <- knn(prototypes[,-1], testdata[,-1], prototypes[,1], k=1)
  test.error <-  mean( predict.class != testdata[,1] )
  test.error
}

KMeansPlusLVQ(trdata,gendata,10,400,0.75)

# Does anyone know what the warning message means,
# and if it indicates a real problem?


########## End ##########