# R Code (and assorted comments) for Overview of Classification Methods


library(mvtnorm)      # for generating multivariate normal values
library(multinomRob ) # for generating multinomial dist'n values
library(klaR)         # for naive Bayes classification
library(kknn)         # for kknn and train.kknn f'ns (for KNN)

# I'll set the seed for the random number generator.
set.seed(17)  


# Before listing (and grouping) some classification methods, I think 
# it'll be good to point out that for iid observations
#                        the Bayes classifier
# is an optimal classifier.  For each query point x it gives the 
# predicted class to be
#                        argmax f_j ( x ) * pi_j,
# where
#        f_j   is the pdf for the jth class
#     &  pi_j  is the a priori probability that an obs comes from class j.

# For example, in a two-class problem for which an observation is equally
# likely to come from either class, the Bayes classifier 
#        predicts class 1  if  f_1 ( x )  >  f_2 ( x )
#     &  predicts class 2  if  f_2 ( 2 )  >  f_1 ( x )
# (and arbitrarily predicts a class if f_1(x) = f_2(x)).  But if an obser-
# vation is twice as likely to come from class 1 as class 2,  i.e.,
#                         pi_1 = 2/3  &  pi_2 = 1/3,
# the Bayes classifier predicts class 1 if 
#                         2 f_1(x)  >  f_2(x).


# For a specific example, let 
#                             pi_1 = pi_2 = 1/2,
# and let the density underlying each class be a bivariate normal pdf 
# having covariance matrix I (the identity matrix).  For class 1, let 
# the mean vector be 
#                                ( 2, 0 )
# and for class 2 let the mean vector be 
#                                ( 0, 2 ).

# Let's plot the two pdfs.
x1plot <- seq(-3, 8, by=0.1)
x2plot <- seq(-3, 8, by=0.1)
Sig <- diag(1,2)
maxpdf <- function(x1, x2){mat=cbind(x1,x2); pmax(dmvnorm(mat, c(2,0), Sig),
                                                  dmvnorm(mat, c(0,2), Sig))}
pdfplot <- outer(x1plot, x2plot, maxpdf)
persp(x1plot, x2plot, pdfplot, xlab="x1", ylab="x2", zlab="pdf", 
theta=-30, col="gold")

###

## Possibly Skip ##
 # Below is a simpler example of using the perspective plot function.
   x1plot <- seq(1, 3, by=0.05)
   x2plot <- seq(1, 3, by=0.05)
   mresp <- function(x1,x2) { sqrt(x2/x1) }
   mrplot <- outer(x1plot, x2plot, mresp)
   persp(x1plot, x2plot, mrplot, xlab="x1", ylab="x2", zlab="mean response", 
         theta=60, col="gold")
## End of Possible Skip ##

# For this simple two-class problem involving bivariate normal populations,
# one could determine the error rate of the Bayes classifier (aka the Bayes 
# rate) analytically.  We can also obtain a Monte Carlo estimate (done below).

x <- rmvnorm(10000, mean=c(2,0), sigma=Sig) 
mean( x[,1] < x[,2] )
# Monte Carlo estimate of Bayes rate

pnorm( -sqrt( 2 ) )
# Bayes rate (exact)

###

# It should be noted that while the Bayes classifier is optimal, 
# we usually don't have all of the information we need to implement 
# it; the class densities and/or the prior probabilities are typically
# unknown.

# The 1984 CART book (p. 15) states 
#   "The three most commonly used classification procedures
#    
#    Disriminant analysis
#    Kernel density estimation
#    Kth nearest neighbor
#
#    attempt, in different ways, to approximate the Bayes rule".

# Classification using density estimation is the most direct way 
# approximate the Bayesian classifier; one simply substitutes 
# estimates of the densities and prior probabilities for the 
# unknowns.  


# I'll generate data for the simple classification problem
# introduced above.  I'll first generate a training sample 
# of 100 observations.

n1 <- rbinom(1, 100, 0.5)
n1
# Number of Class 1 observations in training sample.

X1 <- rmvnorm(n1, mean=c(2,0), sigma=Sig)
X2 <- rmvnorm(100-n1, mean=c(0,2), sigma=Sig)
X <- rbind(X1, X2)
g <- c( rep(1, n1), rep(2, 100-n1) )
trndat <- data.frame( cbind(X, g) )
names(trndat) <- c("x1", "x2", "g")

dim(trndat)
head(trndat)
plot(trndat[,1], trndat[,2], col=4-trndat[,3], main='training data', xlab='x1', ylab='x2')

# Now I'll generate some data (the same way, but independently) 
# to serve as a "generalization sample" (to provide estimates
# of the error rates for the various classification methods
# considered).  I'll put 5000 observations from each class in
# the generalization sample.  (This seems better than making 
# the class sample sizes random.) 

X1 <- rmvnorm(5000, mean=c(2,0), sigma=Sig)
X2 <- rmvnorm(5000, mean=c(0,2), sigma=Sig)
X <- rbind(X1, X2)
g <- c( rep(1, 5000), rep(2, 5000) )
gendat <- data.frame( cbind(X, g) )
names(gendat) <- names(trndat)
 
###

# I'll use two-dimensional kernel density estimates
# to classify by directly approximating the Bayes 
# classifier.
# (Note: The kde2d function used below will only work
# if there are two predictors.  If there is just one
# predictor, then R's density function can be used to
# obtain a density estimate.  Unfortunately, there 
# doesn't seem to be a function which will do kernel
# density estimation for dimensions greater than 2
# (although in a lot of cases I suppose it is the case
# that for dimensions greater than 2 the method doesn't
# work well compared to other methods).  Let me know if 
# you know of some function that will do density estimation
# in dimensions greater than 2.)

# Let's look at an estimate of the Class 1 (green) density. 

x1limL <- min(trndat[,1]) - 2
x1limU <- max(trndat[,1]) + 2
x2limL <- min(trndat[,2]) - 2
x2limU <- max(trndat[,2]) + 2
gden <- kde2d(trndat[1:n1,1], trndat[1:n1,2], n=301, lims=c(x1limL,x1limU,x2limL,x2limU))
image(gden)

# Let's take a look at the Help info for the function kde2d.
?kde2d

###

# Here's the Class 2 (red) density estimate.

rden <- kde2d(trndat[(n1+1):100,1], trndat[(n1+1):100,2], n=301, lims=c(x1limL,x1limU,x2limL,x2limU))
image(rden)

###

# Now let's look at the decision boundary 
# (on which *weighted* densities are equal).

denrat <- (100 - n1)*rden$z/(n1*gden$z)
contour(gden$x, gden$y, denrat, levels=c(0.5,1,2))

# The contour labeled with 1 is the decision boundary.
# Outside of the wavy band, one estimated density is
# more than twice as high as the one one. 

###

# Here's another representation of the decision boundary
# determined from the training data.

pred.kern.den <- ifelse(denrat < 1, 2, 1)
image(gden$x, gden$y, pred.kern.den)


# Now I'll obtain an estimated error rate for this 
# classification method from the training data.
 
x1index <- round(1 + 300*(trndat[,1] - min(gden$x))/(max(gden$x) - min(gden$x)))
x2index <- round(1 + 300*(trndat[,2] - min(gden$y))/(max(gden$y) - min(gden$y)))
pred.denrat <- rep(0, 100)
for (i in 1:100) pred.denrat[i] <- denrat[x1index[i],x2index[i]]
trn.err.kern.den <- mean( ifelse(pred.denrat > 1, 2, 1) != trndat[,3] ) 
trn.err.kern.den

# The training error for the kernel density estimation
# method is relatively small.

# Note: Error rates obtained from the training data are
# often optimistic!

# Let's now get a better estimate of the error rate for 
# this density estimation method using the generalization
# sample.

# Note: One has to take care that the subscript does not 
# fall outside of the set {1, 2, 3, ..., 300, 301}.

x1index <- round(1 + 300*(gendat[,1] - min(gden$x))/(max(gden$x) - min(gden$x)))
x1index <- ifelse( x1index < 1, 1, x1index )
x1index <- ifelse( x1index > 301, 301, x1index )
x2index <- round(1 + 300*(gendat[,2] - min(gden$y))/(max(gden$y) - min(gden$y)))
x2index <- ifelse( x2index < 1, 1, x2index )
x2index <- ifelse( x2index > 301, 301, x2index )

pred.denrat <- rep(0, 10000)
for (i in 1:10000) pred.denrat[i] <- denrat[x1index[i],x2index[i]]
gen.err.kern.den <- mean( ifelse(pred.denrat > 1, 2, 1) != gendat[,3] ) 
gen.err.kern.den

# A tad larger than the Bayes rate, which seems reasonable.
# (The actual error rate definitely cannot be smaller than
# the Bayes rate.  But perhaps it shouldn't be too much greater
# since the density estimation method ought to work decently
# in this simple setting.) 


# A big drawback to this very straightforward approximation of the 
# Bayes classifier is that density estimates can be very unreliable 
# when the dimension is not real small.  (A practical drawback is 
# that here doesn't seem to be a good way to estimate densities using
# R when the dimension is greater than 2.  (Note: I haven't researched
# this thoroughly.))
 

# A "work around" when the dimension is not small is to assume 
# independence and use a naive Bayes classifier.  (Note: I don't
# like R's default, which is to assume normality along with independence.
# I think in this case one might often be better off using either LDA or 
# QDA (although maybe not is the dimension is too large).)

# Although the method is based on an assumption of independence, 
# which may be grossly violated, a nice thing is that this 
# method may be done using R for dimensions greater than 2.

# R's NaiveBayes function apparently needs to be given
# data with the class variable being a "factor" with 
# "levels" specified in order for the predict function 
# to work properly on the object.  Therefore, I created
# a new data frame to use at this point.

alttrndat <- data.frame(cbind(trndat[,1],trndat[,2],factor(trndat[,3])))
names(alttrndat) <- c("x1","x2","g")
levels(alttrndat$g) <- c("1","2")

# I'll also modify the generalization data the same way.

altgendat <- data.frame(cbind(gendat[,1],gendat[,2],factor(gendat[,3])))
names(altgendat) <- c("x1","x2","g")
levels(altgendat$g) <- c("1","2")


# I'll start with a version of the method that assumes
# the marginal dist'ns are normal, and so instead of 
# using a nonparametric density estimator, a parametric
# model is used, and the data is used to estimate the 
# unknown parameters.

nbnorm <- NaiveBayes(g ~ ., data=alttrndat) 

pred.nbnorm <- predict(nbnorm, newdata=alttrndat)
trn.err.nbnorm <- mean( pred.nbnorm$class != alttrndat[,3] ) 
trn.err.nbnorm
# Est. error rate based on training data.

pred.nbnorm <- predict(nbnorm, newdata=altgendat)
gen.err.nbnorm <- mean( pred.nbnorm$class != altgendat[,3] ) 
gen.err.nbnorm
# Est. error rate based on generalization data.


# Next, I will use what I consider to be the standard
# way of doing naive Bayes classification, with a kernel
# density estimator being used to obtain a nonparametric
# density estimate for each predictor.

nbkern <- NaiveBayes(g ~ ., data=alttrndat, usekernel=TRUE) 

pred.nbkern <- predict(nbkern, newdata=alttrndat)
tr.err.nbkern <- mean( pred.nbkern$class != alttrndat[,3] ) 
tr.err.nbkern
# Est. error rate based on training data.

pred.nbkern <- predict(nbkern, newdata=altgendat)
gen.err.nbkern <- mean( pred.nbkern$class != altgendat[,3] ) 
gen.err.nbkern
# Est. error rate based on generalization data.
# (A little worse this way ... not surprising since class
# densities are in fact normal.)


# Since there can be problems with trying to directly approximate
# Bayes rule, one might attempt less direct approximations.  Assuming
# the densities don't vary widely in smallish regions, an indirect
# nonparametric approximation is to use a nearest neighbors method.

# I'll try ordinary nearest neighbors, using 
# train.kknn to select a good value of k.

cvknn <- train.kknn(as.factor(g) ~ ., trndat, 
          kmax=50, kernel="rectangular", distance=2) 

plot(c(1:50),cvknn$MISCLASS,type="l",xlab='k',
      ylab='proportion misclassified',main='Estimated Error Rates',col="maroon")

which.min(cvknn$MISCLASS)

# Since there are many values of k which tie for the 
# smallest c-v error rate, I'll use k = 23 (near middle
# of range of good values).

pred.gen.knn <- kknn(as.factor(g)~.,train=trndat,test=gendat,
                     k=23,kernel="rectangular")
gen.err.knn <- mean(pred.gen.knn$fit != gendat[,3])

gen.err.knn
# Error rate for 23 nearest neighbors.
# (A tad larger than density estimation rate (and Bayes rate).)


# Alternatively, one can assume the density for each class is a 
# multivariate normal density and use either LDA or QDA.  If the
# true class densities are multivariate normal, these methods can
# provide an excellent approximation of the Bayes classifier.  Even 
# if the densities are rather different from normal, these methods
# can often do amazingly well compared to other methods.  (In some
# nonnormal cases, first transforming the predictors may be helpful.)


# I'll try LDA.

lda.fit <- lda(g ~ ., trndat)
pred.lda <- predict(lda.fit, newdata=gendat)
lda.rate <- mean( pred.lda$class != gendat[,3] )

lda.rate
# LDA rate


# Now I'll try QDA.

qda.fit <- qda(g ~ ., trndat)
pred.qda <- predict(qda.fit, newdata=gendat)
qda.rate <- mean( pred.qda$class != gendat[,3] )

qda.rate
# QDA rate 
# (Lowest so far.  (Data was generated so that LDA should
# work very well ... better than QDA.  But really, a number
# of reasonable methods work very well (getting us close to 
# accuracy of Bayes classifier).  Things tend to get more
# interesting when class densities are more complex, and when
# Bayes error rate isn't so low.))


# Nearest neighbors methods are *local* methods; they only use observations
# close to a query point to classify the query point.  On the other hand, LDA
# and QDA are global methods; they use all of the data to fit a model, and then 
# all query points are classified using the fitted model.  (We can also refer to
# LDA and QDA as model-based methods.)

# Some classification methods which are more global than local are:
#    LDA,
#    QDA,
#    RDA,
#    logistic regression,
#    SVMs,
#    neural networks.
# (We can divide these by whether or not they form linear boundaries,
# and whether or not they are based on parametric models for the class
# densities.)

# Some classification methods which are more local than global are:
#    nearest neighbors methods (incl. common KNN but also variations like DANN),
#    methods based on nonparametric density estimates.
# (I'd say that CART is more local than global, but it's not as purely 
# local as the methods listed immediately above.)