# 2nd example for 6/18/10 
# (like the 1st, but means closer together)


library(mvtnorm)      # for generating multivariate normal values
library(multinomRob ) # for generating multinomial dist'n values
library(klaR)         # for naive Bayes classification
library(kknn)         # for kknn and train.kknn f'ns (for KNN)

# I'll set the seed for the random number generator.
set.seed(17)  


# Let 
#                             pi_1 = pi_2 = 1/2,
# and let the density underlying each class be a bivariate normal pdf 
# having covariance matrix I (the identity matrix).  For class 1, let 
# the mean vector be 
#                                ( 1, 0 )
# and for class 2 let the mean vector be 
#                                ( 0, 1 ).

# Let's plot the two pdfs.
x1plot <- seq(-3, 8, by=0.1)
x2plot <- seq(-3, 8, by=0.1)
Sig <- diag(1,2)
maxpdf <- function(x1, x2){mat=cbind(x1,x2); pmax(dmvnorm(mat, c(1,0), Sig),
                                                  dmvnorm(mat, c(0,1), Sig))}
pdfplot <- outer(x1plot, x2plot, maxpdf)
persp(x1plot, x2plot, pdfplot, xlab="x1", ylab="x2", zlab="pdf", 
theta=-30, col="gold")

# For this simple two-class problem involving bivariate normal populations,
# one could determine the error rate of the Bayes classifier (aka the Bayes 
# rate) analytically.  We can also obtain a Monte Carlo estimate (done below).

x <- rmvnorm(10000, mean=c(1,0), sigma=Sig) 
mean( x[,1] < x[,2] )
# Monte Carlo estimate of Bayes rate

pnorm( -1/sqrt( 2 ) )
# Bayes rate (exact)

###

# I'll generate data for the simple classification problem
# introduced above.  I'll first generate a training sample 
# of 100 observations.

n1 <- rbinom(1, 100, 0.5)
n1
# Number of Class 1 observations in training sample.

X1 <- rmvnorm(n1, mean=c(1,0), sigma=Sig)
X2 <- rmvnorm(100-n1, mean=c(0,1), sigma=Sig)
X <- rbind(X1, X2)
g <- c( rep(1, n1), rep(2, 100-n1) )
trndat <- data.frame( cbind(X, g) )
names(trndat) <- c("x1", "x2", "g")

plot(trndat[,1], trndat[,2], col=4-trndat[,3], main='training data', xlab='x1', ylab='x2')

# Now I'll generate some data (the same way, but independently) 
# to serve as a "generalization sample" (to provide estimates
# of the error rates for the various classification methods
# considered).  I'll put 5000 observations from each class in
# the generalization sample.  (This seems better than making 
# the class sample sizes random.) 

X1 <- rmvnorm(5000, mean=c(1,0), sigma=Sig)
X2 <- rmvnorm(5000, mean=c(0,1), sigma=Sig)
X <- rbind(X1, X2)
g <- c( rep(1, 5000), rep(2, 5000) )
gendat <- data.frame( cbind(X, g) )
names(gendat) <- names(trndat)
 
###

# I'll use two-dimensional kernel density estimates
# to classify by directly approximating the Bayes 
# classifier.

x1limL <- min(trndat[,1]) - 2
x1limU <- max(trndat[,1]) + 2
x2limL <- min(trndat[,2]) - 2
x2limU <- max(trndat[,2]) + 2
gden <- kde2d(trndat[1:n1,1], trndat[1:n1,2], n=301, lims=c(x1limL,x1limU,x2limL,x2limU))
rden <- kde2d(trndat[(n1+1):100,1], trndat[(n1+1):100,2], n=301, lims=c(x1limL,x1limU,x2limL,x2limU))

# Now let's look at the decision boundary 
# (on which *weighted* densities are equal).

denrat <- (100 - n1)*rden$z/(n1*gden$z)
contour(gden$x, gden$y, denrat, levels=c(0.5,1,2))

x1index <- round(1 + 300*(trndat[,1] - min(gden$x))/(max(gden$x) - min(gden$x)))
x2index <- round(1 + 300*(trndat[,2] - min(gden$y))/(max(gden$y) - min(gden$y)))
pred.denrat <- rep(0, 100)
for (i in 1:100) pred.denrat[i] <- denrat[x1index[i],x2index[i]]
trn.err.kern.den <- mean( ifelse(pred.denrat > 1, 2, 1) != trndat[,3] ) 
trn.err.kern.den

# Let's now get a better estimate of the error rate for 
# this density estimation method using the generalization
# sample.

# Note: One has to take care that the subscript does not 
# fall outside of the set {1, 2, 3, ..., 300, 301}.

x1index <- round(1 + 300*(gendat[,1] - min(gden$x))/(max(gden$x) - min(gden$x)))
x1index <- ifelse( x1index < 1, 1, x1index )
x1index <- ifelse( x1index > 301, 301, x1index )
x2index <- round(1 + 300*(gendat[,2] - min(gden$y))/(max(gden$y) - min(gden$y)))
x2index <- ifelse( x2index < 1, 1, x2index )
x2index <- ifelse( x2index > 301, 301, x2index )

pred.denrat <- rep(0, 10000)
for (i in 1:10000) pred.denrat[i] <- denrat[x1index[i],x2index[i]]
gen.err.kern.den <- mean( ifelse(pred.denrat > 1, 2, 1) != gendat[,3] ) 
gen.err.kern.den


# Now I'll try naive Bayes.

alttrndat <- data.frame(cbind(trndat[,1],trndat[,2],factor(trndat[,3])))
names(alttrndat) <- c("x1","x2","g")
levels(alttrndat$g) <- c("1","2")

altgendat <- data.frame(cbind(gendat[,1],gendat[,2],factor(gendat[,3])))
names(altgendat) <- c("x1","x2","g")
levels(altgendat$g) <- c("1","2")

# I'll start with a version of the method that assumes
# the marginal densities are normal.

nbnorm <- NaiveBayes(g ~ ., data=alttrndat) 

pred.nbnorm <- predict(nbnorm, newdata=alttrndat)
trn.err.nbnorm <- mean( pred.nbnorm$class != alttrndat[,3] ) 
trn.err.nbnorm
# Est. error rate based on training data.

pred.nbnorm <- predict(nbnorm, newdata=altgendat)
gen.err.nbnorm <- mean( pred.nbnorm$class != altgendat[,3] ) 
gen.err.nbnorm
# Est. error rate based on generalization data.

# Next, I will use what I consider to be the standard
# way of doing naive Bayes classification, with a kernel
# density estimator being used to obtain a nonparametric
# density estimate for each predictor.

nbkern <- NaiveBayes(g ~ ., data=alttrndat, usekernel=TRUE) 

pred.nbkern <- predict(nbkern, newdata=alttrndat)
tr.err.nbkern <- mean( pred.nbkern$class != alttrndat[,3] ) 
tr.err.nbkern
# Est. error rate based on training data.

pred.nbkern <- predict(nbkern, newdata=altgendat)
gen.err.nbkern <- mean( pred.nbkern$class != altgendat[,3] ) 
gen.err.nbkern
# Est. error rate based on generalization data.


# I'll try ordinary nearest neighbors, using 
# train.kknn to select a good value of k.

cvknn <- train.kknn(as.factor(g) ~ ., trndat, 
          kmax=50, kernel="rectangular", distance=2) 

plot(c(1:50),cvknn$MISCLASS,type="l",xlab='k',
      ylab='proportion misclassified',main='Estimated Error Rates',col="maroon")

which.min(cvknn$MISCLASS)

# I'll use k = 39.

pred.gen.knn <- kknn(as.factor(g)~.,train=trndat,test=gendat,
                     k=39,kernel="rectangular")
gen.err.knn <- mean(pred.gen.knn$fit != gendat[,3])

gen.err.knn
# Error rate for 39 nearest neighbors.


# I'll try LDA.

lda.fit <- lda(g ~ ., trndat)
pred.lda <- predict(lda.fit, newdata=gendat)
lda.rate <- mean( pred.lda$class != gendat[,3] )

lda.rate
# LDA rate

# Now I'll try QDA.

qda.fit <- qda(g ~ ., trndat)
pred.qda <- predict(qda.fit, newdata=gendat)
qda.rate <- mean( pred.qda$class != gendat[,3] )

qda.rate
# QDA rate 

###########################  SUMMARY  ###############################

# 1st Example (from other file)    2nd Example (this file)


# Bayes rate                       0.07865          0.23975

# kernel density                   0.084  (0.02)    0.248  (0.16)
# naive Bayes (normal model)       0.085  (0.03)    0.251  (0.20)
# naive Bayes (kern. den. est.)    0.087  (0.03)    0.245  (0.18)
# KNN (K selected by c-v)          0.084  (0.03)    0.243  (0.19)
# LDA                              0.085            0.244
# QDA                              0.081            0.244


# With the normal class densities, it makes little difference which
# method is used.  Accuracy doesn't seem to appreciably suffer if we
# don't assume normality (even though assumption is true in this case).

# It'll be interesting to consider some nonnormal cases.