# First I'll bring in the necessary libraries.  
# (Note: It may be that not all of these are 
# needed.) 
library(class)
library(MASS)
library(Hmisc)
library(classPP)
library(klaR)
library(e1071) 
library(kknn)
library(rpart)
library(boost)
library(mvtnorm)          
library(multinomRob )   

#
# I like to set the seed when using randomly-generated
# data --- it's useful to be able to obtain the same 
# samples when trouble shooting, and it's good to be 
# able to have it so that others can reproduce the
# results. 
set.seed(632)

#
samp1 <- rmvnorm(50, mean=c(1,0), sigma=diag(1,2))   
samp2 <- rmvnorm(50, mean=c(0,1), sigma=diag(1,2))  
cl1 <- rep(1, 50)
cl2 <- rep(2, 50)
samp1 <- cbind(samp1, cl1)
samp2 <- cbind(samp2, cl2)
trdata <- data.frame(rbind(samp1, samp2))
names(trdata) <- c("x1", "x2", "pop")
attach(trdata)
trdata
plot(x1, x2, col=4-pop, xlim=c(-4,4),ylim=c(-4,4))
abline(0, 1, col=8)

#
# I will get an estimate of the Bayes error rate.
bayes <- ifelse( x1 > x2, 1, 2)
est.bayes.rate <- mean( bayes != pop )
est.bayes.rate

#
trlda <- lda(pop ~ x1 + x2)
summary(trlda)
trlda

#
predtr.trlda <- predict(trlda, newdata=trdata)
predtr.trlda

#
tr.lda.rate <- mean( predtr.trlda$class != pop )
tr.lda.rate
plot(x1, x2, col=4-as.numeric(predtr.trlda$class), xlim=c(-4,4), ylim=c(-4,4))
abline(0, 1, col=8)

#
popm1 <- pop - 1
# Response values should be 0, 1 and not 1,2
# for logisitc regression.
trlr1st <- glm(popm1 ~ x1 + x2, family=binomial)
trlr1st
summary(trlr1st)

#
predtr.trlr1st <- predict(trlr1st, newdata=trdata, type="response")
predtr.trlr1st
tr.lr1st.rate <- mean( round(predtr.trlr1st) != popm1 )
tr.lr1st.rate
plot(x1, x2, col=3-round(predtr.trlr1st), xlim=c(-4,4) ,ylim=c(-4,4))
abline(0, 1, col=8)

#
# I'll consider a 2nd-order logistic regression model.
trlr2nd <- glm(popm1 ~ x1 + x2 + I(x1^2) + I(x2^2) + I(x1*x2), family=binomial)
trlr2nd
summary(trlr2nd)

#
predtr.trlr2nd <- predict(trlr2nd, newdata=trdata, type="response")
tr.lr2nd.rate <- mean( round(predtr.trlr2nd) != popm1 )
tr.lr2nd.rate
plot(x1, x2, col=3-round(predtr.trlr2nd), xlim=c(-4,4) ,ylim=c(-4,4))
abline(0, 1, col=8)

#
# I'll do a test to see if the set of the 3 2nd-order terms
# are statistically significant.
tocompare <- anova(trlr1st, trlr2nd)
tocompare
# Unfortunately, this didn't give the p-value which I want
# (which is odd since the anova function gives a p-value
# when used to compare two ols regression models).  So I 
# will have to produce the p-value as shown below.
p.value <- 1 - pchisq( tocompare$Dev[2], df=3 )
p.value
# Somewhat surprisingly, the 2nd-order terms are significant
# (even though the training sample error rate isn't better).

#
# Now I'll generate a large new set of data and estimate
# the generalization error of the LDA classifier, and 
# the classifier obtained from logistic regression.
samp1 <- rmvnorm(5000, mean=c(1,0), sigma=diag(1,2))   
samp2 <- rmvnorm(5000, mean=c(0,1), sigma=diag(1,2))  
cl1 <- rep(1, 5000)
cl2 <- rep(2, 5000)
samp1 <- cbind(samp1, cl1)
samp2 <- cbind(samp2, cl2)
gendata <- data.frame(rbind(samp1, samp2))
names(gendata) <- c("x1", "x2", "pop")
detach(trdata)
attach(gendata)
plot(x1, x2, col=4-pop, xlim=c(-4,4), ylim=c(-4,4))
abline(0, 1, col=8)

#
bayes <- ifelse( x1 > x2, 1, 2)
est.bayes.rate <- mean( bayes != pop )
est.bayes.rate

#
predgen.trlda <- predict(trlda, newdata=gendata)
gen.lda.rate <- mean( predgen.trlda$class != pop )
gen.lda.rate
plot(x1, x2, col=4-as.numeric(predgen.trlda$class), xlim=c(-4,4), ylim=c(-4,4))
abline(0, 1, col=8)

#
popm1 <- pop - 1
predgen.trlr1st <- predict(trlr1st, newdata=gendata, type="response")
gen.lr1st.rate <- mean( round(predgen.trlr1st) != popm1 )
gen.lr1st.rate
plot(x1, x2, col=3-round(predgen.trlr1st), xlim=c(-4,4), ylim=c(-4,4))
abline(0, 1, col=8)

#
predgen.trlr2nd <- predict(trlr2nd, newdata=gendata, type="response")
gen.lr2nd.rate <- mean( round(predgen.trlr2nd) != popm1 )
gen.lr2nd.rate
plot(x1, x2, col=3-round(predgen.trlr2nd), xlim=c(-4,4), ylim=c(-4,4))
abline(0, 1, col=8)

#
# Now I will generate a new training sample, 
# having twice as many 1s as 2s (and it will
# be assumed that there will be twice as many
# 1s as 2s to be classified in the future).
samp1 <- rmvnorm(100, mean=c(1,0), sigma=diag(1,2))   
samp2 <- rmvnorm(50, mean=c(0,1), sigma=diag(1,2))  
cl1 <- rep(1, 100)
cl2 <- rep(2, 50)
samp1 <- cbind(samp1, cl1)
samp2 <- cbind(samp2, cl2)
trdata <- data.frame(rbind(samp1, samp2))
names(trdata) <- c("x1", "x2", "pop")
detach(gendata)
attach(trdata)
plot(x1, x2, col=4-pop, xlim=c(-4,4),ylim=c(-4,4))
abline(log(2), 1, col=8)

#
bayes <- ifelse( x2 < x1 + log(2), 1, 2)
est.bayes.rate <- mean( bayes != pop )
est.bayes.rate

#
trlda <- lda(pop ~ x1 + x2)
trlda

#
predtr.trlda <- predict(trlda, newdata=trdata)
tr.lda.rate <- mean( predtr.trlda$class != pop )
tr.lda.rate
plot(x1, x2, col=4-as.numeric(predtr.trlda$class), xlim=c(-4,4),ylim=c(-4,4))
abline(log(2), 1, col=8)

#
# Now I'll generate a large new set of data and estimate
# the generalization error of the LDA classifier.
samp1 <- rmvnorm(5000, mean=c(1,0), sigma=diag(1,2))   
samp2 <- rmvnorm(2500, mean=c(0,1), sigma=diag(1,2))  
cl1 <- rep(1, 5000)
cl2 <- rep(2, 2500)
samp1 <- cbind(samp1, cl1)
samp2 <- cbind(samp2, cl2)
gendata <- data.frame(rbind(samp1, samp2))
names(gendata) <- c("x1", "x2", "pop")
detach(trdata)
attach(gendata)
plot(x1, x2, col=4-pop, xlim=c(-4,4),ylim=c(-4,4))
abline(log(2), 1, col=8)

#
bayes <- ifelse( x2 < x1 + log(2), 1, 2)
est.bayes.rate <- mean( bayes != pop )
est.bayes.rate

#
predgen.trlda <- predict(trlda, newdata=gendata)
gen.lda.rate <- mean( predgen.trlda$class != pop )
gen.lda.rate
plot(x1, x2, col=4-as.numeric(predgen.trlda$class), xlim=c(-4,4),ylim=c(-4,4))
abline(log(2), 1, col=8)

#
# Now I'll go back to using 50 in each class for
# the training sample, and 5000 in each class to
# obtain a Monte Carlo estimate the generalization 
# error.
# Only now, I will give class 2 a different
# covariance matrix, and I will also use QDA
# and RDA to obtain alternative classifiers.
samp1 <- rmvnorm(50, mean=c(1,0), sigma=diag(1,2))   
samp2 <- rmvnorm(50, mean=c(0,1), sigma=diag(0.25,2))  
cl1 <- rep(1, 50)
cl2 <- rep(2, 50)
samp1 <- cbind(samp1, cl1)
samp2 <- cbind(samp2, cl2)
trdata <- data.frame(rbind(samp1, samp2))
names(trdata) <- c("x1", "x2", "pop")
detach(gendata)
attach(trdata)
plot(x1, x2, col=4-pop, xlim=c(-4,4),ylim=c(-4,4))

#
trlda <- lda(pop ~ x1 + x2)
predtr.trlda <- predict(trlda, newdata=trdata)
tr.lda.rate <- mean( predtr.trlda$class != pop )
tr.lda.rate
plot(x1, x2, col=4-as.numeric(predtr.trlda$class), xlim=c(-4,4),ylim=c(-4,4))

#
trqda <- qda(pop ~ x1 + x2)
predtr.trqda <- predict(trqda, newdata=trdata)
tr.qda.rate <- mean( predtr.trqda$class != pop )
tr.qda.rate
plot(x1, x2, col=4-as.numeric(predtr.trqda$class), xlim=c(-4,4),ylim=c(-4,4))

#
?rda
# Just a reminder that you can use ? to get 
# information about a function.
trrda <- rda(pop ~ x1 + x2, crossval=TRUE, fold=10, gamma=0)
# I set gamma equal to 0 to make it match the
# version I described in class.
trrda
# The lambda value close to 0 means that the fit
# should be very close to the QDA fit.
predtr.trrda <- predict(trrda, newdata=trdata)
tr.rda.rate <- mean( predtr.trrda$class != pop )
tr.rda.rate
plot(x1, x2, col=4-as.numeric(predtr.trqda$class), xlim=c(-4,4),ylim=c(-4,4))

#
# I'll consider a 1st-order logistic regression model.
popm1 <- pop - 1
trlr1st <- glm(popm1 ~ x1 + x2, family=binomial)
summary(trlr1st)
predtr.trlr1st <- predict(trlr1st, newdata=trdata, type="response")
tr.lr1st.rate <- mean( round(predtr.trlr1st) != popm1 )
tr.lr1st.rate
plot(x1, x2, col=3-round(predtr.trlr1st), xlim=c(-4,4) ,ylim=c(-4,4))

#
# I'll consider a 2nd-order logistic regression model.
trlr2nd <- glm(popm1 ~ x1 + x2 + I(x1^2) + I(x2^2) + I(x1*x2), family=binomial)
summary(trlr2nd)
predtr.trlr2nd <- predict(trlr2nd, newdata=trdata, type="response")
tr.lr2nd.rate <- mean( round(predtr.trlr2nd) != popm1 )
tr.lr2nd.rate
plot(x1, x2, col=3-round(predtr.trlr2nd), xlim=c(-4,4) ,ylim=c(-4,4))

#
# I'll do a test to see if the set of the 3 2nd-order terms
# are statistically significant.
tocompare <- anova(trlr1st, trlr2nd)
tocompare
p.value <- 1 - pchisq( tocompare$Dev[2], df=3 )
p.value
# The 2nd-order terms are highly significant.

#
# Now I'll generate a large new set of data and estimate
# the generalization error of the LDA, QDA,RDA and 
# logistic regression classifiers.
samp1 <- rmvnorm(5000, mean=c(1,0), sigma=diag(1,2))   
samp2 <- rmvnorm(5000, mean=c(0,1), sigma=diag(0.25,2))  
cl1 <- rep(1, 5000)
cl2 <- rep(2, 5000)
samp1 <- cbind(samp1, cl1)
samp2 <- cbind(samp2, cl2)
gendata <- data.frame(rbind(samp1, samp2))
names(gendata) <- c("x1", "x2", "pop")
detach(trdata)
attach(gendata)
plot(x1, x2, col=4-pop, xlim=c(-4,4),ylim=c(-4,4))

#
predgen.trlda <- predict(trlda, newdata=gendata)
gen.lda.rate <- mean( predgen.trlda$class != pop )
gen.lda.rate
plot(x1, x2, col=4-as.numeric(predgen.trlda$class), xlim=c(-4,4),ylim=c(-4,4))

#
predgen.trqda <- predict(trqda, newdata=gendata)
gen.qda.rate <- mean( predgen.trqda$class != pop )
gen.qda.rate
plot(x1, x2, col=4-as.numeric(predgen.trqda$class), xlim=c(-4,4),ylim=c(-4,4))

#
predgen.trrda <- predict(trrda, newdata=gendata)
gen.rda.rate <- mean( predgen.trrda$class != pop )
gen.rda.rate
plot(x1, x2, col=4-as.numeric(predgen.trrda$class), xlim=c(-4,4),ylim=c(-4,4))

#
popm1 <- pop - 1
predgen.trlr1st <- predict(trlr1st, newdata=gendata, type="response")
gen.lr1st.rate <- mean( round(predgen.trlr1st) != popm1 )
gen.lr1st.rate
plot(x1, x2, col=3-round(predgen.trlr1st), xlim=c(-4,4), ylim=c(-4,4))

#
predgen.trlr2nd <- predict(trlr2nd, newdata=gendata, type="response")
gen.lr2nd.rate <- mean( round(predgen.trlr2nd) != popm1 )
gen.lr2nd.rate
plot(x1, x2, col=3-round(predgen.trlr2nd), xlim=c(-4,4), ylim=c(-4,4))

#
# Now I will see if performance improves  
# with a larger training sample. 
samp1 <- rmvnorm(500, mean=c(1,0), sigma=diag(1,2))   
samp2 <- rmvnorm(500, mean=c(0,1), sigma=diag(0.25,2))  
cl1 <- rep(1, 500)
cl2 <- rep(2, 500)
samp1 <- cbind(samp1, cl1)
samp2 <- cbind(samp2, cl2)
trdata <- data.frame(rbind(samp1, samp2))
names(trdata) <- c("x1", "x2", "pop")
detach(gendata)
attach(trdata)
plot(x1, x2, col=4-pop, xlim=c(-4,4),ylim=c(-4,4))

#
trlda <- lda(pop ~ x1 + x2)
predtr.trlda <- predict(trlda, newdata=trdata)
tr.lda.rate <- mean( predtr.trlda$class != pop )
tr.lda.rate
plot(x1, x2, col=4-as.numeric(predtr.trlda$class), xlim=c(-4,4),ylim=c(-4,4))

#
trqda <- qda(pop ~ x1 + x2)
predtr.trqda <- predict(trqda, newdata=trdata)
tr.qda.rate <- mean( predtr.trqda$class != pop )
tr.qda.rate
plot(x1, x2, col=4-as.numeric(predtr.trqda$class), xlim=c(-4,4),ylim=c(-4,4))

#
trrda <- rda(pop ~ x1 + x2, crossval=TRUE, fold=10, gamma=0)
# I set gamma equal to 0 to make it match the
# version I described in class.
trrda
# The lambda value close to 0 means that the fit
# should be very close to the QDA fit.
predtr.trrda <- predict(trrda, newdata=trdata)
tr.rda.rate <- mean( predtr.trrda$class != pop )
tr.rda.rate
plot(x1, x2, col=4-as.numeric(predtr.trrda$class), xlim=c(-4,4),ylim=c(-4,4))

#
# I'll now use the previously generated generalization 
# sample to estimate the true error rates of the three
# classifiers.
detach(trdata)
attach(gendata)
predgen.trlda <- predict(trlda, newdata=gendata)
gen.lda.rate <- mean( predgen.trlda$class != pop )
gen.lda.rate
plot(x1, x2, col=4-as.numeric(predgen.trlda$class), xlim=c(-4,4),ylim=c(-4,4))

#
predgen.trqda <- predict(trqda, newdata=gendata)
gen.qda.rate <- mean( predgen.trqda$class != pop )
gen.qda.rate
plot(x1, x2, col=4-as.numeric(predgen.trqda$class), xlim=c(-4,4),ylim=c(-4,4))

#
predgen.trrda <- predict(trrda, newdata=gendata)
gen.rda.rate <- mean( predgen.trrda$class != pop )
gen.rda.rate
plot(x1, x2, col=4-as.numeric(predgen.trrda$class), xlim=c(-4,4),ylim=c(-4,4))

#
# Finally, I'll investigate the performance of LDA 
# and logistic regression on data from nonnormal
# distributions.  I'll generate the x1 values in 
# exactly the same way for both underlying bivariate
# distributions, using a t6 distribution.  So x1
# will be a "noise variable" having no usefullness
# in the classification (and perhaps throwing things 
# off a bit).  I'll generate the x2 values using a 
# lognormal distribution having a mean of 1.  I'll
# subtract 1 from the x1 values for the first group
# in order to seperate the means.  (The mean vectors 
# for the two groups will be (0,0) and (0,1).)
x1 <- rt(100, 6) 
summary(x1)
x2 <- exp( rnorm(100, mean = -0.5, sd = 1) ) 
summary(x2)
x2[1:50] <- x2[1:50] - 1
tranx2 <- log(x2+1)
# (Note: I'll use a transformation of x2 below.)
pop <- rep(0, 100)
pop[51:100] <- pop[51:100] + 1
trdata <- data.frame(cbind(x1, x2,tranx2, pop))
names(trdata) <- c("x1", "x2", "tranx2", "pop")
detach(gendata)
attach(trdata)
plot(x1, x2, col=3-pop)

#
# First I'll use LDA.
trlda <- lda(pop ~ x1 + x2)
trlda
predtr.trlda <- predict(trlda, newdata=trdata)
tr.lda.rate <- mean( predtr.trlda$class != pop )
tr.lda.rate
plot(x1, x2, col=4-as.numeric(predtr.trlda$class))
# (Note: Oddly, the 0/1 values in predtr.trlda$class 
# become 1/2 values when as.numeric is applied.
# (W/o as.numeric, an error results.))

#
# Next I'll use logistic regression.
trlr <- glm(pop ~ x1 + x2, family=binomial)
summary(trlr)
predtr.trlr <- predict(trlr, newdata=trdata, type="response")
tr.lr.rate <- mean( round(predtr.trlr) != pop )
tr.lr.rate
plot(x1, x2, col=3-round(predtr.trlr))

#
# Lastly, I'll use LDA based on data transformed 
# by taking the log of the x2 values with 1 added 
# to them (to avoid taking the log of negative values).
# This makes the x2 values normal for the 1st dist'n
# but not the 2nd dist'n (due to the addition of 1).predict
trldaalt <- lda(pop ~ x1 + tranx2)
trldaalt
predtr.trldaalt <- predict(trldaalt, newdata=trdata)
tr.ldaalt.rate <- mean( predtr.trldaalt$class != pop )
tr.ldaalt.rate
plot(x1, x2, col=4-as.numeric(predtr.trldaalt$class))

#
# Now I'll generate a large new set of data and estimate
# the generalization errors for the three classifiers.
x1 <- rt(10000, 6) 
summary(x1)
x2 <- exp( rnorm(10000, mean = -0.5, sd = 1) ) 
summary(x2)
x2[1:5000] <- x2[1:5000] - 1
tranx2 <- log(x2+1)
# (Note: I'll use a transformation of x2 below.)
pop <- rep(0, 10000)
pop[5001:10000] <- pop[5001:10000] + 1
gendata <- data.frame(cbind(x1, x2,tranx2, pop))
names(gendata) <- c("x1", "x2", "tranx2", "pop")
detach(trdata)
attach(gendata)
plot(x1, x2, col=3-pop)

#
# First I'll use LDA.
predgen.trlda <- predict(trlda, newdata=gendata)
gen.lda.rate <- mean( predgen.trlda$class != pop )
gen.lda.rate
plot(x1, x2, col=4-as.numeric(predgen.trlda$class))

#
# Next I'll use logistic regression.
predgen.trlr <- predict(trlr, newdata=gendata, type="response")
gen.lr.rate <- mean( round(predgen.trlr) != pop )
gen.lr.rate
plot(x1, x2, col=3-round(predgen.trlr))

#
# Lastly, I'll use LDA classifier based on 
# the transformed data.
predgen.trldaalt <- predict(trldaalt, newdata=gendata)
gen.ldaalt.rate <- mean( predgen.trldaalt$class != pop )
gen.ldaalt.rate
plot(x1, x2, col=4-as.numeric(predgen.trldaalt$class))
# It can be noted that the logistic regression
# classifier did better than the original LDA
# classifier.  But using LDA with the transformed
# data did the best.  
#
# Of course, one can try other classifiers, and see
# if they perform better.  For example, one could 
# use the transformed x2 values with logistic 
# regression, and one could try deleting x1 since
# the plot of the original data and the p-values
# from the logistic regression fit suggest that it
# may not be useful.  If one considers classifiers 
# just using x2, the suggestion in HTF could be 
# used and a search could be done for the best way 
# to make a division using x2 to seperate the 
# classes for the observations in the training 
# sample.