# First I'll bring in the necessary libraries. # (Note: It may be that not all of these are # needed.) library(class) library(MASS) library(Hmisc) library(classPP) library(klaR) library(e1071) library(kknn) library(rpart) library(boost) library(mvtnorm) library(multinomRob ) # # I like to set the seed when using randomly-generated # data --- it's useful to be able to obtain the same # samples when trouble shooting, and it's good to be # able to have it so that others can reproduce the # results. set.seed(632) # samp1 <- rmvnorm(50, mean=c(1,0), sigma=diag(1,2)) samp2 <- rmvnorm(50, mean=c(0,1), sigma=diag(1,2)) cl1 <- rep(1, 50) cl2 <- rep(2, 50) samp1 <- cbind(samp1, cl1) samp2 <- cbind(samp2, cl2) trdata <- data.frame(rbind(samp1, samp2)) names(trdata) <- c("x1", "x2", "pop") attach(trdata) trdata plot(x1, x2, col=4-pop, xlim=c(-4,4),ylim=c(-4,4)) abline(0, 1, col=8) # # I will get an estimate of the Bayes error rate. bayes <- ifelse( x1 > x2, 1, 2) est.bayes.rate <- mean( bayes != pop ) est.bayes.rate # trlda <- lda(pop ~ x1 + x2) summary(trlda) trlda # predtr.trlda <- predict(trlda, newdata=trdata) predtr.trlda # tr.lda.rate <- mean( predtr.trlda$class != pop ) tr.lda.rate plot(x1, x2, col=4-as.numeric(predtr.trlda$class), xlim=c(-4,4), ylim=c(-4,4)) abline(0, 1, col=8) # popm1 <- pop - 1 # Response values should be 0, 1 and not 1,2 # for logisitc regression. trlr1st <- glm(popm1 ~ x1 + x2, family=binomial) trlr1st summary(trlr1st) # predtr.trlr1st <- predict(trlr1st, newdata=trdata, type="response") predtr.trlr1st tr.lr1st.rate <- mean( round(predtr.trlr1st) != popm1 ) tr.lr1st.rate plot(x1, x2, col=3-round(predtr.trlr1st), xlim=c(-4,4) ,ylim=c(-4,4)) abline(0, 1, col=8) # # I'll consider a 2nd-order logistic regression model. trlr2nd <- glm(popm1 ~ x1 + x2 + I(x1^2) + I(x2^2) + I(x1*x2), family=binomial) trlr2nd summary(trlr2nd) # predtr.trlr2nd <- predict(trlr2nd, newdata=trdata, type="response") tr.lr2nd.rate <- mean( round(predtr.trlr2nd) != popm1 ) tr.lr2nd.rate plot(x1, x2, col=3-round(predtr.trlr2nd), xlim=c(-4,4) ,ylim=c(-4,4)) abline(0, 1, col=8) # # I'll do a test to see if the set of the 3 2nd-order terms # are statistically significant. tocompare <- anova(trlr1st, trlr2nd) tocompare # Unfortunately, this didn't give the p-value which I want # (which is odd since the anova function gives a p-value # when used to compare two ols regression models). So I # will have to produce the p-value as shown below. p.value <- 1 - pchisq( tocompare$Dev[2], df=3 ) p.value # Somewhat surprisingly, the 2nd-order terms are significant # (even though the training sample error rate isn't better). # # Now I'll generate a large new set of data and estimate # the generalization error of the LDA classifier, and # the classifier obtained from logistic regression. samp1 <- rmvnorm(5000, mean=c(1,0), sigma=diag(1,2)) samp2 <- rmvnorm(5000, mean=c(0,1), sigma=diag(1,2)) cl1 <- rep(1, 5000) cl2 <- rep(2, 5000) samp1 <- cbind(samp1, cl1) samp2 <- cbind(samp2, cl2) gendata <- data.frame(rbind(samp1, samp2)) names(gendata) <- c("x1", "x2", "pop") detach(trdata) attach(gendata) plot(x1, x2, col=4-pop, xlim=c(-4,4), ylim=c(-4,4)) abline(0, 1, col=8) # bayes <- ifelse( x1 > x2, 1, 2) est.bayes.rate <- mean( bayes != pop ) est.bayes.rate # predgen.trlda <- predict(trlda, newdata=gendata) gen.lda.rate <- mean( predgen.trlda$class != pop ) gen.lda.rate plot(x1, x2, col=4-as.numeric(predgen.trlda$class), xlim=c(-4,4), ylim=c(-4,4)) abline(0, 1, col=8) # popm1 <- pop - 1 predgen.trlr1st <- predict(trlr1st, newdata=gendata, type="response") gen.lr1st.rate <- mean( round(predgen.trlr1st) != popm1 ) gen.lr1st.rate plot(x1, x2, col=3-round(predgen.trlr1st), xlim=c(-4,4), ylim=c(-4,4)) abline(0, 1, col=8) # predgen.trlr2nd <- predict(trlr2nd, newdata=gendata, type="response") gen.lr2nd.rate <- mean( round(predgen.trlr2nd) != popm1 ) gen.lr2nd.rate plot(x1, x2, col=3-round(predgen.trlr2nd), xlim=c(-4,4), ylim=c(-4,4)) abline(0, 1, col=8) # # Now I will generate a new training sample, # having twice as many 1s as 2s (and it will # be assumed that there will be twice as many # 1s as 2s to be classified in the future). samp1 <- rmvnorm(100, mean=c(1,0), sigma=diag(1,2)) samp2 <- rmvnorm(50, mean=c(0,1), sigma=diag(1,2)) cl1 <- rep(1, 100) cl2 <- rep(2, 50) samp1 <- cbind(samp1, cl1) samp2 <- cbind(samp2, cl2) trdata <- data.frame(rbind(samp1, samp2)) names(trdata) <- c("x1", "x2", "pop") detach(gendata) attach(trdata) plot(x1, x2, col=4-pop, xlim=c(-4,4),ylim=c(-4,4)) abline(log(2), 1, col=8) # bayes <- ifelse( x2 < x1 + log(2), 1, 2) est.bayes.rate <- mean( bayes != pop ) est.bayes.rate # trlda <- lda(pop ~ x1 + x2) trlda # predtr.trlda <- predict(trlda, newdata=trdata) tr.lda.rate <- mean( predtr.trlda$class != pop ) tr.lda.rate plot(x1, x2, col=4-as.numeric(predtr.trlda$class), xlim=c(-4,4),ylim=c(-4,4)) abline(log(2), 1, col=8) # # Now I'll generate a large new set of data and estimate # the generalization error of the LDA classifier. samp1 <- rmvnorm(5000, mean=c(1,0), sigma=diag(1,2)) samp2 <- rmvnorm(2500, mean=c(0,1), sigma=diag(1,2)) cl1 <- rep(1, 5000) cl2 <- rep(2, 2500) samp1 <- cbind(samp1, cl1) samp2 <- cbind(samp2, cl2) gendata <- data.frame(rbind(samp1, samp2)) names(gendata) <- c("x1", "x2", "pop") detach(trdata) attach(gendata) plot(x1, x2, col=4-pop, xlim=c(-4,4),ylim=c(-4,4)) abline(log(2), 1, col=8) # bayes <- ifelse( x2 < x1 + log(2), 1, 2) est.bayes.rate <- mean( bayes != pop ) est.bayes.rate # predgen.trlda <- predict(trlda, newdata=gendata) gen.lda.rate <- mean( predgen.trlda$class != pop ) gen.lda.rate plot(x1, x2, col=4-as.numeric(predgen.trlda$class), xlim=c(-4,4),ylim=c(-4,4)) abline(log(2), 1, col=8) # # Now I'll go back to using 50 in each class for # the training sample, and 5000 in each class to # obtain a Monte Carlo estimate the generalization # error. # Only now, I will give class 2 a different # covariance matrix, and I will also use QDA # and RDA to obtain alternative classifiers. samp1 <- rmvnorm(50, mean=c(1,0), sigma=diag(1,2)) samp2 <- rmvnorm(50, mean=c(0,1), sigma=diag(0.25,2)) cl1 <- rep(1, 50) cl2 <- rep(2, 50) samp1 <- cbind(samp1, cl1) samp2 <- cbind(samp2, cl2) trdata <- data.frame(rbind(samp1, samp2)) names(trdata) <- c("x1", "x2", "pop") detach(gendata) attach(trdata) plot(x1, x2, col=4-pop, xlim=c(-4,4),ylim=c(-4,4)) # trlda <- lda(pop ~ x1 + x2) predtr.trlda <- predict(trlda, newdata=trdata) tr.lda.rate <- mean( predtr.trlda$class != pop ) tr.lda.rate plot(x1, x2, col=4-as.numeric(predtr.trlda$class), xlim=c(-4,4),ylim=c(-4,4)) # trqda <- qda(pop ~ x1 + x2) predtr.trqda <- predict(trqda, newdata=trdata) tr.qda.rate <- mean( predtr.trqda$class != pop ) tr.qda.rate plot(x1, x2, col=4-as.numeric(predtr.trqda$class), xlim=c(-4,4),ylim=c(-4,4)) # ?rda # Just a reminder that you can use ? to get # information about a function. trrda <- rda(pop ~ x1 + x2, crossval=TRUE, fold=10, gamma=0) # I set gamma equal to 0 to make it match the # version I described in class. trrda # The lambda value close to 0 means that the fit # should be very close to the QDA fit. predtr.trrda <- predict(trrda, newdata=trdata) tr.rda.rate <- mean( predtr.trrda$class != pop ) tr.rda.rate plot(x1, x2, col=4-as.numeric(predtr.trqda$class), xlim=c(-4,4),ylim=c(-4,4)) # # I'll consider a 1st-order logistic regression model. popm1 <- pop - 1 trlr1st <- glm(popm1 ~ x1 + x2, family=binomial) summary(trlr1st) predtr.trlr1st <- predict(trlr1st, newdata=trdata, type="response") tr.lr1st.rate <- mean( round(predtr.trlr1st) != popm1 ) tr.lr1st.rate plot(x1, x2, col=3-round(predtr.trlr1st), xlim=c(-4,4) ,ylim=c(-4,4)) # # I'll consider a 2nd-order logistic regression model. trlr2nd <- glm(popm1 ~ x1 + x2 + I(x1^2) + I(x2^2) + I(x1*x2), family=binomial) summary(trlr2nd) predtr.trlr2nd <- predict(trlr2nd, newdata=trdata, type="response") tr.lr2nd.rate <- mean( round(predtr.trlr2nd) != popm1 ) tr.lr2nd.rate plot(x1, x2, col=3-round(predtr.trlr2nd), xlim=c(-4,4) ,ylim=c(-4,4)) # # I'll do a test to see if the set of the 3 2nd-order terms # are statistically significant. tocompare <- anova(trlr1st, trlr2nd) tocompare p.value <- 1 - pchisq( tocompare$Dev[2], df=3 ) p.value # The 2nd-order terms are highly significant. # # Now I'll generate a large new set of data and estimate # the generalization error of the LDA, QDA,RDA and # logistic regression classifiers. samp1 <- rmvnorm(5000, mean=c(1,0), sigma=diag(1,2)) samp2 <- rmvnorm(5000, mean=c(0,1), sigma=diag(0.25,2)) cl1 <- rep(1, 5000) cl2 <- rep(2, 5000) samp1 <- cbind(samp1, cl1) samp2 <- cbind(samp2, cl2) gendata <- data.frame(rbind(samp1, samp2)) names(gendata) <- c("x1", "x2", "pop") detach(trdata) attach(gendata) plot(x1, x2, col=4-pop, xlim=c(-4,4),ylim=c(-4,4)) # predgen.trlda <- predict(trlda, newdata=gendata) gen.lda.rate <- mean( predgen.trlda$class != pop ) gen.lda.rate plot(x1, x2, col=4-as.numeric(predgen.trlda$class), xlim=c(-4,4),ylim=c(-4,4)) # predgen.trqda <- predict(trqda, newdata=gendata) gen.qda.rate <- mean( predgen.trqda$class != pop ) gen.qda.rate plot(x1, x2, col=4-as.numeric(predgen.trqda$class), xlim=c(-4,4),ylim=c(-4,4)) # predgen.trrda <- predict(trrda, newdata=gendata) gen.rda.rate <- mean( predgen.trrda$class != pop ) gen.rda.rate plot(x1, x2, col=4-as.numeric(predgen.trrda$class), xlim=c(-4,4),ylim=c(-4,4)) # popm1 <- pop - 1 predgen.trlr1st <- predict(trlr1st, newdata=gendata, type="response") gen.lr1st.rate <- mean( round(predgen.trlr1st) != popm1 ) gen.lr1st.rate plot(x1, x2, col=3-round(predgen.trlr1st), xlim=c(-4,4), ylim=c(-4,4)) # predgen.trlr2nd <- predict(trlr2nd, newdata=gendata, type="response") gen.lr2nd.rate <- mean( round(predgen.trlr2nd) != popm1 ) gen.lr2nd.rate plot(x1, x2, col=3-round(predgen.trlr2nd), xlim=c(-4,4), ylim=c(-4,4)) # # Now I will see if performance improves # with a larger training sample. samp1 <- rmvnorm(500, mean=c(1,0), sigma=diag(1,2)) samp2 <- rmvnorm(500, mean=c(0,1), sigma=diag(0.25,2)) cl1 <- rep(1, 500) cl2 <- rep(2, 500) samp1 <- cbind(samp1, cl1) samp2 <- cbind(samp2, cl2) trdata <- data.frame(rbind(samp1, samp2)) names(trdata) <- c("x1", "x2", "pop") detach(gendata) attach(trdata) plot(x1, x2, col=4-pop, xlim=c(-4,4),ylim=c(-4,4)) # trlda <- lda(pop ~ x1 + x2) predtr.trlda <- predict(trlda, newdata=trdata) tr.lda.rate <- mean( predtr.trlda$class != pop ) tr.lda.rate plot(x1, x2, col=4-as.numeric(predtr.trlda$class), xlim=c(-4,4),ylim=c(-4,4)) # trqda <- qda(pop ~ x1 + x2) predtr.trqda <- predict(trqda, newdata=trdata) tr.qda.rate <- mean( predtr.trqda$class != pop ) tr.qda.rate plot(x1, x2, col=4-as.numeric(predtr.trqda$class), xlim=c(-4,4),ylim=c(-4,4)) # trrda <- rda(pop ~ x1 + x2, crossval=TRUE, fold=10, gamma=0) # I set gamma equal to 0 to make it match the # version I described in class. trrda # The lambda value close to 0 means that the fit # should be very close to the QDA fit. predtr.trrda <- predict(trrda, newdata=trdata) tr.rda.rate <- mean( predtr.trrda$class != pop ) tr.rda.rate plot(x1, x2, col=4-as.numeric(predtr.trrda$class), xlim=c(-4,4),ylim=c(-4,4)) # # I'll now use the previously generated generalization # sample to estimate the true error rates of the three # classifiers. detach(trdata) attach(gendata) predgen.trlda <- predict(trlda, newdata=gendata) gen.lda.rate <- mean( predgen.trlda$class != pop ) gen.lda.rate plot(x1, x2, col=4-as.numeric(predgen.trlda$class), xlim=c(-4,4),ylim=c(-4,4)) # predgen.trqda <- predict(trqda, newdata=gendata) gen.qda.rate <- mean( predgen.trqda$class != pop ) gen.qda.rate plot(x1, x2, col=4-as.numeric(predgen.trqda$class), xlim=c(-4,4),ylim=c(-4,4)) # predgen.trrda <- predict(trrda, newdata=gendata) gen.rda.rate <- mean( predgen.trrda$class != pop ) gen.rda.rate plot(x1, x2, col=4-as.numeric(predgen.trrda$class), xlim=c(-4,4),ylim=c(-4,4)) # # Finally, I'll investigate the performance of LDA # and logistic regression on data from nonnormal # distributions. I'll generate the x1 values in # exactly the same way for both underlying bivariate # distributions, using a t6 distribution. So x1 # will be a "noise variable" having no usefullness # in the classification (and perhaps throwing things # off a bit). I'll generate the x2 values using a # lognormal distribution having a mean of 1. I'll # subtract 1 from the x1 values for the first group # in order to seperate the means. (The mean vectors # for the two groups will be (0,0) and (0,1).) x1 <- rt(100, 6) summary(x1) x2 <- exp( rnorm(100, mean = -0.5, sd = 1) ) summary(x2) x2[1:50] <- x2[1:50] - 1 tranx2 <- log(x2+1) # (Note: I'll use a transformation of x2 below.) pop <- rep(0, 100) pop[51:100] <- pop[51:100] + 1 trdata <- data.frame(cbind(x1, x2,tranx2, pop)) names(trdata) <- c("x1", "x2", "tranx2", "pop") detach(gendata) attach(trdata) plot(x1, x2, col=3-pop) # # First I'll use LDA. trlda <- lda(pop ~ x1 + x2) trlda predtr.trlda <- predict(trlda, newdata=trdata) tr.lda.rate <- mean( predtr.trlda$class != pop ) tr.lda.rate plot(x1, x2, col=4-as.numeric(predtr.trlda$class)) # (Note: Oddly, the 0/1 values in predtr.trlda$class # become 1/2 values when as.numeric is applied. # (W/o as.numeric, an error results.)) # # Next I'll use logistic regression. trlr <- glm(pop ~ x1 + x2, family=binomial) summary(trlr) predtr.trlr <- predict(trlr, newdata=trdata, type="response") tr.lr.rate <- mean( round(predtr.trlr) != pop ) tr.lr.rate plot(x1, x2, col=3-round(predtr.trlr)) # # Lastly, I'll use LDA based on data transformed # by taking the log of the x2 values with 1 added # to them (to avoid taking the log of negative values). # This makes the x2 values normal for the 1st dist'n # but not the 2nd dist'n (due to the addition of 1).predict trldaalt <- lda(pop ~ x1 + tranx2) trldaalt predtr.trldaalt <- predict(trldaalt, newdata=trdata) tr.ldaalt.rate <- mean( predtr.trldaalt$class != pop ) tr.ldaalt.rate plot(x1, x2, col=4-as.numeric(predtr.trldaalt$class)) # # Now I'll generate a large new set of data and estimate # the generalization errors for the three classifiers. x1 <- rt(10000, 6) summary(x1) x2 <- exp( rnorm(10000, mean = -0.5, sd = 1) ) summary(x2) x2[1:5000] <- x2[1:5000] - 1 tranx2 <- log(x2+1) # (Note: I'll use a transformation of x2 below.) pop <- rep(0, 10000) pop[5001:10000] <- pop[5001:10000] + 1 gendata <- data.frame(cbind(x1, x2,tranx2, pop)) names(gendata) <- c("x1", "x2", "tranx2", "pop") detach(trdata) attach(gendata) plot(x1, x2, col=3-pop) # # First I'll use LDA. predgen.trlda <- predict(trlda, newdata=gendata) gen.lda.rate <- mean( predgen.trlda$class != pop ) gen.lda.rate plot(x1, x2, col=4-as.numeric(predgen.trlda$class)) # # Next I'll use logistic regression. predgen.trlr <- predict(trlr, newdata=gendata, type="response") gen.lr.rate <- mean( round(predgen.trlr) != pop ) gen.lr.rate plot(x1, x2, col=3-round(predgen.trlr)) # # Lastly, I'll use LDA classifier based on # the transformed data. predgen.trldaalt <- predict(trldaalt, newdata=gendata) gen.ldaalt.rate <- mean( predgen.trldaalt$class != pop ) gen.ldaalt.rate plot(x1, x2, col=4-as.numeric(predgen.trldaalt$class)) # It can be noted that the logistic regression # classifier did better than the original LDA # classifier. But using LDA with the transformed # data did the best. # # Of course, one can try other classifiers, and see # if they perform better. For example, one could # use the transformed x2 values with logistic # regression, and one could try deleting x1 since # the plot of the original data and the p-values # from the logistic regression fit suggest that it # may not be useful. If one considers classifiers # just using x2, the suggestion in HTF could be # used and a search could be done for the best way # to make a division using x2 to seperate the # classes for the observations in the training # sample.