# generate data described on p. 17 of HTF. #Step 1: for each class generate 10 observations from the proper bivariate normal distribution #to serve as means for the components of the mixture distribution #once installed, mvtnorm and multinormRob need not be reinstalled during subsequent operations. library(MASS) library(mvtnorm) # generate multinormal number library(multinomRob ) # for generationg multinomial number library(class) # for knn function set.seed (3) # seed gmean <- rmvnorm(10, mean=c(1,0), sigma=diag(1,2)) # "g" means "green" group rmean <- rmvnorm(10, mean=c(0,1), sigma=diag(1,2)) # "r" means "red" group #plot of the means plot(gmean[,1],gmean[,2], col=3, main='mean', xlim=c(-3.5,4.5), ylim=c(-3.5,4.5), xlab='x1', ylab='x2') points(rmean[,1],rmean[,2], col=2) #Step 2: generate 10 counts which sum to 100 for each class using a multinomial (100,0.1,0.1,...,0.1) distribution. gcounts<-rmultinomial(100,rep(1/10,10)) rcounts<-rmultinomial(100,rep(1/10,10)) #Step 3: using the component means generated in step 1 and the counts generated in step 2 as sample sizes #(for the various components of the mixture distributions), generate 100 training obervations for each class gclass<-matrix(0,100,2) finish <- 0 for(i in 1:10){ start <- finish+1 finish <- finish + gcounts[i] gclass[start:finish,]<-rmvnorm(gcounts[i],mean=gmean[i,],sigma=diag(0.2,2)) } rclass<-matrix(0,100,2) finish <- 0 for(i in 1:10){ start <- finish+1 finish <- finish + rcounts[i] rclass[start:finish,]<-rmvnorm(rcounts[i],mean=rmean[i,],sigma=diag(0.2,2)) } #Step 4: Give response value for green class as 0 and red as 1, then #combine them together as our training data. gclass <- cbind(gclass,0) rclass <- cbind(rclass,1) trndata <- rbind(gclass,rclass) trndata <- data.frame(trndata) names(trndata) <- c("x1","x2","y") #to see the training data plot(trndata[,1], trndata[,2], col=3-trndata[,3], main='training data from P17', xlab='x1',ylab='x2') #estimate the Bayes error rate greenpdf <-rep(0,200) redpdf <-rep (0,200) for (i in 1:10){ greenpdf <- greenpdf + dmvnorm(trndata[,1:2],gmean[i,], diag(0.2,2)) redpdf <- redpdf + dmvnorm(trndata[,1:2],rmean[i,],diag(0.2,2)) } bayes.class <- ifelse(greenpdf >=redpdf,0,1) est.bayes.rate <- mean( bayes.class != trndata[,3]) est.bayes.rate #Step 5: fit least squares regression model using training data, then get the predicted value #for each observation using (2.7) on p. 13. #set data same names and be "data.frame", it can find the corresponding names when we do prediction lmfit1 <- lm(y ~., data=trndata) predict.lmfit1 <- predict ( lmfit1, newdata=data.frame(trndata[,-3])) train.error.lmfit1 <- mean ( ifelse(predict.lmfit1<=0.5,0,1) != trndata[,3]) train.error.lmfit1 plot(trndata[,1], trndata[,2], col=3-trndata[,3], main='Least Square Fit', xlab='x1',ylab='x2') abline((0.5-lmfit1$coef[1])/lmfit1$coef[3], -lmfit1$coef[2]/lmfit1$coef[3]) #now use a quardratic fit trndata2 <- cbind(trndata[,1], trndata[,2], trndata[,1]^2, trndata[,2]^2, trndata[,1]*trndata[,2], trndata[,3]) trndata2 <- data.frame(trndata2) names(trndata2) <- c( "x1", "x2", "x1^2", "x2^2", "x1*x2", "y") lmfit2 <- lm (y~., data=trndata2) predict.lmfit2<- ifelse (predict(lmfit2)<=0.5, 0, 1) train.error.lmfit2 <- mean( predict.lmfit2 != trndata2[,6]) train.error.lmfit2 #Step 6: Using Knn to get prediction of class for each observation of the training data. #(Don't include the class label when giving the two data sets to the knn function.) K <- 30 train.error.knn <- numeric(K) for (i in 1:K) {knn.fit <- knn(trndata[,-3], trndata[,-3], cl=trndata[,3], k=i*2-1) train.error.knn[i] <- mean( (as.numeric(knn.fit)-1) != trndata[,3]) } plot(2*seq(1:K)-1, train.error.knn, type="b",xlab='K value', ylab='error',main=' KNN training data error ',col=3, ylim=c(0,0.4)) #Step 7: repeat steps 2 to 4 to get test data of size 10,000 #5000 from each class gcounts<-rmultinomial(5000,rep(1/10,10)) rcounts<-rmultinomial(5000,rep(1/10,10)) gclass<-matrix(0,5000,2) finish <- 0 for(i in 1:10){ start <- finish+1 finish <- finish + gcounts[i] gclass[start:finish,]<-rmvnorm(gcounts[i],mean=gmean[i,],sigma=diag(0.2,2)) } rclass<-matrix(0,5000,2) finish <- 0 for(i in 1:10){ start <- finish+1 finish <- finish + rcounts[i] rclass[start:finish,]<-rmvnorm(rcounts[i],mean=rmean[i,],sigma=diag(0.2,2)) } gclass <- cbind(gclass,0) rclass <- cbind(rclass,1) testdata <- rbind(gclass,rclass) testdata <- data.frame(testdata) names(testdata) <- names(trndata) #Step 8: use the least squares regression rule from step 5 to classify test data. lmfit1 <- lm(y ~., data=testdata) predict.lmfit1 <-predict(lmfit1, newdata=testdata) test.error.lmfit1 <- mean ( ifelse(predict.lmfit1<=0.5,0,1) != testdata[,3]) test.error.lmfit1 #estimate the test error from the quadratic fit. testdata2 <- cbind(testdata[,1], testdata[,2], testdata[,1]^2, testdata[,2]^2, testdata[,1]*testdata[,2], testdata[,3]) testdata2 <- data.frame(testdata2) names(testdata2) <- names(trndata2) predict.lmfit2 <- ifelse (predict(lmfit2, newdata=testdata2)<=0.5, 0, 1) test.error.lmfit2 <- mean( predict.lmfit2 != testdata2[,6]) test.error.lmfit2 #estimate the bayes error rate using the test data. greenpdf <-rep(0,10000) redpdf <-rep (0,10000) for (i in 1:10){ greenpdf <- greenpdf + dmvnorm(testdata[,1:2],gmean[i,], diag(0.2,2)) redpdf <- redpdf + dmvnorm(testdata[,1:2],rmean[i,],diag(0.2,2)) } bayestest.class <- ifelse(greenpdf >=redpdf,0,1) est.bayes.rate <- mean( bayestest.class != testdata[,3]) est.bayes.rate #Step 9: using knn with the training data to classify the test data. test.error.knn <- numeric(K) for (i in 1:K) {knn.fit <- knn(trndata[,-3], testdata[,-3], cl=trndata[,3], k=i*2-1) test.error.knn[i] <- mean( (as.numeric(knn.fit)-1) != testdata[,3]) } # plot the train KNN error plot(2*seq(1:K)-1, test.error.knn, type="b",xlab='K value', ylab='error',main=' KNN test data error ',col=3, ylim=c(0,0.4)) lines(2*seq(1:K)-1, test.error.knn, type="b",col=2) abline(h=est.bayes.rate,col=7) #test bayes rate