# Classification with vowel data set (11 classes, 10 predictors) referred to in Ch. 4 of HTF2. # Note: Developed with vesion 2.9.0. # Note: I designed this so that it'll work well to paste the commands into R in chunks. # Each time, go down to the next line with just ##### on it and paste all of the # commands in at one time. Then after the commands are executed, look at the # comments and output and look at the graph before pasting in the next chunk of # commands. (If you paste everything in all at once you won't have much time to # look at the graphics.) # I'll load some libraries and set the random number seed. library(class) library(MASS) library(e1071) library(kknn) library(klaR) library(nnet) set.seed(7) ####################################################### # Reading in Data and Creation of Canonical Variables # ####################################################### # I'll read in the training data and delete the unnecessary case id column. fulltrndat <- read.table("http://mason.gmu.edu/~csutton/HTF2VowelTrain.txt",sep=',',header=T) trndat <- fulltrndat[,-1] # Next I'll scale the data. sctrn.vars <- scale(trndat[,-1], center=TRUE, scale=TRUE) sctrndat <- data.frame( cbind( trndat[,1], sctrn.vars ) ) names(sctrndat) <- names(trndat) # Now I will compute the canonical variables. # (Note: Some classification will be done using original variables, # and some will be done using canonical variables.) scldafit <- lda(y~., sctrndat) canvar.trn <- as.matrix(sctrndat[,-1]) %*% scldafit$scaling canvartrn <- data.frame( cbind( trndat[,1], canvar.trn ) ) names(canvartrn) <- c("y","cv1","cv2","cv3","cv4","cv5","cv6","cv7","cv8","cv9","cv10") # Now read in test data and create canonical variables. fulltstdat <- read.table("http://mason.gmu.edu/~csutton/HTF2VowelTest.txt",sep=',',header=T) tstdat <- fulltstdat[,-1] sctst.vars <- scale(tstdat[,-1],center=attr(sctrn.vars,"scaled:center"), scale=attr(sctrn.vars,"scaled:scale")) sctstdat <- data.frame( cbind(tstdat[,1], sctst.vars ) ) names(sctstdat) <- names(sctrndat) canvar.tst <- as.matrix(sctstdat[,-1]) %*% scldafit$scaling canvartst <- data.frame( cbind( tstdat[,1], canvar.tst ) ) names(canvartst) <- names(canvartrn) ############################################ # Classification Using Logistic Regression # ############################################ # P. 173 of the 2002 Sage book # An R and S-Plus Companion to Applied Regression # by John Fox states # "The multinomial logit model cannot be fit by glm, # but the multinom function in the nnet library ... # will do the trick". # The help page for multinom indicates that the predictor # variables should be roughly scaled to [0,1] or the fit # will be slow or may not converge at all. When I used # multinom with 10 predictors, whether they were from the # original data, the scaled data, or the canonical variables, # it ran for 100 iterations and did not converge (although # the resulting predictions were not horrible). The scaled # predictors have standard deviation 1, but of course the # values do not all belong to [0,1] (since the sample mean # is 0 for each of the scaled predictors). I'll see what # happens if I do a linear transformation on each of the # original predictors to make them have range [0,1]. (Note: # Any linear transformation of the original predictors, # whether this one, or the one corresponding to scaling the # data, ought to result in the same fitted model ... if # convergence occurs.) transform.trn <- matrix(0, nrow=nrow(trndat), ncol=(ncol(trndat)-1)) transform.tst <- matrix(0, nrow=nrow(tstdat), ncol=(ncol(tstdat)-1)) for (i in 2:11) { transform.trn[,i-1] <- (trndat[,i]-min(trndat[,i]))/(max(trndat[,i])-min(trndat[,i])) transform.tst[,i-1] <- (tstdat[,i]-min(trndat[,i]))/(max(trndat[,i])-min(trndat[,i])) } # (Note: Above the test data was transformed the same way as the training data ... # so used min and max values from training data to transform test data.) transtrn <- data.frame( cbind( trndat[,1], transform.trn ) ) names(transtrn) <- names(trndat) transtst <- data.frame( cbind( tstdat[,1], transform.tst ) ) names(transtst) <- names(tstdat) logreg.fit1 <- multinom(as.factor(y) ~ ., data=transtrn) # We still don't achieve convergence. (Often when a log reg fit # fails to converge, some of the coef values and/or estimated # standard error values are rather extreme, but this doesn't happen # in a big way here (see output below).) (Note: Wald = T below results # in Wald test statistics being given (so one can see if there are any # unimportant variables).) summary(logreg.fit1, Wald=T) # One way to choose a decent logistic regression model from amongst # a collection of fitted models based on different predictors is to # choose the one yielding the smallest value of the AIC. We can # note that here we got a value of about 903. # Let's see how well the fitted model predicts the classes for the # test set. (Note: In practice one may not have a test set to use # to get estimates of the generalizatiion error rates, and so it's # good to have some other way to select a model.) pred.logreg <- predict(logreg.fit1, newdata=transtst, type="class") logreg.rate <- mean( pred.logreg != transtst[,1] ) logreg.rate ##### # Maybe there just isn't enough data to fit 110 parameters well. # So I'll now try using the first three canonical variables. logreg.fit2 <- multinom(as.factor(y) ~ ., data=canvartrn[1:4]) summary(logreg.fit2, Wald=T) pred.logreg <- predict(logreg.fit2, newdata=canvartst[1:4], type="class") logreg.rate <- mean( pred.logreg != canvartst[,1] ) logreg.rate # This time we got convergence, although the AIC (1085) is larger # (as is the error rate of the test sample predictions). ##### # Now I'll just use the first two canonical variables. logreg.fit3 <- multinom(as.factor(y) ~ ., data=canvartrn[1:3]) summary(logreg.fit3, Wald=T) pred.logreg <- predict(logreg.fit3, newdata=canvartst[1:3], type="class") logreg.rate <- mean( pred.logreg != canvartst[,1] ) logreg.rate # Even though the 2nd model (using one fewer canonical variable) did a # better job predicting the classes of the test set, in practice we may # want to be able to choose a model w/o having test set performance # results. If we base model selection on minimizing the AIC, we'd # prefer the model based on the larger number of variables. (Note: # (7.27) on p. 230 of HTF2 indicates that when the sample size is # somewhat large, a small difference in the number of variables may # have little effect on the AIC.) # We can use the anove f'n to do a test to see if there is significant # evidence that the coefficient of cv3 is nonzero. compare <- anova(logreg.fit3, logreg.fit2) compare # There is strong evidence that at least one of the coefficients # of cv3 is nonzero. (Recall there are 10 coefficients of cv3 in # an 11-class model.) So two things (AIC and this test result) # suggest it's better to include cv3 than to not include it (even # though test set performance indicates otherwise). ##### # Since logistic regression results in piecewise linear decision # boundaries, and previous exploration of and experimentation with # the data suggests that perhaps having curved boundaries might be # better, let's see what happens if 2nd-order terms are included. logreg.fit4 <- multinom(as.factor(y) ~ cv1 + cv2 + I(cv1^2) + I(cv2^2) + I(cv1*cv2), data=canvartrn[1:3]) # It can be noted that although convergence didn't occur, based on the # values given above, one might think it was close to convergence when it # stopped. So maybe we need not worry so much about the lack of convergence. # (Notes: (1) From the R documentation, I don't see how one can increase # the number of iterations. (2) The value given with the iterations is # the absolute value of the log-likelihood (evaluated at the mles).) summary(logreg.fit4, Wald=T) compare <- anova(logreg.fit3, logreg.fit4) compare # The 2nd-order model based on cv1 and cv2 has a smaller AIC than the # 1-st order model based on these two variables. Also, a chi-square # test indicated that the 2nd-order terms are collectively significant. pred.logreg <- predict(logreg.fit4, newdata=canvartst[1:3], type="class") logreg.rate <- mean( pred.logreg != canvartst[,1] ) logreg.rate # Even though convergence didn't occur, this produced the lowest AIC so # far (and also the lowest error rate from the test sample). # Let's try a 2nd-order model using first three canonical variables. logreg.fit5 <- multinom(as.factor(y) ~ cv1 + cv2 + cv3 + I(cv1^2) + + I(cv2^2) + I(cv3^2) + + I(cv1*cv2) + I(cv1*cv3) + + I(cv2*cv3), data=canvartrn[1:4]) summary(logreg.fit5, Wald=T) compare <- anova(logreg.fit2, logreg.fit5) compare # The 2nd-order model based on cv1, cv2, and cv3 has a smaller AIC than # the 1-st order model based on these three variables. (It's the smallest # AIC we've obtained so far from models involving the canonical variables.) # Also, a chi-square test indicated that the 2nd-order terms are collectively # significant. # So now it seems sensible to compare the two 2nd-order models. compare <- anova(logreg.fit4, logreg.fit5) compare # The larger 2nd-order model (involving cv3) has a smaller AIC # (832 vs. 922), and also the chi-square test indicates the terms # involving cv3 are collectively significant. # Let's see how well the 2nd-order model based on the first three # canonical variables predicts the test set. pred.logreg <- predict(logreg.fit5, newdata=canvartst[1:4], type="class") logreg.rate <- mean( pred.logreg != canvartst[,1] ) logreg.rate # Lower AIC, but worse performance on test set (50.2% errors vs. 47.4% # errors from 2nd-order model based on just two canonical variables). ##### # Since previous experience with this data indicated that # x.1 and x.2 were the best predictors, and that using x.8 # might also be useful (but not necessarily), let's investigate # some models using a subset of the best original predictors # (transformed to make all of the value lie in [0,1]). (Note: # 2nd-order terms based on the transformed variables will also # have all values in [0,1].) logreg.fit6 <- multinom(as.factor(y) ~ x.1 + x.2 + x.8, data=transtrn) summary(logreg.fit6, Wald=T) # A horribly large AIC! Let's check out the pertinent chi-square test. compare <- anova(logreg.fit6, logreg.fit1) compare # There's strong evidence that at least one of the coefficients of # the variables not included in the smaller model is nonzero. # Let's see how well the fitted model predicts the classes for the # test set. pred.logreg <- predict(logreg.fit6, newdata=transtst, type="class") logreg.rate <- mean( pred.logreg != transtst[,1] ) logreg.rate # Even though the smaller model has a much larger value for the AIC, # and the chi-square test pertaining to the model differences is highly # significant (indicating too many variables were deleted from the # smaller model), the smaller actually predicts the test set classes # a tad better (51.3% errors vs. 51.9% errors from the larger model). # One might wonder why I didn't use a stepwise approach to select # variables for a reduced model. Well, neither the stepclass f'n # or the stepAIC f'n seems to work with the multinom f'n. # Here's a plot summarizing the results so far. aic <- c(903,1085,1148,922,832,1440) test.error <- c(51.9,54.8,49.8,47.4,50.2,51.2) plot(aic,test.error,col="cornflowerblue",pch=19, main="Prediction Error vs. AIC",xlab="AIC",ylab="Prediction Error") ##### # Now I want to focus on classifiers for the two-class problem of # classifying just the Class 2 and Class 3 cases. # Just in case I do something involving random numbers, I'll reset # the random number seed (so that if things are added above, they # won't affect what's below). set.seed(12) # First I'll create some pertinent data sets from the original # variables (linearly transformed to belong to [0,1]) and from # the canonical varibles. ind2 <- 2 + 11*c(0:47) ind3 <- 3 + 11*c(0:47) index <- c(ind2,ind3) transtrn.small <- transtrn[index,] canvartrn.small <- canvartrn[index,] ind2 <- 2 + 11*c(0:41) ind3 <- 3 + 11*c(0:41) index <- c(ind2,ind3) transtst.small <- transtst[index,] canvartst.small <- canvartst[index,] # I'm going to use the glm function to do logistic regression. fit1 <- glm(as.factor(y) ~ ., data=transtrn.small, family=binomial) summary(fit1) # I'll use backwards elimination to search for a good model. # I find it interesting that x.2, which was found to be an # important predictor previously (in stepwise LDA and QDA # when all 11 classes were considered) yields the largest # p-value, and so is the first variable eliminated. fit2 <- glm(as.factor(y) ~ ., data=transtrn.small[,-3], family=binomial) summary(fit2) # The AIC went down. Now x.7 should be eliminated. But # before doing that, let's check to see if a chi-square test # (an asymptotic likelihood ratio test) yields a p-value # consistent with the one from the Wald test about the # coefficient of x.2 from the previous logistic regression # output. Unfortunately, the anova f'n doesn't supply us # with the pertinent p-value (like it does in a lot of cases), # and so we have to produce one using a value from the outputted # object, as shown below. compare <- anova(fit2, fit1) compare p.value <- 1 - pchisq( compare$Dev[2], df=1 ) p.value # We got 0.8796 from the Wald test, and here we get 0.8784. # So pretty close. (It's generally thought to be better to # use chi-square tests, but it's a lot of trouble. For step # 1 in the backwards elimination strategy, I'd have to fit a # total of 11 logistic regression models to get the 10 p-values # I want, whereas I get all 10 Wald test p-values from a single # logistic regression fit. So I'll just rely on Wald test p-values # for now.) fit3 <- glm(as.factor(y) ~ ., data=transtrn.small[,-c(3,8)], family=binomial) summary(fit3) # Now I'll remove x.5. fit4 <- glm(as.factor(y) ~ ., data=transtrn.small[,-c(3,6,8)], family=binomial) summary(fit4) # Now I'll remove x.1. (I'm surprised that x.1 seems expendable since it has # showed up as a strong predictor previously.) fit5 <- glm(as.factor(y) ~ ., data=transtrn.small[,-c(2,3,6,8)], family=binomial) summary(fit5) # Now I'll remove x.9. fit6 <- glm(as.factor(y) ~ ., data=transtrn.small[,-c(2,3,6,8,10)], family=binomial) summary(fit6) # Let's see if the sequence of models found by the backwards elimination ploy # results in a sequence of decreasing misclassification rates on the test data. # (Note: type="class" didn't work when I gave the predict f'n the glm object. # So I use type="response" which returns probabilities (for class 3). I can # round the probabilities to get a vector of 0s and 1s which can be compared # to a vector of the true classes with 2 subtracted (to convert the 2s and 3s # to 0s and 1s). Since AIC decreased as the models were made smaller, I'll # plot the test error rates against AIC. If smaller AIC corresponds to better # prediction accuracy (and so larger AIC would correspond to worse accuracy) # we should see an increasing trend on the plot (the larger the AIC, the larger # the error rate). error <- rep(0,6) pred <- predict(fit1, newdata=transtst.small, type="response") error[1] <- mean( round(pred) != (transtst.small[,1]-2) ) pred <- predict(fit2, newdata=transtst.small, type="response") error[2] <- mean( round(pred) != (transtst.small[,1]-2) ) pred <- predict(fit3, newdata=transtst.small, type="response") error[3] <- mean( round(pred) != (transtst.small[,1]-2) ) pred <- predict(fit4, newdata=transtst.small, type="response") error[4] <- mean( round(pred) != (transtst.small[,1]-2) ) pred <- predict(fit5, newdata=transtst.small, type="response") error[5] <- mean( round(pred) != (transtst.small[,1]-2) ) pred <- predict(fit6, newdata=transtst.small, type="response") error[6] <- mean( round(pred) != (transtst.small[,1]-2) ) AIC <- c(fit1$aic, fit2$aic, fit3$aic, fit4$aic, fit5$aic, fit6$aic) plot(AIC, error, type="l", col="limegreen", main="Generalization Error vs. AIC", sub="Classification of 2nd and 3rd Classes Using Original Variables", xlab="AIC", ylab="Estimated Error Rate") results <- cbind(AIC,error) results # In this case, once again, AIC doesn't do a good # job of identifying the best performing models. ##### # We can use the stepAIC f'n with a glm object # (from the fit based on all of the variables) # to do stepwise logistic regression, deleting # and possibly adding back in variables in a # step-by-step manner in an attempt to minimize # the AIC. fit.sw <- stepAIC(fit1, scope = list(upper = ~., lower = ~1), trace = FALSE) fit.sw$anova # This stepwise tactic results in the same final model # I obtained above using backwards elimination based # on Wald test p-values, but the sequence of models is # slightly different. Here the variables were removed # in the order # x.2, x.7, x.1, x.5, x.9 # and before they were removed in the order # x.2, x.7, x.5, x.1, x.9. ##### # Now let's try using the canonical variables. cvfit1 <- glm(as.factor(y) ~ ., data=canvartrn.small, family=binomial) summary(cvfit1) # I'll remove cv5. cvfit2 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-6], family=binomial) summary(cvfit2) # I'll remove cv3. cvfit3 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-c(4,6)], family=binomial) summary(cvfit3) # I'll remove cv9. cvfit4 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-c(4,6,10)], family=binomial) summary(cvfit4) # Even though there are no more large p-values, instead of # stopping I'll keep going in order to more thoroughly explore # the relationship between AIC and prediction accurcay. I'll # remove cv7. cvfit5 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-c(4,6,8,10)], family=binomial) summary(cvfit5) # I'll remove cv1. cvfit6 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-c(2,4,6,8,10)], family=binomial) summary(cvfit6) # I'll remove cv10. cvfit7 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-c(2,4,6,8,10,11)], family=binomial) summary(cvfit7) # I'll remove cv4. cvfit8 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-c(2,4,5,6,8,10,11)], family=binomial) summary(cvfit8) # I'll remove cv6. cvfit9 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-c(2,4,5,6,7,8,10,11)], family=binomial) summary(cvfit9) # I'll remove cv8. cvfit10 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-c(2,4,5,6,7,8,9,10,11)], family=binomial) summary(cvfit10) # Now let's look at a plot of test set error against AIC. error <- rep(0,10) pred1 <- predict(cvfit1, newdata=canvartst.small, type="response") error[1] <- mean( round(pred1) != (canvartst.small[,1]-2) ) pred2 <- predict(cvfit2, newdata=canvartst.small, type="response") error[2] <- mean( round(pred2) != (canvartst.small[,1]-2) ) pred3 <- predict(cvfit3, newdata=canvartst.small, type="response") error[3] <- mean( round(pred3) != (canvartst.small[,1]-2) ) pred4 <- predict(cvfit4, newdata=canvartst.small, type="response") error[4] <- mean( round(pred4) != (canvartst.small[,1]-2) ) pred5 <- predict(cvfit5, newdata=canvartst.small, type="response") error[5] <- mean( round(pred5) != (canvartst.small[,1]-2) ) pred6 <- predict(cvfit6, newdata=canvartst.small, type="response") error[6] <- mean( round(pred6) != (canvartst.small[,1]-2) ) pred7 <- predict(cvfit7, newdata=canvartst.small, type="response") error[7] <- mean( round(pred7) != (canvartst.small[,1]-2) ) pred8 <- predict(cvfit8, newdata=canvartst.small, type="response") error[8] <- mean( round(pred8) != (canvartst.small[,1]-2) ) pred9 <- predict(cvfit9, newdata=canvartst.small, type="response") error[9] <- mean( round(pred9) != (canvartst.small[,1]-2) ) pred10 <- predict(cvfit10, newdata=canvartst.small, type="response") error[10] <- mean( round(pred10) != (canvartst.small[,1]-2) ) AIC <- c(cvfit1$aic,cvfit2$aic,cvfit3$aic,cvfit4$aic,cvfit5$aic,cvfit6$aic, cvfit7$aic,cvfit8$aic,cvfit9$aic,cvfit10$aic) plot(AIC, error, type="l", col="darkorange", main="Generalization Error vs. AIC", sub="Classification of 2nd and 3rd Classes Using Canonical Variables (from all 11 classes)", xlab="AIC", ylab="Estimated Error Rate",xlim=c(45,124),ylim=c(0.075,0.345)) results <- cbind(AIC,error) results # As the AIC initially decreased the test error did not. # The p-values indicated I should have stopped after the # fourth model (with three variables removed), but when # I kept going, even though the AIC got larger, I obtained # models which resulted in appreciably lower test error rates. ##### # Now I want to look at what happens if we start with 10 # canonical variables and produce a sequence of simpler # models by removing one variable at a time, starting with # cv10, then cv9, then cv8, etc. cvfit11 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-11], family=binomial) cvfit12 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-c(10,11)], family=binomial) cvfit13 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-c(9,10,11)], family=binomial) cvfit14 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-c(8,9,10,11)], family=binomial) cvfit15 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-c(7,8,9,10,11)], family=binomial) cvfit16 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-c(6,7,8,9,10,11)], family=binomial) cvfit17 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-c(5,6,7,8,9,10,11)], family=binomial) cvfit18 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-c(4,5,6,7,8,9,10,11)], family=binomial) cvfit19 <- glm(as.factor(y) ~ ., data=canvartrn.small[,-c(3,4,5,6,7,8,9,10,11)], family=binomial) newerror <- rep(0,10) newerror[1] <- error[1] pred11 <- predict(cvfit11, newdata=canvartst.small, type="response") newerror[2] <- mean( round(pred11) != (canvartst.small[,1]-2) ) pred12 <- predict(cvfit12, newdata=canvartst.small, type="response") newerror[3] <- mean( round(pred12) != (canvartst.small[,1]-2) ) pred13 <- predict(cvfit13, newdata=canvartst.small, type="response") newerror[4] <- mean( round(pred13) != (canvartst.small[,1]-2) ) pred14 <- predict(cvfit14, newdata=canvartst.small, type="response") newerror[5] <- mean( round(pred14) != (canvartst.small[,1]-2) ) pred15 <- predict(cvfit15, newdata=canvartst.small, type="response") newerror[6] <- mean( round(pred15) != (canvartst.small[,1]-2) ) pred16 <- predict(cvfit16, newdata=canvartst.small, type="response") newerror[7] <- mean( round(pred16) != (canvartst.small[,1]-2) ) pred17 <- predict(cvfit17, newdata=canvartst.small, type="response") newerror[8] <- mean( round(pred17) != (canvartst.small[,1]-2) ) pred18 <- predict(cvfit18, newdata=canvartst.small, type="response") newerror[9] <- mean( round(pred18) != (canvartst.small[,1]-2) ) pred19 <- predict(cvfit19, newdata=canvartst.small, type="response") newerror[10] <- mean( round(pred19) != (canvartst.small[,1]-2) ) newAIC <- c(cvfit1$aic,cvfit11$aic,cvfit12$aic,cvfit13$aic,cvfit14$aic,cvfit15$aic,cvfit16$aic, cvfit17$aic,cvfit18$aic,cvfit19$aic) points(newAIC, newerror, type="l", col="limegreen") newresults <- cbind(newAIC,newerror) newresults # The best performing model is the one using cv1 through cv4, # but it's not clear how one would be led to that model from # the training data. (Note: The best performing model has an # AIC about 50% larger than the smallest AIC, but makes less # than 1/3 the errors that the model with the smallest AIC made.) ##### # Finally I'll use the stepAIC function to perform stepwise # logistic regression using the canonical variables. (It # will delete and possibly add back in deleted variables # as it decreases the AIC.) cvfit.sw <- stepAIC(cvfit1, scope = list(upper = ~., lower = ~1), trace = FALSE) cvfit.sw$anova # The sequence of four models indicated above are the same as # the first four in the sequence obtained using backwards # elimination based on p-values. I'll add this "path" to # the plot. (To better see the small stepwise path, I suggest # looking at a full screen version of the plot.) points(AIC[1:4], error[1:4], type="l", lty="dotted", col="darkslateblue") legend(45,0.35,legend=c("back. elim. (p-values)","canonical variable order","stepwise (AIC)"), fill=c("darkorange","limegreen","darkslateblue")) # It should be noted that the canonical variables used above were # based on the data from all 11 classes (and by design attempt to # best separate all 11 classes). They aren't ideal for classifying # just the two classes at hand. (Note: For just the two classes, # just a single discriminant function from LDA takes the place of the # canonical variables (which are for when you want a dimension less # than the number of classes minus 1 (see top of p. 114 of HTF2)).) # The main purpose of the exercise above was to obtain more sequences # of models in order to explore whether or not minimizing the AIC is a # good way to identify a good predicting model. With this data, using # AIC doesn't seem to work well, but we should try other data sets before # jumping to a firm conclusion. ##### End #####