# Chemical Engineering Example
#
# ... to show 
#    (1) how to read data into R from a (nice) web page,
#        name the variables, and make the data nice to use,
#    (2) how to determine if a full 2nd-order model is 
#        (statistically) significantly better than a 
#        1st-order model,
#    (3) how to determine if a full 2nd-order model is
#        (statistically) significantly better than a
#        2nd-order additive model,
#    (4) how to compare 3rd- and 4th-order models, 
#        if needed,
#    (5) how to use c+r plots to search for alternative
#        models,
# and as a bonus, I'll finish by showing more about 
# reading in data sets.
#
#
# Task 1 
# (reading in the data from one of my web pages,
# and making it nice to use)
#
# I have a web page,
#    http://mason.gmu.edu/~csutton/MandP220.DAT,
# that shows the data and gives a very brief description.
# I have another web page,
#    http://mason.gmu.edu/~csutton/MandP220.dat,
# which just contains the actual data values (in 3 columns),
# that I created by (some time ago) typing the small data set
# into Minitab, and then writing the data into a file.
# Below, I will read the data into R from this second file,
# and then show the data.
# (Note: I didn't have to put header=FALSE since that is the default.)
chemical <- read.table("http://mason.gmu.edu/~csutton/MandP220.dat", header=FALSE)
chemical

#
# It can be noted that the columns/variables do not have nice names
# (the V1, V2, and V3 names are due to R).
# I can name the columns/variables using R's names function.
names(chemical) <- c("yield", "time", "temp")
chemical

#
# As it is now, to use the variable yield, one would have to type
# chemical$yield --- but if we use the attach function this can be 
# avoided.
attach(chemical)
yield

#
#
# Task 2
# (compare full 2nd-order model to 1st-order model)
#
# First I will fit a 1st-order model and look at the residuals.
lm1st <- lm(yield ~ time + temp)
summary(lm1st)
plot(lm1st$fit, lm1st$res)
abline(h = 0, col = 8)

#
# It can be seen that both predictors are significant,
# but the residual plot clearly suggests appreciable
# nonlinearity --- 
# and so we will now examine a (full) 2nd-order model.
# (First I create the variables for the 2nd-order terms.)
time2 <- time*time
temp2 <- temp*temp
timetemp <- time*temp
lm2ndfull <- lm(yield ~ time + temp + time2 + temp2 + timetemp)
summary(lm2ndfull)
plot(lm2ndfull$fit, lm2ndfull$res)
abline(h = 0, col = 8)

#
# Here is another way to fit the 2nd-order model that 
# doesn't require creating the 2nd-order variables.
lm2ndalt <- lm(yield ~ time + temp + I(time^2) + I(temp^2) + I(time*temp))
summary(lm2ndalt)

#
# It can be seen that all five predictors are significant 
# (and so really I can address Task 3 at this point ---
# the t test p-value of about 0.027 for the interaction 
# variable suggests that we should not simplify to a 
# 2nd-order additive model).
# It can also be seen that the residuals do not indicate
# appreciable nonlinearity, nor to they indicate
# appreciable heteroscedasticity.
#
# Although, at this point, there is no real need to do a 
# test to determine if there is statistically significant 
# evidence that the 2nd-order model is better (since all
# of the 2nd-order terms are significant, and since the
# residuals indicated that the 1st-order model was horrible),
# I will nevertheless demonstrate how to do the appropriate
# F test.
# (To do the F test, we need to use sum of squares values 
# that can be found in the ANOVA tables associated with
# the regression fits.  Below I will 
#    show the ANOVA table for the 2nd-order model, 
#  & show how you can find out the name for the needed SS value.
# Then I will compute the value of the F statistic 
# and obtain the p-value.  (The numerator of the F statistic is
# the difference of the values of the sum of the squared residuals 
# for the two models, divided by the difference in the number of 
# predictors for the two models, and the denominator of the F 
# statistic is the MSE value for the larger model.))
aov2nd <- anova(lm2ndfull)
aov2nd
names(aov2nd)
aov2nd$Sum
aov1st <- anova(lm1st)
Fstat <- ((aov1st$Sum[3] - aov2nd$Sum[6])/3)/aov2nd$Mean[6]
p.value <- 1 - pf(Fstat, 3, aov2nd$Df[6])
p.value

#
# Whoa!  After doing some reading, I discovered a much easier way to
# perform the F test --- one can simply use the anova function as I 
# use it below.
# (Note: When comparing two models in this way, the larger model must
# use all of the predictors used in the smaller model (and of course 
# the larger model uses some additional predictors).  Also, I'll point
# out that I didn't delete the more complicated way of getting the 
# desired p-value that is shown above, since it does illustrate how
# one can determine what the components of an object are, and it shows
# how specific values from the object can be used in further computations.)
anova(lm1st, lm2ndfull)

#
#
# Task 4
# (consider 3rd- and 4th-order models)
#
# Now I'll compare full a 3rd-order model to the full 2nd-order model.
time3 <- time2*time
temp3 <- temp2*temp
time2temp <- time2*temp
timetemp2 <- time*temp2
lm3rdfull <- lm(yield ~ time+temp+time2+temp2+timetemp+time3+temp3+time2temp+timetemp2)
summary(lm3rdfull)
plot(lm3rdfull$fit, lm3rdfull$res)
abline(h = 0, col = 8)
anova(lm2ndfull, lm3rdfull)

#
# The F test result indicates that the 3rd-order model is superior, 
# but it is interesting to note that not many of the predictors are
# highly significant (based on the t test results shown in the 
# summary output).  So it might be good to investigate a 3rd-order
# additive model.
lm3rdadd <- lm(yield ~ time+time2+time3 + temp+temp2+temp3)
summary(lm3rdadd)
plot(lm3rdadd$fit, lm3rdadd$res)
abline(h = 0, col = 8)
anova(lm3rdadd, lm3rdfull)

#
# The F test result indicates that the full model is better than 
# the additive model.  
#
# Below I show another way to fit an additive model.
# It can be seen from the R**2 value that this fit seems to be
# equivalent (and it *is* equivalent), but the coefficients are
# different.  This is due to the fact that when poly is used,
# orthogonal polynomials are created, and the individual 
# predictors used are not the simple power-transformed variables.
# I think that the first way I did it is better with regard to
# producing more easily interpretable results (and I'm not going
# to further deal with orthogonal polynomials).
alt3rdadd <- lm(yield ~ poly(time,3) + poly(temp,3))
summary(alt3rdadd)

# 
# Now let's see what happens when a 4th-order model is considered.
time4 <- time3*time
temp4 <- temp3*temp
time3temp <- time3*temp
timetemp3 <- time*temp3
time2temp2 <- time2*temp2
lm4thfull <- lm(yield ~ time+temp+time2+temp2+timetemp+time3+temp3+time2temp+timetemp2+time4+temp4+time3temp+timetemp3)
summary(lm4thfull)
plot(lm4thfull$fit, lm4thfull$res)
abline(h = 0, col = 8)
anova(lm3rdfull, lm4thfull)

#
# Finally, we have a nonsignificant F test result --- 
# we do not have strong evidence that adding the four
# 4th-order terms really gives a better model 
# (even though the R**2 value is larger).
# 
# Later, we will consider other ways to compare models
# (e.g., Mallow's C_p and AIC).  As a teaser, I'll 
# use R's step function, which selects a model based on
# AIC values.  (I'll delay giving detailas about AIC,
# but in class I will explain what R's step function is 
# doing.  Basically, it searches for the model which 
# has the largest AIC value.)
step(lm4thfull)

#
# Since the step results suggest that some of the 
# interaction terms in the full 3rd-order model 
# aren't needed, and that perhaps some 4th-order terms 
# might be useful, rather than stop at this point,
# I'll explore using c+r plots to find transformations
# of time and temp to use as predictors.  Due to the 
# strong nonlinearity exhibited in the fit of the 
# 1st-order model, I'll start by using time2 and temp2
# as initial predictors.
crtime <- time2
crtemp <- temp2
crfit <- lm(yield ~ crtime + crtemp)
#
# The use of the objects named using cr above is to make 
# it simpler to copy and paste previously used code as 
# I investigate various transformations.
#
# First I will look at the initial c+r plot for time.
plot(crtime, crtime*crfit$coef[2] + crfit$res)

#
# Next I will look at the initial c+r plot for temp.
plot(crtemp, crtemp*crfit$coef[3] + crfit$res)

#
# I'll start by altering the transformation of time,
# going from a power of 2 to a power of 2.5.
crtime <- time**2.5
crfit <- lm(yield ~ crtime + crtemp)
plot(crtime, crtime*crfit$coef[2] + crfit$res)

#
# It seems like a stronger trasnformation is needed,
# so I'll try a power of of 3.5.
crtime <- time**3.5
crfit <- lm(yield ~ crtime + crtemp)
plot(crtime, crtime*crfit$coef[2] + crfit$res)

#
# It seems like a stronger trasnformation is needed,
# so I'll try a power of of 5.
crtime <- time**5
crfit <- lm(yield ~ crtime + crtemp)
plot(crtime, crtime*crfit$coef[2] + crfit$res)

#
# Now I will look at the c+r plot for temp (with
# the current transformed temp being temp**2),
# based on the fit using time**5.
plot(crtemp, crtemp*crfit$coef[3] + crfit$res)

#
# I'll now try using just temp, instead of temp**2.
crtemp <- temp
crfit <- lm(yield ~ crtime + crtemp)
plot(crtemp, crtemp*crfit$coef[3] + crfit$res)

#
# Perhaps this is better.  So I will now look at 
# the R**2 value which results from a fit using
# just time**5 and temp (which was just created).
# (Note: With both time and temp, simple power 
# transformations don't have a great effect since 
# the spread of the predictor values is small
# relative to their distances from 0.)
summary(crfit)

# 
# Although the c+r plots don't suggest that further
# transformations are called for, the R**2 value 
# and the residual plot which results from the fit
# indicate that an additive model based on the
# transformed predictors isn't very good compared
# to the full 3rd-order model (and also compared to
# the full 2nd order model).  This suggests that an
# interaction of time and temp needs to be adjusted 
# for.
#
#
#
#
# As an added bonus, I'll show how to use one of the data sets
# that comes with R.  One such data set is the cherry tree data
# that I used in class previously.  It's already in R, and so 
# we don't have to read it in.  But to make it convenient to use,
# it can be attached.  But before attaching trees, I will detach
# chemical.  
# (Note: I'm not going to bother to clear my work space now, or
# start a new R session, but you might want to do so if you are
# going to do analysis with a new data set.)
detach(chemical) 
attach(trees)
trees

#
# Note that the variables have different names than I gave them
# in my previous analysis.
#
# The data can also be read from a web site where the data set 
# is stored in a form with variable names given above the column.
cherry <- read.table("http://www.statsci.org/data/general/cherry.txt", header = TRUE)
cherry

#
# With both this version of the data, and the version that is 
# part of R, the variables already have names.  But these 
# names can be changed, as shown below.
names(cherry) <- c("dia", "ht", "vol")
cherry

#
# Finally, I'll add that if you have data in a plain text file
# (i.e., a "flat file" in ASCII format, as might be created using 
# NotePad), then you can (perhaps) read it into R using read.table.
#
# For example, I have a file named pitching.txt, with the contents
# shown below, in a folder I created named Rdata.
# 
#  IP   ER
#  45   17
# 101   29
#  23   14
#  31    9
#
# I can read the data into R, along with the variable names, as 
# shown below.
# (Note: Even though Windows uses backslashes, for this use
# forward slashes.) 
pitch <- read.table("C:/Documents and Settings/user/My Documents/Rdata/pitching.txt",header=T)
pitch

#
# I have another file, named batting.txt, with the contents 
# shown below, in the same folder.
#
# 10,34
# 8,32
# 4,21
# 9,33
#
# Since it doesn't have variable names, and since the numbers
# are separated by commas instead of blanks, I read it in a
# slightly different way, as shown below.
bat <- read.table("C:/Documents and Settings/user/My Documents/Rdata/batting.txt",sep=",",header=F)
bat