# demo to show some regression basics with R

# The data set entered below has 
#       the diameters (inches) at 54 inches above the ground, 
#       heights (feet), 
#   and volumes (cubic feet)
# of 31 black cherry trees from the Allegheny National Forest in Pennsylvania.  
# The measurements were taken so that a formula could be found which would give
# an estimate of the volume, useful for timber yield, given the diameter and height.
dia<-c(83,86,88,105,107,108,110,110,111,112,113,114,114,117,120,129,129,133,137,138,140,142,145,160,163,173,175,179,180,180,206)
ht<-c(70,65,63,72,81,83,66,75,80,75,79,76,76,69,75,74,85,86,71,64,78,80,74,72,77,81,82,80,80,80,87)
vol<-c(103,103,102,164,188,197,156,182,226,199,242,210,214,213,191,222,338,274,257,249,345,317,363,383,426,554,557,583,515,510,770)
dia<-dia/10
vol<-vol/10
# Later I will show how data can be read in from a file.
meas <- cbind(dia,ht,vol)
meas

# First let's see if just the diameter can be used to predict the volume.
plot(dia, vol, xlab='diameter', ylab='volume')
# It appears (look carefully) that the mean volume is not a linear function of diameter.

# Despite the apparent nonlinear relationship, 
# I will use OLS to fit a simple regression model.
diafit <- lm(vol ~ dia)
diafit

# Using the summary function will provide more information.
summary(diafit)

# A plot of the residuals may make it easier to see the nonlinearity.
plot(dia, diafit$res)
abline(h=0, col=8)

# I'll repeat what I did above, 
# except now I'll use height as the sole predictor.
plot(ht, vol, xlab='height', ylab='volume')
# It's perhaps harder to tell if mean volume is (at least approximately) 
# a linear function of height, because the fit isn't tight.

# I will use OLS to fit a simple regression model.
htfit <- lm(vol ~ ht)
summary(htfit)

plot(ht, htfit$res)
abline(h=0, col=8)
# A plot of the residuals suggests that the variance of vol increases with increasing ht.

# Before doing a multiple regression, 
# let's look at the relationship between diameter and height.
plot(ht, dia)
# Although there is a positive correlation, 
# the relationship isn't real tight (not close to a one-to-one relationship),
# and so it may be useful to have both diameter and height 
# in a prediction formula for volume.

# Now I will fit a multiple regression model (using dia and ht as predictors).
multfit1 <- lm(vol ~ dia + ht)
summary(multfit1)
# It can be noted that both predictors are statistically significant (at level 0.015), 
# and the intercept is also highly significant.

# Before going farther, we should look at the residuals.  
# (I will plot them against the fitted values.)
plot(multfit1$fit, multfit1$res)
abline(h=0, col=8)
# There is nonlinearity 
# (but at least the odd point seen in the first residual plot I did disappeared).
# There also seems to be heteroscedasticity (a bit hard to see due to the nonlinearity),  
# and that needs to be addressed prior to trying to correct the nonlinearity.

# Rather than do a weighted regression, I will try to transform the dependent variable, vol.  
# Suppose we view the tree trunk as a cone.  
# The formula giving the volume, v, of a cone in 
# terms of its base diameter, d, and height, h, is
#                              v = ( pi/12 ) h d**2.
# So we have
#                         log(v) = log( pi/12 ) + log(h) + 2 log(d),
# and so it may be that a nice linear fit can be done if we take the natural log 
# (or you could use log base 10) of all three variables.
# (Note: The R function for the natural log is log.)
ldia <- log(dia)
lht <- log(ht)
lvol <- log(vol)
multfit2 <- lm(lvol ~ ldia + lht)
summary(multfit2)
plot(multfit2$fit, multfit2$res)
abline(h=0, col=8)

# It can be noted that the fitted coefficients for lht and ldia are rather close 
# to 1 and 2, but that doesn't mean that the model of a tree trunk as a cone is a
# good one, because we would also expect to get coefficients near 1 and 2 if tree
# trunks are modeled well by cylinders.  But the residuals suggest a good fit, and
# it doesn't really matter if a cone, cylinder, or some other geometric model is good.  
# But suppose that we didn't have a good way to help us guess how to transform predictors
# (since whether one assumes a cone or a cylinder, the h(d**2) product would hold) 
# --- what could we do then?  
# Well, we could use trial-and-error, but if there are a lot of predictors,
# it can be a very tedious procedure.  Looking at a scatter plots of
# the response variable against each predictor may be useful.  
# For example, plotting lvol against dia suggests that we need to "pull in" large values
# of dia, and so one might start by trying log and sqaure root transformations of dia.
plot(dia, lvol)

# But in a multiple regression setting, the scatter plot method may not work so well because 
# one isn't adjusting the dependent variable for the effects of the other predictor variables.  
# Component plus residual plots (aka partial residual plots) show the relationship between a predictor 
# variable and the response variable, adjusted for the effects of the other predictor variables.  
# They tend to work great if only one predictor needs to be transformed.  
# If more than one predictor needs to be transformed, then 
# it's generally good to start by looking at a c+r plot for each of the predictors 
# and initially focus on the one which seems the most nonlinear.
# If you find a transformation for the offending predictor that results in a plot 
# which looks better than the plot for some other variable,
# then turn your attention to the new worst offender.  
# Let's first look at the c+r plot for dia, taking lvol as the dependent variable.
multfit3 <- lm(lvol ~ dia + ht)
plot(dia, dia*multfit3$coef[2] + multfit3$res)

# We need to "pull in" large dia values.  
# But before fiddling with dia, let's look at the c+r plot for ht.
plot(ht, ht*multfit3$coef[3] + multfit3$res)

# Because things are less clear with ht 
# (partially because the pattern isn't as tight), 
# I'll start by taking the square root of dia.  
# (I won't change the name of the object created by the lm function 
# so that I will have less to change when I copy and paste.  
# Also, I will let tdia be the transformed dia variable
# until I settle on my final transformations.)
tdia <- sqrt(dia)
multfit3 <- lm(lvol ~ tdia + ht)
plot(tdia, tdia*multfit3$coef[2] + multfit3$res)

# Before switching my attention to ht, I will try a stronger transformation of dia, 
# since it appears that large values still need to be pulled in.
tdia <- log(dia)
multfit3 <- lm(lvol ~ tdia + ht)
plot(tdia, tdia*multfit3$coef[2] + multfit3$res)

# Since the log transformation works well with dia, 
# I will now take the residuals from the last fit 
# and create a new c+r plot for ht 
# (since I want a transformation of ht to use with the transformed dia variable).
plot(ht, ht*multfit3$coef[3] + multfit3$res)

# It's hard to tell what, if anything, needs to be done.  
# Sometimes if it's hard to tell, one might as well do nothing.
# But in this case, if one has used the log transformation with the other two variables, 
# there is some appeal in using it with the remaining variable.
# It should also be noted that because the spread of the ht values 
# is small relative to their distances from 0,
# neither a square root or log transformation will appreciably change 
# the relative spacings between the predictor values.
# Before fitting with the transformed ht variable, I will look at the 
# R**2 value from the last fit so that I can compare it to the new R**2 value.
summary(multfit3)
tht <- log(ht)
multfit3 <- lm(lvol ~ tdia + tht)
plot(tht, tht*multfit3$coef[3] + multfit3$res)
summary(multfit3)

# Upon comparing the two R**2 values, it can be seen 
# that the transformation apparently had little effect.
# Since using a log transformation for all of the variables is appealing, 
# I'll stop here. 
# Normally, upon transforming a second predictor, 
# one would go back and check to see if the 
# initially transformed predictor is still good,
# but since the R**2 barely changed, I won't bother this time.  
# Also, one would normally look at the residual plot for the final(?) model, 
# but I won't do so now because I examined this fit previously.
# However, I have not yet looked at a probit plot of the residuals, 
# and so let's do that now.  
# (Doing it as I have below, one gets other diagnostic plots as well ---
# four plots will result from the next command, with the probit plot being the 3rd one.
# Note that you have to click on each plot to move to the next one.)
plot(multfit3)
# There is no indication of horrible nonnormality 
# (but there is a suggestion of mild negative skewness).

# Now let's suppose that it has been decided to use ldia and lht as predictors,  
# and let's see if the Box-Cox  transformation procedure identifies 
# a log transformation for vol.
multfit4 <- lm(vol ~ ldia + lht)
library(MASS)
boxcox(multfit4)

# The Box-Cox method worked fine when used with decently transformed predictors,  
# but how does it do with the untransformed predictors?  Let's see.
boxcox(multfit1)
# Although it didn't identify the log transformation as being great, 
# it could be that if one started by using a fourth root or cube root
# trasnformation of vol, and then appropriately transformed the predictors, 
# one would then be led to a log transformation of vol.  (Try it.)