# First I'll bring in the necessary libraries. # (Note: It may be that not all of these are # needed.) library(MASS) library(Hmisc) library(Design) library(stats) # # Let's use the data set hills, which is available # when the MASS library is loaded. data(hills) # # Let's find out a little about the data. ?hills # # Let's look at the data. hills # # Let's change the questionable observation. hills[18,3] <- 18.65 # # Lat's attach the data frame and check to see # if the change was made correctly. (When I # attached the data frame before making the # change, when I typed hills and looked at the # data, the value of interest was 18.65, but # when I typed time, it was 78.65.) attach(hills) hills time # The desired change seems to have been accepted. # # Let's fit a basic multiple regression model # in order to better see what we're dealing # with. hillfit1 <- lm(time ~ dist + climb) summary(hillfit1) plot(hillfit1) # # Since one can see both heteroscedasticity and # nonlinearity, it makes sense to address the # heteroscedasticity first. I'll choose to trans- # form the response variable and guess that a # square root transformation might be a good # start (having seen some results from a log # transformation, and noting that it seemed like # too strong of a transformation). # # It seems like a nonlinear relationship should # not be unexpected --- even for good runners, # the average time per mile tends to decrease as # the length of the run increases. hillfit2 <- lm(sqrt(time) ~ dist + climb) summary(hillfit2) plot(hillfit2) # # The variance doesn't seem like a problem now. # (I can better assess this once I get rid of some # of the appreciable nonlinearity.) # # Now, instead of seaching for good transformations # the explanatory variables, or trying a polynomial # model, I'll start by using natural cubic splines # to represent both dist and climb. Because there # is not a lot of data, I'll only use 3 knots for # each explanatory variable. hillfit3 <- lm(sqrt(time) ~ rcs(dist, 3) + rcs(climb, 3)) summary(hillfit3) plot(hillfit3) # # There is still a bit of nonlinearity. I'll increase # to 4 knots for dist, since the results of the last fit # indicate that it is the more important of the two # explanatory variables. If that gives me a good fit, # then due to worries about overfitting I'll see if it # looks like one of the explanatory variables can go into # the model linearly, or if some relatively simple trans- # formation might suffice. hillfit4 <- lm(sqrt(time) ~ rcs(dist, 4) + rcs(climb, 3)) summary(hillfit4) plot(hillfit4) # # The extra knot didn't seem to help the nonlinearity a lot, # so before going farther, I'll examine the 3 knot fit some # more, using some of Harrell's functions. # # I'll redo the fit using Harrell's ols function instead of lm. # This will allow me to look at different types of plots. # (The type of plot of chief interest right now is one which # shows the effect of each explanatory variable, as fitted in # model currently (e.g., with natural splines having 3 knots), # on the response variable. I can get such plots when I plot # an object created using Harrell's ols function. But first I # have to use his datadist function to in a sense set things up # for plot to work as desired.) ddist <- datadist(dist, climb) options(datadist='ddist') # Having first created an object using the dataditst function, # I now use the options function as above to specify that the # created object, ddist, be used whenever a function needs to # have something specified for datadist. hillfit5 <- ols(sqrt(time) ~ rcs(dist, 3) + rcs(climb, 3), x=TRUE) hillfit5 par(mfrow=c(2,1)) plot(hillfit5) # # Based on the plots created, I will now put climb into the # model linearly, and once again try using 4 knots with dist. # (Note: A straight line can be drawn between the bands of the # plot for climb, but not for the plot for dist.) hillfit6 <- ols(sqrt(time) ~ rcs(dist, 4) + climb, x=TRUE) hillfit6 plot(hillfit6) # # Now climb produces a very small p-value. The fact that # it's plot looks linear doesn't mean anything --- with it # only put into the model linearly, it cannot not give a # curved plot. The plot for dist doesn't look very curvy, # and so it doesn't seem to need the freedom given to it by # a 4 knot spline. The large p-values for dist's spline # terms don't contradict this conclusion. So let's look at # a residual plot once more. plot(hillfit6$fitted.values, hillfit6$residuals) abline(h=0, col=8) # # It's not that horrible, yet not great either. # Before trying anything else, I'll add a simple # product interaction term to see if it's significant. # I'll also only use 3 knots with dist. hillfit7 <- ols(sqrt(time) ~ rcs(dist, 3) + climb + dist %ia% climb, x=TRUE) # Note the syntax used to include an interaction term when # using Harrell's ols function. hillfit7 plot(hillfit7$fitted.values, hillfit7$residuals) abline(h=0, col=8) # # It doesn't look like an interaction needs to be included. # So perhaps a good model would be to just put climb in # linearly, and use a 3 knot natural spline for dist. hillfit8 <- ols(sqrt(time) ~ rcs(dist, 3) + climb, x=TRUE) hillfit8 plot(hillfit8$fitted.values, hillfit8$residuals) abline(h=0, col=8) # # With this model there are four coefficients to estimate. # An alternative model with four coefficients would be to # put climb in linearly, and use a linear and 2nd-order # term for dist. SO I'll try this, and compare the results. hillfit9 <- ols(sqrt(time) ~ pol(dist, 2) + climb, x=TRUE) # Note the syntax used to include 1st- and 2nd-order terms # for dist when using Harrell's ols function. hillfit9 plot(hillfit9$fitted.values, hillfit9$residuals) abline(h=0, col=8) # # I'll stop at this point. It could be that a different # transformation of the response would work better, or # it could be that other variables not given with the data # are needed to better explain the times. The main point # of this example was to show how natural cubic splines can # be put into a multiple regression model, and investigated, # using some of Harrell's functions. (Note: One needs to # load two of Harrell's libraries; Hmisc and Design.) One # could search for transformations using c+r plots, but that # can be a tedious procedure. Splines give us somewhat of a # quick fix! If you have enough data to avoid gross over- # fitting by doing so, it's sometimes very enlightening to # represent all of the explanatory variables in the model # using splines, and then look at plots to see which ones # have strong nonlinear relationships with the response # after adjusting for the effects of other variables (which # is what Harrell's plots do). After a first quick look, # you may decide to eliminate some explanatory variables from # immediate consideration, and put others into the model only # linearly. At this point the model can be fine-tuned by # seeing if there is strong evidence of the need for one or # more interaction terms (perhaps starting the investigation # by considering just simple product interaction terms using # the untransformed explanatory variables since several products # involving one or more spline terms can add a lot more paramters # which need to be estimated). Finally, once you think that # you've got a decent model, you might see if anything shows up # as being significant if deleted variables are added back into # the model (testing them one at a time). #