# First I'll bring in the necessary libraries. # (Note: It may be that not all of these are # needed.) library(MASS) library(Hmisc) library(Design) library(stats) # # Now I will read in the data. xyvalues <- data.frame(read.table("http://mason.gmu.edu/~csutton/spline.DAT", header=FALSE)) names(xyvalues) <- c("x","y") attach(xyvalues) plot(x,y,xlab="x",ylab="y",main="knots at 30, 60, & 90 (except for smoothing spline)") # # Now I will create the vectors that are needed # to represent the basis functions other than # h0(x) = 1 and h1(x) = x for a linear spline. h2 <- ifelse(x - 30 > 0, x - 30, 0) h3 <- ifelse(x - 60 > 0, x - 60, 0) h4 <- ifelse(x - 90 > 0, x - 90, 0) # # Now I will fit the linear spline model. linspl5 <- lm(y ~ x + h2 + h3 + h4) summary(linspl5) # # Here is an alternate way to fit a linear spline # with knots at 30, 60, and 90. altlinspl <- lm(y ~ lsp(x, c(30,60,90))) summary(altlinspl) # (Note: This way of doing it produces exactly the # same fit as the first way I did it.) # # Now I will create the vectors that are needed # to represent the basis functions other than # h0(x) = 1, h1(x) = x, h2(x) = x^2, and h3(x) # = x^3 for a cubic spline. hc4 <- h2^3 hc5 <- h3^3 hc6 <- h4^3 # Now I will fit the cubic spline model. cubspl7 <- lm(y ~ x + I(x^2) + I(x^3) + hc4 + hc5 + hc6) summary(cubspl7) # # I will now fit a cubic spline having only 5 df # (having one knot at 60). cubspl5 <- lm(y ~ x + I(x^2) + I(x^3) + hc5) summary(cubspl5) # # I will use Harrell's rcs function to fit a # restricted (aka natural) cubic spline, having # knots at 30, 60, and 90. natspl3 <- lm(y ~ rcs(x, c(30,60,90))) summary(natspl3) # The residuals for this fit look rather bad # (plot not shown, but you can see the poor # fit by looking at the first plot shown below). # # Since it's not really fair to compare the # natural cubic spine with 3 knots to the cubic # spline with 3 knots, since the cubic spline # uses 4 more parameters than the natural spline # does, a fairer comparison would be to use a # natural cubic spline with 7 knots, but since 7 # may be more than is really needed, I'll just # increase the number of knots to 5, which will # make the natural cubic spline have the same # number of knots as the linear spline fit above. natspl5 <- lm(y ~ rcs(x, c(10,30,60,90,110))) summary(natspl5) # # I will fit a smoothing spline having 5 df (same # as for the linear spline and the second natural # cubic spline). smospl5 <- smooth.spline(x, y, df=5, all.knots=TRUE) # # Now I will use cross-validation to determine the # proper amount of smoothing. smosplcv <- smooth.spline(x, y, cv=TRUE, all.knots=TRUE) smosplcv$df # Above is the equivalent df chosen by c-v. # Finally, I will fit a global 4th degree # polynomial (not a spline), which has 5 df. poly5 <- lm(y ~ x + I(x^2) + I(x^3) + I(x^4)) summary(poly5) # # Now I will create the new x values for which # I want predictions (so that I can get a nice # plot of the estimated mean function). # Then I will create the data frame that will # be used to obtain the estimates of the mean. xforplot <- seq(1:119) hh2 <- ifelse(xforplot > 30, xforplot - 30, 0) hh3 <- ifelse(xforplot > 60, xforplot - 60, 0) hh4 <- ifelse(xforplot > 90, xforplot - 90, 0) hhc4 <- hh2^3 hhc5 <- hh3^3 hhc6 <- hh4^3 dataforplot <- data.frame(cbind(xforplot,hh2,hh3,hh4,hhc4,hhc5,hhc6)) names(dataforplot) <- c("x","h2","h3","h4","hc4","hc5","hc6") detach(xyvalues) attach(dataforplot) est.mean.1 <- predict(linspl5, newdata=dataforplot) points(xforplot, est.mean.1, type="l", col=2) est.mean.2 <- predict(cubspl7, newdata=dataforplot) points(xforplot, est.mean.2, type="l", col=3) est.mean.4 <- predict(natspl3, newdata=dataforplot) points(xforplot, est.mean.4, type="l", col=4) est.mean.7 <- predict(smosplcv, x=xforplot) points(xforplot, est.mean.7$y, type="l", col=5) legend( 5, 2.5, legend=c("linear", "cubic", "natural", "smoothing"), fill=c(2,3,4,5)) # # Now I will just plot fits of models having 5 df. detach(dataforplot) attach(xyvalues) plot(x,y,xlab="x",ylab="y",main="5 df") detach(xyvalues) attach(dataforplot) est.mean.8 <- predict(poly5, newdata=dataforplot) points(xforplot, est.mean.8, type="l", col=8) est.mean.1 <- predict(linspl5, newdata=dataforplot) points(xforplot, est.mean.1, type="l", col=2) est.mean.3 <- predict(cubspl5, newdata=dataforplot) points(xforplot, est.mean.3, type="l", col=3) est.mean.5 <- predict(natspl5, newdata=dataforplot) points(xforplot, est.mean.5, type="l", col=4) est.mean.6 <- predict(smospl5, x=xforplot) points(xforplot, est.mean.6$y, type="l", col=5) legend( 5, 2.5, legend=c("4th deg poly","linear spl","cubic spl","natural spl","smoothing spl"), fill=c(8,2,3,4,5)) # I didn't bother to generate a large new set of data # using the same method that I used to generate the # sample of size 60 (used to get the various fits) in # order to see which method resulted in the smallest # generalization error, because with such a study the # winner is highly dependent on the manner in which # the training data was generated. Just one example # wouldn't mean much --- one would have to do a larger # study to reach meaningful conclusions. #