# First I'll bring in the necessary libraries.  
# (Note: It may be that not all of these are 
# needed.) 
library(MASS)
library(Hmisc)
library(Design)
library(stats)

#
# Now I will read in the data.
xyvalues <- data.frame(read.table("http://mason.gmu.edu/~csutton/spline.DAT", header=FALSE))
names(xyvalues) <- c("x","y")
attach(xyvalues)
plot(x,y,xlab="x",ylab="y",main="knots at 30, 60, & 90 (except for smoothing spline)")

#
# Now I will create the vectors that are needed
# to represent the basis functions other than
# h0(x) = 1 and h1(x) = x for a linear spline.
h2 <- ifelse(x - 30 > 0, x - 30, 0)
h3 <- ifelse(x - 60 > 0, x - 60, 0)
h4 <- ifelse(x - 90 > 0, x - 90, 0)
#
# Now I will fit the linear spline model.
linspl5 <- lm(y ~ x + h2 + h3 + h4)
summary(linspl5)
#
# Here is an alternate way to fit a linear spline
# with knots at 30, 60, and 90.
altlinspl <- lm(y ~ lsp(x, c(30,60,90)))
summary(altlinspl)
# (Note: This way of doing it produces exactly the 
# same fit as the first way I did it.)

#
# Now I will create the vectors that are needed
# to represent the basis functions other than
# h0(x) = 1, h1(x) = x, h2(x) = x^2, and h3(x) 
# = x^3 for a cubic spline.
hc4 <- h2^3
hc5 <- h3^3
hc6 <- h4^3
# Now I will fit the cubic spline model.
cubspl7 <- lm(y ~ x + I(x^2) + I(x^3) + hc4 + hc5 + hc6)
summary(cubspl7)
#
# I will now fit a cubic spline having only 5 df
# (having one knot at 60).
cubspl5 <- lm(y ~ x + I(x^2) + I(x^3) + hc5)
summary(cubspl5)

#
# I will use Harrell's rcs function to fit a 
# restricted (aka natural) cubic spline, having
# knots at 30, 60, and 90.
natspl3 <- lm(y ~ rcs(x, c(30,60,90)))
summary(natspl3)
# The residuals for this fit look rather bad
# (plot not shown, but you can see the poor 
# fit by looking at the first plot shown below).
# 
# Since it's not really fair to compare the 
# natural cubic spine with 3 knots to the cubic
# spline with 3 knots, since the cubic spline 
# uses 4 more parameters than the natural spline
# does, a fairer comparison would be to use a
# natural cubic spline with 7 knots, but since 7
# may be more than is really needed, I'll just 
# increase the number of knots to 5, which will 
# make the natural cubic spline have the same 
# number of knots as the linear spline fit above.
natspl5 <- lm(y ~ rcs(x, c(10,30,60,90,110)))
summary(natspl5)

#
# I will fit a smoothing spline having 5 df (same
# as for the linear spline and the second natural
# cubic spline).
smospl5 <- smooth.spline(x, y, df=5, all.knots=TRUE)
#
# Now I will use cross-validation to determine the
# proper amount of smoothing.
smosplcv <- smooth.spline(x, y, cv=TRUE, all.knots=TRUE)
smosplcv$df
# Above is the equivalent df chosen by c-v.

# Finally, I will fit a global 4th degree
# polynomial (not a spline), which has 5 df.
poly5 <- lm(y ~ x + I(x^2) + I(x^3) + I(x^4))
summary(poly5)

#
# Now I will create the new x values for which
# I want predictions (so that I can get a nice 
# plot of the estimated mean function).
# Then I will create the data frame that will
# be used to obtain the estimates of the mean.
xforplot <- seq(1:119)
hh2 <- ifelse(xforplot > 30, xforplot - 30, 0)
hh3 <- ifelse(xforplot > 60, xforplot - 60, 0)
hh4 <- ifelse(xforplot > 90, xforplot - 90, 0)
hhc4 <- hh2^3
hhc5 <- hh3^3
hhc6 <- hh4^3
dataforplot <- data.frame(cbind(xforplot,hh2,hh3,hh4,hhc4,hhc5,hhc6))
names(dataforplot) <- c("x","h2","h3","h4","hc4","hc5","hc6")
detach(xyvalues)
attach(dataforplot)
est.mean.1 <- predict(linspl5, newdata=dataforplot)
points(xforplot, est.mean.1, type="l", col=2) 
est.mean.2 <- predict(cubspl7, newdata=dataforplot)
points(xforplot, est.mean.2, type="l", col=3) 
est.mean.4 <- predict(natspl3, newdata=dataforplot)
points(xforplot, est.mean.4, type="l", col=4) 
est.mean.7 <- predict(smosplcv, x=xforplot)
points(xforplot, est.mean.7$y, type="l", col=5) 
legend( 5, 2.5, legend=c("linear", "cubic", "natural", "smoothing"), fill=c(2,3,4,5))

#
# Now I will just plot fits of models having 5 df.
detach(dataforplot)
attach(xyvalues)
plot(x,y,xlab="x",ylab="y",main="5 df")
detach(xyvalues)
attach(dataforplot)
est.mean.8 <- predict(poly5, newdata=dataforplot)
points(xforplot, est.mean.8, type="l", col=8) 
est.mean.1 <- predict(linspl5, newdata=dataforplot)
points(xforplot, est.mean.1, type="l", col=2) 
est.mean.3 <- predict(cubspl5, newdata=dataforplot)
points(xforplot, est.mean.3, type="l", col=3) 
est.mean.5 <- predict(natspl5, newdata=dataforplot)
points(xforplot, est.mean.5, type="l", col=4) 
est.mean.6 <- predict(smospl5, x=xforplot)
points(xforplot, est.mean.6$y, type="l", col=5) 
legend( 5, 2.5, legend=c("4th deg poly","linear spl","cubic spl","natural spl","smoothing spl"), fill=c(8,2,3,4,5))
# I didn't bother to generate a large new set of data
# using the same method that I used to generate the
# sample of size 60 (used to get the various fits) in 
# order to see which method resulted in the smallest
# generalization error, because with such a study the 
# winner is highly dependent on the manner in which 
# the training data was generated.  Just one example
# wouldn't mean much --- one would have to do a larger
# study to reach meaningful conclusions.
#