################################################################################
### Basis expansions, splines and smoothing in R
### Version 2. Some comments added and ggplot2 is now explicitly loaded.
################################################################################

### Expansions in polynomial bases can be done with the formula
### interface "by hand", remembering to encapsulate all polynomial
### basis terms in an 'I()' or they will be interpreted wrongly. The
### 'poly' function is convenient for producing expansions using
### orthogonal polynomials, see
?poly
### and 'cut'
?cut
### can be used to convert a univariate continues variable into a
### factor, which effectively result in an expansion in terms of
### indicators of disjoint intervals (a univariate box-type basis).
### The use of 'cut' is historically important for various reasons,
### and you often see in medical journals that variables that are
### believed to have non-linear effects are represented using an
### equivalent of 'cut'. I believe that in such cases other basis
### expansions or smoothing techniques are superior and should be
### considered. One reason is that the use of 'cut' introduces
### somewhat arbitrary divisions of the scale for the regressor that
### are easy to misinterpret.

### Various functions for using regression splines is implemented in
### the splines package (ships with R).

require(splines)

###For information on the computation with natural cubic splines see
?ns
### and for general B-splines see 
?bs
### The interested might also find it relevant to study
?splineDesign

### A reanalysis of the South African heart data using regression splines.

SAheart <- read.table("http://www-stat.stanford.edu/~tibs/ElemStatLearn/datasets/SAheart.data",
                      sep = ",", head = TRUE, row.names = 1)

varNames <- colnames(SAheart[, -c(4, 5, 6, 8, 10)])
form <- paste("chd ~ ",
              paste(paste("ns(", varNames,
                          ", df = 4)", sep = ""),
                    "+", collapse = " "), "famhist")
form <- formula(form)

### The code above is used for automatic formula generation by pasting
### together strings in R and turning the result into a formula. Of
### course, in this case you can just write down the formula, but it
### is a useful technique to know, and it does generally pay of to
### write automatic procedures for tedious tasks like writing
### formulas. You can easily change the number of basis functions (df)
### for all terms, say, or include or exclude terms by changing the
### 'varNames' vector. It is, however, less obvious how to change one
### particular term. The advantage is mostly seen when the formula
### contains a lot of repetitive elements that you might do by hand
### with cut-and-paste anyway.

### The model is then fitted just as an ordinary logistic regression
### model using 'glm' - the difference being that now the linear
### predictor is given in terms of the derived variables, which
### corresponds to an expansion of the non-linear effects in the
### natural spline basis.

SAheartGlm <- glm(form, data = SAheart, family = binomial)
summary(SAheartGlm)

### The summary is not of much value as the individual parameter
### estimates are difficult / impossible to interpret, and the test
### statistics are tests of basically irrelevant hypotheses. Table 5.1
### in the book, reproduced below, is more informative. Note that the
### model was selected based on step-wise backwards elimination using
### AIC, and that the resulting model can not be simplified
### further. That is, AIC is smallest when no terms are dropped, and
### dropping any term increases the AIC. The formal likelihood ratio
### tests tell a slightly different storry, but recall that the formal
### assumptions for these tests are violated by the fact that the
### model is selected based on the data.

drop1(SAheartGlm, scope = form, test = "Chisq")

### We can also reproduce Figure 5.4 with the ugly colors. I think, I
### would choose a black line and a blue confidence band instead.

newData <- lapply(SAheart,
                  function(x) {
                      r <- range(as.numeric(x))
                      y <- seq(r[1], r[2], length.out = 400)
                      if(is.factor(x))
                        y <- levels(x)[as.integer(y)]
                      return(y)
                    }
                  )
newData <- as.data.frame(newData)
termPred <- predict(SAheartGlm, newData, type = "terms", se.fit = TRUE)

### The thing to notice is that we produce predictions of the 'terms'
### from the formula above, which is exactly the estimated non-linear
### functions.

famhistInd <- which(colnames(termPred$fit) == "famhist")
require(ggplot2)
plotData <- cbind(melt(newData[, varNames]),
                  se = melt(termPred$se.fit[, -famhistInd])[, "value"],
                  y = melt(termPred$fit[, -famhistInd])[, "value"])
qplot(value, y, data = plotData, geom = "line",
      colour = I("green"), size = I(1), xlab = "", ylab = "") +
  geom_ribbon(aes(ymin = y - 2*se, ymax = y + 2*se),
              alpha = I(0.5),
              fill = I("yellow")) +
  geom_rug(data = melt(SAheart[, varNames]),
           aes(x = value, y = NULL), colour = I("red"), alpha = I(0.3)) +
  facet_wrap(~ variable, nrow = 3, scale = "free") 

### Then we turn to smoothing spines. The first function considered is
### the 'smooth.spline' function from the stats package. Examples of
### fitting a simple spline curve to the bone mineral data, see Figure
### 5.6 in the book. First we do it "manually" using the
### 'smooth.spline' function and then we do it using ggplot2.

bone <- read.table("http://www-stat-class.stanford.edu/~tibs/ElemStatLearn/datasets/bone.data",
                   header=TRUE)

Males <- bone$gender == "male"
Females <- bone$gender=="female"
boneMaleSmooth <- smooth.spline(bone[Males, "age"],
                                bone[Males, "spnbmd"], df = 12)

boneFemaleSmooth <- smooth.spline(bone[Females, "age"],
                                  bone[Females, "spnbmd"],
                                  df=12)

plot(boneMaleSmooth, ylim = c(-0.05, 0.20),
     col = "blue", type = "l", xlab = "Age", ylab = "spnbmd")
points(bone[Males, c(2, 4)], col = "blue", pch = 20)
lines(boneFemaleSmooth, ylim=c(-0.05, 0.20), col = "red")
points(bone[Females, c(2,4)], col = "red", pch = 20)

### And then using ggplot2, which almost does all the computations
### automagically ...

qplot(age, spnbmd, data = bone, colour = gender,
      geom = c("point", "smooth"))

### The only 'but' is that the smoothed line is not a spline, or it is
### at least not clear what it is, and how you control the degrees of
### freedom, say. There is a way to control that precisely by
### specifying the smoother. This is done below using the 'gam' package
### for generalized additive models because ggplot2 does not have an
### interface for using 'smooth.spline' directly. There is no support
### for standard errors when using 'gam'.

require(gam)
qplot(age, spnbmd, data = bone, colour = gender, geom = "point") +
  stat_smooth(method = gam, formula = y ~ s(x, df = 12), se = FALSE)

### We will return to 'gam' later in the course, but it is fascinating
### that you can produce a Figure like 5.6 in just 2 lines of code.
### An interesting comparison is with the 'loess' smoother, which is
### another example of a linear smoother, but instead of being based
### on a basis function expansion, it is based on a local least
### squares fit of a small degree polynomial (degree 2 by default in
### R). The locality is determined by a kernel, and the method is
### treated in Section 6.1 in the book, which is not covered in the
### course. The 'loess' method does allow for the computation of
### standard errors.

qplot(age, spnbmd, data = bone, colour = gender, geom = "point") +
  stat_smooth(method = loess, enp.target = 12) ## effective df = 12