require(leaps)  ## Implements a branch and bound algorithm for finding the best subset
require(MASS)   ## For ridge regression
require(glmnet) ## Lasso
require(ggplot2)

##There are 330 observations in this dataset
LAozone <- read.table("http://www-stat-class.stanford.edu/%7Etibs/ElemStatLearn/datasets/LAozone.data",
                      sep = ",", header = TRUE)
################################################################################
### Q5.1
################################################################################

nrTrain <- 220  ## Try changing this to 20. Then you can see a much
                ## more clear effect on the resulting plots below
nrTest <- 330 - nrTrain
set.seed(222)
train <- sample(1:330, nrTrain)
LAozoneTrain <- LAozone[train, ]
LAozoneTest <- LAozone[!(1:330 %in% train), ]
  

### Ordinary linear regression of the cube root and various standard diagnostic plots.

LAozoneLm <- lm(ozone^(1/3) ~ ., data = LAozoneTrain)
summary(LAozoneLm)
par(mfcol = c(2, 2))
plot(LAozoneLm)
graphics.off()

################################################################################
### Q5.2.
################################################################################

### This question can be solved using either regsubsets or leaps.
### First using regsubsets we call the summary function to get the information
### of which model is best for each dimension.

Xtrain <- as.matrix(LAozoneTrain[, 2:10])
yTrain <- as.matrix(LAozoneTrain[, 1])

LAozoneRegsubsets <- regsubsets(Xtrain, yTrain^(1/3), nbest = 1)
LAozoneWhich <- summary(LAozoneRegsubsets, matrix.logical = TRUE)$which[, -1]

### Alternatively, we can use the function leaps, which computes the "which" directly.

LAozoneSubsets <- leaps(Xtrain, yTrain, nbest = 1)
LAozoneWhich <- LAozoneSubsets$which[-9, ] 

### The training error (residual sum of squares) can be found by estimation of
### the optimal models, e.g. as

optModel <- list()
TrainErrors <- numeric(9)
TrainErrors[1] <- sum(scale(yTrain^(1/3), scale=FALSE)^2)  ## The intercept only RSS

for(i in 1:8){
  optModel[[i]] <- lm(ozone^(1/3) ~ .,
                      data = LAozoneTrain[, c(TRUE, LAozoneWhich[i, ])])
  TrainErrors[i+1] <- sum(optModel[[i]]$residuals^2)
}


### In the alternative, we can extract this information from
### LAozoneRegsubsets$ress.  Below, we will need the fitted optimal
### models for each dimension for predictions on the test data. For
### comparison

LAozoneRegsubsets$ress - TrainErrors

### For comparison with the test error we normalize by the number of samples

TrainErrors <- TrainErrors/nrTrain

### The test error can be found by predicitions on the test dataset
### using the estimated optimal models from above.

TestErrors <- numeric(9)
TestErrors[1] <-  sum((LAozoneTest[, 1]^(1/3) - mean(yTrain^(1/3)))^2)
  
for(i in 1:8){
  pred <- predict(optModel[[i]], LAozoneTest)
  TestErrors[i+1] <- sum((pred - (LAozoneTest[,1])^(1/3))^2)
}

TestErrors <- TestErrors/nrTest

qplot(x = k, y = value, colour = variable,
      data = melt(data.frame(k = 0:8, TestErrors = TestErrors,
        TrainErrors = TrainErrors),
        id = "k")
      ) + geom_line()

################################################################################
### Q5.3.
################################################################################
### First lasso

LAozoneGlmnet <- glmnet(x = as.matrix(LAozoneTrain[, -1]),
                        y = as.matrix(LAozoneTrain[, 1])^(1/3))

predTrainGlmnet <- predict(LAozoneGlmnet, as.matrix(LAozoneTrain[, -1]))
TrainErrorsGlmnet <- colMeans((predTrainGlmnet - LAozoneTrain[, 1]^(1/3))^2)

predTestGlmnet <- predict(LAozoneGlmnet, as.matrix(LAozoneTest[, -1]))
TestErrorsGlmnet <- colMeans((predTestGlmnet - LAozoneTest[, 1]^(1/3))^2)

qplot(log(lambda), value, colour=variable,
      data = melt(data.frame(
        lambda = LAozoneGlmnet$lambda,
        TestErrors = TestErrorsGlmnet,
        TrainErrors = TrainErrorsGlmnet),
        id = "lambda")
      ) + geom_line()

### Then ridge regression. Here there is no 'predict' function, so we
### have to do that by hand.

lambda <- exp(seq(0, log(30), length.out = 100))
LAozoneRidge <- lm.ridge(ozone^(1/3)~., data = LAozoneTrain, lambda = lambda)
RidgeCoef <- coefficients(LAozoneRidge)

predTrainRidge <- cbind(1, as.matrix(LAozoneTrain[, -1])) %*% t(RidgeCoef)                        
TrainErrorsRidge <- colMeans((predTrainRidge - LAozoneTrain[, 1]^(1/3))^2)

predTestRidge <- cbind(1, as.matrix(LAozoneTest[, -1])) %*% t(RidgeCoef)     
TestErrorsRidge <- colMeans((predTestRidge - LAozoneTest[, 1]^(1/3))^2)

qplot(log(lambda), value, colour=variable,
      data = melt(data.frame(
        lambda = lambda,
        TestErrors = TestErrorsRidge,
        TrainErrors = TrainErrorsRidge),
        id = "lambda")
      ) + geom_line()

################################################################################
### Q5.4 -- See Alexanders pdf-solution
################################################################################