require(leaps)  #Implements a branch and bound algorithm for finding the best subset
require(MASS)   #For ridge regression
require(lars)
require(ggplot2)

##There are 330 observations in this dataset
LAozone <- read.table("http://www-stat-class.stanford.edu/%7Etibs/ElemStatLearn/datasets/LAozone.data",sep=",",header=TRUE)

###Q4.1
nrTrain <- 220  ##Try changing this to 20. Then you can see a must more clear effect on the resulting plots below
nrTest <- 330 -nrTrain
train <- sample(1:330,nrTrain)
LAozoneTrain <- LAozone[train,]
LAozoneTest <- LAozone[!(1:330 %in% train),]
  

###Ordinary linear regression of the cube root and various standard diagnostic plots.
LAozoneLm <- lm(ozone^(1/3)~.,data=LAozoneTrain)
summary(LAozoneLm)
plot(LAozoneLm)


###Q4.2. This question can be solved using either regsubsets or leaps.
###First using regsubsets we call the summary function to get the information
###of which model is best for each dimension.

LAozoneRegsubsets <- regsubsets(as.matrix(LAozoneTrain[,2:10]),as.matrix(LAozoneTrain[,1])^(1/3),nbest=1)
LAozoneWhich <- summary(LAozoneRegsubsets,matrix.logical=TRUE)$which[,-1]

###Alternatively, we can use the function leaps, which computes the "which" directly
LAozoneSubsets <- leaps(as.matrix(LAozoneTrain[,2:10]),as.matrix(LAozoneTrain[,1])^(1/3),nbest=1)
LAozoneWhich <- LAozoneSubsets$which[-9,] 

###The training error (residual sum of squares) can be found by estimation of
###the optimal models, e.g. as

optModel <- list()
TrainErrors <- sum(scale(LAozoneTrain[,1]^(1/3),scale=FALSE)^2)  #The intersept RSS

for(i in 1:8){
  optModel[[i]] <- lm(ozone^(1/3)~.,data=LAozoneTrain[,c(TRUE,LAozoneWhich[i,])])
  TrainErrors[i+1] <- sum(optModel[[i]]$residuals^2)
}


###In the alternative, we can extract this information from LAozoneRegsubsets$ress
###For comparison
LAozoneRegsubsets$ress - TrainErrors

###For comparison with the test error we normalize by the number of samples
TrainErrors <- TrainErrors/nrTrain


###The test error can be found by predicitions on the test dataset using the estimated 
###optimal models from above.

TestErrors <-  sum((LAozoneTest[,1]^(1/3) - mean(LAozoneTrain[,1]^(1/3)))^2)
  
for(i in 1:8){
  pred <- predict(optModel[[i]],LAozoneTest)
  TestErrors[i+1] <- sum((pred - (LAozoneTest[,1])^(1/3))^2)
}

TestErrors <- TestErrors/nrTest

qplot(x=k,y=value,colour=variable,data=melt(data.frame(k=0:8,TestErrors=TestErrors,TrainErrors=TrainErrors),id="k"))


###Q4.3.
###First LARS


LAozoneLars <- lars(as.matrix(LAozoneTrain[,-1]),as.matrix(LAozoneTrain[,1])^(1/3))
plot(LAozoneLars)

s <- seq(1,9,by=0.01)

predTrainLars <- predict(LAozoneLars,as.matrix(LAozoneTrain[,-1]),s=s)
residualsTrainLars <- as.data.frame(apply(predTrainLars$fit,2,"-",y=(LAozoneTrain[,1,drop=FALSE])^(1/3)))
TrainErrorsLars <- apply(residualsTrainLars^2,2,mean)


predTestLars <- predict(LAozoneLars,as.matrix(LAozoneTest[,-1]),s=s)
residualsTestLars <- as.data.frame(apply(predTestLars$fit,2,"-",y=(LAozoneTest[,1,drop=FALSE])^(1/3)))
TestErrorsLars <- apply(residualsTestLars^2,2,mean)

qplot(x=s,y=value,colour=variable,data=melt(data.frame(s=s,TestErrors=TestErrorsLars,TrainErrors=TrainErrorsLars),id="s"))

###Then ridge regression

lambda <- seq(0,nrTrain*3,by=0.1)
LAozoneRidge <- lm.ridge(ozone^(1/3)~.,data=LAozoneTrain,lambda=lambda)
RidgeCoef <- LAozoneRidge$coef/LAozoneRidge$scales
RidgeIntercept <-  LAozoneRidge$ym -  apply(LAozoneRidge$xm*RidgeCoef,2,sum)
predTrainRidge <- t(t(as.matrix(LAozoneTrain[,-1]) %*% RidgeCoef) + RidgeIntercept)
residualsTrainRidge <- as.data.frame(apply(predTrainRidge,2,"-",y=(LAozoneTrain[,1,drop=FALSE])^(1/3)))
TrainErrorsRidge <- apply(residualsTrainRidge^2,2,mean)

predTestRidge <- t(t(as.matrix(LAozoneTest[,-1]) %*% RidgeCoef) + RidgeIntercept)
residualsTestRidge <- as.data.frame(apply(predTestRidge,2,"-",y=(LAozoneTest[,1,drop=FALSE])^(1/3)))
TestErrorsRidge <- apply(residualsTestRidge^2,2,mean)

qplot(x=L2,y=value,colour=variable,data=melt(data.frame(L2=apply(LAozoneRidge$coef^2,2,sum),TestErrors=TestErrorsRidge,TrainErrors=TrainErrorsRidge),id="L2"))