require(leaps) ## Implements a branch and bound algorithm for finding the best subset require(MASS) ## For ridge regression require(glmnet) ## Lasso require(ggplot2) ##There are 330 observations in this dataset LAozone <- read.table("http://www-stat-class.stanford.edu/%7Etibs/ElemStatLearn/datasets/LAozone.data", sep = ",", header = TRUE) ################################################################################ ### Q5.1 ################################################################################ nrTrain <- 220 ## Try changing this to 20. Then you can see a much ## more clear effect on the resulting plots below nrTest <- 330 - nrTrain set.seed(222) train <- sample(1:330, nrTrain) LAozoneTrain <- LAozone[train, ] LAozoneTest <- LAozone[!(1:330 %in% train), ] ### Ordinary linear regression of the cube root and various standard diagnostic plots. LAozoneLm <- lm(ozone^(1/3) ~ ., data = LAozoneTrain) summary(LAozoneLm) par(mfcol = c(2, 2)) plot(LAozoneLm) graphics.off() ################################################################################ ### Q5.2. ################################################################################ ### This question can be solved using either regsubsets or leaps. ### First using regsubsets we call the summary function to get the information ### of which model is best for each dimension. Xtrain <- as.matrix(LAozoneTrain[, 2:10]) yTrain <- as.matrix(LAozoneTrain[, 1]) LAozoneRegsubsets <- regsubsets(Xtrain, yTrain^(1/3), nbest = 1) LAozoneWhich <- summary(LAozoneRegsubsets, matrix.logical = TRUE)$which[, -1] ### Alternatively, we can use the function leaps, which computes the "which" directly. LAozoneSubsets <- leaps(Xtrain, yTrain, nbest = 1) LAozoneWhich <- LAozoneSubsets$which[-9, ] ### The training error (residual sum of squares) can be found by estimation of ### the optimal models, e.g. as optModel <- list() TrainErrors <- numeric(9) TrainErrors[1] <- sum(scale(yTrain^(1/3), scale=FALSE)^2) ## The intercept only RSS for(i in 1:8){ optModel[[i]] <- lm(ozone^(1/3) ~ ., data = LAozoneTrain[, c(TRUE, LAozoneWhich[i, ])]) TrainErrors[i+1] <- sum(optModel[[i]]$residuals^2) } ### In the alternative, we can extract this information from ### LAozoneRegsubsets$ress. Below, we will need the fitted optimal ### models for each dimension for predictions on the test data. For ### comparison LAozoneRegsubsets$ress - TrainErrors ### For comparison with the test error we normalize by the number of samples TrainErrors <- TrainErrors/nrTrain ### The test error can be found by predicitions on the test dataset ### using the estimated optimal models from above. TestErrors <- numeric(9) TestErrors[1] <- sum((LAozoneTest[, 1]^(1/3) - mean(yTrain^(1/3)))^2) for(i in 1:8){ pred <- predict(optModel[[i]], LAozoneTest) TestErrors[i+1] <- sum((pred - (LAozoneTest[,1])^(1/3))^2) } TestErrors <- TestErrors/nrTest qplot(x = k, y = value, colour = variable, data = melt(data.frame(k = 0:8, TestErrors = TestErrors, TrainErrors = TrainErrors), id = "k") ) + geom_line() ################################################################################ ### Q5.3. ################################################################################ ### First lasso LAozoneGlmnet <- glmnet(x = as.matrix(LAozoneTrain[, -1]), y = as.matrix(LAozoneTrain[, 1])^(1/3)) predTrainGlmnet <- predict(LAozoneGlmnet, as.matrix(LAozoneTrain[, -1])) TrainErrorsGlmnet <- colMeans((predTrainGlmnet - LAozoneTrain[, 1]^(1/3))^2) predTestGlmnet <- predict(LAozoneGlmnet, as.matrix(LAozoneTest[, -1])) TestErrorsGlmnet <- colMeans((predTestGlmnet - LAozoneTest[, 1]^(1/3))^2) qplot(log(lambda), value, colour=variable, data = melt(data.frame( lambda = LAozoneGlmnet$lambda, TestErrors = TestErrorsGlmnet, TrainErrors = TrainErrorsGlmnet), id = "lambda") ) + geom_line() ### Then ridge regression. Here there is no 'predict' function, so we ### have to do that by hand. lambda <- exp(seq(0, log(30), length.out = 100)) LAozoneRidge <- lm.ridge(ozone^(1/3)~., data = LAozoneTrain, lambda = lambda) RidgeCoef <- coefficients(LAozoneRidge) predTrainRidge <- cbind(1, as.matrix(LAozoneTrain[, -1])) %*% t(RidgeCoef) TrainErrorsRidge <- colMeans((predTrainRidge - LAozoneTrain[, 1]^(1/3))^2) predTestRidge <- cbind(1, as.matrix(LAozoneTest[, -1])) %*% t(RidgeCoef) TestErrorsRidge <- colMeans((predTestRidge - LAozoneTest[, 1]^(1/3))^2) qplot(log(lambda), value, colour=variable, data = melt(data.frame( lambda = lambda, TestErrors = TestErrorsRidge, TrainErrors = TrainErrorsRidge), id = "lambda") ) + geom_line() ################################################################################ ### Q5.4 -- See Alexanders pdf-solution ################################################################################