require(leaps) #Implements a branch and bound algorithm for finding the best subset require(MASS) #For ridge regression require(lars) require(ggplot2) ##There are 330 observations in this dataset LAozone <- read.table("http://www-stat-class.stanford.edu/%7Etibs/ElemStatLearn/datasets/LAozone.data",sep=",",header=TRUE) ###Q4.1 nrTrain <- 220 ##Try changing this to 20. Then you can see a must more clear effect on the resulting plots below nrTest <- 330 -nrTrain train <- sample(1:330,nrTrain) LAozoneTrain <- LAozone[train,] LAozoneTest <- LAozone[!(1:330 %in% train),] ###Ordinary linear regression of the cube root and various standard diagnostic plots. LAozoneLm <- lm(ozone^(1/3)~.,data=LAozoneTrain) summary(LAozoneLm) plot(LAozoneLm) ###Q4.2. This question can be solved using either regsubsets or leaps. ###First using regsubsets we call the summary function to get the information ###of which model is best for each dimension. LAozoneRegsubsets <- regsubsets(as.matrix(LAozoneTrain[,2:10]),as.matrix(LAozoneTrain[,1])^(1/3),nbest=1) LAozoneWhich <- summary(LAozoneRegsubsets,matrix.logical=TRUE)$which[,-1] ###Alternatively, we can use the function leaps, which computes the "which" directly LAozoneSubsets <- leaps(as.matrix(LAozoneTrain[,2:10]),as.matrix(LAozoneTrain[,1])^(1/3),nbest=1) LAozoneWhich <- LAozoneSubsets$which[-9,] ###The training error (residual sum of squares) can be found by estimation of ###the optimal models, e.g. as optModel <- list() TrainErrors <- sum(scale(LAozoneTrain[,1]^(1/3),scale=FALSE)^2) #The intersept RSS for(i in 1:8){ optModel[[i]] <- lm(ozone^(1/3)~.,data=LAozoneTrain[,c(TRUE,LAozoneWhich[i,])]) TrainErrors[i+1] <- sum(optModel[[i]]$residuals^2) } ###In the alternative, we can extract this information from LAozoneRegsubsets$ress ###For comparison LAozoneRegsubsets$ress - TrainErrors ###For comparison with the test error we normalize by the number of samples TrainErrors <- TrainErrors/nrTrain ###The test error can be found by predicitions on the test dataset using the estimated ###optimal models from above. TestErrors <- sum((LAozoneTest[,1]^(1/3) - mean(LAozoneTrain[,1]^(1/3)))^2) for(i in 1:8){ pred <- predict(optModel[[i]],LAozoneTest) TestErrors[i+1] <- sum((pred - (LAozoneTest[,1])^(1/3))^2) } TestErrors <- TestErrors/nrTest qplot(x=k,y=value,colour=variable,data=melt(data.frame(k=0:8,TestErrors=TestErrors,TrainErrors=TrainErrors),id="k")) ###Q4.3. ###First LARS LAozoneLars <- lars(as.matrix(LAozoneTrain[,-1]),as.matrix(LAozoneTrain[,1])^(1/3)) plot(LAozoneLars) s <- seq(1,9,by=0.01) predTrainLars <- predict(LAozoneLars,as.matrix(LAozoneTrain[,-1]),s=s) residualsTrainLars <- as.data.frame(apply(predTrainLars$fit,2,"-",y=(LAozoneTrain[,1,drop=FALSE])^(1/3))) TrainErrorsLars <- apply(residualsTrainLars^2,2,mean) predTestLars <- predict(LAozoneLars,as.matrix(LAozoneTest[,-1]),s=s) residualsTestLars <- as.data.frame(apply(predTestLars$fit,2,"-",y=(LAozoneTest[,1,drop=FALSE])^(1/3))) TestErrorsLars <- apply(residualsTestLars^2,2,mean) qplot(x=s,y=value,colour=variable,data=melt(data.frame(s=s,TestErrors=TestErrorsLars,TrainErrors=TrainErrorsLars),id="s")) ###Then ridge regression lambda <- seq(0,nrTrain*3,by=0.1) LAozoneRidge <- lm.ridge(ozone^(1/3)~.,data=LAozoneTrain,lambda=lambda) RidgeCoef <- LAozoneRidge$coef/LAozoneRidge$scales RidgeIntercept <- LAozoneRidge$ym - apply(LAozoneRidge$xm*RidgeCoef,2,sum) predTrainRidge <- t(t(as.matrix(LAozoneTrain[,-1]) %*% RidgeCoef) + RidgeIntercept) residualsTrainRidge <- as.data.frame(apply(predTrainRidge,2,"-",y=(LAozoneTrain[,1,drop=FALSE])^(1/3))) TrainErrorsRidge <- apply(residualsTrainRidge^2,2,mean) predTestRidge <- t(t(as.matrix(LAozoneTest[,-1]) %*% RidgeCoef) + RidgeIntercept) residualsTestRidge <- as.data.frame(apply(predTestRidge,2,"-",y=(LAozoneTest[,1,drop=FALSE])^(1/3))) TestErrorsRidge <- apply(residualsTestRidge^2,2,mean) qplot(x=L2,y=value,colour=variable,data=melt(data.frame(L2=apply(LAozoneRidge$coef^2,2,sum),TestErrors=TestErrorsRidge,TrainErrors=TrainErrorsRidge),id="L2"))