require(leaps) #Implements a branch and bound algorithm for finding the best subset require(MASS) #Implements the lm.ridge function for ridge regression require(lars) #Does among other things lasso estimation require(glmnet) #Contains an implementation of the elastic net (and lasso ...) require(ggplot2) ###Timings for different matrix algorithms are computed using this function. ###Note that is is mostly the user timings that are relevant. Note that these ###timings can be a little tricky depending upon whether the underlying packages ###that do the linear algebra are capable of using the processors with multiple cores. ###My understanding is that among two alternatives LAPACK is but LINPACK is not. ###Curiously for the QR-decomposition but not the others LINPACK is the default. ###For comparability we choose LAPACK=TRUE for the QR-decomposition below. timings <- function(p){ timings <- list() timings$svdUser <- numeric() timings$cholUser <- numeric() timings$qrUser <- numeric() timings$solveUser <- numeric() timings$svdSystem <- numeric() timings$cholSystem <- numeric() timings$qrSystem <- numeric() timings$solveSystem <- numeric() for(i in seq(along=p)){ testData <- matrix(rnorm(p[i]*p[i]),ncol=p[i]) a <- system.time(svd(testData)) timings$svdUser[i] <- a[1] timings$svdSystem[i] <- a[2] a <- system.time(chol(t(testData) %*% testData)) timings$cholUser[i] <- a[1] timings$cholSystem[i] <- a[2] a <- system.time(qr(testData,LAPACK=TRUE)) timings$qrUser[i] <- a[1] timings$qrSystem[i] <- a[2] a <- system.time(solve(t(testData) %*% testData)) timings$solveUser[i] <- a[1] timings$solveSystem[i] <- a[2] } return(cbind(p=p,as.data.frame(timings))) } ###Timings are computed and plotted in a log-log plot. It looks as if ###the algorithmic complexity is polynomial and of the same order for ###all four problems. testTimings <- timings(seq(100,1000,by=50)) ggplot(data=melt(testTimings[,1:5],id.var="p"),aes(x=p,y=value,colour=variable)) + geom_line() + coord_trans(x="log",y="log") ###The following regression shows that in this case the practical numerical ###computational complexity for all three algorithms is very close, and estimated ###to roughly p^{2.5}. The theoretical asymptotic complexity is p^3 as reported in the ###book lm(log(value)~variable/log(p),data=melt(testTimings[,1:5],id.var="p")) ################################################################################ ###The following is the data analysis of the prostate data. ################################################################################ prostate <- read.table("http://www-stat-class.stanford.edu/%7Etibs/ElemStatLearn/datasets/prostate.data") ###Selection of the train data only prostateTest <- prostate[!prostate$train,] prostate <- prostate[prostate$train,] ###For comparison with the table 3.2 in the book. Note that they have done the scaling prior to ###selection of the training data, which gives some differences. summary(lm(lpsa~.-1,data=cbind(as.data.frame(scale(prostate[,1:8])),as.data.frame(scale(prostate[,9,drop=FALSE],scale=FALSE))))) summary(lm(lpsa~.,data=cbind(as.data.frame(scale(prostate[,1:8])),prostate[,9,drop=FALSE]))) ###Centering of the response and scaling of the remaining variables is done prior to ###computing the best subsets using the regsubsets function. prostateSubset <- regsubsets(scale(as.matrix(prostate[,1:8])),scale(as.matrix(prostate[,9]),scale=FALSE),nbest=70,really.big=TRUE) prostateRss <- prostateSubset$ress prostateRss <- reshape(as.data.frame(prostateRss),varying=list(1:70),direction="long")[,c(3,2)] names(prostateRss) <- c("k","RSS") prostateRss <- prostateRss[prostateRss$RSS < 1e35,] p <- qplot(prostateRss$k-1,prostateRss$RSS,shape=I(19),colour=I("gray50"),ylim=c(0,100),xlab="Subset size k",ylab="Residual sum-of-squares") p + geom_line(aes(x=0:8,y=prostateSubset$ress[,1]),colour="red") + geom_point(aes(x=0:8,y=prostateSubset$ress[,1]),shape=19,colour="red") ###prostateSubset <- leaps(scale(as.matrix(prostate[,1:8])),scale(as.matrix(prostate[,9]),scale=FALSE),nbest=40) ###rssSelect <- function(select) sum(lm.fit(scale(as.matrix(prostate[,select])),scale(as.matrix(prostate[,9]),scale=FALSE))$residuals^2) ###plot(prostateSubset$size-1,apply(prostateSubset$which,1, function(which) rssSelect(c(1:8)[which])),ylim=c(0,100)) prostateLars <- lars(as.matrix(prostate[,1:8]),as.matrix(prostate[,9])) prostatePredict <- predict(prostateLars,as.matrix(prostate[,1:8]),s=seq(2,6,by=0.1))$fit prostateRSS <- apply((prostatePredict - as.matrix(prostate[,9]) %*% rep(1,41) )^2,2,sum) summary(prostateLars) plot(prostateLars) prostateGlmnet <- glmnet(as.matrix(prostate[,1:8]),as.matrix(prostate[,9]),alpha=0.2) plot(prostateGlmnet) prostateRidge <- lm.ridge(lpsa~.,data=as.data.frame(scale(prostate[prostate$train,1:9])),lambda=seq(0,1000,0.1)) prostateSvd <- svd(scale(prostate[prostate$train,1:8])) prostateDf <- sapply(prostateRidge$lambda,function(lambda) sum(prostateSvd$d^2/(prostateSvd$d^2 + lambda))) prostateRidge$lambda <- prostateDf plot(prostateRidge)