require(leaps)  #Implements a branch and bound algorithm for finding the best subset
require(MASS)   #Implements the lm.ridge function for ridge regression
require(lars)   #Does among other things lasso estimation
require(glmnet) #Contains an implementation of the elastic net (and lasso ...) 
require(ggplot2)


###Timings for different matrix algorithms are computed using this function.
###Note that is is mostly the user timings that are relevant. Note that these
###timings can be a little tricky depending upon whether the underlying packages
###that do the linear algebra are capable of using the processors with multiple cores.
###My understanding is that among two alternatives LAPACK is but LINPACK is not.
###Curiously for the QR-decomposition but not the others LINPACK is the default.
###For comparability we choose LAPACK=TRUE for the QR-decomposition below. 

timings <- function(p){
  timings <- list()
  timings$svdUser <- numeric()
  timings$cholUser <- numeric()
  timings$qrUser <- numeric()
  timings$solveUser <- numeric()
  timings$svdSystem <- numeric()
  timings$cholSystem <- numeric()
  timings$qrSystem <- numeric()
  timings$solveSystem <- numeric()
  
  for(i in seq(along=p)){
    testData <- matrix(rnorm(p[i]*p[i]),ncol=p[i])
    a <- system.time(svd(testData))
    timings$svdUser[i] <- a[1]
    timings$svdSystem[i] <- a[2]

    a <- system.time(chol(t(testData) %*% testData))
    timings$cholUser[i] <- a[1]
    timings$cholSystem[i] <- a[2]
    
    a <- system.time(qr(testData,LAPACK=TRUE))
    timings$qrUser[i] <- a[1]
    timings$qrSystem[i] <- a[2]

    a <- system.time(solve(t(testData) %*% testData))
    timings$solveUser[i] <- a[1]
    timings$solveSystem[i] <- a[2]
    
  }
  return(cbind(p=p,as.data.frame(timings)))
}


###Timings are computed and plotted in a log-log plot. It looks as if
###the algorithmic complexity is polynomial and of the same order for
###all four problems. 
testTimings <- timings(seq(100,1000,by=50))
ggplot(data=melt(testTimings[,1:5],id.var="p"),aes(x=p,y=value,colour=variable)) +
  geom_line() +
  coord_trans(x="log",y="log")

###The following regression shows that in this case the practical numerical
###computational complexity for all three algorithms is very close, and estimated
###to roughly p^{2.5}. The theoretical asymptotic complexity is p^3 as reported in the
###book
lm(log(value)~variable/log(p),data=melt(testTimings[,1:5],id.var="p"))


################################################################################
###The following is the data analysis of the prostate data. 
################################################################################


prostate <- read.table("http://www-stat-class.stanford.edu/%7Etibs/ElemStatLearn/datasets/prostate.data")
###Selection of the train data only
prostateTest <- prostate[!prostate$train,]  
prostate <- prostate[prostate$train,]


###For comparison with the table 3.2 in the book. Note that they have done the scaling prior to
###selection of the training data, which gives some differences. 
summary(lm(lpsa~.-1,data=cbind(as.data.frame(scale(prostate[,1:8])),as.data.frame(scale(prostate[,9,drop=FALSE],scale=FALSE)))))
summary(lm(lpsa~.,data=cbind(as.data.frame(scale(prostate[,1:8])),prostate[,9,drop=FALSE])))


###Centering of the response and scaling of the remaining variables is done prior to
###computing the best subsets using the regsubsets function.
prostateSubset <- regsubsets(scale(as.matrix(prostate[,1:8])),scale(as.matrix(prostate[,9]),scale=FALSE),nbest=70,really.big=TRUE)
prostateRss <- prostateSubset$ress
prostateRss <- reshape(as.data.frame(prostateRss),varying=list(1:70),direction="long")[,c(3,2)]
names(prostateRss) <- c("k","RSS")
prostateRss <- prostateRss[prostateRss$RSS < 1e35,]

p <- qplot(prostateRss$k-1,prostateRss$RSS,shape=I(19),colour=I("gray50"),ylim=c(0,100),xlab="Subset size k",ylab="Residual sum-of-squares")
p + geom_line(aes(x=0:8,y=prostateSubset$ress[,1]),colour="red") + geom_point(aes(x=0:8,y=prostateSubset$ress[,1]),shape=19,colour="red")

###prostateSubset <- leaps(scale(as.matrix(prostate[,1:8])),scale(as.matrix(prostate[,9]),scale=FALSE),nbest=40)
###rssSelect <- function(select) sum(lm.fit(scale(as.matrix(prostate[,select])),scale(as.matrix(prostate[,9]),scale=FALSE))$residuals^2)
###plot(prostateSubset$size-1,apply(prostateSubset$which,1, function(which) rssSelect(c(1:8)[which])),ylim=c(0,100))

prostateLars <- lars(as.matrix(prostate[,1:8]),as.matrix(prostate[,9]))
prostatePredict <- predict(prostateLars,as.matrix(prostate[,1:8]),s=seq(2,6,by=0.1))$fit
prostateRSS <- apply((prostatePredict -  as.matrix(prostate[,9]) %*% rep(1,41) )^2,2,sum)
summary(prostateLars)
plot(prostateLars)


prostateGlmnet <- glmnet(as.matrix(prostate[,1:8]),as.matrix(prostate[,9]),alpha=0.2)
plot(prostateGlmnet)

prostateRidge <- lm.ridge(lpsa~.,data=as.data.frame(scale(prostate[prostate$train,1:9])),lambda=seq(0,1000,0.1))

prostateSvd <- svd(scale(prostate[prostate$train,1:8]))
prostateDf <- sapply(prostateRidge$lambda,function(lambda) sum(prostateSvd$d^2/(prostateSvd$d^2 + lambda)))
prostateRidge$lambda <- prostateDf
plot(prostateRidge)