################################################################################
### Q4.1
################################################################################

### First we implement two specific g-functions for soft and hard thresholding.

gsoft <- function(d, Delta = 0){
  shrunk <- abs(d) - Delta
  sign(d) * shrunk * (shrunk > 0)
}

ghard <- function(d, Delta = 0){
  shrunk <- abs(d) - Delta
  d * (shrunk > 0)
}

### Then we implement the nearest shrunken centroid by implementing
### three separate functions. The 'nsc' function implements the
### fundamental computations of group means and estimates of standard
### deviations. The shrink function does the actual shrinking, and the
### predict function the final prediction on a new data set. 

nsc <- function(x, y, g = gsoft, Delta = 0, s0 = median(sd), ...) {
  ## Computation of group means and residuals using linear algebra.
  N <- length(y)
  Nk <- tabulate(y)
  A <- model.matrix(~ y - 1)   ## This generates a 'design matrix'.
  centroids <- (diag(1/Nk) %*% t(A)) %*% x
  rownames(centroids) <- levels(y)
  residuals <- x - A %*% centroids 
  totalMean <- colMeans(x)
  pi <- Nk/N
  mk <- sqrt(1/Nk - 1/N)

  ## Estimates of standard deviations.
  
  sd <- sqrt(colSums(residuals^2)/(N - length(Nk)))
  
  ## The following computes the d-quantity, which is subject to be
  ## shrunken. Note that the scaling is the same across groups except
  ## for the mk quantity, which is, on the other hand the same across
  ## all coordinates. The constant 's0' is either specified as an
  ## argument or at this point evaluated as the median of the
  ## estimated standard deviations.
  
  d <- scale(centroids, center = totalMean, scale = sd + s0)/mk

  ### Then we collect the computed quantities in a list, call the
  ### shrink function and return the resulting object.
  
  result <- list()
  result$d <- d
  result$pi <- pi
  result$totalMean <- totalMean
  result$mk <- mk
  result$s0 <- s0
  result$centroids <- centroids
  result$sd <- sd
  result$Delta <- Delta
  class(result) <- "nsc"  ## Setting a formal class label on the object returned.
  result <- shrink(result, g = g, Delta = Delta)
  return(result)
}

### The 'shrink' function takes an 'nsc' object and does the actual
### shrinking depending on the chosen g function and threshold. 

shrink <- function(object, g = gsoft, Delta = 0, ...) {
  shrunken <- g(object$d, Delta)
  usedCentroids <- which(colSums(shrunken != 0) != 0)

  ## We use the 'sweep' function systematically to perform
  ## e.g. multiplications of each row element by element with a row
  ## vector. Such operations can actually be carried out by using
  ## "vector recycling" in R if everything gets transposed. Some
  ## performance tests suggest to me that we pay a price in terms of
  ## computer time for using 'sweep', but I have chosen to keep it in
  ## this solution to avoid confusing transpositions. 

  if(length(usedCentroids) > 0) {
    shrunkenCentroids <- object$mk * sweep(shrunken[, usedCentroids, drop = FALSE], 2, 
                                           object$sd[usedCentroids] + object$s0,
                                           "*")
    shrunkenCentroids <- sweep(shrunkenCentroids, 2,
                               object$totalMean[usedCentroids], "+")
  } else {
    shrunkenCentroids <- matrix(nrow = length(object$mk), ncol = 0)
  }
  object$shrunkenCentroids <- shrunkenCentroids
  object$usedCentroids <- usedCentroids
  return(object)
}

### The predict function is implemented to work on matrices of new
### observations.  The 'sdOffset' argument implements a feature of the
### 'pamr.train' function from the pamr package, to be used below,
### which is not in the book. In the book, the 's0' quantity is only
### used for the shrinking and not the prediction. In 'pamr.train' it
### is used for both. With 'sdOffset = TRUE' the present function
### mimics more closely 'pamr.train'.

predict.nsc <- function(object, xnew, g = gsoft, Delta = NULL, sdOffset = FALSE, ...) {
  if(!is.null(Delta)) ## Should a new Delta be used?
    object <- shrink(object, g = g, Delta = Delta)

  sd <- object$sd[object$usedCentroids]
  if(sdOffset)
    sd <- sd + object$s0
  scaledXnew <- sweep(xnew[, object$usedCentroids, drop = FALSE],
                      2, sd, "/")
  K <- length(object$mk) ## Number of groups.
  delta <- matrix(0, dim(xnew)[1], K)
  groupNames <- rownames(object$centroids)
  scaledCentroids <- sweep(object$shrunkenCentroids, 2, sd, "/")

  if(length(object$usedCentroids) > 0) {
    for(k in 1:K) {  
      scaledRes <- sweep(scaledXnew, 2, scaledCentroids[k, ], "-")      
      delta[, k] <-  2 * log(object$pi[k]) - rowSums(scaledRes^2)
    }
  } else {
    delta <- matrix(2 * log(object$pi), dim(xnew)[1], K, byrow = TRUE)
  }
  groupHat <- groupNames[apply(delta, 1, which.max)]
  return(groupHat)
}

### Note that by using the name 'predict.nsc' we are implementing what
### is called a _method_ for the generic 'predict' function so that it
### works automagically on objects of class 'nsc'.

################################################################################
### Q4.2
################################################################################

### Reading the data sets to be used.

GolubXTrain <- read.table("http://www.math.ku.dk/~richard/courses/statlearn2009/GolubXTrain.txt",
                          row.names = 1)
GolubYTrain <- read.table("http://www.math.ku.dk/~richard/courses/statlearn2009/GolubYTrain.txt",
                          row.names = 1)
GolubXTest <- read.table("http://www.math.ku.dk/~richard/courses/statlearn2009/GolubXTest.txt",
                         row.names = 1)
GolubYTest <- read.table("http://www.math.ku.dk/~richard/courses/statlearn2009/GolubYTest.txt",
                         row.names = 1)

### Transforming the data to the right class and shape.

Xtrain <- t(as.matrix(GolubXTrain))
yTrain <- GolubYTrain[, 1]
Xtest <- t(as.matrix(GolubXTest))
yTest <- GolubYTest[, 1]

### An initial nsc object is created. Here with Delta chosen as the
### largest value where the prediction error on the test data set below
### is minimal, that is, equal to 2 in this case.
  
GolubNsc <- nsc(Xtrain, yTrain, Delta = 5.2)

### The centroid(s) not shrunk to 0 are

GolubNsc$shrunkenCentroids

### Helper functions for computing training and test error for a
### sequence of Delta values.

trainErrors <- function(Delta, ...) 
  sum(predict(GolubNsc, Xtrain, Delta = Delta, ...) != yTrain)

testErrors <- function(Delta, ...) 
  sum(predict(GolubNsc, Xtest, Delta = Delta, ...) != yTest)
  
Delta <- seq(0, 6, by = 0.2)
trainE <- sapply(Delta, trainErrors)
testE <- sapply(Delta, testErrors)
  
plot(Delta, trainE/38, col = "green", pch = 20, type = "b",
     ylim = c(0,0.5), ylab = "Prediction error")
points(Delta, testE/34, col = "purple", pch = 20)

### Try the following plots with different choices of Delta in the
### computation of 'GolubNsc' above.

par(mfcol = c(2, 1), mar = c(1, 3, 1, 1))
for(k in 1:2) {
  plot(GolubNsc$centroids[k, ] - GolubNsc$totalMean, type = "h", ylim = c(-1, 1),
       col = "gray", xlab = "Gene index", ylab = "Difference from total mean",
       axes = FALSE)
  
  points(GolubNsc$usedCentroids,
         GolubNsc$shrunkenCentroids[k, ] - GolubNsc$totalMean[GolubNsc$usedCentroids],
         type="h", col="blue", lwd = 2)
  
    axis(2)
}
graphics.off()

################################################################################
### ------------------ Digression - Digression ---------------------------
### Partial replication of the analysis from the book using the SRBCT data
################################################################################

SRBCTxtrain <- read.table("http://www-stat-class.stanford.edu/~tibs/ElemStatLearn/datasets/khan.xtrain")
SRBCTytrain <- read.table("http://www-stat-class.stanford.edu/~tibs/ElemStatLearn/datasets/khan.ytrain")

SRBCTxtest <- read.table("http://www-stat-class.stanford.edu/~tibs/ElemStatLearn/datasets/khan.xtest")
SRBCTytest <- read.table("http://www-stat-class.stanford.edu/~tibs/ElemStatLearn/datasets/khan.ytest")

Xtrain2 <- t(as.matrix(SRBCTxtrain))
yTrain2 <- as.factor(t(SRBCTytrain))
yTest2 <- as.factor(t(SRBCTytest))
Xtest2 <- t(as.matrix(SRBCTxtest))[!is.na(yTest2), ]
yTest2 <- yTest2[!is.na(yTest2)]

### An initial nsc object is created. Here with Delta chosen as the
### largest value where the prediction error on the test data set below
### is minimal, that is, equal to 0 in this case.

SRBCNsc <- nsc(Xtrain2, yTrain2, Delta = 5.1)  

trainErrors <- function(Delta, ...) 
  sum(predict(SRBCNsc, Xtrain2, Delta = Delta, ...) != yTrain2)

testErrors <- function(Delta, ...) 
  sum(predict(SRBCNsc, Xtest2, Delta = Delta, ...) != yTest2)

### We use 'sdOffset = TRUE' here to get results as close as possible
### to those in Figure 18.4, which are most likely produced with the
### pamr package.

Delta <- seq(0, 8, length.out = 50)
trainE <- sapply(Delta, trainErrors, sdOffset = TRUE)
testE <- sapply(Delta, testErrors, sdOffset = TRUE)

### This figure resembles Figure 18.4 (Top)
  
plot(Delta, trainE/63, col = "green", pch = 20, type = "b",
     ylim = c(0, 0.8), ylab = "Prediction error")
points(Delta, testE/20, col = "purple", pch = 20, type = "b")

### and this figure resembles Figure 18.4 (Bottom), but is not
### identical. I have not been able to figure out precisely what is
### plotted. They claim in the book it is the d-values, which are the
### differences below suitably normalized, but I have not been able to
### reproduce this exactly.

par(mfcol = c(4, 1), mar = c(1, 3, 1, 1))
for(k in 1:4) {
  plot(SRBCNsc$centroids[k, ] - SRBCNsc$totalMean, type="h", ylim = c(-1, 1),
       col = "gray", xlab = "Gene index", ylab = "Difference from total mean",
       axes = FALSE)
  
  points(SRBCNsc$usedCentroids,
         SRBCNsc$shrunkenCentroids[k, ] - SRBCNsc$totalMean[SRBCNsc$usedCentroids],
         type="h", col="blue", lwd = 2)

  axis(2)
}
graphics.off()

### A complete implementation of ncs targeted towards microarray
### classification is in the 'pamr' package. 

require(pamr)
SRBCpamr <- pamr.train(list(x = t(Xtrain2), y = yTrain2), threshold = Delta)

all(trainE[1:48] == SRBCpamr$errors) ### pamr computes the same ... it seems
pamrTest <- sapply(Delta,
                   function(d) {
                     sum(pamr.predict(SRBCpamr, t(Xtest2), d) != yTest2)
                   }
                   )
all(testE == pamrTest)  ### And the same number of errors on the test data

################################################################################
### Q4.3
################################################################################

require(glmnet) ### Lasso computations

zeroOneCost <- function(pi, r) mean(abs(r - pi) > 0.5)
likelihoodCost <- function(pi, r) - mean(log(r*pi + (1 - r)*(1 - pi)))

GolubNet <- glmnet(Xtrain, yTrain, family = "binomial")
plot(GolubNet)

### Predictions of class probabilities

pHatTrain <- predict(GolubNet, Xtrain, type = "response")
pHatTest <- predict(GolubNet, Xtest, type = "response")
  
### With the zero-one loss

yTrain01 <- as.numeric(yTrain) - 1
yTest01 <- as.numeric(yTest) - 1
TrainErrors <- apply(pHatTrain, 2, zeroOneCost, r = yTrain01)
TestErrors <- apply(pHatTest, 2, zeroOneCost, r = yTest01)

plot(log(GolubNet$lambda), TrainErrors, col = "green",
     pch = 20, type = "b", ylim = c(0, 0.4))
points(log(GolubNet$lambda), TestErrors, col = "purple",
       pch = 20, type = "b")

### And with the likelihood loss

TrainErrors <- apply(pHatTrain, 2, likelihoodCost, r = yTrain01)
TestErrors <- apply(pHatTest, 2, likelihoodCost, r = yTest01)

plot(log(GolubNet$lambda), TrainErrors, col = "green",
     pch = 20, type = "b", ylim = c(0, 0.4), ylab = "minus-log-likelihood")
points(log(GolubNet$lambda), TestErrors, col = "purple",
       pch = 20, type = "b")

### Genes selected for a predictor based on selecting the least
### complex model (largest lambda) with a sufficiently small test
### error. 

which(GolubNet$beta[, 13] != 0)


################################################################################
### Q4.5
################################################################################

require(MASS)

tTest <- function(x, y){
  N <- length(y)
  Nk <- tabulate(y)
  A <- model.matrix(~ y - 1) 
  centroids <- (diag(1/Nk) %*% t(A)) %*% x
  residuals <- x - A %*% centroids   
  sd <- sqrt(colSums(residuals^2)/(N - length(Nk)))
  fac <- 1/sqrt((1/Nk[1] + 1/Nk[2]))
  tTest <- fac*(centroids[1, ] - centroids[2, ])/sd
  return(list(tTest = tTest, centroids = centroids))
}

tTests <- tTest(Xtrain, yTrain)
geneOrder <- order(abs(tTests$tTest), decreasing = TRUE)


### A vulcano plot can be interesting. It shows the relation between
### the size of differences and absolute values of the t-test
### statistic. The most interesting genes are those that are to the
### north-east and north-west on the plot.

plot(tTests$centroids[1, ] - tTests$centroids[2, ],
     abs(tTests$tTest), pch = 20)

R <- 35  ## Max number of genes to include.
testError <- numeric(R)
trainError <- numeric(R)

for(i in 2:R) {
  ordLda <- lda(Xtrain[, geneOrder[1:i]], yTrain)
  trainError[i] <- sum(predict(ordLda)$class != yTrain)
  testError[i] <- sum(predict(ordLda, Xtest[, geneOrder[1:i]])$class != yTest)
}

plot(2:R, trainError[2:R]/38, col = "green", pch = 20,
     type = "b", ylim = c(0, 0.2))
points(2:R, testError[2:R]/34, col = "purple", pch = 20, type = "b")