################################################################################ ### Q4.1 ################################################################################ ### First we implement two specific g-functions for soft and hard thresholding. gsoft <- function(d, Delta = 0){ shrunk <- abs(d) - Delta sign(d) * shrunk * (shrunk > 0) } ghard <- function(d, Delta = 0){ shrunk <- abs(d) - Delta d * (shrunk > 0) } ### Then we implement the nearest shrunken centroid by implementing ### three separate functions. The 'nsc' function implements the ### fundamental computations of group means and estimates of standard ### deviations. The shrink function does the actual shrinking, and the ### predict function the final prediction on a new data set. nsc <- function(x, y, g = gsoft, Delta = 0, s0 = median(sd), ...) { ## Computation of group means and residuals using linear algebra. N <- length(y) Nk <- tabulate(y) A <- model.matrix(~ y - 1) ## This generates a 'design matrix'. centroids <- (diag(1/Nk) %*% t(A)) %*% x rownames(centroids) <- levels(y) residuals <- x - A %*% centroids totalMean <- colMeans(x) pi <- Nk/N mk <- sqrt(1/Nk - 1/N) ## Estimates of standard deviations. sd <- sqrt(colSums(residuals^2)/(N - length(Nk))) ## The following computes the d-quantity, which is subject to be ## shrunken. Note that the scaling is the same across groups except ## for the mk quantity, which is, on the other hand the same across ## all coordinates. The constant 's0' is either specified as an ## argument or at this point evaluated as the median of the ## estimated standard deviations. d <- scale(centroids, center = totalMean, scale = sd + s0)/mk ### Then we collect the computed quantities in a list, call the ### shrink function and return the resulting object. result <- list() result$d <- d result$pi <- pi result$totalMean <- totalMean result$mk <- mk result$s0 <- s0 result$centroids <- centroids result$sd <- sd result$Delta <- Delta class(result) <- "nsc" ## Setting a formal class label on the object returned. result <- shrink(result, g = g, Delta = Delta) return(result) } ### The 'shrink' function takes an 'nsc' object and does the actual ### shrinking depending on the chosen g function and threshold. shrink <- function(object, g = gsoft, Delta = 0, ...) { shrunken <- g(object$d, Delta) usedCentroids <- which(colSums(shrunken != 0) != 0) ## We use the 'sweep' function systematically to perform ## e.g. multiplications of each row element by element with a row ## vector. Such operations can actually be carried out by using ## "vector recycling" in R if everything gets transposed. Some ## performance tests suggest to me that we pay a price in terms of ## computer time for using 'sweep', but I have chosen to keep it in ## this solution to avoid confusing transpositions. if(length(usedCentroids) > 0) { shrunkenCentroids <- object$mk * sweep(shrunken[, usedCentroids, drop = FALSE], 2, object$sd[usedCentroids] + object$s0, "*") shrunkenCentroids <- sweep(shrunkenCentroids, 2, object$totalMean[usedCentroids], "+") } else { shrunkenCentroids <- matrix(nrow = length(object$mk), ncol = 0) } object$shrunkenCentroids <- shrunkenCentroids object$usedCentroids <- usedCentroids return(object) } ### The predict function is implemented to work on matrices of new ### observations. The 'sdOffset' argument implements a feature of the ### 'pamr.train' function from the pamr package, to be used below, ### which is not in the book. In the book, the 's0' quantity is only ### used for the shrinking and not the prediction. In 'pamr.train' it ### is used for both. With 'sdOffset = TRUE' the present function ### mimics more closely 'pamr.train'. predict.nsc <- function(object, xnew, g = gsoft, Delta = NULL, sdOffset = FALSE, ...) { if(!is.null(Delta)) ## Should a new Delta be used? object <- shrink(object, g = g, Delta = Delta) sd <- object$sd[object$usedCentroids] if(sdOffset) sd <- sd + object$s0 scaledXnew <- sweep(xnew[, object$usedCentroids, drop = FALSE], 2, sd, "/") K <- length(object$mk) ## Number of groups. delta <- matrix(0, dim(xnew)[1], K) groupNames <- rownames(object$centroids) scaledCentroids <- sweep(object$shrunkenCentroids, 2, sd, "/") if(length(object$usedCentroids) > 0) { for(k in 1:K) { scaledRes <- sweep(scaledXnew, 2, scaledCentroids[k, ], "-") delta[, k] <- 2 * log(object$pi[k]) - rowSums(scaledRes^2) } } else { delta <- matrix(2 * log(object$pi), dim(xnew)[1], K, byrow = TRUE) } groupHat <- groupNames[apply(delta, 1, which.max)] return(groupHat) } ### Note that by using the name 'predict.nsc' we are implementing what ### is called a _method_ for the generic 'predict' function so that it ### works automagically on objects of class 'nsc'. ################################################################################ ### Q4.2 ################################################################################ ### Reading the data sets to be used. GolubXTrain <- read.table("http://www.math.ku.dk/~richard/courses/statlearn2009/GolubXTrain.txt", row.names = 1) GolubYTrain <- read.table("http://www.math.ku.dk/~richard/courses/statlearn2009/GolubYTrain.txt", row.names = 1) GolubXTest <- read.table("http://www.math.ku.dk/~richard/courses/statlearn2009/GolubXTest.txt", row.names = 1) GolubYTest <- read.table("http://www.math.ku.dk/~richard/courses/statlearn2009/GolubYTest.txt", row.names = 1) ### Transforming the data to the right class and shape. Xtrain <- t(as.matrix(GolubXTrain)) yTrain <- GolubYTrain[, 1] Xtest <- t(as.matrix(GolubXTest)) yTest <- GolubYTest[, 1] ### An initial nsc object is created. Here with Delta chosen as the ### largest value where the prediction error on the test data set below ### is minimal, that is, equal to 2 in this case. GolubNsc <- nsc(Xtrain, yTrain, Delta = 5.2) ### The centroid(s) not shrunk to 0 are GolubNsc$shrunkenCentroids ### Helper functions for computing training and test error for a ### sequence of Delta values. trainErrors <- function(Delta, ...) sum(predict(GolubNsc, Xtrain, Delta = Delta, ...) != yTrain) testErrors <- function(Delta, ...) sum(predict(GolubNsc, Xtest, Delta = Delta, ...) != yTest) Delta <- seq(0, 6, by = 0.2) trainE <- sapply(Delta, trainErrors) testE <- sapply(Delta, testErrors) plot(Delta, trainE/38, col = "green", pch = 20, type = "b", ylim = c(0,0.5), ylab = "Prediction error") points(Delta, testE/34, col = "purple", pch = 20) ### Try the following plots with different choices of Delta in the ### computation of 'GolubNsc' above. par(mfcol = c(2, 1), mar = c(1, 3, 1, 1)) for(k in 1:2) { plot(GolubNsc$centroids[k, ] - GolubNsc$totalMean, type = "h", ylim = c(-1, 1), col = "gray", xlab = "Gene index", ylab = "Difference from total mean", axes = FALSE) points(GolubNsc$usedCentroids, GolubNsc$shrunkenCentroids[k, ] - GolubNsc$totalMean[GolubNsc$usedCentroids], type="h", col="blue", lwd = 2) axis(2) } graphics.off() ################################################################################ ### ------------------ Digression - Digression --------------------------- ### Partial replication of the analysis from the book using the SRBCT data ################################################################################ SRBCTxtrain <- read.table("http://www-stat-class.stanford.edu/~tibs/ElemStatLearn/datasets/khan.xtrain") SRBCTytrain <- read.table("http://www-stat-class.stanford.edu/~tibs/ElemStatLearn/datasets/khan.ytrain") SRBCTxtest <- read.table("http://www-stat-class.stanford.edu/~tibs/ElemStatLearn/datasets/khan.xtest") SRBCTytest <- read.table("http://www-stat-class.stanford.edu/~tibs/ElemStatLearn/datasets/khan.ytest") Xtrain2 <- t(as.matrix(SRBCTxtrain)) yTrain2 <- as.factor(t(SRBCTytrain)) yTest2 <- as.factor(t(SRBCTytest)) Xtest2 <- t(as.matrix(SRBCTxtest))[!is.na(yTest2), ] yTest2 <- yTest2[!is.na(yTest2)] ### An initial nsc object is created. Here with Delta chosen as the ### largest value where the prediction error on the test data set below ### is minimal, that is, equal to 0 in this case. SRBCNsc <- nsc(Xtrain2, yTrain2, Delta = 5.1) trainErrors <- function(Delta, ...) sum(predict(SRBCNsc, Xtrain2, Delta = Delta, ...) != yTrain2) testErrors <- function(Delta, ...) sum(predict(SRBCNsc, Xtest2, Delta = Delta, ...) != yTest2) ### We use 'sdOffset = TRUE' here to get results as close as possible ### to those in Figure 18.4, which are most likely produced with the ### pamr package. Delta <- seq(0, 8, length.out = 50) trainE <- sapply(Delta, trainErrors, sdOffset = TRUE) testE <- sapply(Delta, testErrors, sdOffset = TRUE) ### This figure resembles Figure 18.4 (Top) plot(Delta, trainE/63, col = "green", pch = 20, type = "b", ylim = c(0, 0.8), ylab = "Prediction error") points(Delta, testE/20, col = "purple", pch = 20, type = "b") ### and this figure resembles Figure 18.4 (Bottom), but is not ### identical. I have not been able to figure out precisely what is ### plotted. They claim in the book it is the d-values, which are the ### differences below suitably normalized, but I have not been able to ### reproduce this exactly. par(mfcol = c(4, 1), mar = c(1, 3, 1, 1)) for(k in 1:4) { plot(SRBCNsc$centroids[k, ] - SRBCNsc$totalMean, type="h", ylim = c(-1, 1), col = "gray", xlab = "Gene index", ylab = "Difference from total mean", axes = FALSE) points(SRBCNsc$usedCentroids, SRBCNsc$shrunkenCentroids[k, ] - SRBCNsc$totalMean[SRBCNsc$usedCentroids], type="h", col="blue", lwd = 2) axis(2) } graphics.off() ### A complete implementation of ncs targeted towards microarray ### classification is in the 'pamr' package. require(pamr) SRBCpamr <- pamr.train(list(x = t(Xtrain2), y = yTrain2), threshold = Delta) all(trainE[1:48] == SRBCpamr$errors) ### pamr computes the same ... it seems pamrTest <- sapply(Delta, function(d) { sum(pamr.predict(SRBCpamr, t(Xtest2), d) != yTest2) } ) all(testE == pamrTest) ### And the same number of errors on the test data ################################################################################ ### Q4.3 ################################################################################ require(glmnet) ### Lasso computations zeroOneCost <- function(pi, r) mean(abs(r - pi) > 0.5) likelihoodCost <- function(pi, r) - mean(log(r*pi + (1 - r)*(1 - pi))) GolubNet <- glmnet(Xtrain, yTrain, family = "binomial") plot(GolubNet) ### Predictions of class probabilities pHatTrain <- predict(GolubNet, Xtrain, type = "response") pHatTest <- predict(GolubNet, Xtest, type = "response") ### With the zero-one loss yTrain01 <- as.numeric(yTrain) - 1 yTest01 <- as.numeric(yTest) - 1 TrainErrors <- apply(pHatTrain, 2, zeroOneCost, r = yTrain01) TestErrors <- apply(pHatTest, 2, zeroOneCost, r = yTest01) plot(log(GolubNet$lambda), TrainErrors, col = "green", pch = 20, type = "b", ylim = c(0, 0.4)) points(log(GolubNet$lambda), TestErrors, col = "purple", pch = 20, type = "b") ### And with the likelihood loss TrainErrors <- apply(pHatTrain, 2, likelihoodCost, r = yTrain01) TestErrors <- apply(pHatTest, 2, likelihoodCost, r = yTest01) plot(log(GolubNet$lambda), TrainErrors, col = "green", pch = 20, type = "b", ylim = c(0, 0.4), ylab = "minus-log-likelihood") points(log(GolubNet$lambda), TestErrors, col = "purple", pch = 20, type = "b") ### Genes selected for a predictor based on selecting the least ### complex model (largest lambda) with a sufficiently small test ### error. which(GolubNet$beta[, 13] != 0) ################################################################################ ### Q4.5 ################################################################################ require(MASS) tTest <- function(x, y){ N <- length(y) Nk <- tabulate(y) A <- model.matrix(~ y - 1) centroids <- (diag(1/Nk) %*% t(A)) %*% x residuals <- x - A %*% centroids sd <- sqrt(colSums(residuals^2)/(N - length(Nk))) fac <- 1/sqrt((1/Nk[1] + 1/Nk[2])) tTest <- fac*(centroids[1, ] - centroids[2, ])/sd return(list(tTest = tTest, centroids = centroids)) } tTests <- tTest(Xtrain, yTrain) geneOrder <- order(abs(tTests$tTest), decreasing = TRUE) ### A vulcano plot can be interesting. It shows the relation between ### the size of differences and absolute values of the t-test ### statistic. The most interesting genes are those that are to the ### north-east and north-west on the plot. plot(tTests$centroids[1, ] - tTests$centroids[2, ], abs(tTests$tTest), pch = 20) R <- 35 ## Max number of genes to include. testError <- numeric(R) trainError <- numeric(R) for(i in 2:R) { ordLda <- lda(Xtrain[, geneOrder[1:i]], yTrain) trainError[i] <- sum(predict(ordLda)$class != yTrain) testError[i] <- sum(predict(ordLda, Xtest[, geneOrder[1:i]])$class != yTest) } plot(2:R, trainError[2:R]/38, col = "green", pch = 20, type = "b", ylim = c(0, 0.2)) points(2:R, testError[2:R]/34, col = "purple", pch = 20, type = "b")