################################################################################ ### Logistic regression and multinomial regression ################################################################################ ### Standard estimation of a logistic regression model. SAheart <- read.table("http://www-stat.stanford.edu/~tibs/ElemStatLearn/datasets/SAheart.data", sep = ",", head = TRUE, row.names = 1) SAheart[1, ] SAheartGlm <- glm(chd ~ ., data = SAheart, family = binomial) summary(SAheartGlm) ### Below, we fit a multinomial model to the vowel data. require(VGAM) ### Contains an implementation of multinomial regression require(ggplot2) vowels <- read.table("http://www-stat-class.stanford.edu/%7Etibs/ElemStatLearn/datasets/vowel.train", row.names = 1, head = TRUE, sep = ",") vowelsVglm <- vglm(y ~ ., data = vowels, family = multinomial) summary(vowelsVglm) ### The following plots makes pair-wise comparisons of the linear functions used for ### discrimination in a multinomial logis-model. predictor <- predict(vowelsVglm) ### The linear predictors on the training data select <- function(i, j) vowels[, 1] %in% c(i, j) ### A helper function sepPlot <- function(i, j, k = i, l = j) { selected <- select(i, j) qplot(predictor[selected, k], predictor[selected, l], colour = as.factor(vowels[selected, 1]), shape = I(20)) + geom_abline(intercept = 0, slope = 1, colour = I("black")) + scale_colour_discrete("Group") + xlab("") + ylab("") } sepPlot(1, 4) sepPlot(2, 4) densPlot <- function(i, j, k = i, l = j) { selected <- select(i, j) qplot(predictor[selected, k] - predictor[selected, l], geom = "density", alpha = I(0.6), fill = as.factor(vowels[selected, 1]), shape = I(20)) + geom_abline(intercept = 0, slope = 1, colour = I("black")) + scale_fill_discrete("Group") + xlab("") + ylab("") } densPlot(1, 4) densPlot(2, 4) ################################################################################ ### Penalized logistic regression and high-dimensional data ################################################################################ require(stepPlr) ### Implements logistic regression with quadratic penalization require(glmnet) ### Elastic net penalty for glmmodels require(ggplot2) SRBCTxtrain <- read.table("http://www-stat-class.stanford.edu/~tibs/ElemStatLearn/datasets/khan.xtrain") SRBCTytrain <- read.table("http://www-stat-class.stanford.edu/~tibs/ElemStatLearn/datasets/khan.ytrain") SRBCTxtest <- read.table("http://www-stat-class.stanford.edu/~tibs/ElemStatLearn/datasets/khan.xtest") SRBCTytest <- read.table("http://www-stat-class.stanford.edu/~tibs/ElemStatLearn/datasets/khan.ytest") ### Elastic net -- or with alpha=1 the default this is Lasso -- using the multinomial model SRBCTglmnet <- glmnet(t(SRBCTxtrain), as.factor(t(SRBCTytrain)), family="multinomial") par(mfcol = c(2, 2)) plot(SRBCTglmnet) ### Coefficient profiles ### Computing the predictions on the training and test data and ### plotting the curves. SRBCTpredictTrain <- predict(SRBCTglmnet, t(SRBCTxtrain), type="class") TrainErrors <- apply(apply(SRBCTpredictTrain, 2, "!=", SRBCTytrain), 2, sum, na.rm=TRUE) qplot(SRBCTglmnet$lambda, TrainErrors, geom="line") SRBCTpredict <- predict(SRBCTglmnet, t(SRBCTxtest), type="class") TestErrors <- apply(apply(SRBCTpredict, 2, "!=", SRBCTytest), 2, sum, na.rm=TRUE) qplot(SRBCTglmnet$lambda, TestErrors, geom="line") ### Slightly nicer plot Errors <- melt(data.frame(lambda = SRBCTglmnet$lambda, TrainErrors = TrainErrors, TestErrors = TestErrors), id = "lambda") qplot(lambda, value, data = Errors, colour = variable, geom = "line") ### Extracting the non-null coefficients for s = 0.1 SRBCTcoef <- coefficients(SRBCTglmnet, s = 0.1) SRBCTcoef <- as.data.frame(lapply(SRBCTcoef, as.matrix)) SRBCTcoef[apply(abs(SRBCTcoef), 1, sum) > 0, ]