require(rpart) ## For trees require(gam) ## Estimation of generalized additive models require(ggplot2) ### Illustration of the relation to naive Bayes on the genetic fingerprint data load("prac7.RData") ### For comparison reasons we reorder the columns. prac7Train <- prac7Train[, c(sort(names(prac7Train)[-16]), "population")] ### The logistic regression model summary(glm(population ~ ., data = prac7Train, family = binomial)) ### Using generalized additive models on the genetic fingerprint data df <- 6 form <- paste("population~", paste("s(", names(prac7Train)[-16], ",", df, ")", sep="",collapse="+") ) form <- as.formula(form) prac7Gam <- gam(form, data = prac7Train, family = binomial) par(mfrow = c(4, 4), mar = c(4, 1, 1, 1)) plot(prac7Gam, se = TRUE) summary(prac7Gam) require(VGAM) ## Alternative to gam. More general, contains multinomial regression. prac7Vgam <- vgam(form, data = prac7Train, family = binomialff) par(mfrow = c(4, 4), mar = c(4, 1, 1, 1)) plot(prac7Vgam, se = TRUE) summary(prac7Vgam) ### Extracting information to produce a similar plot with ggplot2 pred <- melt(predict(prac7Gam, type = "terms"))$value termPredict <- cbind(data.frame(predict = pred), melt(prac7Train[, -16]) ) qplot(value, predict, data = termPredict, geom = "line", ylim = c(-10, 10)) + facet_wrap(~ variable, scale = "free_x") ### Automatic selection of tuning parameters via the package ### mgcv. There are conflicts with the gam and perhaps also the VGAM ### packages and we detach those. In the alternative, you can restart ### R to make a clean start. detach(package:gam) detach(package:VGAM) require(mgcv) form <- paste("population ~", paste("s(", names(prac7Train)[-16], ")", sep = "", collapse = "+") ) form <- as.formula(form) ### The actual gam fit using that mgcv package. It uses more or less ### the same syntax as 'gam' from the gam package, but the formulae ### are a little different. For once, there is no degrees of freedom ### in the specification of the terms we need to smooth because the ### tuning parameters are automatically selected. This is done by a ### simultaneous optimization over the parameter space of the ### likelihood and the tuning parameters space of the a suitable error ### criteria. prac7Gam <- gam(form, data = prac7Train, family = binomial) pred <- melt(predict(prac7Gam, type = "terms"))$value termPredict <- cbind(data.frame(predict = pred), melt(prac7Train[, -16]) ) qplot(value, predict, data = termPredict, geom = "line", ylim = c(-10, 10)) + facet_wrap(~ variable, scale = "free_x") summary(prac7Gam) ### Note that for those variables whose effective degrees of freedom ### is 1.00 in the table above the smoother has been reduced to a ### linear function. ################################################################################ ### Trees using 'rpart' - the R implementation of CART ################################################################################ ### Simple example with the iris data set. data(iris) irisTree <- rpart(Species ~ ., data = iris) plot(irisTree, margin = 0.1) text(irisTree, use.n = TRUE) printcp(irisTree) ### Then returning to the data from prac 7 prac7Tree <- rpart(population ~ ., data = prac7Train) plot(prac7Tree, uniform = TRUE, margin = 0.1) text(prac7Tree) ### Different choice of complexity parameter prac7Tree <- rpart(population ~ ., data = prac7Train, control = list(cp = 0.0001)) plot(prac7Tree, uniform = TRUE, margin = 0.1) text(prac7Tree) prac7Tree <- prune(prac7Tree, 0.01) plot(prac7Tree, uniform = TRUE, margin = 0.1) text(prac7Tree) ### Computing the training error yHat <- predict(prac7Tree, type = "class") pred <- table(yHat, y) print(pred) print(pred/sum(pred), digits=3) ### Computing the test error yHat <- predict(prac7Tree, prac7Test, type = "class") pred <- table(yHat, yTest) print(pred) print(pred/sum(pred), digits=3)