require(gam) #Estimation of generalized additive models require(ggplot2) ###Illustration of the relation to naive Bayes on the genetic fingerprint data load("~/courses/statlearn/statlearn_2009/R/Assignment1.RData") ###First we remove the outlier and turn all columns into factors with the same ###set of levels summary(Assignment1Train) Assignment1Train <- Assignment1Train[-230,] Assignment1Train <- Assignment1Train[!Assignment1Train[,16]=="Hispanic",] Assignment1Train[,16] <- as.factor(as.vector(Assignment1Train[,16])) allLevels <- levels(factor(unlist(Assignment1Train[,-16]))) Assignment1TrainMod <- Assignment1Train Assignment1TrainMod[,-16] <- data.frame(lapply(Assignment1Train[,-16],function(x) factor(x,levels=allLevels))) ###Here we compute the estimates of the marginal distributions counts <- lapply(Assignment1TrainMod[,-16],function(x) tapply(x,Assignment1TrainMod[,16],table)) h <- lapply(counts,function(x) lapply(x,function(x) x/sum(x))) epsilon <- 0.00001 #This is added in the computation of log-ratios to prevent 0/0 and log(0) choice <- melt(lapply(h,function(x) x[[1]] > 0 | x[[2]] > 0)) logit <- melt(lapply(h,function(x) log((x[[2]]+epsilon)/(x[[1]]+epsilon))))[choice$value,] logit$L1 <- factor(logit$L1,levels=names(Assignment1Train)[-16]) qplot(x=Var.1,y=value,data=logit,geom="line")+facet_wrap(~L1,scale="free_x") ###Compare with logistic regression summary(glm(population~.,data=Assignment1Train,family=binomial)) ###Using generalized additive models on the genetic fingerprint data df <- 6 form <- as.formula(paste("population~",paste("s(",names(Assignment1Train)[-16],",",df,")",sep="",collapse="+"))) Ass1Gam <- gam(form,data=Assignment1Train,family=binomial) par(mfcol=c(4,4)) plot(Ass1Gam,se=TRUE) summary(Ass1Gam) ###Extracting information to produce a similar plot with ggplot2 termPredict <- cbind(data.frame(predict=melt(predict(Ass1Gam,type="terms"))$value),melt(Assignment1Train[,-16],)) qplot(x=value,y=predict,data=termPredict,geom="line",ylim=c(-10,10)) + facet_wrap(~variable,scale="free_x") ###Automatic selection of tuning parameters via mgcv. You might need to restart R to make this work. require(mgcv) #Functions for estimation of generalized additive models and automatic selection of the smoothing parameter form <- as.formula(paste("population~",paste("s(",names(Assignment1Train)[-16],")",sep="",collapse="+"))) Ass1Gam <- gam(form,data=Assignment1Train,family=binomial) termPredict <- cbind(data.frame(predict=melt(predict(Ass1Gam,type="terms"))$value),melt(Assignment1Train[,-16],)) qplot(x=value,y=predict,data=termPredict,geom="line",ylim=c(-10,10)) + facet_wrap(~variable,scale="free_x") termPredict[termPredict$variable=="TPOX",] summary(Ass1Gam) plot(Ass1Gam)