require(gam)   #Estimation of generalized additive models
require(ggplot2)

###Illustration of the relation to naive Bayes on the genetic fingerprint data

load("~/courses/statlearn/statlearn_2009/R/Assignment1.RData")

###First we remove the outlier and turn all columns into factors with the same
###set of levels

summary(Assignment1Train)
Assignment1Train <- Assignment1Train[-230,]
Assignment1Train <- Assignment1Train[!Assignment1Train[,16]=="Hispanic",]
Assignment1Train[,16] <- as.factor(as.vector(Assignment1Train[,16]))
allLevels <- levels(factor(unlist(Assignment1Train[,-16])))
Assignment1TrainMod <- Assignment1Train
Assignment1TrainMod[,-16] <- data.frame(lapply(Assignment1Train[,-16],function(x) factor(x,levels=allLevels)))

###Here we compute the estimates of the marginal distributions

counts <- lapply(Assignment1TrainMod[,-16],function(x) tapply(x,Assignment1TrainMod[,16],table))
h <- lapply(counts,function(x) lapply(x,function(x) x/sum(x)))
epsilon <- 0.00001 #This is added in the computation of log-ratios to prevent 0/0 and log(0)
choice <- melt(lapply(h,function(x) x[[1]] > 0 | x[[2]] > 0))
logit <- melt(lapply(h,function(x) log((x[[2]]+epsilon)/(x[[1]]+epsilon))))[choice$value,]
logit$L1 <- factor(logit$L1,levels=names(Assignment1Train)[-16])
qplot(x=Var.1,y=value,data=logit,geom="line")+facet_wrap(~L1,scale="free_x")

###Compare with logistic regression

summary(glm(population~.,data=Assignment1Train,family=binomial))

###Using generalized additive models on the genetic fingerprint data

df <- 6
form <- as.formula(paste("population~",paste("s(",names(Assignment1Train)[-16],",",df,")",sep="",collapse="+")))
Ass1Gam <- gam(form,data=Assignment1Train,family=binomial)
par(mfcol=c(4,4))
plot(Ass1Gam,se=TRUE)
summary(Ass1Gam)

###Extracting information to produce a similar plot with ggplot2

termPredict <- cbind(data.frame(predict=melt(predict(Ass1Gam,type="terms"))$value),melt(Assignment1Train[,-16],))
qplot(x=value,y=predict,data=termPredict,geom="line",ylim=c(-10,10)) + facet_wrap(~variable,scale="free_x")

###Automatic selection of tuning parameters via mgcv. You might need to restart R to make this work. 
require(mgcv)  #Functions for estimation of generalized additive models and automatic selection of the smoothing parameter

form <- as.formula(paste("population~",paste("s(",names(Assignment1Train)[-16],")",sep="",collapse="+")))
Ass1Gam <- gam(form,data=Assignment1Train,family=binomial)
termPredict <- cbind(data.frame(predict=melt(predict(Ass1Gam,type="terms"))$value),melt(Assignment1Train[,-16],))
qplot(x=value,y=predict,data=termPredict,geom="line",ylim=c(-10,10)) + facet_wrap(~variable,scale="free_x")
termPredict[termPredict$variable=="TPOX",]
summary(Ass1Gam)
plot(Ass1Gam)