>--- title: "Statistical Learning Examples" author: "Luke Tierney" output: html_document --- ```{r, include = FALSE} knitr::opts_chunk$set(collapse = TRUE, message = FALSE) set.seed(1234) ``` ## Tree Models A regression tree for the trade union membership data: ```{r} data(trade.union, package = "SemiPar") library(rpart) trade.union$member.fac <- as.factor(ifelse(trade.union$union.member, "yes", "no")) fit <- rpart(member.fac ~ wage + age + years.educ, data = trade.union) fit ``` ```{r, fig.height = 9} plot(fit) text(fit, use.n = TRUE) ``` ## California Air Pollution Data ```{r} data(calif.air.poll, package = "SemiPar") ``` The features vary in scale: ```{r} summary(calif.air.poll[-1]) ``` Some methods may do better workign with scaled features: ```{r} calif.scaled <- scale(calif.air.poll[-1]) calif.scaled.df <- transform(calif.scaled, ozone.level = calif.air.poll$ozone.level) ``` A prediction function to use with fits based on rescaled features: ```{r} ca.predict <- function(fit, X, ...) { X.scaled <- scale(X, attr(calif.scaled, "scaled:center"), attr(calif.scaled, "scaled:scale")) X.scaled <- as.data.frame(X.scaled) predict(fit, X.scaled, ...) } ``` ### Spline Models Fit an additive model to the data and compute the mean square prediction error for the test data: ```{r} library(mgcv) ca.gam <- function(train = TRUE) gam(ozone.level ~ s(daggett.pressure.gradient) + s(inversion.base.height) + s(inversion.base.temp), data = calif.air.poll[train,]) ca.gam.mse <- function(fit, test = TRUE) { y <- calif.air.poll$ozone.level[test] yhat <- predict(fit, calif.air.poll[test,]) mean((y - yhat) ^ 2) } fit <- ca.gam() ca.gam.mse(fit) ``` Visualizing the fit: ```{r} np <- 50 dpg <- seq(-60, 100, len = np) ibh <- seq(200, 4000, len = np) gg <- expand.grid(daggett.pressure.gradient = dpg, inversion.base.height = ibh, inversion.base.temp = c(60, 80)) library(lattice) wireframe(predict(fit, gg) ~ daggett.pressure.gradient * inversion.base.height, group = inversion.base.temp, data = gg, auto.key = TRUE) wireframe(predict(fit, gg) ~ daggett.pressure.gradient * inversion.base.height | inversion.base.temp, data = gg) ``` A thin-plate spline fit to all three predictors: ```{r} ca.gam3 <- function(train) gam(ozone.level ~ s(daggett.pressure.gradient, inversion.base.height, inversion.base.temp), data = calif.air.poll[train,]) fit3 <- ca.gam3() ca.gam.mse(fit3) ``` The fit surface: ```{r} wireframe(predict(fit3, gg) ~ daggett.pressure.gradient * inversion.base.height, group = inversion.base.temp, data = gg, auto.key = TRUE) wireframe(predict(fit3, gg) ~ daggett.pressure.gradient * inversion.base.height | inversion.base.temp, data = gg) ``` Thin plate spline fit with re-scaled features: ```{r} ca.gam3s <- function(train) gam(ozone.level ~ s(daggett.pressure.gradient, inversion.base.height, inversion.base.temp), data = calif.scaled.df[train,]) ca.gam3s.mse <- function(fit, test = TRUE) { y <- calif.air.poll$ozone.level[test] yhat <- ca.predict(fit, calif.air.poll[test, -1]) mean((y - yhat) ^ 2) } fit3s <- ca.gam3s() ca.gam3s.mse(fit3s) ``` The fit surface: ```{r} wireframe(ca.predict(fit3s, gg) ~ daggett.pressure.gradient * inversion.base.height, group = inversion.base.temp, data = gg, auto.key = TRUE) wireframe(ca.predict(fit3s, gg) ~ daggett.pressure.gradient * inversion.base.height | inversion.base.temp, data = gg) ``` ### A Tree Model Fit a tree to the data using all predictors: ```{r} library(rpart) ca.tree <- function(train = TRUE) rpart(ozone.level ~ ., data = calif.air.poll[train,]) ca.tree.mse <- function(fit, test = TRUE) { y <- calif.air.poll$ozone.level[test] yhat <- predict(fit, calif.air.poll[test,]) mean((y - yhat) ^ 2) } tree.ca <- ca.tree() ca.tree.mse(tree.ca) ``` The fit surface: ```{r} wireframe(predict(tree.ca, gg) ~ daggett.pressure.gradient * inversion.base.height, group = inversion.base.temp, data = gg, auto.key = TRUE) wireframe(predict(tree.ca, gg) ~ daggett.pressure.gradient * inversion.base.height | inversion.base.temp, data = gg) ``` ### Bagging Use bagging to fit the data: ```{r} library(randomForest) ca.bag <- function(train = TRUE) randomForest(ozone.level ~ ., data = calif.air.poll[train,], mtry = ncol(calif.air.poll) - 1) ca.bag.mse <- function(fit, test = TRUE) { y <- calif.air.poll$ozone.level[test] yhat <- predict(fit, calif.air.poll[test,]) mean((y - yhat) ^ 2) } bag.ca <- ca.bag() ca.bag.mse(bag.ca) ``` The fit surface: ```{r} wireframe(predict(bag.ca, gg) ~ daggett.pressure.gradient * inversion.base.height, group = inversion.base.temp, data = gg, auto.key = TRUE) wireframe(predict(bag.ca, gg) ~ daggett.pressure.gradient * inversion.base.height | inversion.base.temp, data = gg) ``` ### Random Forest Fit a random forest: ```{r} ca.rf <- function(train = TRUE) randomForest(ozone.level ~ ., data = calif.air.poll[train,]) ca.rf.mse <- function(fit, test = TRUE) { y <- calif.air.poll$ozone.level[test] yhat <- predict(fit, calif.air.poll[test,]) mean((y - yhat) ^ 2) } rf.ca <- ca.rf() ca.rf.mse(rf.ca) ``` The fit surface: ```{r} wireframe(predict(rf.ca, gg) ~ daggett.pressure.gradient * inversion.base.height, group = inversion.base.temp, data = gg, auto.key = TRUE) wireframe(predict(rf.ca, gg) ~ daggett.pressure.gradient * inversion.base.height | inversion.base.temp, data = gg) ``` ### Gradient Boosting This uses `gbm` from the `gbm` package to fit boosted regression trees. The first approach fits an additive model, the second allows two-way interactions. ```{r} library(gbm) n.trees <- 100 ca.boost <- function(train = TRUE) gbm(ozone.level ~ ., data = calif.air.poll[train,], n.trees = n.trees, distribution = "gaussian") ca.boost.mse <- function(fit, test = TRUE) { y <- calif.air.poll$ozone.level[test] yhat <- predict(fit, calif.air.poll[test,], n.trees = n.trees) mean((y - yhat) ^ 2) } ca.boost2 <- function(train = TRUE) gbm(ozone.level ~ ., data = calif.air.poll[train,], n.trees = n.trees, , distribution = "gaussian", interaction.depth = 2) boost.ca <- ca.boost() ca.boost.mse(boost.ca) boost.ca2 <- ca.boost2() ca.boost.mse(boost.ca2) ``` The fit surfaces: ```{r} wireframe(predict(boost.ca, gg, n.trees = n.trees) ~ daggett.pressure.gradient * inversion.base.height, group = inversion.base.temp, data = gg, auto.key = TRUE) wireframe(predict(boost.ca, gg, n.trees = n.trees) ~ daggett.pressure.gradient * inversion.base.height | inversion.base.temp, data = gg) wireframe(predict(boost.ca2, gg, n.trees = n.trees) ~ daggett.pressure.gradient * inversion.base.height, group = inversion.base.temp, data = gg, auto.key = TRUE) wireframe(predict(boost.ca2, gg, n.trees = n.trees) ~ daggett.pressure.gradient * inversion.base.height | inversion.base.temp, data = gg) ``` ### Cross Validation 10-fold cross-validation to assess performance: ```{r, include = FALSE} set.seed(54321) ``` ```{r} cvsplit <- function(n, k) { x <- seq_len(n) brk <- quantile(x, seq(0, 1, length.out = k + 1)) y <- sample(n, n) structure(split(y, cut(x, brk, include.lowest=TRUE)), names = NULL) } cv <- cvsplit(nrow(calif.air.poll), 10) ``` ```{r} mse_gam <- sapply(cv, function(test) ca.gam.mse(ca.gam(-test), test)) mse_gam3 <- sapply(cv, function(test) ca.gam.mse(ca.gam3(-test), test)) mse_gam3s <- sapply(cv, function(test) ca.gam3s.mse(ca.gam3s(-test), test)) mse_tree <- sapply(cv, function(test) ca.tree.mse(ca.tree(-test), test)) mse_bag <- sapply(cv, function(test) ca.bag.mse(ca.bag(-test), test)) mse_rf <- sapply(cv, function(test) ca.rf.mse(ca.rf(-test), test)) mse_boost <- sapply(cv, function(test) ca.boost.mse(ca.boost(-test), test)) mse_boost2 <- sapply(cv, function(test) ca.boost.mse(ca.boost2(-test), test)) mse <- data.frame(gam = mse_gam, gam3 = mse_gam3, gam3s = mse_gam3s, tree = mse_tree, bag = mse_bag, rf = mse_rf, boost = mse_boost, boost2 = mse_boost2, sample = seq_along(mse_gam)) library(tidyr) mse <- gather(mse, which, mse, -sample) library(dplyr) summarize(group_by(mse, which), mse = mean(mse)) knitr::kable(summarize(group_by(mse, which), avg_mse = mean(mse), se = sd(mse) / sqrt(n())), digits = 2, format = "html") %>% kableExtra::kable_styling(full_width = F) ``` ## Some Neural Network Tinkering ```{r, eval = FALSE} library(neuralnet) M <- 5 x <- seq(0,1,len=101) d3 <- data.frame(x, y = sin(x*4*pi)) nn3 <- neuralnet(y ~ x, d3, hidden = M) plot(y ~ x, data = d3, type = "l") title(expression(f(x) == sin(4*pi*x))) lines(x, predict(nn3, d3), col="red") plot(nn3, rep = 1) text(0, 1, expression(bold(f(x) == sin(4*pi*x))), cex = 1.5) ``` ```{r, include = FALSE} ReLU <- function(x) ifelse(x > 0, x, 0) ``` ```{r, eval = FALSE} library(neuralnet) nnfit <- neuralnet(ozone.level ~ ., data = calif.scaled.df, hidden = 4, rep = 5) best <- which.min( nnfit$result.matrix["error",]) wireframe(ca.predict(nnfit, gg, rep = best) ~ daggett.pressure.gradient * inversion.base.height, group = inversion.base.temp, data = gg, auto.key = TRUE) plot(nnfit, rep = "best") ```