--- title: "More Markov Chain Monte Carlo Examples" author: "Luke Tierney" institute: "University of Iowa" date: "`r Sys.Date()`" output: xaringan::moon_reader: lib_dir: libs css: [default, default-fonts, "myslides.css"] nature: ratio: '16:9' highlightStyle: github highlightLines: true countIncrementalSlides: false titleSlideClass: [center, middle] --- ```{r setup, include = FALSE} options(htmltools.dir.version = FALSE) library(ggplot2) knitr::opts_chunk$set(collapse = TRUE, fig.height = 5, fig.width = 6) library(lattice) library(tidyverse) theme_set(theme_minimal() + theme(text = element_text(size = 16)) + theme(panel.border = element_rect(color = "grey30", fill = NA))) set.seed(12345) ``` ## Gibbs Sampler for Pump Failure Data The data: ```{r} fail <- c(5, 1, 5, 14, 3, 19, 1, 1, 4, 22) time <- c(94.32, 15.72, 62.88, 125.76, 5.24, 31.44, 1.05, 1.05, 2.10, 10.48) ``` -- Hyper-parameters for the prior distribution: ```{r} alpha <- 1.8 gamma <- 0.01 delta <- 1 ``` --- ## Gibbs Sampler for Pump Failure Data Generator for the full conditional for $\lambda_i, i = 1, \dots, 10$: ```{r} genLambda <- function(alpha, beta) rgamma(10, fail + alpha, rate = time + beta) ``` -- Generator for the full conditional for $\beta$: ```{r} genBeta <- function(alpha, lambda) rgamma(1, 10 * alpha + gamma, rate = delta + sum(lambda)) ``` --- ## Gibbs Sampler for Pump Failure Data Metropolis-Hastings sampler for the full conditional for $\alpha$ (for `K = 0` the value of `alpha` is unchanged): ```{r} genAlpha <- function(alpha, beta, lambda, K, d) { b <- (10 * log(beta) + sum(log(lambda)) - 1) for (j in seq_len(K)) { newAlpha <- exp(rnorm(1, log(alpha), d)) logDensRatio <- ((newAlpha - alpha) * b + log(newAlpha) - log(alpha) + 10 * (lgamma(alpha) - lgamma(newAlpha))) if (is.finite(logDensRatio) && log(runif(1)) < logDensRatio) alpha <- newAlpha } alpha } ``` --- ## Gibbs Sampler for Pump Failure Data Driver for running the sampler: ```{r} pump <- function(alpha, beta, N, d = 1, K = 1) { v <- matrix(0, ncol=12, nrow=N) for (i in 1:N) { lambda <- genLambda(alpha, beta) beta <- genBeta(alpha, lambda) alpha <- genAlpha(alpha, beta, lambda, K, d) v[i,1:10] <- lambda v[i,11] <- beta v[i,12] <- alpha } v } ``` --- ## Alternative Samplers The joint marginal posterior density of $\alpha$ and $\beta$ is \begin{equation*} f(\alpha, \beta | t_i, x_i) \propto \frac{\prod_{i=1}^{10} \Gamma(x_i + \alpha)} {\Gamma(\alpha)^{10}\prod_{i=1}^{10} (t_i + \beta)^{x_i + \alpha}} \beta^{10 \alpha + \gamma - 1} e^{-\delta\beta}e^{-\alpha} \end{equation*} -- Parallel and simulated tempering can be used to improve on the basic sampler implemented in the `pump` function. -- I will use a normal approximation to this marginal posterior distribution as the as the _hot_ target distribution. --- ## Parallel Tempering A function to evaluate the logarithm of the joint marginal density: ```{r} lab <- function(alpha, beta, gamma = 0.01, delta = 1) sum(lgamma(fail + alpha)) + (10 * alpha + gamma - 1) * log(beta) - delta * beta - alpha - 10* lgamma(alpha) - sum((fail + alpha) * log(time + beta)) labV <- function(ab) lab(ab[1], ab[2]) ``` -- Use `optim` to find the mode and the Hessian at the mode: ```{r} opt <- optim(c(0.5, 0.5), labV, method = "BFGS", hessian = TRUE, control = list(fnscale = -1)) ``` --- ## Parallel Tempering A function to generate from a normal approximation to the posterior density: ```{r} R <- chol(solve(-opt$hessian)) m <- opt$par genN <- function() as.numeric(rnorm(2) %*% R + m) ``` -- A function to evaluate the log density of this approximation: ```{r} labnV <- function(ab) { z <- solve(t(R), ab - m) - 0.5 * sum(z * z) } labn <- function(a, b) labnV(c(a, b)) ``` --- ## Parallel Tempering The approximation is not bad: ```{r, echo = FALSE, fig.width = 10} g <- expand.grid(a = seq(0, 1.5, len = 30), b = seq(0, 2, len = 30)) g$z1 <- mapply(lab, g$a, g$b) - lab(m[1], m[2]) g$z2 <- mapply(labn, g$a, g$b) library(lattice) contourplot(exp(z1) + exp(z2) ~ a * b, data = g, strip = strip.custom(factor.levels = c("Posterior Density", "Normal Approximation"))) ``` -- A somewhat better approximation could be obtained by taking logarithms of the parameters and approximating on the log scale. --- ## Parallel Tempering I will use parallel tempering with two chains. -- * The target distributions are the posterior density as $f_1$, and the normal approximation as $f_2$. -- * The `pump` sampler is used for sampling $f_1$. The approximation $f_2$ is sampled independently; this is the _hot_ chain. -- * A switch is attempted at each iteration. -- An equivalent formulation: -- * At each iteration generate a new value from the `pump` chain and an independent draw from the normal approximation. -- * Compute the M-H acceptance probability for swapping the two values and swap or not with that probability. --- ## Parallel Tempering The probability for accepting a swap $(\theta_2, \theta_1)$ or keeping the original pair $(\theta_1, \theta_2)$ is $$ \min\left(\frac{f_1(\theta_2) f_2(\theta_1)}{f_1(\theta_1) f_2(\theta_2)}, 1\right) $$ -- A function for computing this probability: ```{r} aprob <- function(v1, v2) { if (min(v1, v2) > 0) { lr <- (labV(v2) + labnV(v1)) - (labV(v1) + labnV(v2)) if (is.finite(lr)) exp(min(lr, 0)) else 0 } else 0 } ``` --- ## Parallel Tempering A function to implements the sampler. .small-code[ ```{r} pumpPT <- function(alpha, beta, N, d = 1, K = 1) { v <- matrix(0, ncol = 4, nrow = N) for (i in 1:N) { ab1 <- pump(alpha, beta, 1, d, K)[12:11] ab2 <- genN() p <- aprob(ab1, ab2) U <- runif(1) if (U < p) ab <- ab2 else ab <- ab1 alpha <- ab[1] beta <- ab[2] v[i, 1] <- alpha v[i, 2] <- beta v[i, 3] <- p v[i, 4] <- U < p } v } ``` ] -- The result returned also records the acceptance probabilities and whether a swap occurred. --- ## Parallel Tempering .pull-left[ Runs with the standard sampler and the parallel tempering version: ```{r} N <- 10000 st <- system.time(v <- pump(0.5, 0, N)[,12:11]) st2 <- system.time(v2 <- pumpPT(0.5, 0.5, N)) ``` Some plots: ] .pull-right[ ```{r, echo = FALSE, fig.width = 8, fig.height = 8} opar <- par(mfrow = c(3, 2)) plot(density(v[, 1]), main = expression(alpha)) lines(density(v2[, 1]), col = "red") plot(density(v[, 2]), main = expression(beta)) lines(density(v2[, 2]), col = "red") acf(v[, 1], main = expression(paste(bold("Original "), alpha))) acf(v[, 2], main = expression(paste(bold("Original "), beta))) acf(v2[, 1],main = expression(paste(bold("Simulated Tempering "), alpha))) acf(v2[, 2], main = expression(paste(bold("Simulated Tempering "), beta))) par(opar) ``` ] ```{r, echo = FALSE} ## lag-1 auto-correlations r <- acf(v[, 1], plot = FALSE)$acf[2] r2 <- acf(v2[, 1], plot = FALSE)$acf[2] ## efficiencies based on a simple AR(1) model e <-(1 - r ^ 2) / (1 + r ^ 2) e2 <-(1 - r2 ^ 2) / (1 + r2 ^ 2) ``` --- ## Parallel Tempering Based on a simple AR(1) model, the parallel tempering sampler is about `r round(e2 / e, 1)` times more efficient, in terms of the number of effective observations produced per iteration, than the original sampler. -- But in the current implementation it is also about `r round(st2[1] / st[1], 1)` times slower. -- The distribution of the number of iterations until the next accepted swap: ```{r, echo = FALSE} s <- diff(which(v2[,4] == 1)) plot(prop.table(table(s))) ``` --- ## Simulated Tempering Simulated tempering can also be used with a _hot_ chain independently sampling from the normal approximation and a cold chain sampling with `pump`. -- Suppose the hot target density $f_2$ and the cold target density $f_1$ are both normalized to equal one at their common mode. -- The probability of accepting a move from the cold chain at $\theta$ to the hot chain at $\theta$ is $$ \min\left(\frac{a f_2(\theta)}{f_1(\theta)}, 1\right). $$ -- The probability of accepting a move from hot chain to the cold chain is the inverse of this value. -- The parameter $a$ governs how much time is spent in the hot chain relative to the cold chain; larger values of $a$ mean more time spent in the hot chain. --- ## Simulated Tempering .pull-left[ This function accumulates a fixed number of steps from the cold chain. Steps from the hot chain are discarded, but the number of steps spent in the hot chain before each cold chain draw is recorded. The argument `off` corresponds to the logarithm of $a$. ] .pull-right[ .tiny-code[ ```{r} pumpST <- function(alpha, beta, N, d = 1, K = 1, off = 0) { v <- matrix(0, ncol = 3, nrow = N) for (i in 1:N) { ab <- pump(alpha, beta, 1, d, K)[12:11] lr <- labnV(ab) + off - (labV(ab) - labV(m)) k <- 0 if (is.finite(lr) && runif(1) < exp(lr)) { repeat { k <- k + 1 ab <- genN() if (min(ab) <= 0) next lr <- (labV(ab) - labV(m)) - labnV(ab) - off if (is.finite(lr) && runif(1) < exp(lr)) break } } alpha <- ab[1] beta <- ab[2] v[i, 1] <- alpha v[i, 2] <- beta v[i, 3] <- k } v } ``` ] ] --- ## Simulated Tempering Run the samplers: ```{r} st <- system.time(v3_0 <- pump(0.5, 0.5, N)) st_0 <- system.time(v3_0 <- pumpST(0.5, 0.5, N, off = 0)) st_1 <- system.time(v3_1 <- pumpST(0.5, 0.5, N, off = 1)) st_2 <- system.time(v3_2 <- pumpST(0.5, 0.5, N, off = 2)) ``` The estimated marginal densities: ```{r, echo = FALSE, fig.width = 10, fig.height = 4} opar <- par(mfrow = c(1, 2)) plot(density(v[, 1]), main = expression(alpha)) lines(density(v3_0[, 1]), col = "red") lines(density(v3_1[, 1]), col = "green") lines(density(v3_2[, 1]), col = "blue") plot(density(v[, 2]), main = expression(beta)) lines(density(v3_0[, 2]), col = "red") lines(density(v3_1[, 2]), col = "green") lines(density(v3_2[, 2]), col = "blue") par(opar) ``` --- ## Simulated Tempering Auto-correlation functions for $\alpha$: ```{r, echo = FALSE, fig.width = 10, fig.height = 7} opar <- par(mfrow = c(2, 2)) acf(v[,1], main = "Original, K = 1") acf(v3_0[,1], main = "Simulated Tempering, off = 0") acf(v3_1[,1], main = "Simulated Tempering, off = 1") acf(v3_2[,1], main = "Simulated Tempering, off = 2") par(opar) ``` --- ## Simulated Tempering ```{r, echo = FALSE} ra <- c(acf(v[, 1], plot = FALSE)$acf[2], acf(v3_0[, 1], plot = FALSE)$acf[2], acf(v3_1[, 1], plot = FALSE)$acf[2], acf(v3_2[, 1], plot = FALSE)$acf[2]) rb <- c(acf(v[, 2], plot = FALSE)$acf[2], acf(v3_0[, 2], plot = FALSE)$acf[2], acf(v3_1[, 2], plot = FALSE)$acf[2], acf(v3_2[, 2], plot = FALSE)$acf[2]) d <- data.frame(tm = c(st[[1]], st_0[[1]], st_1[[1]], st_2[[1]]), nm = c("Original, K = 1", paste("Simulated Tempering, off =", 0:2)), th = c(0, 1 - nrow(v3_0) / (nrow(v3_0) + sum(v3_0[, 3])), 1 - nrow(v3_1) / (nrow(v3_1) + sum(v3_0[, 3])), 1 - nrow(v3_2) / (nrow(v3_2) + sum(v3_2[, 3]))), ra = ra, ea = (1 - ra ^ 2) / (1 + ra ^ 2), eb = (1 - rb ^ 2) / (1 + rb ^ 2)) d <- transform(d, ne_cpu_a = N * ea / tm, ne_cpu_b = N * eb / tm) ``` Some performance measures: ```{r, echo = FALSE, fig.width = 12, fig.height = 6.5} b1 <- barchart(nm ~ tm, xlim = c(0, max(d$tm) * 1.05), data = d, main = "User Time", xlab = "Time (seconds)") b2 <- barchart(nm ~ th, xlim = c(0, 1), data = d, main = "Proportion of Time in Hot Chain", xlab = "Proportion") br_a <- barchart(nm ~ ra, data = d, xlim = c(0, 1), main = expression(paste(bold("Lag-1 Auto-Correlation for "), alpha))) br_b <- barchart(nm ~ rb, data = d, xlim = c(0, 1), main = expression(paste(bold("Lag-1 Auto-Correlation for "), beta))) be_a <- barchart(nm ~ ea, data = d, xlim = c(0, 1), main = expression(paste(bold("Iteration Efficiency for "), alpha))) be_b <- barchart(nm ~ eb, data = d, xlim = c(0, 1), main = expression(paste(bold("Iteration Efficiency for "), beta))) gridExtra::grid.arrange(b1, br_a, be_a, b2, br_b, be_b, ncol = 3) ``` --- ## Simulated Tempering Number of effective samples per CPU second _for the current implementations_: ```{r, echo = FALSE, fig.width = 10, fig.height = 5} b_ne_cpu_a <-barchart(nm ~ ne_cpu_a, data = d, xlim = c(0, 1.05 * max(d$ne_cpu_a)), main = expression(paste(N[E], bold(" per CPU Second for "), alpha))) b_ne_cpu_b <-barchart(nm ~ ne_cpu_b, data = d, xlim = c(0, 1.05 * max(d$ne_cpu_b)), main = expression(paste(N[E], bold(" per CPU Second for "), beta))) gridExtra::grid.arrange(b_ne_cpu_a, b_ne_cpu_b, ncol = 2) ``` --- ## Regeneration For the simple simulated tempering scheme used here, each time the process moves to the hot chain is a regeneration time. -- For the chain returned by `pumpST` this means all entries with a positive hot chain count are regeneration s. -- Distributions of times until the next regeneration: ```{r, echo = FALSE, fig.width = 12} t0 <-diff(which(v3_0[, 3] > 0)) t1 <- diff(which(v3_1[, 3] > 0)) t2 <- diff(which(v3_2[, 3] > 0)) xlims <- c(1, 1.05 * max(t0, t1, t2)) opar <- par(mfrow = c(1, 3)) plot(prop.table(table(t0)), xlim = xlims, ylim = c(0, 1), main = "off = 0") plot(prop.table(table(t1)), xlim = xlims, ylim = c(0, 1), main = "off = 1") plot(prop.table(table(t2)), xlim = xlims, ylim = c(0, 1), main = "off = 2") par(opar) ``` --- ## Regeneration A function to compute the standard error using regenerations. ```{r} rse <- function(X, regen) { regen[1] <- TRUE # mark the first observation as a tour start T <- which(regen) # tour starts T <- c(T, length(X) + 1) # add end of final, incomplete, tour tour <- cumsum(regen) # index for the tours N <- diff(T) Y <- tapply(X, tour, sum) tau <- mean((Y - mean(X) * N) ^ 2) / mean(N) ^ 2 sqrt(tau / length(X)) } ``` Standard error estimates for the posterior mean of $\alpha$: ```{r, echo = 0} library(coda) ar1se <- function(X) { r <- acf(X, plot = FALSE)$acf[2] sd(X) * sqrt((1 + r ^ 2) / (1 - r ^ 2) / length(X)) } se_ts <- c(spectrum0(v3_0[, 1])[[1]], spectrum0(v3_1[, 1])[[1]], spectrum0(v3_2[, 1])[[1]]) / sqrt(N) se_ar1 <- c(ar1se(v3_0[, 1]), ar1se(v3_1[, 1]), ar1se(v3_2[, 1])) se_regen <- c(rse(v3_0[, 1], v3_0[, 3] > 0), rse(v3_1[, 1], v3_1[, 3] > 0), rse(v3_2[, 1], v3_2[, 3] > 0)) tours <- c(sum(v3_0[, 3] > 0), sum(v3_1[, 3] > 0), sum(v3_2[, 3] > 0)) d <- data.frame("AR(1) SE" = se_ar1, "Time-series SE" = se_ts, "Regeneration SE" = se_regen, "Num. Tours" = tours, check.names = FALSE, row.names = c("off = 0", "off = 1", "off = 2")) knitr::kable(d, digits = 5) ```