--- title: "Simulation Examples" author: "Luke Tierney" output: html_document: toc: yes --- ```{r, include = FALSE} set.seed(1234) ``` ```{r global_options, include=FALSE} knitr::opts_chunk$set(collapse = TRUE, fig.align = "center") options(rgl.useNULL=TRUE) knitr::knit_hooks$set(webgl = rgl::hook_webgl) ``` ## Using `/dev/random/` ```{r, eval = FALSE} devRand <- file("/dev/random", open="rb") U <- function() (as.double(readBin(devRand, "integer"))+2^31) / 2^32 x <-numeric(1000) for (i in seq(along=x)) x[i] <- U() y <- numeric(1000) for (i in seq(along=x)) y[i] <- U() close(devRand) hist(x) plot(x,y) ``` ## Randu Using the `randu` data set: ```{r, webgl = TRUE} library(rgl) points3d(randu) par3d(FOV=1) ## removes perspective distortion ``` With a larger number of points: ```{r, webgl = TRUE} seed <- as.double(1) RANDU <- function() { seed <<- ((2^16 + 3) * seed) %% (2^31) seed/(2^31) } U <- matrix(replicate(10000 * 3, RANDU()), ncol = 3, byrow = TRUE) clear3d() points3d(U) par3d(FOV=1) ``` ## Low Order Bits Mersenne-Twister, the current default: ```{r} n <- 1000 x <- runif(n) y <- runif(n) plot((2^25 * x) %% 1, (2^25 * y) %% 1) plot((2^30 * x) %% 1, (2^30 * y) %% 1) ``` Wichmann-Hill: ```{r} RNGkind(kind = "Wichmann-Hill") n <- 10000 x <- runif(n) y <- runif(n) plot((2^50 * x) %% 1, (2^50 * y) %% 1) plot((2^50 * (1 - x)) %% 1, (2^50 * (1 - y)) %% 1) ``` ## Sampling a Large Population A basic uniform may not have enough good low order random bits: ```{r} RNGkind(kind = "Mersenne-Twister") table(floor(2^33 * runif(1000)) %% 10) ``` The `sample` function protects against this: ```{r} table(sample(2^33, 1000, replace = TRUE) %% 10) ``` Using higher precision uniforms, as used for generating normals by inverison: ```{r} runif53 <- function(n) { BIG <- 134217728 ## 2^27 u1 <- runif(n) (floor(BIG * u1) + runif(n)) / BIG } table(floor(2^33 * runif53(1000)) %% 10) ``` ## Alias method R version on Ripley's Algorithm 3.13B, which is used by the R `sample` function: ```{r} ## Ripley alias <- function(p) { M <- length(p) w <- numeric(M) nn <- 0 np <- M + 1 Q <- M * p A <- integer(M) for (i in 1 : M) if (Q[i] < 1) { nn <- nn + 1 w[nn] <- i } else { np <- np - 1 w[np] <- i } for (step in 1 : (M - 1)) { i <- w[step] j <- w[np] A[i] <- j Q[j] <- Q[j] + Q[i] - 1 if (Q[i] < 1) np <- np + 1 if (np >= M) break } list(Q = Q, A = A) } ``` R version of Ripley Algorithm 3.13, with some simplifications: ```{r} alias <- function(p) { M <- length(p) A <- integer(M) Q <- p * M repeat { i <- which(Q < 1 & A == 0)[1] j <- which(Q > 1 & A == 0)[1] if (is.na(i) || is.na(j)) break A[i] <- j Q[j] <- Q[j] - (1 - Q[i]) } list(Q = Q, A = A) } ``` Reconstruct the probabilities from the alias data: ```{r} unalias <- function(d) { M <- length(d$A) p <- d$Q / M for (i in 1 : M) { k <- d$A[i] if (k > 0) p[k] <- p[k] + (1 - d$Q[i]) / M } p } ``` Simulate by the alias method: ```{r} gen <- function(n, p) { d <- alias(p) M <- length(p) k <- sample(M, n, replace = TRUE) ifelse(runif(n) < d$Q[k], k, d$A[k]) } ``` Alias computations for a Binomial distribution with $n = 10$ and $p = 0.5$: ```{r} p <- dbinom(0:10, 10, 0.5) d <- alias(p) max(abs(unalias(d) - p)) ``` A plot of the alias data: ```{r, echo = FALSE} library(ggplot2) theme_set(theme_minimal() + theme(text = element_text(size = 16))) ggplot(as.data.frame(d), aes(x = factor(seq_along(Q)), y = Q, label = ifelse(d$A > 0, d$A, NA))) + geom_col(color = "black", fill = NA, width = 1) + geom_text(vjust = "bottom", nudge_y = 0.01, na.rm = TRUE, size = 7) + xlab("Index") ## plot(d$Q, type = "s") ## text(seq_along(d$Q) + 0.5, d$Q + 0.05, label = ifelse(d$A > 0, d$A, NA)) ``` Simulated data: ```{r} x <- gen(100000, p) plot(table(x) / length(x)) points(seq_along(p), p, col = "red") ``` ## Rejection sampling ### Normal Distribution with Cauchy Envelope Area under the standard normal density: ```{r} x <- seq(-3, 3, len = 201) y <- dnorm(x) plot(x, y, type = "l") polygon(c(x, 3, -3), c(y, 0, 0), col = "grey") ``` A Cauchy envelope: ```{r} M <- 1.53 pnc <- function() { plot(x, M * dcauchy(x), type = "l", ylim = c(0, .5), ylab = "Density") polygon(c(x, 3, -3), c(M * dcauchy(x), 0, 0), col = "light grey", border = NA) polygon(c(x, 3, -3), c(y, 0, 0), col = "grey", border = NA) lines(x, y) lines(x, M * dcauchy(x)) title("Normal Density with Cauchy Envelope") } pnc() ``` Uniform sampling from the area under the Cauchy envelope: ```{r} N <- 10000 X <- rcauchy(N) Y <- runif(N, 0, M*dcauchy(X)) pnc() points(X, Y, col = "blue") ``` Selecting the points under the normal density: ```{r} pnc() points(X, Y, col = ifelse(Y <= dnorm(X), "red", "blue")) ``` Proportion accepted: ```{r} Z <- X[Y <= dnorm(X)] length(Z) / N ``` Compare a density estimate to the normal density: ```{r} plot(density(Z)) lines(x, y, col = "red") ``` ### Poisson Distribution with Cauchy Envelope Density and envelope: ```{r} M <- 2 ppc <- function() { x <- seq(0, 20, len = 2000) plot(x, M * dcauchy(x, 5, 3), type = "l") polygon(c(x, 20, 0), c(2 * dcauchy(x, 5, 3), 0, 0), col = "light grey", border = NA) polygon(c(x,20, 0), c(dpois(floor(x), 5), 0, 0), col = "grey", border = NA) lines(x, M * dcauchy(x, 5, 3)) lines(x,dpois(floor(x), 5)) title("Poisson PMF with Cauchy Envelope") } ppc() ``` Samples: ```{r} N <- 10000 X <- rcauchy(N, 5, 3) Y <- runif(N, 0, M * dcauchy(X, 5, 3)) ppc() points(X, Y, col = "blue") accept <- Y < dpois(floor(X), 5) points(X[accept], Y[accept], col = "red") ``` Acceptance rate: ```{r} mean(accept) ``` Comparing the sample proportions to the Poisson PMF: ```{r} Z <- floor(X[accept]) tbl <- table(Z) / length(Z) tbl_x <- as.numeric(rownames(tbl)) plot(tbl_x, tbl, type = "h") points(tbl_x, dpois(tbl_x, 5), col = "red") ``` With simulation standard errors: ```{r} plot(tbl_x, tbl) points(tbl_x, dpois(tbl_x, 5), col = "red") se <- sqrt(tbl * (1 - tbl) / N) segments(tbl_x, dpois(tbl_x, 5) - 2 * se, y1 = dpois(tbl_x, 5) + 2 * se, col = "red") ``` Truncated Cauchy increases the acceptance probability somewhat: ```{r} a <- pcauchy(0, 5, 3) U <- runif(N) X <- qcauchy(a + U * (1 - a), 5, 3) Y <- runif(N, 0, M * dcauchy(X, 5, 3)) accept <- Y < dpois(floor(X), 5) mean(accept) ``` ## Ratio of Uniforms Region for a standard normal target: ```{r} x <- seq(-6, 6, len = 301) u <- sqrt(exp(-0.5 * x ^ 2)) plot(x * u, u, type="l", xlab="v", asp = 1.5) polygon(x * u, u, col = "grey") ``` Rejection sampling envelope equivalent to ratio-of-uniforms sampling by generating points uniformly in an enclosing rectangle: ```{r} x <- seq(-3, 3, len = 301) plot(x, exp(-x ^ 2 / 2), type = "l", ylab = "density") polygon(c(x, 3, -3), c(pmin(1, 2 * exp(-1) / x ^ 2), 0, 0), col="light grey", border = NA) polygon(c(x, 3, -3), c(exp(-x ^ 2 / 2), 0, 0), col = "grey", border = NA) lines(x,exp(-x ^ 2 / 2)) lines(x, pmin(1, 2 * exp(-1) / x ^ 2)) ``` Region for a Gamma(30, 1) distribution: ```{r} x <- seq(0, 70, len = 300) u <- sqrt(dgamma(x, 30)) plot(x * u, u, type = "l", xlab = "v") polygon(x * u, u, col = "grey") title(expression(paste("Gamma with ", alpha, "=", 30, " and ", mu, "=", 0))) ``` Centering at $\mu = 29$: ```{r} plot((x - 29) * u, u, type = "l", xlab = "v") polygon((x - 29) * u, u, col = "grey") title(expression(paste("Gamma with ", alpha, "=", 30, " and ", mu, "=", 29))) ``` ## Inhomogeneous Poisson Process on the Line Suppose the intensity function $h$ of an inhomogeneous Poisson process varies regularly over a 24-hour period and we would like to simulate events in a 5-day period: ```{r, fig.width = 10, fig.height = 3.5} m <- 0 M <- 1 h <- function(x) (M - m) * (sin((2 * pi * x / 24) - pi / 2) + 1) / 2 + m plot(h, 0, 24 * 5, ylim = c(0, M)) ``` Generate a homogeneous Poisson process with rate $M$ over a 5-day period: ```{r} T <- 24 * 5 ## 5 days N <- rpois(1, T * M) X <- runif(N, 0, T) plot(h, 0, 24 * 5, ylim = c(0, M)) points(X, rep(0, length(X)), col = "blue", pch = 19) ``` Filter down to an inhomogeneous process with rate function $h$: ```{r, fig.width = 10, fig.height = 3.5} keep <- runif(N, 0, M) < h(X) Y <- X[keep] plot(h, 0, 24 * 5, ylim = c(0, M)) points(X, rep(0, length(X)), col = "blue", pch = 19) points(Y, rep(0, length(Y)), col = "red", pch = 19) ``` The cumulative rate function $H$ is ```{r} H <- function(x) (M - m) * (-12 * sin(pi * x / 12) / pi + x) / 2 + m * x plot(H, 0, T) ``` Generate a rate one Poisson process on $[0, H(T)]$: ```{r} N <- rpois(1, H(T)) Z <- runif(N, 0, H(T)) ``` Invert to an inhomogeneous Poisson process on $[0, T]$ ```{r} Y <- sapply(Z, function(z) uniroot(function(x) H(x) - z, c(0, T))$root) plot(H, 0, T) points(rep(0, length(Z)), Z, col = "blue", pch = 19) segments(rep(0, length(Z)), Z, Y, Z, lty = 2) segments(Y, Z, Y, rep(0, length(Z)), lty = 2) points(Y, rep(0, length(Y)), col = "red", pch = 19) ``` ```{r, fig.width = 10, fig.height = 3.5} plot(h, 0, 24 * 5, ylim = c(0, M)) points(Y, rep(0, length(Y)), col = "red", pch = 19) ``` ## Variance Reduction ### Control Variates Suppose we want to estimate the expected value of the sample median $T$ for a sample of size 10 from a $\text{Gamma}(3,1)$ population. Crude estimate: $$ Y = \frac{1}{N}\sum T_i $$ Using the sample mean as a control variate with $b=1$: $$ \widehat{Y} = \frac{1}{N}\sum (T_i-\overline{X}_i) + E[\overline{X}_i] = \frac{1}{N}\sum (T_i-\overline{X}_i) + \alpha $$ ```{r} x <- matrix(rgamma(100000, 3), ncol = 10) md <- apply(x, 1, median) mn <- apply(x, 1, mean) mean(md) mean(md - mn) + 3 sd(md) sd(md-mn) ``` The standard deviation is cut roughly in half. The optimal $b$ seems close to 1. ### Importance Sampling for Tail Probabilities For computing $P(X>2)$ where $X$ has a standard Cauchy distribution we can use a shifted Cauchy distribution: ```{r} y <- rcauchy(10000,3) tt <- ifelse(y > 2, 1, 0) * dcauchy(y) / dcauchy(y,3) mean(tt) ``` The asymptotic standard deviation for the importance sampling estimate: ```{r} sd(tt) ``` The asymptotic standard deviation for crude Monte Carlo is approximately ```{r} sqrt(mean(tt) * (1 - mean(tt))) ``` ### Antithetic Variates For estimating the expected value of the median for samples of size 10 from the $\text{Gamma}(3,1)$ distribution: ```{r} u <- matrix(runif(50000), ncol = 10) x1 <- qgamma(u, 3) x2 <- qgamma(1 - u, 3) md1 <- apply(x1, 1, median) md2 <- apply(x2, 1, median) mean((md1 + md2) / 2) ``` Estimated asymptotic standard error: ```{r} sqrt(2) * sd((md1 + md2) / 2) ``` Control variates helps further a bit but need $b = 0.2$ or so. ```{r} mn1 <- apply(x1, 1, mean) mn2 <- apply(x2, 1, mean) sqrt(2) * sd((md1 + md2 - 0.2 * (mn1 + mn2)) / 2) ``` ### Conditioning Estimating $P(X > 2)$ for $X$ a Cauchy random variable: If $Z$ and $Y$ are independent standard normals and $W = |Y|$ then $X = Z/ W$ has a Cauchy distribution. So $$ P(X > 2) = E[P(X > 2| W)] $$ and $$ P(X > 2 | W = w) = P(Z / W > 2| W = w) = P(Z / w > 2) = P(Z > 2 w) $$ ```{r} N <- 10000 Z <- rnorm(N) W <- abs(rnorm(N)) (phat <- mean(Z/W > 2)) mean(pnorm(2 * W, lower.tail = FALSE)) ``` Asymptotic standard errors: ```{r} sd(pnorm(2 * W, lower.tail = FALSE)) sqrt(phat * (1 - phat)) ``` ### Independence Decomposition ```{r} x <- matrix(rnorm(100000), ncol = 10) mn <- apply(x, 1, mean) md <- apply(x, 1, median) ``` Estimates: ```{r} mean(md^2) 1 / 10 + mean((md - mn)^2) ``` Asymptotic standard errors: ```{r} sd(md^2) sd((md - mn)^2) ```