## ## Utilities ## ## Record function call arguments trackxvals <- function(f) { function(x) { xvals <<- rbind(xvals, x) f(x) } } ## Show the path of function call arguments showpath <- function(xvals, sleep = 0.2) { for (i in seq_len(nrow(xvals))) if (i == 1) points(xvals[i, 1], xvals[i,2], col = "red", pch = 19) else { points(xvals[i - 1, 1], xvals[i - 1, 2], col = "black", pch = 19) segments(xvals[i - 1, 1], xvals[i - 1, 2], xvals[i, 1], xvals[i,2]) points(xvals[i, 1], xvals[i,2], col = "red", pch = 19) Sys.sleep(sleep) } points(xvals[i, 1], xvals[i, 2], col = "red", pch = 19) } ## ## Quadratic Function ## B <- matrix(c(1, .5, .5, 1), 2) b <- c(1, 1) fn <- function(x) { y <- x - b sum(0.5 * y * (B %*% y)) } gr <- function(x) B %*% (x - b) x1 <- seq(0, 2, len = 101) x2 <- x1 g <- expand.grid(x1 = x1, x2 = x2) z <- apply(cbind(g$x1, g$x2), 1, fn) xinit <- c(2, 1.8) ## Nelder-Mead xvals <- NULL optim(xinit, trackxvals(fn)) contour(x1, x2, matrix(z, length(x1), length(x2))) showpath(xvals) ## BFGS xvals <- NULL optim(xinit, trackxvals(fn), gr, method = "BFGS") contour(x1, x2, matrix(z, length(x1), length(x2))) showpath(xvals) ## Constrained c1 <- 1.1 c2 <- 1.2 xvals <- NULL constrOptim(xinit, trackxvals(fn), gr, ui = diag(2), ci = c(c1, c2)) contour(x1, x2, matrix(z, length(x1), length(x2))) abline(v = c1, lty = 2) abline(h = c2, lty = 2) showpath(xvals) ## ## Rosenbrock Banana function ## a <- 5 fr <- function(x) { x1 <- x[1] x2 <- x[2] a * (x2 - x1 ^2)^2 + (1 - x1)^2 } grr <- function(x) { ## Gradient of 'fr' x1 <- x[1] x2 <- x[2] c(-4 * a * x1 * (x2 - x1 * x1) - 2 * (1 - x1), 2 * a * (x2 - x1 * x1)) } x1 <- seq(0, 2, len = 101) x2 <- x1 g <- expand.grid(x1 = x1, x2 = x2) z <- apply(cbind(g$x1, g$x2), 1, fr) xinit <- c(1.5, 1,5) ## Nelder-Mead xvals <- NULL optim(c(1.5, 1.5), trackxvals(fr)) contour(x1, x2, matrix(z, length(x1), length(x2)), levels = 0.1 * (1:10)) showpath(xvals) ## BFGS xvals <- NULL optim(c(1.5, 1.5), trackxvals(fr), grr, method = "BFGS") contour(x1, x2, matrix(z, length(x1), length(x2)), levels = 0.1 * (1:10)) showpath(xvals) ## Constrained c1 <- 1.0 c2 <- 1.2 xvals <- NULL constrOptim(c(1.5, 1.5), trackxvals(fr), grr, ui = diag(2), ci = c(c1, c2)) contour(x1, x2, matrix(z, length(x1), length(x2)), levels = 0.1 * (1:10), xlim = c(1, 2), ylim = c(1, 2)) abline(v = c1, lty = 2) abline(h = c2, lty = 2) showpath(xvals) ## ## Himmelblau function ## fh <- function(x, y) (x ^ 2 + y - 11) ^ 2 + (x + y ^ 2 - 7) ^ 2 k <- 201 b <- 4.5 x1 <- seq(-b, b, length.out = k) x2 <- seq(-b, b, length.out = k) z <- outer(x1, x2, fh) r <- range(z) levs <- r[1] + c(0.001, 0.01, 0.05, 0,1, 0.2, 0.3, 0.4, 0.5, 0.7) * diff(r) set.seed(12345) for (i in 1:3) { xvals <- NULL v <- optim(c(0, 0), trackxvals(function(par) fh(par[1], par[2]) / 10), method = "SANN") } contour(x1, x2, z, levels = levs) showpath(xvals, sleep = 0.0001) points(xvals[nrow(xvals), 1], xvals[nrow(xvals), 2], col = "red", pch = 19) points(v$par[1], v$par[2], col = "green", pch = 19) ## ## Gradient descent ## Simple linear regression ## n <- 100 x <- rnorm(n, 1, 0.2) y <- rnorm(n, 1 + x, 0.05) X <- cbind(1, x) XtX <- crossprod(X) Xty <- t(X) %*% y x1 <- seq(0, 2, length.out = 101) x2 <- seq(0, 2, length.out = 101) grid <- expand.grid(x1 = x1, x2 = x2) f <- function(b) (0.5 * t(b) %*% XtX %*% b - t(Xty) %*% b) / n z <- matrix(apply(cbind(grid$x1, grid$x2), 1, f), length(x1), length(x2)) r <- range(z) levs <- r[1] + c(0.0001, 0.001, 0.01, 0.05, 0,1, 0.2, 0.3, 0.4, 0.5) * diff(r) g <- function(b) (XtX %*% b - Xty) / n a <- 0.9 ## favorable start b <- c(0, 0) xvals <- NULL for (i in 1:100) { b <- b - a * g(b) xvals <- rbind(xvals, as.numeric(b)) } contour(x1, x2, z, levels = levs) showpath(xvals) ## unfavorable start b <- c(1, 0) xvals <- NULL for (i in 1:100) { b <- b - a * g(b) xvals <- rbind(xvals, as.numeric(b)) } contour(x1, x2, z, levels = levs) showpath(xvals) ## optimal a a <- 2 / sum(eigen(XtX)$values) b <- c(1, 0) xvals <- NULL for (i in 1:10000) { b <- b - a * g(b) xvals <- rbind(xvals, as.numeric(b)) } contour(x1, x2, z, levels = levs) showpath(xvals, 0.0001) ## momentum b <- c(1, 0) xvals <- NULL d <- g(b) s <- 0.5 gamma <- 0.6 for (i in 1:100) { d <- g(b) + gamma * d b <- b - s * d xvals <- rbind(xvals, as.numeric(b)) } contour(x1, x2, z, levels = levs) showpath(xvals) ## optimum s, gamma b <- c(1, 0) xvals <- NULL d <- g(b) e <- eigen(XtX)$values s <- (2 / sum(sqrt(e)))^2 gamma <- (diff(sqrt(e)) / sum(sqrt(e)))^2 for (i in 1:10000) { d <- g(b) + gamma * d b <- b - s * d xvals <- rbind(xvals, as.numeric(b)) } contour(x1, x2, z, levels = levs) showpath(xvals, 0.0001) ## Nesterov acceleration b <- c(1, 0) xvals <- NULL d <- 0 s <- 0.5 gamma <- 0.9 for (i in 1:100) { d <- gamma * d - s * g(b + gamma * d) b <- b + d xvals <- rbind(xvals, as.numeric(b)) } contour(x1, x2, z, levels = levs) showpath(xvals) ## stochastic version gs <- function(b) { k <- sample(n, 1) xk <- X[k, ] xk %*% (xk %*% b) - xk * y[k] } b <- c(0, 0) xvals <- NULL for (i in 1:1000) { b <- b - a * gs(b) / (1 + i / 100) xvals <- rbind(xvals, as.numeric(b)) } contour(x1, x2, z, levels = levs) showpath(xvals, 0.001)