##
## Utilities
##

## Record function call arguments 
trackxvals <- function(f) {
    function(x) {
        xvals <<- rbind(xvals, x)
        f(x)
    }
}

## Show the path of function call arguments
showpath <- function(xvals, sleep = 0.2) {
    for (i in seq_len(nrow(xvals)))
        if (i == 1)
            points(xvals[i, 1], xvals[i,2], col = "red", pch = 19)
        else {
            points(xvals[i - 1, 1], xvals[i - 1, 2], col = "black", pch = 19)
            segments(xvals[i - 1, 1], xvals[i - 1, 2], xvals[i, 1], xvals[i,2])
            points(xvals[i, 1], xvals[i,2], col = "red", pch = 19)
            Sys.sleep(sleep)
        }
    points(xvals[i, 1], xvals[i, 2], col = "red", pch = 19)
}


##
## Quadratic Function
##

B <- matrix(c(1, .5, .5, 1), 2)
b <- c(1, 1)

fn <- function(x) {
    y <- x - b
    sum(0.5 * y * (B %*% y))
}

gr <- function(x) B %*% (x - b)

x1 <- seq(0, 2, len = 101)
x2 <- x1
g <- expand.grid(x1 = x1, x2 = x2)
z <- apply(cbind(g$x1, g$x2), 1, fn)

xinit <- c(2, 1.8)

## Nelder-Mead
xvals <- NULL
optim(xinit, trackxvals(fn))
contour(x1, x2, matrix(z, length(x1), length(x2)))
showpath(xvals)

## BFGS
xvals <- NULL
optim(xinit, trackxvals(fn), gr, method = "BFGS")
contour(x1, x2, matrix(z, length(x1), length(x2)))
showpath(xvals)

## Constrained
c1 <- 1.1
c2 <- 1.2
xvals <- NULL
constrOptim(xinit, trackxvals(fn), gr, ui = diag(2), ci = c(c1, c2))
contour(x1, x2, matrix(z, length(x1), length(x2)))
abline(v = c1, lty = 2)
abline(h = c2, lty = 2)
showpath(xvals)


##
## Rosenbrock Banana function
##

a <- 5
fr <- function(x) {
         x1 <- x[1]
         x2 <- x[2]
         a * (x2 - x1 ^2)^2 + (1 - x1)^2
     }

grr <- function(x) { ## Gradient of 'fr'
         x1 <- x[1]
         x2 <- x[2]
         c(-4 * a * x1 * (x2 - x1 * x1) - 2 * (1 - x1),
            2 * a *      (x2 - x1 * x1))
     }

x1 <- seq(0, 2, len = 101)
x2 <- x1
g <- expand.grid(x1 = x1, x2 = x2)
z <- apply(cbind(g$x1, g$x2), 1, fr)

xinit <- c(1.5, 1,5)

## Nelder-Mead
xvals <- NULL
optim(c(1.5, 1.5), trackxvals(fr))
contour(x1, x2, matrix(z, length(x1), length(x2)), levels = 0.1 * (1:10))
showpath(xvals)

## BFGS
xvals <- NULL
optim(c(1.5, 1.5), trackxvals(fr), grr, method = "BFGS")
contour(x1, x2, matrix(z, length(x1), length(x2)), levels = 0.1 * (1:10))
showpath(xvals)

## Constrained
c1 <- 1.0
c2 <- 1.2
xvals <- NULL
constrOptim(c(1.5, 1.5), trackxvals(fr), grr, ui = diag(2), ci = c(c1, c2))
contour(x1, x2, matrix(z, length(x1), length(x2)), levels = 0.1 * (1:10),
        xlim = c(1, 2), ylim = c(1, 2))
abline(v = c1, lty = 2)
abline(h = c2, lty = 2)
showpath(xvals)


##
## Himmelblau function
##

fh <- function(x, y) (x ^ 2 + y - 11) ^ 2 + (x + y ^ 2 - 7) ^ 2

k <- 201
b <- 4.5
x1 <- seq(-b, b, length.out = k)
x2 <- seq(-b, b, length.out = k)
z <- outer(x1, x2, fh)
r <- range(z)
levs <- r[1] + c(0.001, 0.01, 0.05,  0,1, 0.2, 0.3, 0.4, 0.5, 0.7) * diff(r)
set.seed(12345)
for (i in 1:3) {
    xvals <- NULL
    v <- optim(c(0, 0), trackxvals(function(par) fh(par[1], par[2]) / 10),
               method = "SANN")
}
contour(x1, x2, z, levels = levs)
showpath(xvals, sleep = 0.0001)
points(xvals[nrow(xvals), 1], xvals[nrow(xvals), 2], col = "red", pch = 19)
points(v$par[1], v$par[2], col = "green", pch = 19)


##
## Gradient descent
## Simple linear regression
##

n <- 100
x <- rnorm(n, 1, 0.2)
y <- rnorm(n, 1 + x, 0.05)

X <- cbind(1, x)
XtX <- crossprod(X)
Xty <- t(X) %*% y

x1 <- seq(0, 2, length.out = 101)
x2 <- seq(0, 2, length.out = 101)
grid <- expand.grid(x1 = x1, x2 = x2)

f <- function(b) (0.5 * t(b) %*% XtX %*% b - t(Xty) %*% b) / n
z <- matrix(apply(cbind(grid$x1, grid$x2), 1, f), length(x1), length(x2))
r <- range(z)
levs <- r[1] + c(0.0001, 0.001, 0.01, 0.05,  0,1, 0.2, 0.3, 0.4, 0.5) * diff(r)

g <- function(b) (XtX %*% b - Xty) / n
a <- 0.9

## favorable start
b <- c(0, 0)
xvals <- NULL
for (i in 1:100) {
    b <- b - a * g(b)
    xvals <- rbind(xvals, as.numeric(b))
}

contour(x1, x2, z, levels = levs)
showpath(xvals)

## unfavorable start
b <- c(1, 0)
xvals <- NULL
for (i in 1:100) {
    b <- b - a * g(b)
    xvals <- rbind(xvals, as.numeric(b))
}

contour(x1, x2, z, levels = levs)
showpath(xvals)

## optimal a
a <- 2 / sum(eigen(XtX)$values)
b <- c(1, 0)
xvals <- NULL
for (i in 1:10000) {
    b <- b - a * g(b)
    xvals <- rbind(xvals, as.numeric(b))
}

contour(x1, x2, z, levels = levs)
showpath(xvals, 0.0001)


## momentum
b <- c(1, 0)
xvals <- NULL
d <- g(b)
s <- 0.5
gamma <- 0.6
for (i in 1:100) {
    d <- g(b) + gamma * d
    b <- b - s * d
    xvals <- rbind(xvals, as.numeric(b))
}

contour(x1, x2, z, levels = levs)
showpath(xvals)

## optimum s, gamma
b <- c(1, 0)
xvals <- NULL
d <- g(b)
e <- eigen(XtX)$values
s <- (2 / sum(sqrt(e)))^2
gamma <- (diff(sqrt(e)) / sum(sqrt(e)))^2
for (i in 1:10000) {
    d <- g(b) + gamma * d
    b <- b - s * d
    xvals <- rbind(xvals, as.numeric(b))
}

contour(x1, x2, z, levels = levs)
showpath(xvals, 0.0001)

## Nesterov acceleration
b <- c(1, 0)
xvals <- NULL
d <- 0
s <- 0.5
gamma <- 0.9
for (i in 1:100) {
    d <- gamma * d - s * g(b + gamma * d)
    b <- b + d
    xvals <- rbind(xvals, as.numeric(b))
}

contour(x1, x2, z, levels = levs)
showpath(xvals)

## stochastic version
gs <- function(b) {
    k <- sample(n, 1)
    xk <- X[k, ]
    xk %*% (xk %*% b) - xk * y[k]
}

b <- c(0, 0)
xvals <- NULL
for (i in 1:1000) {
    b <- b - a * gs(b) / (1 + i / 100)
    xvals <- rbind(xvals, as.numeric(b))
}

contour(x1, x2, z, levels = levs)
showpath(xvals, 0.001)