Biostatistical Computing, PHC 6068

multi-variate optimization

Zhiguang Huo (Caleb)

Monday Nov 27, 2017

Outline

Univariate Gradient decent

Gradient decent: choose initial \(\beta^{(0)} \in \mathbb{R}\), repeat: \[\beta^{(k)} = \beta^{(k - 1)} - t\times f'(\beta^{(k - 1)}), k = 1,2,3,\ldots\]

Interpretation

\[f(y) \approx f(\beta^{(k - 1)}) + f'(\beta^{(k - 1)}) \times (y - \beta^{(k - 1)}) + \frac{1}{2t} (y - \beta^{(k - 1)})^2\]

Visual interpretation

Newton’s method

Newton’s method interpretation

New problem?

Gradient in multivariate case

Then \[\nabla_\beta f(\beta) = \frac{\partial f(\beta)}{\partial \beta} = (\frac{\partial f(\beta)}{\partial \beta_1}, \frac{\partial f(\beta)}{\partial \beta_2}, \ldots, \frac{\partial f(\beta)}{\partial \beta_p})^\top \in \mathbb{R}^p\]

Visualization of the example function

f <- function(x,y){
  4 * x^2 + y^2 + 2 * x * y - x - y 
}


x <- y <- seq(-20, 20, len=1000)
z <- outer(x, y, f)
# Contour plots
contour(x,y,z, xlim = c(-10,10), ylim = c(-20,20), nlevels = 10, levels = seq(1,100,length.out = 10), main="Contour Plot")

Hessian in multivariate case

Then \[\nabla_\beta f(\beta) = \frac{\partial f(\beta)}{\partial \beta} = (\frac{\partial f(\beta)}{\partial \beta_1}, \frac{\partial f(\beta)}{\partial \beta_2}, \ldots, \frac{\partial f(\beta)}{\partial \beta_p})^\top \in \mathbb{R}^p\]

\[\Delta_\beta f(\beta) = \nabla^2_\beta f(\beta) = \nabla_\beta\frac{\partial f(\beta)}{\partial \beta} = (\nabla_\beta \frac{\partial f(\beta)}{\partial \beta_1}, \nabla_\beta \frac{\partial f(\beta)}{\partial \beta_2}, \ldots, \nabla_\beta \frac{\partial f(\beta)}{\partial \beta_p})^\top \]

\[\Delta_\beta f(\beta) = \begin{pmatrix} \frac{\partial^2 f(\beta)}{\partial \beta_1^2} & \frac{\partial^2 f(\beta)}{\partial \beta_1 \partial \beta_2} & \ldots & \frac{\partial^2 f(\beta)}{\partial \beta_1 \partial\beta_p}\\ \frac{\partial^2 f(\beta)}{\partial \beta_2 \partial \beta_1 } & \frac{\partial^2 f(\beta)}{\partial \beta_2^2} & \ldots & \frac{\partial^2 f(\beta)}{\partial \beta_2 \partial \beta_p} \\ \ldots &\ldots &\ldots &\ldots\\ \frac{\partial^2 f(\beta)}{\partial \beta_p \partial \beta_1} & \frac{\partial^2 f(\beta)}{\partial \beta_p \partial \beta_2} & \ldots & \frac{\partial^2 f(\beta)}{\partial\beta_p^2} \end{pmatrix} \]

Hessian in multivariate case example

\[f(x,y) = 4x^2 + y^2 + 2xy - x - y\]

\[\nabla f = (\frac{\partial f}{\partial x}, \frac{\partial f}{\partial y})^\top = \binom{8x + 2y - 1}{2y +2x - 1} \in \mathbb{R}^2\]

\[\Delta f = \begin{pmatrix} 8 & 2 \\ 2 & 2 \end{pmatrix} \in \mathbb{R}^{2\times 2}\]

Gradient decent for multivariate case

Gradient decent: choose initial \(\beta^{(0)} \in \mathbb{R}^p\), repeat: \[\beta^{(k)} = \beta^{(k - 1)} - t\times \nabla_\beta f(\beta^{(k - 1)}), k = 1,2,3,\ldots\]

Gradient decent on our motivating example

f <- function(x,y){
  4 * x^2 + y^2 + 2 * x * y - x - y 
}

g <- function(x,y){
  c(8*x + 2*y - 1, 2*y +2*x - 1)
}

x <- y <- seq(-20, 20, len=1000)
z <- outer(x, y, f)
# Contour plots
contour(x,y,z, xlim = c(-10,10), ylim = c(-20,20), nlevels = 10, levels = seq(1,100,length.out = 10), main="Contour Plot")


x <- x0 <- 8; x_old <- 0; 
y <- y0 <- -10; y_old <- 0; 
curXY <- c(x,y)
preXY <- c(x_old, y_old)
t <- 0.1; k <- 0; error <- 1e-6
trace <- list(curXY)
points(x0,y0, col=1, pch=1)

l2n <- function(avec){
  sqrt(sum(avec^2))
}
diffChange <- function(avec, bvec){
  deltaVec <- avec - bvec
  sumVec <- avec + bvec
  l2n(deltaVec)/l2n(sumVec)
}

while(diffChange(curXY, preXY) > error){
  k <- k + 1
  preXY <- curXY
  curXY <- preXY - t*g(preXY[1], preXY[2])
  trace <- c(trace, list(curXY)) ## collecting results
  points(curXY[1], curXY[2], col=k, pch=19)
  segments(curXY[1], curXY[2], preXY[1], preXY[2])
}

print(k)
## [1] 97

Divergence of Gradient decent

f <- function(x,y){
  4 * x^2 + y^2 + 2 * x * y - x - y 
}

g <- function(x,y){
  c(8*x + 2*y - 1, 2*y +2*x - 1)
}

B <- 1000
plot(x,xlim=c(-B,B), ylim=c(-B,B))
#contour(x,y,z, xlim = c(-100,100), ylim = c(-200,200), nlevels = 10, levels = seq(1,10000,length.out = 10), main="Contour Plot")


x <- x0 <- 0.8; x_old <- 0; 
y <- y0 <- -0.10; y_old <- 0; 
curXY <- c(x,y)
preXY <- c(x_old, y_old)
t <- 0.8; k <- 0; error <- 1e-6
trace <- list(curXY)
points(x0,y0, col=1, pch=1)

l2n <- function(avec){
  sqrt(sum(avec^2))
}
diffChange <- function(avec, bvec){
  deltaVec <- avec - bvec
  sumVec <- avec + bvec
  l2n(deltaVec)/l2n(sumVec)
}

## repeat the following chunk of code to see divergence of gradient decent
k <- k + 1
preXY <- curXY
curXY <- preXY - t*g(preXY[1], preXY[2])
points(curXY[1], curXY[2], col=k, pch=19)
segments(curXY[1], curXY[2], preXY[1], preXY[2])

How to select step size \(t\), backtracking line search

Implementation of back-tracking for gradient decent

f0 <- function(x,y){
  4 * x^2 + y^2 + 2 * x * y - x - y 
}

f <- function(avec){
  x <- avec[1]
  y <- avec[2]
  4 * x^2 + y^2 + 2 * x * y - x - y 
}

g <- function(avec){
  x <- avec[1]
  y <- avec[2]  
  c(8*x + 2*y - 1, 2*y +2*x - 1)
}

x <- y <- seq(-20, 20, len=1000)
z <- outer(x, y, f0)
# Contour plots
contour(x,y,z, xlim = c(-10,10), ylim = c(-20,20), nlevels = 10, levels = seq(1,100,length.out = 10), main="Contour Plot")


x <- x0 <- 8; x_old <- 0; 
y <- y0 <- -10; y_old <- 0; 
curXY <- c(x,y)
preXY <- c(x_old, y_old)
t0 <- 1; k <- 0; error <- 1e-6
alpha = 1/3
beta = 1/2
trace <- list(curXY)
points(x0,y0, col=1, pch=1)

l2n <- function(avec){
  sqrt(sum(avec^2))
}
diffChange <- function(avec, bvec){
  deltaVec <- avec - bvec
  sumVec <- avec + bvec
  l2n(deltaVec)/l2n(sumVec)
}

while(diffChange(curXY, preXY) > error){
  k <- k + 1
  preXY <- curXY
  
  ## backtracking
  t <- t0
  while(f(preXY - t*g(preXY)) > f(preXY) - alpha * t * l2n(g(preXY))^2){
    t <- t * beta
  }
  
  curXY <- preXY - t*g(preXY)
  trace <- c(trace, list(curXY)) ## collecting results
  points(curXY[1], curXY[2], col=k, pch=19)
  segments(curXY[1], curXY[2], preXY[1], preXY[2])
  
}

print(k)
## [1] 25

Exercise, solve for logistic regression

library(ElemStatLearn)
glm_binomial_logit <- glm(svi ~ lcavol, data = prostate,  family = binomial())
summary(glm_binomial_logit)
## 
## Call:
## glm(formula = svi ~ lcavol, family = binomial(), data = prostate)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.79924  -0.48354  -0.21025  -0.04274   2.32135  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -5.0296     1.0429  -4.823 1.42e-06 ***
## lcavol        1.9798     0.4543   4.358 1.31e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 101.35  on 96  degrees of freedom
## Residual deviance:  64.14  on 95  degrees of freedom
## AIC: 68.14
## 
## Number of Fisher Scoring iterations: 6

Newton’s method for multivariate case

Implement multivariate Newton’s method

f0 <- function(x,y){
  4 * x^2 + y^2 + 2 * x * y - x - y 
}

f <- function(avec){
  x <- avec[1]
  y <- avec[2]
  4 * x^2 + y^2 + 2 * x * y - x - y 
}

g <- function(avec){
  x <- avec[1]
  y <- avec[2]  
  c(8*x + 2*y - 1, 2*y +2*x - 1)
}

h <- function(avec){
  x <- avec[1]
  y <- avec[2]  
  res <- matrix(c(8,2,2,2),2,2)  ## Hessian function
  return(res)  
} 

x <- y <- seq(-20, 20, len=1000)
z <- outer(x, y, f0)
# Contour plots
contour(x,y,z, xlim = c(-10,10), ylim = c(-20,20), nlevels = 10, levels = seq(1,100,length.out = 10), main="Contour Plot")


x <- x0 <- 8; x_old <- 0; 
y <- y0 <- -10; y_old <- 0; 
curXY <- c(x,y)
preXY <- c(x_old, y_old)
t0 <- 1; k <- 0; error <- 1e-6
trace <- list(curXY)
points(x0,y0, col=1, pch=1)

l2n <- function(avec){
  sqrt(sum(avec^2))
}
diffChange <- function(avec, bvec){
  deltaVec <- avec - bvec
  sumVec <- avec + bvec
  l2n(deltaVec)/l2n(sumVec)
}

while(diffChange(curXY, preXY) > error){
  k <- k + 1
  preXY <- curXY
  
  curXY <- preXY - solve(h(curXY)) %*% g(preXY)
  trace <- c(trace, list(curXY)) ## collecting results
  points(curXY[1], curXY[2], col=k, pch=19)
  segments(curXY[1], curXY[2], preXY[1], preXY[2])
}

k ## why k is only 2?
## [1] 2

Exercise

\[f(x,y) = \exp(xy) + y^2 + x^4\]

What are the x and y such that \(f(x,y)\) is minimized?

Divergence in Newton’s method

Note: same color denotes the region where will lead to the same root. Reference: https://www.chiark.greenend.org.uk/~sgtatham/newton/

Backtracking line search for Newton’s method

Exercise, solve for logistic regression

library(ElemStatLearn)
glm_binomial_logit <- glm(svi ~ lcavol, data = prostate,  family = binomial())
summary(glm_binomial_logit)
## 
## Call:
## glm(formula = svi ~ lcavol, family = binomial(), data = prostate)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.79924  -0.48354  -0.21025  -0.04274   2.32135  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -5.0296     1.0429  -4.823 1.42e-06 ***
## lcavol        1.9798     0.4543   4.358 1.31e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 101.35  on 96  degrees of freedom
## Residual deviance:  64.14  on 95  degrees of freedom
## AIC: 68.14
## 
## Number of Fisher Scoring iterations: 6

Coordinate decent

Example on linear regression

\[ \min_{\beta \in \mathbb{R}^p} f(\beta) = \min_{\beta \in \mathbb{R}^p} \frac{1}{2} \|y - X\beta\|_2^2\] where \(y \in \mathbb{R}^n\) and \(X \in \mathbb{R}^{n \times p}\) with columns \(X_1\) (intercept), \(X_2\), \(\ldots\), \(X_p\).

\[\beta_j = \frac{X_j^\top (y - X_{-j}\beta_{-j})}{X_j^\top X_j}\]

implement coordinate decent using prostate cancer data

library(ElemStatLearn)

y <- prostate$lcavol
x0 <- prostate[,-match(c("lcavol", "train"), colnames(prostate))]
x <- cbind(1, as.matrix(x0))

beta <- rep(0,ncol(x))
beta_old <- rnorm(ncol(x))

error <- 1e-6
k <- 0
while(diffChange(beta, beta_old) > error){
  k <- k + 1
  beta_old <- beta
  for(j in 1:length(beta)){
    xj <- x[,j]
    xj_else <- x[,-j]
    beta_j_else <- as.matrix(beta[-j])
    beta[j] <- t(xj) %*% (y - xj_else %*% beta_j_else) / sum(xj^2)
  }
}

print(beta)
## [1] -2.419610178 -0.025954949  0.022114840 -0.093134994 -0.152983715
## [6]  0.367195579  0.197095411 -0.007114941  0.565822653
## compare with lm
lm(lcavol ~ . - train, data=prostate)
## 
## Call:
## lm(formula = lcavol ~ . - train, data = prostate)
## 
## Coefficients:
## (Intercept)      lweight          age         lbph          svi  
##   -2.420613    -0.025925     0.022113    -0.093138    -0.152966  
##         lcp      gleason        pgg45         lpsa  
##    0.367180     0.197249    -0.007117     0.565826

One dimensional lasso problem

\[\arg \min_\beta f(\beta) = \arg \min_\beta \frac{1}{2}(y - \beta)^2 + \lambda |\beta|\]

\[0 = \frac{\partial f(\beta)}{\partial \beta} = \beta - y + \lambda \frac{\partial |\beta|}{\partial \beta}\]

\[\beta = S_\lambda(y) = \begin{cases} y - \lambda \text{ if } y > \lambda\\ y + \lambda \text{ if } y < - \lambda\\ 0 \text{ if } -\lambda \le y \le \lambda\\ \end{cases}\]

soft thresholding operator and hard thresholding operator

Exercise

Constrained optimization - Lagrange multiplier

Constrained optimization - Karush-Kuhn-Tucher conditions

Given general problem

\[L(x) = f(x) + \sum_{i=1}^m u_i h_i(x) + \sum_{j=1}^r v_j l_j(x) \]

The Karush-Kuhn-Tucher conditions or KKT conditions are:

Application: water-filling (B & V page 245)

Application: water-filling (B & V page 245)

Optimize this function in R

Optimize this function in R

n <- 10
alpha <- runif(n)
x0 <- runif(n)
x1 <- x0/sum(x0)
f <- function(x){
  -sum(log(alpha + x))
}
grad <- function(x){
  -1/(alpha + x)
}

ui1 <- diag(n)
ui2 <- rep(1,n)
ui3 <- rep(-1,n)
ui <- rbind(ui1, ui2, ui3)

ci1 <- rep(0,n)
ci2 <- 1 - 1e-12
ci3 <- -1 - 1e-12
ci <- c(ci1,ci2,ci3)


constrOptim(x1, f, grad, ui, ci)
## $par
##  [1] 0.03854230 0.05224637 0.19736554 0.08935701 0.07468487 0.03603157
##  [7] 0.22017346 0.02620023 0.17276648 0.09263218
## 
## $value
## [1] 3.937191
## 
## $counts
## [1] 0
## 
## $convergence
## [1] 0
## 
## $message
## NULL
## 
## $outer.iterations
## [1] 1
## 
## $barrier.value
## [1] 0.0003081647

Exercise

Reference

knitr::purl("multiOptimization.rmd", output = "multiOptimization.R ", documentation = 2)
## 
## 
## processing file: multiOptimization.rmd
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |...                                                              |   5%
  |                                                                       
  |......                                                           |  10%
  |                                                                       
  |..........                                                       |  15%
  |                                                                       
  |.............                                                    |  20%
  |                                                                       
  |................                                                 |  25%
  |                                                                       
  |....................                                             |  30%
  |                                                                       
  |.......................                                          |  35%
  |                                                                       
  |..........................                                       |  40%
  |                                                                       
  |.............................                                    |  45%
  |                                                                       
  |................................                                 |  50%
  |                                                                       
  |....................................                             |  55%
  |                                                                       
  |.......................................                          |  60%
  |                                                                       
  |..........................................                       |  65%
  |                                                                       
  |..............................................                   |  70%
  |                                                                       
  |.................................................                |  75%
  |                                                                       
  |....................................................             |  80%
  |                                                                       
  |.......................................................          |  85%
  |                                                                       
  |..........................................................       |  90%
  |                                                                       
  |..............................................................   |  95%
  |                                                                       
  |.................................................................| 100%
## output file: multiOptimization.R
## [1] "multiOptimization.R "