Biostatistical Computing, PHC 6068

Univariate optimization

Zhiguang Huo (Caleb)

Monday October 23, 2017

Outline

optimization

Convex set

Convex function

Convex function properties:

Optimization functions in R

Focus on one dimensional (univariate) optimization this week. We will talk about multivariate optimization at the end of this semester.

Univariate optimization example

Suppose our objective function is \[f(x) = e^x + x^4\] What is the minimum value of \(f(x)\), what is the correponding \(x\)?

f <- function(x) exp(x) + x^4
curve(f, from = -1, to = 1)

Use R optimize() function to perform the optimization

f <- function(x) exp(x) + x^4
xmin <- optimize(f, interval = c(-10, 10))
xmin
## $minimum
## [1] -0.5282468
## 
## $objective
## [1] 0.6675038
curve(f, from = -1, to = 1)
points(xmin$minimum, xmin$objective, col="red", pch=19)

Do the optimization yourself

Gradient decent

Gradient decent: choose initial \(x^{(0)} \in \mathbb{R}\), repeat: \[x^{(k)} = x^{(k - 1)} - t\times f'(x^{(k - 1)}), k = 1,2,3,\ldots\]

Interpretation

\[f(y) \approx f(x^{(k - 1)}) + f'(x^{(k - 1)}) \times (y - x^{(k - 1)}) + \frac{1}{2t} (y - x^{(k - 1)})^2\]

Visual interpretation

Gradient decent on our motivating example

f <- function(x) exp(x) + x^4 ## original function
g <- function(x) exp(x) + 4 * x^3 ## gradient function
curve(f, from = -1, to = 1, lwd=2) ## visualize the objective function

x <- x0 <- 0.8; x_old <- 0; t <- 0.1; k <- 0; error <- 1e-6
trace <- x
points(x0, f(x0), col=1, pch=1)

while(abs(x - x_old) > error){
  k <- k + 1
  x_old <- x
  x <- x_old - t*g(x_old)
  trace <- c(trace, x) ## collecting results
  points(x, f(x), col=k, pch=19)
}

trace_grad <- trace
print(trace)
##  [1]  0.80000000  0.37264591  0.20678990  0.08028037 -0.02828567
##  [6] -0.12548768 -0.21290391 -0.28986708 -0.35496119 -0.40719158
## [11] -0.44673749 -0.47504573 -0.49435026 -0.50702281 -0.51511484
## [16] -0.52018513 -0.52332289 -0.52524940 -0.52642641 -0.52714331
## [21] -0.52757914 -0.52784381 -0.52800441 -0.52810183 -0.52816091
## [26] -0.52819673 -0.52821844 -0.52823161 -0.52823959 -0.52824443
## [31] -0.52824736 -0.52824914 -0.52825021 -0.52825087

A non-convex example

Discussion, will gradient decent algorithm always converge?

No, see the example

f <- function(x) x^2 ## original function
g <- function(x) 2*x ## gradient function
curve(f, from = -10, to = 10, lwd=2) ## visualize the objective function

x <- x0 <- 1; x_old <- 0; t <- 1.1; k <- 0; error <- 1e-6
trace <- x
points(x0, f(x0), col=1, pch=1)

while(abs(x - x_old) > error & k < 30){
  k <- k + 1
  x_old <- x
  x <- x_old - t*g(x_old)
  trace <- c(trace, x) ## collecting results
  points(x, f(x), col=k, pch=19)
  segments(x0=x, y0 = f(x), x1 = x_old, y1 = f(x_old))
}

print(trace)
##  [1]    1.000000   -1.200000    1.440000   -1.728000    2.073600
##  [6]   -2.488320    2.985984   -3.583181    4.299817   -5.159780
## [11]    6.191736   -7.430084    8.916100  -10.699321   12.839185
## [16]  -15.407022   18.488426  -22.186111   26.623333  -31.948000
## [21]   38.337600  -46.005120   55.206144  -66.247373   79.496847
## [26]  -95.396217  114.475460 -137.370552  164.844662 -197.813595
## [31]  237.376314

Change to another stepsize

f <- function(x) x^2 ## original function
g <- function(x) 2*x ## gradient function
curve(f, from = -1, to = 1, lwd=2) ## visualize the objective function

x <- x0 <- 1; x_old <- 0; t <- 0.2; k <- 0; error <- 1e-6
trace <- x
points(x0, f(x0), col=1, pch=1)

while(abs(x - x_old) > error & k < 30){
  k <- k + 1
  x_old <- x
  x <- x_old - t*g(x_old)
  trace <- c(trace, x) ## collecting results
  points(x, f(x), col=k, pch=19)
  segments(x0=x, y0 = f(x), x1 = x_old, y1 = f(x_old))
}

print(trace)
##  [1] 1.000000e+00 6.000000e-01 3.600000e-01 2.160000e-01 1.296000e-01
##  [6] 7.776000e-02 4.665600e-02 2.799360e-02 1.679616e-02 1.007770e-02
## [11] 6.046618e-03 3.627971e-03 2.176782e-03 1.306069e-03 7.836416e-04
## [16] 4.701850e-04 2.821110e-04 1.692666e-04 1.015600e-04 6.093597e-05
## [21] 3.656158e-05 2.193695e-05 1.316217e-05 7.897302e-06 4.738381e-06
## [26] 2.843029e-06 1.705817e-06 1.023490e-06

Change to another stepsize

f <- function(x) x^2 ## original function
g <- function(x) 2*x ## gradient function
curve(f, from = -1, to = 1, lwd=2) ## visualize the objective function

x <- x0 <- 1; x_old <- 0; t <- 0.01; k <- 0; error <- 1e-6
trace <- x
points(x0, f(x0), col=1, pch=1)

while(abs(x - x_old) > error & k < 30){
  k <- k + 1
  x_old <- x
  x <- x_old - t*g(x_old)
  trace <- c(trace, x) ## collecting results
  points(x, f(x), col=k, pch=19)
  segments(x0=x, y0 = f(x), x1 = x_old, y1 = f(x_old))
}

print(trace)
##  [1] 1.0000000 0.9800000 0.9604000 0.9411920 0.9223682 0.9039208 0.8858424
##  [8] 0.8681255 0.8507630 0.8337478 0.8170728 0.8007314 0.7847167 0.7690224
## [15] 0.7536419 0.7385691 0.7237977 0.7093218 0.6951353 0.6812326 0.6676080
## [22] 0.6542558 0.6411707 0.6283473 0.6157803 0.6034647 0.5913954 0.5795675
## [29] 0.5679762 0.5566167 0.5454843

How to select stepsize \(t\), backtracking line search

Implementation of back-tracking for gradient decent

f <- function(x) x^2 ## original function
g <- function(x) 2*x ## gradient function
curve(f, from = -1, to = 1, lwd=2) ## visualize the objective function

x <- x0 <- 1; x_old <- 0; t0 <- 1; k <- 0; error <- 1e-6; beta = 0.8; alpha <- 0.4
trace <- x
points(x0, f(x0), col=1, pch=1)

while(abs(x - x_old) > error & k < 30){
  k <- k + 1
  x_old <- x
  
  ## backtracking
  t <- t0
  while(f(x_old - t*g(x_old)) > f(x_old) - alpha * t * g(x_old)^2){
    t <- t * beta
  }
  
  x <- x_old - t*g(x_old)
  trace <- c(trace, x) ## collecting results
  points(x, f(x), col=k, pch=19)
  segments(x0=x, y0 = f(x), x1 = x_old, y1 = f(x_old))
}

print(trace)
## [1]  1.000000e+00 -2.400000e-02  5.760000e-04 -1.382400e-05  3.317760e-07
## [6] -7.962624e-09

Implementation of back-tracking for gradient decent on the motivating example

f <- function(x) exp(x) + x^4 ## original function
g <- function(x) exp(x) + 4 * x^3 ## gradient function
curve(f, from = -1, to = 1, lwd=2) ## visualize the objective function

x <- x0 <- 1; x_old <- 0; t0 <- 1; k <- 0; error <- 1e-6; beta = 0.8; alpha <- 0.4
trace <- x
points(x0, f(x0), col=1, pch=1)

while(abs(x - x_old) > error & k < 30){
  k <- k + 1
  x_old <- x
  
  ## backtracking
  t <- t0
  while(f(x_old - t*g(x_old)) > f(x_old) - alpha * t * g(x_old)^2){
    t <- t * beta
  }
  
  x <- x_old - t*g(x_old)
  trace <- c(trace, x) ## collecting results
  points(x, f(x), col=k, pch=19)
  segments(x0=x, y0 = f(x), x1 = x_old, y1 = f(x_old))
}

print(trace)
## [1]  1.00000000  0.09828748 -0.61024238 -0.53353118 -0.52803658 -0.52825877
## [7] -0.52825165 -0.52825188

Exercise (will be on HW), solve for logistic regression

library(ElemStatLearn)
glm_binomial_logit <- glm(svi ~ lcavol, data = prostate,  family = binomial())
summary(glm_binomial_logit)
## 
## Call:
## glm(formula = svi ~ lcavol, family = binomial(), data = prostate)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.79924  -0.48354  -0.21025  -0.04274   2.32135  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -5.0296     1.0429  -4.823 1.42e-06 ***
## lcavol        1.9798     0.4543   4.358 1.31e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 101.35  on 96  degrees of freedom
## Residual deviance:  64.14  on 95  degrees of freedom
## AIC: 68.14
## 
## Number of Fisher Scoring iterations: 6

Exercise (will be on HW), some hints

\(\begin{aligned} l(\beta_0, \beta_1) &= \log\prod_{i = 1}^n p(x_i)^{y_i}(1 - p(x_i))^{(1 - y_i)} \\ & = \sum_{i=1}^n y_i \log p(x_i) + (1 - y_i) \log (1 - p(x_i)) \\ & = \sum_{i=1}^n \log (1 - p(x_i)) + \sum_{i=1}^n y_i \log \frac{p(x_i)}{1 - p(x_i)} \\ & = \sum_{i=1}^n -\log(1 + \exp(\beta_0 + x_i \beta_1)) + \sum_{i=1}^n y_i (\beta_0 + x_i\beta_1)\\ \end{aligned}\)

Newton’s method

Newton’s method interpretation

Newton’s method on our motivating example

f <- function(x) exp(x) + x^4 ## original function
g <- function(x) exp(x) + 4 * x^3 ## gradient function
h <- function(x) exp(x) + 12 * x^2 ## Hessian function
curve(f, from = -1, to = 1, lwd=2) ## visualize the objective function

x <- x0 <- 0.8; x_old <- 0; t <- 0.1; k <- 0; error <- 1e-6
trace <- x
points(x0, f(x0), col=1, pch=1)

while(abs(x - x_old) > error){
  k <- k + 1
  x_old <- x
  x <- x_old - 1/h(x_old)*g(x_old)
  trace <- c(trace, x) ## collecting results
  points(x, f(x), col=k, pch=19)
}
lines(trace, f(trace), lty = 2)

trace_newton <- trace
print(trace)
## [1]  0.8000000  0.3685707 -0.1665553 -0.8686493 -0.6362012 -0.5432407
## [7] -0.5285880 -0.5282520 -0.5282519

Compare Gradient decent with Newton’s method

f <- function(x) exp(x) + x^4 ## original function
par(mfrow=c(1,2))
title_grad <- paste("gradient decent, nstep =", length(trace_grad))
curve(f, from = -1, to = 1, lwd=2, main = title_grad) 
points(trace_grad, f(trace_grad), col=seq_along(trace_grad), pch=19)
lines(trace_grad, f(trace_grad), lty = 2)

title_newton <- paste("Newton's method, nstep =", length(trace_newton))
curve(f, from = -1, to = 1, lwd=2, main = title_newton) 
points(trace_newton, f(trace_newton), col=seq_along(trace_newton), pch=19)
lines(trace_newton, f(trace_newton), lty = 2)

Alternative interpretation about Newton’s method

Backtracking line search for Newton’s method

Implementation of back-tracking for Newton’s method

f <- function(x) exp(x) + x^4 ## original function
g <- function(x) exp(x) + 4 * x^3 ## gradient function
h <- function(x) exp(x) + 12 * x^2 ## gradient function
curve(f, from = -1, to = 1, lwd=2) ## visualize the objective function

x <- x0 <- 1; x_old <- 0; t0 <- 1; k <- 0; error <- 1e-6; beta = 0.8; alpha <- 0.4
trace <- x
points(x0, f(x0), col=1, pch=1)

while(abs(x - x_old) > error & k < 30){
  k <- k + 1
  x_old <- x
  
  ## backtracking
  t <- t0
  while(f(x_old - t*g(x_old)/h(x_old)) > f(x_old) - alpha * t * g(x_old)^2/h(x_old)){
    t <- t * beta
  }
  
  x <- x_old - t*g(x_old)/h(x_old)
  trace <- c(trace, x) ## collecting results
  points(x, f(x), col=k, pch=19)
  segments(x0=x, y0 = f(x), x1 = x_old, y1 = f(x_old))
}

print(trace)
## [1]  1.00000000  0.54354171  0.09465801 -0.63631402 -0.54326938 -0.52858926
## [7] -0.52825205 -0.52825187

Exercise (will be on HW), solve for logistic regression

library(ElemStatLearn)
glm_binomial_logit <- glm(svi ~ lcavol, data = prostate,  family = binomial())
summary(glm_binomial_logit)
## 
## Call:
## glm(formula = svi ~ lcavol, family = binomial(), data = prostate)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.79924  -0.48354  -0.21025  -0.04274   2.32135  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -5.0296     1.0429  -4.823 1.42e-06 ***
## lcavol        1.9798     0.4543   4.358 1.31e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 101.35  on 96  degrees of freedom
## Residual deviance:  64.14  on 95  degrees of freedom
## AIC: 68.14
## 
## Number of Fisher Scoring iterations: 6

Comparison between gradient decent and Newton’s method

Method Gradient decent Newton’s method
Order First order method Second Order method
Criterion smooth \(f\) double smooth \(f\)
Convergence (# iterations) Slow Fast
Iteration cost cheap (compute gradient) moderate to expensive (Compute Hessian)

Root finding

Root finding in R

afunction <- function(x) cos(x) - x
curve(afunction, from = -10, to=10)

aroot <- uniroot(function(x) cos(x) - x, lower = -10, upper = 10, tol = 1e-9)$root
curve(afunction, from = -10, to=10)
points(aroot, afunction(aroot), col=2, pch=19)

print(aroot)
## [1] 0.7390851

Root finding by Newton’s method (optional)

Objective function and gradient

tol <- 1e-9
f <- function(x) cos(x) - x
g <- function(x) -sin(x) - 1

niter <- 1; maxiter <- 100
x0 <- 4
x <- x0
x_old <- rnorm(1)
while(abs(x - x_old) > tol & niter < maxiter){
  x_old <- x
  x <- x_old - f(x_old)/g(x_old)
  niter <- niter + 1
}

curve(f, from = -10, to=10)
points(x, f(x), col=2, pch=19)

print(x)
## [1] 0.7390851
tol <- 1e-9
f <- function(x) cos(x) - x
L <- -8; R <- 9
niter <- 1; maxiter <- 100

while(abs(L - R) > tol & niter < maxiter){
  fmid <- f((L+R)/2)
  if(fmid>0){
    L <- (L+R)/2
  } else {
    R <- (L+R)/2
  }
  niter <- niter + 1
}

root_binary <- (L+R)/2
curve(f, from = -10, to=10)
points(root_binary,f(root_binary),col=2,pch=19)

print(root_binary)
## [1] 0.7390851

Reference

http://www.stat.cmu.edu/~ryantibs/convexopt/

knitr::purl("optimization.Rmd", output = "optimization.R ", documentation = 2)
## 
## 
## processing file: optimization.Rmd
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |..                                                               |   3%
  |                                                                       
  |...                                                              |   5%
  |                                                                       
  |.....                                                            |   8%
  |                                                                       
  |.......                                                          |  10%
  |                                                                       
  |........                                                         |  13%
  |                                                                       
  |..........                                                       |  15%
  |                                                                       
  |............                                                     |  18%
  |                                                                       
  |.............                                                    |  21%
  |                                                                       
  |...............                                                  |  23%
  |                                                                       
  |.................                                                |  26%
  |                                                                       
  |..................                                               |  28%
  |                                                                       
  |....................                                             |  31%
  |                                                                       
  |......................                                           |  33%
  |                                                                       
  |.......................                                          |  36%
  |                                                                       
  |.........................                                        |  38%
  |                                                                       
  |...........................                                      |  41%
  |                                                                       
  |............................                                     |  44%
  |                                                                       
  |..............................                                   |  46%
  |                                                                       
  |................................                                 |  49%
  |                                                                       
  |.................................                                |  51%
  |                                                                       
  |...................................                              |  54%
  |                                                                       
  |.....................................                            |  56%
  |                                                                       
  |......................................                           |  59%
  |                                                                       
  |........................................                         |  62%
  |                                                                       
  |..........................................                       |  64%
  |                                                                       
  |...........................................                      |  67%
  |                                                                       
  |.............................................                    |  69%
  |                                                                       
  |...............................................                  |  72%
  |                                                                       
  |................................................                 |  74%
  |                                                                       
  |..................................................               |  77%
  |                                                                       
  |....................................................             |  79%
  |                                                                       
  |.....................................................            |  82%
  |                                                                       
  |.......................................................          |  85%
  |                                                                       
  |.........................................................        |  87%
  |                                                                       
  |..........................................................       |  90%
  |                                                                       
  |............................................................     |  92%
  |                                                                       
  |..............................................................   |  95%
  |                                                                       
  |...............................................................  |  97%
  |                                                                       
  |.................................................................| 100%
## output file: optimization.R
## [1] "optimization.R "