Biostatistical Computing, PHC 6068

Optimization

Zhiguang Huo (Caleb)

Monday Nov 2, 2020

Outline

Optimization

Convex function

Convex function properties:

Optimization functions in R

Univariate optimization example

Suppose our objective function is \[f(x) = e^x + x^4\] What is the minimum value of \(f(x)\), what is the correponding \(x\)?

f <- function(x) exp(x) + x^4
curve(f, from = -1, to = 1)

Use R optimize() function to perform the optimization

f <- function(x) exp(x) + x^4
xmin <- optimize(f, interval = c(-10, 10))
xmin
## $minimum
## [1] -0.5282468
## 
## $objective
## [1] 0.6675038
curve(f, from = -1, to = 1)
points(xmin$minimum, xmin$objective, col="red", pch=19)

Gradient decent

Gradient decent: choose initial \(x^{(0)} \in \mathbb{R}\), repeat: \[x^{(k)} = x^{(k - 1)} - t\times f'(x^{(k - 1)}), k = 1,2,3,\ldots\]

Interpretation

\[f(y) \approx f(x^{(k - 1)}) + f'(x^{(k - 1)}) \times (y - x^{(k - 1)}) + \frac{1}{2t} (y - x^{(k - 1)})^2\]

Visual interpretation

Gradient decent on our motivating example

f <- function(x) exp(x) + x^4 ## original function
g <- function(x) exp(x) + 4 * x^3 ## gradient function
curve(f, from = -1, to = 1, lwd=2) ## visualize the objective function

x <- x0 <- 0.8; x_old <- 0; t <- 0.1; k <- 0; error <- 1e-6
trace <- x
points(x0, f(x0), col=1, pch=1)

while(abs(x - x_old) > error){ ## there is a scale issue. 
  ## E.g., if x = 1e-7, x_old = 1e-6, relative change is large, absolute change is small
  ## We can monitor abs(x - x_old) / (abs(x) + abs(x_old))
  k <- k + 1
  x_old <- x
  x <- x_old - t*g(x_old)
  trace <- c(trace, x) ## collecting results
  points(x, f(x), col=k, pch=19)
}

trace_grad <- trace
print(trace)
##  [1]  0.80000000  0.37264591  0.20678990  0.08028037 -0.02828567 -0.12548768
##  [7] -0.21290391 -0.28986708 -0.35496119 -0.40719158 -0.44673749 -0.47504573
## [13] -0.49435026 -0.50702281 -0.51511484 -0.52018513 -0.52332289 -0.52524940
## [19] -0.52642641 -0.52714331 -0.52757914 -0.52784381 -0.52800441 -0.52810183
## [25] -0.52816091 -0.52819673 -0.52821844 -0.52823161 -0.52823959 -0.52824443
## [31] -0.52824736 -0.52824914 -0.52825021 -0.52825087

A non-convex example

Discussion, will gradient decent algorithm always converge?

No, see the example

f <- function(x) x^2 ## original function
g <- function(x) 2*x ## gradient function
curve(f, from = -10, to = 10, lwd=2) ## visualize the objective function

x <- x0 <- 1; x_old <- 0; t <- 1.1; k <- 0; error <- 1e-6
trace <- x
points(x0, f(x0), col=1, pch=1)

while(abs(x - x_old) > error & k < 30){ ## there is a scale issue. 
  ## E.g., if x = 1e-7, x_old = 1e-6, relative change is large, absolute change is small
  ## We can monitor abs(x - x_old) / (abs(x) + abs(x_old))
  k <- k + 1
  x_old <- x
  x <- x_old - t*g(x_old)
  trace <- c(trace, x) ## collecting results
  points(x, f(x), col=k, pch=19)
  segments(x0=x, y0 = f(x), x1 = x_old, y1 = f(x_old))
}

print(trace)
##  [1]    1.000000   -1.200000    1.440000   -1.728000    2.073600   -2.488320
##  [7]    2.985984   -3.583181    4.299817   -5.159780    6.191736   -7.430084
## [13]    8.916100  -10.699321   12.839185  -15.407022   18.488426  -22.186111
## [19]   26.623333  -31.948000   38.337600  -46.005120   55.206144  -66.247373
## [25]   79.496847  -95.396217  114.475460 -137.370552  164.844662 -197.813595
## [31]  237.376314

Change to another stepsize

f <- function(x) x^2 ## original function
g <- function(x) 2*x ## gradient function
curve(f, from = -1, to = 1, lwd=2) ## visualize the objective function

x <- x0 <- 1; x_old <- 0; t <- 0.2; k <- 0; error <- 1e-6
trace <- x
points(x0, f(x0), col=1, pch=1)

while(abs(x - x_old) > error & k < 30){
  k <- k + 1
  x_old <- x
  x <- x_old - t*g(x_old)
  trace <- c(trace, x) ## collecting results
  points(x, f(x), col=k, pch=19)
  segments(x0=x, y0 = f(x), x1 = x_old, y1 = f(x_old))
}

print(trace)
##  [1] 1.000000e+00 6.000000e-01 3.600000e-01 2.160000e-01 1.296000e-01
##  [6] 7.776000e-02 4.665600e-02 2.799360e-02 1.679616e-02 1.007770e-02
## [11] 6.046618e-03 3.627971e-03 2.176782e-03 1.306069e-03 7.836416e-04
## [16] 4.701850e-04 2.821110e-04 1.692666e-04 1.015600e-04 6.093597e-05
## [21] 3.656158e-05 2.193695e-05 1.316217e-05 7.897302e-06 4.738381e-06
## [26] 2.843029e-06 1.705817e-06 1.023490e-06

Change to another stepsize

f <- function(x) x^2 ## original function
g <- function(x) 2*x ## gradient function
curve(f, from = -1, to = 1, lwd=2) ## visualize the objective function

x <- x0 <- 1; x_old <- 0; t <- 0.01; k <- 0; error <- 1e-6
trace <- x
points(x0, f(x0), col=1, pch=1)

while(abs(x - x_old) > error & k < 30){
  k <- k + 1
  x_old <- x
  x <- x_old - t*g(x_old)
  trace <- c(trace, x) ## collecting results
  points(x, f(x), col=k, pch=19)
  segments(x0=x, y0 = f(x), x1 = x_old, y1 = f(x_old))
}

print(trace)
##  [1] 1.0000000 0.9800000 0.9604000 0.9411920 0.9223682 0.9039208 0.8858424
##  [8] 0.8681255 0.8507630 0.8337478 0.8170728 0.8007314 0.7847167 0.7690224
## [15] 0.7536419 0.7385691 0.7237977 0.7093218 0.6951353 0.6812326 0.6676080
## [22] 0.6542558 0.6411707 0.6283473 0.6157803 0.6034647 0.5913954 0.5795675
## [29] 0.5679762 0.5566167 0.5454843

How to select stepsize \(t\), backtracking line search

\[f(x) + t\alpha f'(x)\Delta x\]

Backtracking line search Algorithm

Note: In each iteration, you may always want to initialize \(t = 1\) before backtracking line search.

Implementation of back-tracking for gradient decent on the motivating example

f <- function(x) exp(x) + x^4 ## original function
g <- function(x) exp(x) + 4 * x^3 ## gradient function
curve(f, from = -1, to = 1, lwd=2) ## visualize the objective function

x <- x0 <- 1; x_old <- 0; t0 <- 1; k <- 0; error <- 1e-6; beta = 0.8; alpha <- 0.4
trace <- x
points(x0, f(x0), col=1, pch=1)

while(abs(x - x_old) > error & k < 30){
  k <- k + 1
  x_old <- x
  
  ## backtracking
  t <- t0
  while(f(x_old - t*g(x_old)) > f(x_old) - alpha * t * g(x_old)^2){
    t <- t * beta
  }
  
  x <- x_old - t*g(x_old)
  trace <- c(trace, x) ## collecting results
  points(x, f(x), col=k, pch=19)
  segments(x0=x, y0 = f(x), x1 = x_old, y1 = f(x_old))
}

print(trace)
## [1]  1.00000000  0.09828748 -0.61024238 -0.53353118 -0.52803658 -0.52825877
## [7] -0.52825165 -0.52825188

Exercise (will be on HW), solve for logistic regression

library(ElemStatLearn)
data2 <- prostate[,c("svi", "lcavol")]
str(data2)
## 'data.frame':    97 obs. of  2 variables:
##  $ svi   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ lcavol: num  -0.58 -0.994 -0.511 -1.204 0.751 ...

\[\log \frac{E(Y|x)}{1 - E(Y|x)} = \beta_0 + x\beta_1\]

Exercise (will be on HW), solve for logistic regression

glm_binomial_logit <- glm(svi ~ lcavol, data = prostate,  family = binomial())
summary(glm_binomial_logit)
## 
## Call:
## glm(formula = svi ~ lcavol, family = binomial(), data = prostate)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.79924  -0.48354  -0.21025  -0.04274   2.32135  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -5.0296     1.0429  -4.823 1.42e-06 ***
## lcavol        1.9798     0.4543   4.358 1.31e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 101.35  on 96  degrees of freedom
## Residual deviance:  64.14  on 95  degrees of freedom
## AIC: 68.14
## 
## Number of Fisher Scoring iterations: 6

Exercise (will be on HW), some hints

\(\begin{aligned} l(\beta_0, \beta_1) &= \log\prod_{i = 1}^n p(x_i)^{y_i}(1 - p(x_i))^{(1 - y_i)} \\ & = \sum_{i=1}^n y_i \log p(x_i) + (1 - y_i) \log (1 - p(x_i)) \\ & = \sum_{i=1}^n \log (1 - p(x_i)) + \sum_{i=1}^n y_i \log \frac{p(x_i)}{1 - p(x_i)} \\ & = \sum_{i=1}^n -\log(1 + \exp(\beta_0 + x_i \beta_1)) + \sum_{i=1}^n y_i (\beta_0 + x_i\beta_1)\\ \end{aligned}\)

Newton’s method

Newton’s method interpretation

Newton’s method on our motivating example

f <- function(x) exp(x) + x^4 ## original function
g <- function(x) exp(x) + 4 * x^3 ## gradient function
h <- function(x) exp(x) + 12 * x^2 ## Hessian function
curve(f, from = -1, to = 1, lwd=2) ## visualize the objective function

x <- x0 <- 0.8; x_old <- 0; t <- 0.1; k <- 0; error <- 1e-6
trace <- x
points(x0, f(x0), col=1, pch=1)

while(abs(x - x_old) > error){
  k <- k + 1
  x_old <- x
  x <- x_old - 1/h(x_old)*g(x_old)
  trace <- c(trace, x) ## collecting results
  points(x, f(x), col=k, pch=19)
}
lines(trace, f(trace), lty = 2)

trace_newton <- trace
print(trace)
## [1]  0.8000000  0.3685707 -0.1665553 -0.8686493 -0.6362012 -0.5432407 -0.5285880
## [8] -0.5282520 -0.5282519

Compare Gradient decent with Newton’s method

f <- function(x) exp(x) + x^4 ## original function
par(mfrow=c(1,2))
title_grad <- paste("gradient decent, nstep =", length(trace_grad))
curve(f, from = -1, to = 1, lwd=2, main = title_grad) 
points(trace_grad, f(trace_grad), col=seq_along(trace_grad), pch=19)
lines(trace_grad, f(trace_grad), lty = 2)

title_newton <- paste("Newton's method, nstep =", length(trace_newton))
curve(f, from = -1, to = 1, lwd=2, main = title_newton) 
points(trace_newton, f(trace_newton), col=seq_along(trace_newton), pch=19)
lines(trace_newton, f(trace_newton), lty = 2)

Alternative interpretation about Newton’s method

Backtracking line search for Newton’s method

Implementation of back-tracking for Newton’s method

f <- function(x) exp(x) + x^4 ## original function
g <- function(x) exp(x) + 4 * x^3 ## gradient function
h <- function(x) exp(x) + 12 * x^2 ## gradient function
curve(f, from = -1, to = 1, lwd=2) ## visualize the objective function

x <- x0 <- 1; x_old <- 0; t0 <- 1; k <- 0; error <- 1e-6; beta = 0.8; alpha <- 0.4
trace <- x
points(x0, f(x0), col=1, pch=1)

while(abs(x - x_old) > error & k < 30){
  k <- k + 1
  x_old <- x
  
  ## backtracking
  t <- t0
  while(f(x_old - t*g(x_old)/h(x_old)) > f(x_old) - alpha * t * g(x_old)^2/h(x_old)){
    t <- t * beta
  }
  
  x <- x_old - t*g(x_old)/h(x_old)
  trace <- c(trace, x) ## collecting results
  points(x, f(x), col=k, pch=19)
  segments(x0=x, y0 = f(x), x1 = x_old, y1 = f(x_old))
}

print(trace)
## [1]  1.00000000  0.54354171  0.09465801 -0.63631402 -0.54326938 -0.52858926
## [7] -0.52825205 -0.52825187

Exercise (will be on HW), solve for logistic regression

library(ElemStatLearn)
glm_binomial_logit <- glm(svi ~ lcavol, data = prostate,  family = binomial())
summary(glm_binomial_logit)
## 
## Call:
## glm(formula = svi ~ lcavol, family = binomial(), data = prostate)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.79924  -0.48354  -0.21025  -0.04274   2.32135  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -5.0296     1.0429  -4.823 1.42e-06 ***
## lcavol        1.9798     0.4543   4.358 1.31e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 101.35  on 96  degrees of freedom
## Residual deviance:  64.14  on 95  degrees of freedom
## AIC: 68.14
## 
## Number of Fisher Scoring iterations: 6

Comparison between gradient decent and Newton’s method

Method Gradient decent Newton’s method
Order First order method Second Order method
Criterion smooth \(f\) double smooth \(f\)
Convergence (# iterations) Slow Fast
Iteration cost cheap (compute gradient) moderate to expensive (Compute Hessian)

Multivariate Case

Gradient in multivariate case

Then \[\nabla_\beta f(\beta) = \frac{\partial f(\beta)}{\partial \beta} = (\frac{\partial f(\beta)}{\partial \beta_1}, \frac{\partial f(\beta)}{\partial \beta_2}, \ldots, \frac{\partial f(\beta)}{\partial \beta_p})^\top \in \mathbb{R}^p\]

Visualization of the example function

f <- function(x,y){
  4 * x^2 + y^2 + 2 * x * y - x - y 
}


xx <- yy <- seq(-20, 20, len=1000)
zz <- outer(xx, yy, f)
# Contour plots
contour(xx,yy,zz, xlim = c(-10,10), ylim = c(-20,20), nlevels = 10, levels = seq(1,100,length.out = 10), main="Contour Plot")

Hessian in multivariate case

Then \[\nabla_\beta f(\beta) = \frac{\partial f(\beta)}{\partial \beta} = (\frac{\partial f(\beta)}{\partial \beta_1}, \frac{\partial f(\beta)}{\partial \beta_2}, \ldots, \frac{\partial f(\beta)}{\partial \beta_p})^\top \in \mathbb{R}^p\]

\[\Delta_\beta f(\beta) = \nabla^2_\beta f(\beta) = \nabla_\beta\frac{\partial f(\beta)}{\partial \beta} = (\nabla_\beta \frac{\partial f(\beta)}{\partial \beta_1}, \nabla_\beta \frac{\partial f(\beta)}{\partial \beta_2}, \ldots, \nabla_\beta \frac{\partial f(\beta)}{\partial \beta_p})^\top \]

\[\Delta_\beta f(\beta) = \begin{pmatrix} \frac{\partial^2 f(\beta)}{\partial \beta_1^2} & \frac{\partial^2 f(\beta)}{\partial \beta_1 \partial \beta_2} & \ldots & \frac{\partial^2 f(\beta)}{\partial \beta_1 \partial\beta_p}\\ \frac{\partial^2 f(\beta)}{\partial \beta_2 \partial \beta_1 } & \frac{\partial^2 f(\beta)}{\partial \beta_2^2} & \ldots & \frac{\partial^2 f(\beta)}{\partial \beta_2 \partial \beta_p} \\ \ldots &\ldots &\ldots &\ldots\\ \frac{\partial^2 f(\beta)}{\partial \beta_p \partial \beta_1} & \frac{\partial^2 f(\beta)}{\partial \beta_p \partial \beta_2} & \ldots & \frac{\partial^2 f(\beta)}{\partial\beta_p^2} \end{pmatrix} \]

Hessian in multivariate case example

\[f(x,y) = 4x^2 + y^2 + 2xy - x - y\]

\[\nabla f = (\frac{\partial f}{\partial x}, \frac{\partial f}{\partial y})^\top = \binom{8x + 2y - 1}{2y +2x - 1} \in \mathbb{R}^2\]

\[\Delta f = \begin{pmatrix} 8 & 2 \\ 2 & 2 \end{pmatrix} \in \mathbb{R}^{2\times 2}\]

Gradient decent for multivariate case

Gradient decent: choose initial \(\beta^{(0)} \in \mathbb{R}^p\), repeat: \[\beta^{(k)} = \beta^{(k - 1)} - t\times \nabla_\beta f(\beta^{(k - 1)}), k = 1,2,3,\ldots\]

Gradient decent on our motivating example

f <- function(x,y){
  4 * x^2 + y^2 + 2 * x * y - x - y 
}

g <- function(x,y){
  c(8*x + 2*y - 1, 2*y +2*x - 1)
}

xx <- yy <- seq(-20, 20, len=1000)
zz <- outer(xx, yy, f)
# Contour plots
contour(xx,yy,zz, xlim = c(-10,10), ylim = c(-20,20), nlevels = 10, levels = seq(1,100,length.out = 10), main="Contour Plot")


x <- x0 <- 8; x_old <- 0; 
y <- y0 <- -10; y_old <- 0; 
curXY <- c(x,y)
preXY <- c(x_old, y_old)
t <- 0.1; k <- 0; error <- 1e-6
trace <- list(curXY)
points(x0,y0, col=1, pch=1)

l2n <- function(avec){
  sqrt(sum(avec^2))
}
diffChange <- function(avec, bvec){
  deltaVec <- avec - bvec
  sumVec <- avec + bvec
  l2n(deltaVec)/l2n(sumVec)
}

while(diffChange(curXY, preXY) > error){
  k <- k + 1
  preXY <- curXY
  curXY <- preXY - t*g(preXY[1], preXY[2])
  trace <- c(trace, list(curXY)) ## collecting results
  points(curXY[1], curXY[2], col=k, pch=19)
  segments(curXY[1], curXY[2], preXY[1], preXY[2])
}

print(k)
## [1] 97

How to select step size \(t\), backtracking line search

\[f(x) + t\alpha \nabla f(x)^\top\Delta x\]

Implementation of back-tracking for gradient decent

f0 <- function(x,y){
  4 * x^2 + y^2 + 2 * x * y - x - y 
}

f <- function(avec){
  x <- avec[1]
  y <- avec[2]
  4 * x^2 + y^2 + 2 * x * y - x - y 
}

g <- function(avec){
  x <- avec[1]
  y <- avec[2]  
  c(8*x + 2*y - 1, 2*y +2*x - 1)
}

x <- y <- seq(-20, 20, len=1000)
z <- outer(x, y, f0)
# Contour plots
contour(x,y,z, xlim = c(-10,10), ylim = c(-20,20), nlevels = 10, levels = seq(1,100,length.out = 10), main="Contour Plot")


x <- x0 <- 8; x_old <- 0; 
y <- y0 <- -10; y_old <- 0; 
curXY <- c(x,y)
preXY <- c(x_old, y_old)
t0 <- 1; k <- 0; error <- 1e-6
alpha = 1/3
beta = 1/2
trace <- list(curXY)
points(x0,y0, col=1, pch=1)

l2n <- function(avec){
  sqrt(sum(avec^2))
}
diffChange <- function(avec, bvec){
  deltaVec <- avec - bvec
  sumVec <- avec + bvec
  l2n(deltaVec)/l2n(sumVec)
}

while(diffChange(curXY, preXY) > error){
  k <- k + 1
  preXY <- curXY
  
  ## backtracking
  t <- t0
  while(f(preXY - t*g(preXY)) > f(preXY) - alpha * t * l2n(g(preXY))^2){
    t <- t * beta
  }
  
  curXY <- preXY - t*g(preXY)
  trace <- c(trace, list(curXY)) ## collecting results
  points(curXY[1], curXY[2], col=k, pch=19)
  segments(curXY[1], curXY[2], preXY[1], preXY[2])
  
}

print(k)
## [1] 25

Exercise, solve for logistic regression

library(ElemStatLearn)
glm_binomial_logit <- glm(svi ~ lcavol, data = prostate,  family = binomial())
summary(glm_binomial_logit)
## 
## Call:
## glm(formula = svi ~ lcavol, family = binomial(), data = prostate)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.79924  -0.48354  -0.21025  -0.04274   2.32135  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -5.0296     1.0429  -4.823 1.42e-06 ***
## lcavol        1.9798     0.4543   4.358 1.31e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 101.35  on 96  degrees of freedom
## Residual deviance:  64.14  on 95  degrees of freedom
## AIC: 68.14
## 
## Number of Fisher Scoring iterations: 6

Newton’s method for multivariate case

Implement multivariate Newton’s method

f0 <- function(x,y){
  4 * x^2 + y^2 + 2 * x * y - x - y 
}

f <- function(avec){
  x <- avec[1]
  y <- avec[2]
  4 * x^2 + y^2 + 2 * x * y - x - y 
}

g <- function(avec){
  x <- avec[1]
  y <- avec[2]  
  c(8*x + 2*y - 1, 2*y +2*x - 1)
}

h <- function(avec){
  x <- avec[1]
  y <- avec[2]  
  res <- matrix(c(8,2,2,2),2,2)  ## Hessian function
  return(res)  
} 

x <- y <- seq(-20, 20, len=1000)
z <- outer(x, y, f0)
# Contour plots
contour(x,y,z, xlim = c(-10,10), ylim = c(-20,20), nlevels = 10, levels = seq(1,100,length.out = 10), main="Contour Plot")


x <- x0 <- 8; x_old <- 0; 
y <- y0 <- -10; y_old <- 0; 
curXY <- c(x,y)
preXY <- c(x_old, y_old)
t0 <- 1; k <- 0; error <- 1e-6
trace <- list(curXY)
points(x0,y0, col=1, pch=1)

l2n <- function(avec){
  sqrt(sum(avec^2))
}
diffChange <- function(avec, bvec){
  deltaVec <- avec - bvec
  sumVec <- avec + bvec
  l2n(deltaVec)/l2n(sumVec)
}

while(diffChange(curXY, preXY) > error){
  k <- k + 1
  preXY <- curXY
  
  curXY <- preXY - solve(h(curXY)) %*% g(preXY)
  trace <- c(trace, list(curXY)) ## collecting results
  points(curXY[1], curXY[2], col=k, pch=19)
  segments(curXY[1], curXY[2], preXY[1], preXY[2])
}

k ## why k is only 2?
## [1] 2

Exercise

\[f(x,y) = \exp(xy) + y^2 + x^4\]

What are the x and y such that \(f(x,y)\) is minimized?

Backtracking line search for Newton’s method

Exercise, solve for logistic regression

library(ElemStatLearn)
glm_binomial_logit <- glm(svi ~ lcavol, data = prostate,  family = binomial())
summary(glm_binomial_logit)
## 
## Call:
## glm(formula = svi ~ lcavol, family = binomial(), data = prostate)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.79924  -0.48354  -0.21025  -0.04274   2.32135  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -5.0296     1.0429  -4.823 1.42e-06 ***
## lcavol        1.9798     0.4543   4.358 1.31e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 101.35  on 96  degrees of freedom
## Residual deviance:  64.14  on 95  degrees of freedom
## AIC: 68.14
## 
## Number of Fisher Scoring iterations: 6

Coordinate decent

Example on linear regression

\[ \min_{\beta \in \mathbb{R}^p} f(\beta) = \min_{\beta \in \mathbb{R}^p} \frac{1}{2} \|y - X\beta\|_2^2\] where \(y \in \mathbb{R}^n\) and \(X \in \mathbb{R}^{n \times p}\) with columns \(X_1\) (intercept), \(X_2\), \(\ldots\), \(X_p\).

\[\beta_j = \frac{X_j^\top (y - X_{-j}\beta_{-j})}{X_j^\top X_j}\]

implement coordinate decent using prostate cancer data

library(ElemStatLearn)

y <- prostate$lcavol
x0 <- prostate[,-match(c("lcavol", "train"), colnames(prostate))]
x <- cbind(1, as.matrix(x0))

beta <- rep(0,ncol(x))
beta_old <- rnorm(ncol(x))

error <- 1e-6
k <- 0
while(diffChange(beta, beta_old) > error){
  k <- k + 1
  beta_old <- beta
  for(j in 1:length(beta)){
    xj <- x[,j]
    xj_else <- x[,-j]
    beta_j_else <- as.matrix(beta[-j])
    beta[j] <- t(xj) %*% (y - xj_else %*% beta_j_else) / sum(xj^2)
  }
}

print(beta)
## [1] -2.419610178 -0.025954949  0.022114840 -0.093134994 -0.152983715
## [6]  0.367195579  0.197095411 -0.007114941  0.565822653
## compare with lm
lm(lcavol ~ . - train, data=prostate)
## 
## Call:
## lm(formula = lcavol ~ . - train, data = prostate)
## 
## Coefficients:
## (Intercept)      lweight          age         lbph          svi  
##   -2.420613    -0.025925     0.022113    -0.093138    -0.152966  
##         lcp      gleason        pgg45         lpsa  
##    0.367180     0.197249    -0.007117     0.565826

Reference

http://www.stat.cmu.edu/~ryantibs/convexopt/