Biostatistical Computing, PHC 6068

cross validation

Zhiguang Huo (Caleb)

Monday November 18, 2019

Outline

Holdout method to evaluate machine learning accuracy

Holdout method example

library(ElemStatLearn)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
prostate0 <- prostate
prostate0$svi <- as.factor(prostate0$svi)
trainLabel <- prostate$train
prostate0$train <- NULL

prostate_train <- subset(prostate0, trainLabel)
prostate_test <- subset(prostate0, !trainLabel)

rfFit <- randomForest(svi ~ ., data=prostate_train)
rfFit
## 
## Call:
##  randomForest(formula = svi ~ ., data = prostate_train) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 14.93%
## Confusion matrix:
##    0 1 class.error
## 0 48 4  0.07692308
## 1  6 9  0.40000000
rfPred <- predict(rfFit, prostate_test)
rfTable <- table(rfPred, truth=prostate_test$svi)
rfTable
##       truth
## rfPred  0  1
##      0 22  4
##      1  2  2
1 - sum(diag(rfTable))/sum(rfTable) ## error rate
## [1] 0.2

k-fold cross validation

Algorithm:

k-fold cross validation example

#Create K = 5 equally size folds

K <- 5
folds <- cut(seq(1,nrow(prostate0)),breaks=5,labels=FALSE)

EKs <- numeric(K)

for(k in 1:K){
  atrain <- subset(prostate0, folds != k)
  atest <- subset(prostate0, folds == k)

  krfFit <- randomForest(svi ~ ., data=atrain)
  krfPred <- predict(krfFit, atest)
  krfTable <- table(krfPred, truth=atest$svi)
  
  EKs[k] <- 1 - sum(diag(krfTable))/sum(krfTable)
}

EKs
## [1] 0.0000000 0.0000000 0.0000000 0.3157895 0.3500000
mean(EKs)
## [1] 0.1331579

Leave one out cross validation

For K-fold cross validation

leave one out cross validation example

n <- nrow(prostate0)

EKs <- numeric(n)

for(k in 1:n){
  #cat("leaving the",k,"sample out!","\n")
  atrain <- prostate0[-k,]
  atest <- prostate0[k,]

  krfFit <- randomForest(svi ~ ., data=atrain)
  krfPred <- predict(krfFit, atest)
  krfTable <- table(krfPred, truth=atest$svi)
  
  EKs[k] <- 1 - sum(diag(krfTable))/sum(krfTable)
}

EKs
##  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [36] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0
## [71] 1 0 1 1 0 0 0 0 0 1 0 1 1 1 1 0 0 1 0 1 0 1 0 0 0 0 0
mean(EKs)
## [1] 0.1443299

Important note for cross validation

Choosing tuning parameters

Some machine learning classifiers contain tuning parameters

The tuning parameter is crucial since it controls the amount of regularization (model complexity).

Tuning parameter selection

Lasso problem

\[\hat{\beta}^{lasso} = \frac{1}{2}\arg\min_{\beta \in \mathbb{R}^P} \|y - X\beta\|_2^2 + \lambda \|\beta\|_1\]

library(lars)
## Loaded lars 1.2
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
options(stringsAsFactors=F)
x <- as.matrix(prostate[,1:8])
y <- prostate[,9]
lassoFit <- lars(x, y) ## lar for least angle regression

lambdas <- seq(0,5,0.5)
coefLassoFit <- coef(lassoFit, s=lambdas, mode="lambda") 
adataFrame <- NULL
for(i in 1:ncol(coefLassoFit)){
  acoef <- colnames(coefLassoFit)[i]
  adataFrame0 <- data.frame(lambda=lambdas, coef = coefLassoFit[,i], variable=acoef)
  adataFrame <- rbind(adataFrame, adataFrame0)
}

ggplot(adataFrame) + aes(x=lambda, y=coef, col=variable) + 
  geom_point() +
  geom_path()

How to select the best tuning parameter

Tuning parameter selection algorithm

Choose the tuning parameter \(\hat{\lambda} = \arg \min_{\lambda\in (\lambda_1, \ldots, \lambda_B)} e(\lambda)\)

Implement tuning parameter selection for lasso problem

K <- 10
folds <- cut(seq(1,nrow(prostate)),breaks=K,labels=FALSE)
lambdas <- seq(0.5,5,0.5)

rMSEs <- matrix(NA,nrow=length(lambdas),ncol=K)
rownames(rMSEs) <- lambdas
colnames(rMSEs) <- 1:K

for(k in 1:K){
  atrainx <- as.matrix(prostate[!folds==k,1:8])
  atestx <- as.matrix(prostate[folds==k,1:8])
 
  atrainy <- prostate[!folds==k,9]
  atesty <- prostate[folds==k,9]

  klassoFit <- lars(atrainx, atrainy) ## lar for least angle regression

  predictValue <- predict.lars(klassoFit, atestx, s=lambdas, type="fit",  mode="lambda")
  rMSE <- apply(predictValue$fit,2, function(x) sqrt(sum((x - atesty)^2))) ## square root MSE
  
  rMSEs[,k] <- rMSE
}

rMSEdataFrame <- data.frame(lambda=lambdas, rMSE = rowMeans(rMSEs))
ggplot(rMSEdataFrame) + aes(x=lambda, y = rMSE) + geom_point() + geom_path()

Common mistake about tuning parameter selection in machine learning

Combine selecting tuning parameters and evaluating prediction accuracy

  1. Split the data into training set and testing set.
  2. In training set, use cross validation to determine the best tuning parameter.
  3. Use the best tuning parameter and the entire training set to build a classifier.
  4. Evaluation:
    • Predict the testing set
    • Report the accuracy.

For Step 1, can we also use cross validation?

Nested Cross validation

  1. Use K-folder cross validation (outer) to split the original data into training set and testing set.
  2. For \(k^{th}\) fold training set, use cross validation (inner) to determine the best tuning parameter of the \(k^{th}\) fold.
  3. Use the best tuning parameter of the \(k^{th}\) fold and the \(k^{th}\) fold training set to build a classifier.
  4. Predict the \(k^{th}\) fold testing set and report the accuracy.
  5. Evaluation:
    • Report the average accuracy from outer K-folder cross validation..

Other machine learning approaches

KNN motivating example

library(MASS)
set.seed(32611)
N<-100
d1<-mvrnorm(N, c(1,-1), matrix(c(2, 1, 1, 2), 2, 2))
d2<-mvrnorm(N, c(-1,1), matrix(c(2, 1, 1, 2), 2, 2))
atest <- rbind(c(1,1),c(2,-2),c(-2,2))
d <- rbind(d1, d2,atest)
label <- c(rep("A", N), rep("B", N),rep("test",3))
text <- label
text[label!="test"] <- ""
data_train <- data.frame(label=label, x=d[,1], y=d[,2], text=text)
names(data_train)<-c("label", "x", "y", "text")

ggplot(data_train, aes(x=x,y=y,col=label, label = text)) + 
  geom_point() + geom_text(vjust = -1,size=5)

KNN algorithm

\[\hat{f}(x) = mode_{i \in N_k(x)} y_i\]

\[\hat{f}(x) = \frac{1}{K} \sum_{i \in N_k(x)} y_i\]

KNN example

library(ElemStatLearn)
library(class)

prostate0 <- prostate
prostate0$svi <- as.factor(prostate0$svi)
trainLabel <- prostate$train

## here 5 is the index of svi in the prostate data
adata_train <- prostate0[trainLabel, -5]
adata_test <- prostate0[!trainLabel, -5]

alabel_train <- prostate0[trainLabel, 5]
alabel_test <- prostate0[!trainLabel, 5]

knnFit <- knn(adata_train, adata_test, alabel_train, k=5)

knnTable <- table(knnFit, truth=alabel_test)
knnTable
##       truth
## knnFit  0  1
##      0 22  5
##      1  2  1
1 - sum(diag(knnTable))/sum(knnTable)
## [1] 0.2333333

KNN Exercise (Will be in HW)

Linear discriminant analysis (LDA)

Problem setting:

\[f(X|Y=k) = \frac{1}{(2\pi)^{p/2} |\Sigma_k|^{1/2}} \exp ( - \frac{1}{2} (X - \mu_k)^\top \Sigma_k^{-1} (X - \mu_k))\]

Intuition for Linear discriminant analysis (LDA)

library(MASS)
library(ggplot2)

set.seed(32611)
N<-100
d1<-mvrnorm(N, c(1,-1), matrix(c(2, 1, 1, 2), 2, 2))
d2<-mvrnorm(N, c(-1,1), matrix(c(2, 1, 1, 2), 2, 2))
d <- rbind(d1, d2)
label <- c(rep("1", N), rep("2", N))
data_train <- data.frame(label=label, x1=d[,1], x2=d[,2])
names(data_train)<-c("label", "x1", "x2")

ggplot(data_train) + aes(x=x1,y=x2,col=label) + geom_point() +  stat_ellipse()

Math behind Linear discriminant analysis (LDA)

\(\begin{aligned} C_{Bayes}(X) & = \arg \max_k f(Y=k | X) \\ & = \arg \max_k \frac{f(Y=k)f(X|Y=k)}{f(X)} \\ & = \arg \max_k f(Y=k)f(X|Y=k) \\ & = \arg \max_k \log f(Y=k) + \log f(X|Y=k) \\ & = \arg \max_k \log f(Y=k) - \frac{1}{2} \log |\Sigma_k| - \frac{1}{2} (X - \mu_k)^\top \Sigma_k^{-1} (X - \mu_k) \\ & = \arg \min_k - 2 \log f(Y=k) + \log |\Sigma_k| + (X - \mu_k)^\top \Sigma_k^{-1} (X - \mu_k) \end{aligned}\)

Decision boundary for Linear discriminant analysis (LDA)

The decision boundary for class \(m\) and class \(l\) is: \[f(Y=l|X) = f(Y=m|X)\]

Or equivalently,

\(\begin{aligned} & - 2 \log f(Y=l) + \log |\Sigma_l| + (X - \mu_l)^\top \Sigma_l^{-1} (X - \mu_l) \\ & = - 2 \log f(Y=m) + \log |\Sigma_m| + (X - \mu_m)^\top \Sigma_m^{-1} (X - \mu_m) \end{aligned}\)

Further assume uniform prior \(f(Y=l) = f(Y=m)\), and same covariance structure \(\Sigma_l = \Sigma_m = \Sigma\), the decision hyperlane simplifies as:

\[2 (\mu_l - \mu_m)^\top \Sigma^{-1} X - (\mu_l - \mu_m)^\top \Sigma^{-1} (\mu_l + \mu_m) = 0\]

\[2 (\mu_l - \mu_m)^\top \Sigma^{-1} (X - \frac{\mu_l + \mu_m}{2}) = 0\]

You can see the decision boundary is linear and passes the center between \(\frac{\mu_l + \mu_m}{2}\).

Visualize the decision boundary (LDA)

library(MASS)

alda <- lda(label ~ . ,data=data_train)

avec <- seq(-4,4,0.1)
z <- expand.grid(avec, avec)
z <- data.frame(x1=z[,1],x2=z[,2])
predict_lda <- predict(alda, z)$class

z_lda <- cbind(region=as.factor(predict_lda), z)
ggplot(z_lda) + aes(x=x1,y=x2,col=region) + geom_point(alpha = 0.4)  + geom_point(data = data_train, aes(x=x1,y=x2,col=label)) +
  labs(title = "LDA boundary")

Quadratic discriminant analysis (QDA)

Further assume uniform prior \(f(Y=l) = f(Y=m)\), but different covariance structure \(\Sigma_l\), \(\Sigma_m\), the decision hyperlane simplifies as:

\(\begin{aligned} \log |\Sigma_l| + (X - \mu_l)^\top \Sigma_l^{-1} (X - \mu_l) = \log |\Sigma_m| + (X - \mu_m)^\top \Sigma_m^{-1} (X - \mu_m) \end{aligned}\)

You will see the decision boundary is a quadratic function.

library(MASS)
library(ggplot2)

set.seed(32611)
N<-100
d1<-mvrnorm(N, c(0,2), matrix(c(4, 0, 0, 1), 2, 2))
d2<-mvrnorm(N, c(0,-2), matrix(c(1, 0, 0, 4), 2, 2))
d <- rbind(d1, d2)
label <- c(rep("1", N), rep("2", N))
data_train <- data.frame(label=label, x1=d[,1], x2=d[,2])
names(data_train)<-c("label", "x1", "x2")

ggplot(data_train) + aes(x=x1,y=x2,col=label) + geom_point() +  stat_ellipse()

Visualize the decision boundary (QDA)

library(MASS)
aqda <- qda(label ~ . ,data=data_train)

avec <- seq(-6,6,0.1)
z <- expand.grid(avec, avec)
z <- data.frame(x1=z[,1],x2=z[,2])
predict_qda <- predict(aqda, z)$class

z_qda <- cbind(region=as.factor(predict_qda), z)
ggplot(z_qda) + aes(x=x1,y=x2,col=region) + geom_point(alpha = 0.4)  + geom_point(data = data_train, aes(x=x1,y=x2,col=label)) +
  labs(title = "QDA boundary")

Support vector machine (SVM)

library(MASS)
library(ggplot2)

set.seed(32611)
N<-100
d1<-mvrnorm(N, c(3,-3), matrix(c(1, 0, 0, 1), 2, 2))
d2<-mvrnorm(N, c(-3,3), matrix(c(1, 0, 0, 1), 2, 2))
d <- rbind(d1, d2)
label <- c(rep("1", N), rep("-1", N))
data_train <- data.frame(label=label, x1=d[,1], x2=d[,2])
names(data_train)<-c("label", "x1", "x2")

p <- ggplot(data_train) + aes(x=x1,y=x2,col=label) + geom_point()
p + geom_abline(slope=1,intercept=0, col="black") + 
 geom_abline(slope=0.8,intercept=0.5, col="green") + 
 geom_abline(slope=1.4,intercept=-0.4, col = "orange")

SVM margin

p +
geom_abline(slope=1,intercept=max(d1[,2] - d1[,1]), col="black", linetype = "dashed") +
geom_abline(slope=1,intercept=min(d2[,2] - d2[,1]), col="black", linetype = "dashed") +
geom_abline(slope=1,intercept=(max(d1[,2] - d1[,1]) + min(d2[,2] - d2[,1]))/2, col="black")

Formulate optimization problem via SVM margin

\[\frac{w^\top x + 1 - (w^\top x - 1)}{2\|w\|_2} = \frac{2}{2\|w\|_2} = \frac{1}{\|w\|_2}\]

\(\max_{w} \frac{1}{\|w\|_2}\) subject to \(y_i (w^\top x_i ) \ge 1\)

SVM: large Margin Classifier

\(\min_{w} \frac{1}{2} \|w\|_2^2\) subject to \(y_i (w^\top x_i ) \ge 1\) This is the SVM primal objective function.

\[L(w, \alpha) = \frac{1}{2} \|w\|_2^2 - \sum_i \alpha_i [y_i (w^\top x_i ) - 1]\] By KKT condition, \(\frac{\partial L}{\partial w}\) need to be 0 at the optimun solution.

\[\frac{\partial L}{\partial w_i} = w_i - \alpha_i y_i x_i = 0\] \[w_i = \alpha_i y_i x_i\]

Plugging terms back into Lagrange function \(L\): \[\max_\alpha - \frac{1}{2} \sum_{i,j} \alpha_i \alpha_j y_i y_j x_i^\top x_j + \sum_i \alpha_i\] Subject to \(\alpha_i \ge 0\)

This is the SVM dual objective function.

Geometric interpretation of SVM

p +
geom_abline(slope=1,intercept=max(d1[,2] - d1[,1]), col="black", linetype = "dashed") +
geom_abline(slope=1,intercept=min(d2[,2] - d2[,1]), col="black", linetype = "dashed") +
geom_abline(slope=1,intercept=(max(d1[,2] - d1[,1]) + min(d2[,2] - d2[,1]))/2, col="black")

From KKT condition:

\[\alpha_i [y_i w^\top x_i - 1] = 0\]

Use svm function in R

library(e1071)
library(ggplot2)

set.seed(32611)
N<-100
d1<-mvrnorm(N, c(3,-3), matrix(c(1, 0, 0, 1), 2, 2))
d2<-mvrnorm(N, c(-3,3), matrix(c(1, 0, 0, 1), 2, 2))
d <- rbind(d1, d2)
label <- c(rep("1", N), rep("-1", N))
data_train <- data.frame(label=factor(label), x1=d[,1], x2=d[,2])
names(data_train)<-c("label", "x1", "x2")

asvm <- svm(label ~ x1 + x2 ,data=data_train, kernel = "linear")

avec <- seq(-6,6,0.1)
z <- expand.grid(avec, avec)
z <- data.frame(x1=z[,1],x2=z[,2])
predict_svm <- predict(asvm, z)

z_svm <- cbind(region=as.factor(predict_svm), z)
ggplot(z_svm) + aes(x=x1,y=x2,col=region) + geom_point(alpha = 0.4)  + geom_point(data = data_train, aes(x=x1,y=x2,col=label)) +
  labs(title = "SVM boundary")

Property of SVM

\[w_i = \alpha_i y_i x_i\]

\[\max_\alpha - \frac{1}{2} \sum_{i,j} \alpha_i \alpha_j y_i y_j x_i^\top x_j + \sum_i \alpha_i\] Subject to \(\alpha_i \ge 0\)

Use svm function with Gaussian kernel in R

library(e1071)
library(ggplot2)

N<-100
set.seed(32611)
theta1 <- runif(N,0,2*pi)
r1 <- runif(N,2,3)

theta2 <- runif(N,0,2*pi)
r2 <- runif(N,4,5)

d1 <- cbind(r1 * cos(theta1), r1 * sin(theta1))
d2 <- cbind(r2 * cos(theta2), r2 * sin(theta2))

d <- rbind(d1, d2)
label <- c(rep("1", N), rep("-1", N))
data_train <- data.frame(label=factor(label), x=d[,1], y=d[,2])
names(data_train)<-c("label", "x", "y")

ggplot(data_train, aes(x=x,y=y,col=label))+ geom_point() 

Use svm function with Gaussian kernel in R

asvm <- svm(label ~ . ,data=data_train, kernel = "radial")

avec <- seq(-6,6,0.1)
z <- expand.grid(avec, avec)
z <- data.frame(x=z[,1],y=z[,2])
predict_svm <- predict(asvm, z)

z_svm <- cbind(region=as.factor(predict_svm), z)
ggplot(z_svm) + aes(x=x,y=y,col=region) + geom_point(alpha = 0.4)  + geom_point(data = data_train, aes(x=x,y=y,col=label)) +
  labs(title = "SVM boundary")

Interpretation for Kernel SVM:

\[\exp(x) = 1 + x + \frac{x^2}{2!} + \frac{x^3}{3!} + \ldots\]

SVM when linear separator is impossible

library(MASS)
library(ggplot2)

set.seed(32611)
N<-100
d1<-mvrnorm(N, c(2,-2), matrix(c(2, 1, 1, 2), 2, 2))
d2<-mvrnorm(N, c(-2,2), matrix(c(2, 1, 1, 2), 2, 2))
d <- rbind(d1, d2)
label <- c(rep("1", N), rep("-1", N))
data_train <- data.frame(label=factor(label), x=d[,1], y=d[,2])
names(data_train)<-c("label", "x", "y")

p <- ggplot(data_train) + aes(x=x,y=y,col=label) + geom_point()
p + geom_abline(slope=1,intercept=(max(d1[,2] - d1[,1]) + min(d2[,2] - d2[,1]))/2, col="black")

SVM for linearly no-separable case

\[\min_{w} \frac{1}{2} \|w\|_2^2 + C\|\zeta\|_2^2\] subject to \(y_i (w^\top x_i ) \ge 1 - \zeta_i\) and \(\zeta_i \ge 0\)

Use svm function in R with non-linear separable case

library(e1071)
library(ggplot2)

asvm <- svm(label ~ . ,data=data_train, kernel = "linear")

avec <- seq(-6,6,0.1)
z <- expand.grid(avec, avec)
z <- data.frame(x=z[,1],y=z[,2])
predict_svm <- predict(asvm, z)

z_svm <- cbind(region=as.factor(predict_svm), z)
ggplot(z_svm) + aes(x=x,y=y,col=region) + geom_point(alpha = 0.4)  + geom_point(data = data_train, aes(x=x,y=y,col=label)) +
  labs(title = "SVM boundary")

Summary for classification functions in R

Classifier package function
Logistic regression stats glm with parameter family=binomial()
Linear and quadratic discriminant analysis MASS lda, qda
DLDA and DQDA sma stat.diag.da
KNN classification class knn
CART rpart rpart
Random forest randomForest randomForest
Support Vector machines e1071 svm

Resources