Biostatistical Computing, PHC 6068

cross validation

Zhiguang Huo (Caleb)

Wednesday November 28, 2018

Outline

Holdout method to evaluate machine learning accuracy

Holdout method example

library(ElemStatLearn)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
prostate0 <- prostate
prostate0$svi <- as.factor(prostate0$svi)
trainLabel <- prostate$train
prostate0$train <- NULL

prostate_train <- subset(prostate0, trainLabel)
prostate_test <- subset(prostate0, !trainLabel)

rfFit <- randomForest(svi ~ ., data=prostate_train)
rfFit
## 
## Call:
##  randomForest(formula = svi ~ ., data = prostate_train) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 11.94%
## Confusion matrix:
##    0  1 class.error
## 0 48  4  0.07692308
## 1  4 11  0.26666667
rfPred <- predict(rfFit, prostate_test)
rfTable <- table(rfPred, truth=prostate_test$svi)
rfTable
##       truth
## rfPred  0  1
##      0 22  4
##      1  2  2
1 - sum(diag(rfTable))/sum(rfTable)
## [1] 0.2

k-fold cross validation

Algorithm:

k-fold cross validation example

#Create K = 5 equally size folds

K <- 5
folds <- cut(seq(1,nrow(prostate0)),breaks=5,labels=FALSE)

EKs <- numeric(K)

for(k in 1:K){
  atrain <- subset(prostate0, folds != k)
  atest <- subset(prostate0, folds == k)

  krfFit <- randomForest(svi ~ ., data=atrain)
  krfPred <- predict(krfFit, atest)
  krfTable <- table(krfPred, truth=atest$svi)
  
  EKs[k] <- 1 - sum(diag(krfTable))/sum(krfTable)
}

EKs
## [1] 0.0000000 0.0000000 0.0000000 0.2631579 0.3000000
mean(EKs)
## [1] 0.1126316

Leave one out cross validation

For K-fold cross validation

leave one out cross validation example

n <- nrow(prostate0)

EKs <- numeric(n)

for(k in 1:n){
  #cat("leaving the",k,"sample out!","\n")
  atrain <- prostate0[-k,]
  atest <- prostate0[k,]

  krfFit <- randomForest(svi ~ ., data=atrain)
  krfPred <- predict(krfFit, atest)
  krfTable <- table(krfPred, truth=atest$svi)
  
  EKs[k] <- 1 - sum(diag(krfTable))/sum(krfTable)
}

EKs
##  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [36] 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0
## [71] 1 0 1 1 0 0 0 0 0 1 0 1 1 1 0 0 0 1 0 1 0 1 0 0 0 0 0
mean(EKs)
## [1] 0.1443299

Important note for cross validation

Choosing a value of the tuning parameter

Some machine learning classifiers contain tuning parameters

The tuning parameter is crucial since it controls the amount of regularization (model complexity).

Model selection

Lasso problem

\[\hat{\beta}^{lasso} = \frac{1}{2}\arg\min_{\beta \in \mathbb{R}^P} \|y - X\beta\|_2^2 + \lambda \|\beta\|_1\]

library(lars)
## Loaded lars 1.2
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
options(stringsAsFactors=F)
x <- as.matrix(prostate[,1:8])
y <- prostate[,9]
lassoFit <- lars(x, y) ## lar for least angle regression

lambdas <- seq(0,5,0.5)
coefLassoFit <- coef(lassoFit, s=lambdas, mode="lambda") 
adataFrame <- NULL
for(i in 1:ncol(coefLassoFit)){
  acoef <- colnames(coefLassoFit)[i]
  adataFrame0 <- data.frame(lambda=lambdas, coef = coefLassoFit[,i], variable=acoef)
  adataFrame <- rbind(adataFrame, adataFrame0)
}

ggplot(adataFrame) + aes(x=lambda, y=coef, col=variable) + 
  geom_point() +
  geom_path()

How to select the best tuning parameter

Tuning parameter selection algorithm

Choose the tuning parameter \(\hat{\lambda} = \arg \min_{\lambda\in (\lambda_1, \ldots, \lambda_B)} e(\lambda)\)

Implement tuning parameter selection for lasso problem

K <- 10
folds <- cut(seq(1,nrow(prostate)),breaks=K,labels=FALSE)
lambdas <- seq(0.5,5,0.5)

MSEs <- matrix(NA,nrow=length(lambdas),ncol=K)
rownames(MSEs) <- lambdas
colnames(MSEs) <- 1:K

for(k in 1:K){
  atrainx <- as.matrix(prostate[!folds==k,1:8])
  atestx <- as.matrix(prostate[folds==k,1:8])
 
  atrainy <- prostate[!folds==k,9]
  atesty <- prostate[folds==k,9]

  klassoFit <- lars(atrainx, atrainy) ## lar for least angle regression

  predictValue <- predict.lars(klassoFit, atestx, s=lambdas, type="fit",  mode="lambda")
  MSE <- apply(predictValue$fit,2, function(x) sqrt(sum((x - atesty)^2)))
  
  MSEs[,k] <- MSE
}

MSEdataFrame <- data.frame(lambda=lambdas, rMSE = rowMeans(MSEs))
ggplot(MSEdataFrame) + aes(x=lambda, y = rMSE) + geom_point() + geom_path()

Common mistake about tuning parameter selection in machine learning

Combine selecting tuning parameters and evaluating prediction accuracy (skip)

  1. Split the data into training set and testing set.
  2. In training set, use cross validation to determine the best tuning parameter.
  3. Use the best tuning parameter and the entire training set to build a classifier.
  4. Evaluation:
    • Report the final prediction model
    • Predict the testing set
    • Report the accuracy.

For Step 1, can we also use cross validation?

Nested Cross validation (skip)

  1. Use K-folder cross validation (outer) to split the original data into training set and testing set.
  2. For \(k^{th}\) fold training set, use cross validation (inner) to determine the best tuning parameter of the \(k^{th}\) fold.
  3. Use the best tuning parameter of the \(k^{th}\) fold and the \(k^{th}\) fold training set to build a classifier.
  4. Predict the \(k^{th}\) fold testing set and report the accuracy.
  5. Evaluation:
    • Report the average accuracy from outer K-folder cross validation..

Other machine learning approaches

KNN motivating example

library(MASS)
set.seed(32611)
N<-100
d1<-mvrnorm(N, c(1,-1), matrix(c(2, 1, 1, 2), 2, 2))
d2<-mvrnorm(N, c(-1,1), matrix(c(2, 1, 1, 2), 2, 2))
atest <- rbind(c(1,1),c(2,-2),c(-2,2))
d <- rbind(d1, d2,atest)
label <- c(rep("A", N), rep("B", N),rep("test",3))
text <- label
text[label!="test"] <- ""
data_train <- data.frame(label=label, x=d[,1], y=d[,2], text=text)
names(data_train)<-c("label", "x", "y", "text")

ggplot(data_train, aes(x=x,y=y,col=label, label = text)) + 
  geom_point() + geom_text(vjust = -1,size=5)

KNN algorithm

\[\hat{f}(x) = mode_{i \in N_k(x)} y_i\]

\[\hat{f}(x) = \frac{1}{K} \sum_{i \in N_k(x)} y_i\]

KNN example

library(ElemStatLearn)
library(class)

prostate0 <- prostate
prostate0$svi <- as.factor(prostate0$svi)
trainLabel <- prostate$train

## here 5 is the index of svi in the prostate data
adata_train <- prostate0[trainLabel, -5]
adata_test <- prostate0[!trainLabel, -5]

alabel_train <- prostate0[trainLabel, 5]
alabel_test <- prostate0[!trainLabel, 5]

knnFit <- knn(adata_train, adata_test, alabel_train, k=5)

knnTable <- table(knnFit, truth=alabel_test)
knnTable
##       truth
## knnFit  0  1
##      0 22  5
##      1  2  1
1 - sum(diag(knnTable))/sum(knnTable)
## [1] 0.2333333

KNN Exercise

Linear discriminant analysis (LDA)

Problem setting:

\[f(X|Y=k) = \frac{1}{(2\pi)^{p/2} |\Sigma_k|^{1/2}} \exp ( - \frac{1}{2} (X - \mu_k)^\top \Sigma_k^{-1} (X - \mu_k))\]

Intuition for Linear discriminant analysis (LDA)

library(MASS)
library(ggplot2)

set.seed(32611)
N<-100
d1<-mvrnorm(N, c(1,-1), matrix(c(2, 1, 1, 2), 2, 2))
d2<-mvrnorm(N, c(-1,1), matrix(c(2, 1, 1, 2), 2, 2))
d <- rbind(d1, d2)
label <- c(rep("1", N), rep("2", N))
data_train <- data.frame(label=label, x=d[,1], y=d[,2])
names(data_train)<-c("label", "x", "y")

ggplot(data_train) + aes(x=x,y=y,col=label) + geom_point() +  stat_ellipse()

Math behind Linear discriminant analysis (LDA)

\(\begin{aligned} C_{Bayes}(X) & = \arg \max_k f(Y=k | X) \\ & = \arg \max_k \frac{f(Y=k)f(X|Y=k)}{f(X)} \\ & = \arg \max_k f(Y=k)f(X|Y=k) \\ & = \arg \max_k \log f(Y=k) + \log f(X|Y=k) \\ & = \arg \max_k \log f(Y=k) - \frac{1}{2} \log |\Sigma_k| - \frac{1}{2} (X - \mu_k)^\top \Sigma_k^{-1} (X - \mu_k) \\ & = \arg \min_k - 2 \log f(Y=k) + \log |\Sigma_k| + (X - \mu_k)^\top \Sigma_k^{-1} (X - \mu_k) \end{aligned}\)

Decision boundary for Linear discriminant analysis (LDA)

The decision boundary for class \(m\) and class \(l\) is: \[f(Y=l|X) = f(Y=m|X)\]

Or equivalently,

\(\begin{aligned} & - 2 \log f(Y=l) + \log |\Sigma_l| + (X - \mu_l)^\top \Sigma_l^{-1} (X - \mu_l) \\ & = - 2 \log f(Y=m) + \log |\Sigma_m| + (X - \mu_m)^\top \Sigma_m^{-1} (X - \mu_m) \end{aligned}\)

Further assume uniform prior \(f(Y=l) = f(Y=m)\), and same covariance structure \(\Sigma_l = \Sigma_m = \Sigma\), the decision hyperlane simplifies as:

\[2 (\mu_l - \mu_m)^\top \Sigma^{-1} X - (\mu_l - \mu_m)^\top \Sigma^{-1} (\mu_l + \mu_m) = 0\]

You can see the decision boundary is linear and passes the center between \(\frac{\mu_l + \mu_m}{2}\).

Visualize the decision boundary (LDA)

library(MASS)

alda <- lda(label ~ . ,data=data_train)

avec <- seq(-4,4,0.1)
z <- expand.grid(avec, avec)
z <- data.frame(x=z[,1],y=z[,2])
predict_lda <- predict(alda, z)$class

z_lda <- cbind(region=as.factor(predict_lda), z)
ggplot(z_lda) + aes(x=x,y=y,col=region) + geom_point(alpha = 0.4)  + geom_point(data = data_train, aes(x=x,y=y,col=label)) +
  labs(title = "LDA boundary")

Quadratic discriminant analysis (QDA)

Further assume uniform prior \(f(Y=l) = f(Y=m)\), but different covariance structure \(\Sigma_l\), \(\Sigma_m\), the decision hyperlane simplifies as:

\(\begin{aligned} \log |\Sigma_l| + (X - \mu_l)^\top \Sigma_l^{-1} (X - \mu_l) = \log |\Sigma_m| + (X - \mu_m)^\top \Sigma_m^{-1} (X - \mu_m) \end{aligned}\)

You will see the decision boundary is a quadratic function.

library(MASS)
library(ggplot2)

set.seed(32611)
N<-100
d1<-mvrnorm(N, c(0,2), matrix(c(4, 0, 0, 1), 2, 2))
d2<-mvrnorm(N, c(0,-2), matrix(c(1, 0, 0, 4), 2, 2))
d <- rbind(d1, d2)
label <- c(rep("1", N), rep("2", N))
data_train <- data.frame(label=label, x=d[,1], y=d[,2])
names(data_train)<-c("label", "x", "y")

ggplot(data_train) + aes(x=x,y=y,col=label) + geom_point() +  stat_ellipse()

Visualize the decision boundary (QDA)

library(MASS)
aqda <- qda(label ~ . ,data=data_train)

avec <- seq(-6,6,0.1)
z <- expand.grid(avec, avec)
z <- data.frame(x=z[,1],y=z[,2])
predict_qda <- predict(aqda, z)$class

z_qda <- cbind(region=as.factor(predict_qda), z)
ggplot(z_qda) + aes(x=x,y=y,col=region) + geom_point(alpha = 0.4)  + geom_point(data = data_train, aes(x=x,y=y,col=label)) +
  labs(title = "QDA boundary")

SVM (supporting vector machine)

Won’t be covered this semester.

Refer to https://caleb-huo.github.io/teaching/2017FALL/lectures/week14_multivariateOptimization/LDAandSVM/LDAandSVM.html

Summary for classification functions in R

Classifier package function
Logistic regression stats glm with parameter family=binomial()
Linear and quadratic discriminant analysis MASS lda, qda
DLDA and DQDA sma stat.diag.da
KNN classification class knn
CART rpart rpart
Random forest randomForest randomForest
Support Vector machines e1071 svm

Resources