Biostatistical Computing, PHC 6068

cross validation

Zhiguang Huo (Caleb)

Wednesday November 8, 2017

Outline

Cross validation to evaluate machine learning accuracy

holdout method

holdout method example

library(ElemStatLearn)
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
prostate0 <- prostate
prostate0$svi <- as.factor(prostate0$svi)
trainIabel <- prostate$train
prostate0$train <- NULL

prostate_train <- subset(prostate0, trainIabel)
prostate_test <- subset(prostate0, !trainIabel)

rfFit <- randomForest(svi ~ ., data=prostate_train)
rfFit
## 
## Call:
##  randomForest(formula = svi ~ ., data = prostate_train) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 11.94%
## Confusion matrix:
##    0  1 class.error
## 0 48  4  0.07692308
## 1  4 11  0.26666667
rfPred <- predict(rfFit, prostate_test)
rfTable <- table(rfPred, truth=prostate_test$svi)
rfTable
##       truth
## rfPred  0  1
##      0 22  4
##      1  2  2
1 - sum(diag(rfTable))/sum(rfTable)
## [1] 0.2

k-fold cross validation

Algorithm:

k-fold cross validation example

#Create K = 5 equally size folds

K <- 5
folds <- cut(seq(1,nrow(prostate0)),breaks=5,labels=FALSE)

EKs <- numeric(K)

for(k in 1:K){
  atrain <- subset(prostate0, folds != k)
  atest <- subset(prostate0, folds == k)

  krfFit <- randomForest(svi ~ ., data=atrain)
  krfPred <- predict(krfFit, atest)
  krfTable <- table(krfPred, truth=atest$svi)
  
  EKs[k] <- 1 - sum(diag(krfTable))/sum(krfTable)
}

EKs
## [1] 0.0000000 0.0000000 0.0000000 0.2631579 0.3500000
mean(EKs)
## [1] 0.1226316

Leave one out cross validation

For K-fold cross validation

leave one out cross validation example

n <- nrow(prostate0)

EKs <- numeric(n)

for(k in 1:n){
  #cat("leaving the",k,"sample out!","\n")
  atrain <- prostate0[-k,]
  atest <- prostate0[k,]

  krfFit <- randomForest(svi ~ ., data=atrain)
  krfPred <- predict(krfFit, atest)
  krfTable <- table(krfPred, truth=atest$svi)
  
  EKs[k] <- 1 - sum(diag(krfTable))/sum(krfTable)
}

EKs
##  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [36] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0
## [71] 1 0 1 1 0 0 0 0 0 1 0 1 1 1 0 0 0 1 0 1 0 1 0 0 0 0 0
mean(EKs)
## [1] 0.1340206

Important note for cross validation

Choosing a value of the tuning parameter

Some machine learning classifiers contain tuning parameters

The tuning parameter is crucial since it controls the amount of regularization (model complexity).

Model selection

Lasso problem

\[\hat{\beta}^{lasso} = \arg\min_{\beta \in \mathbb{R}^P} \|y - X\beta\|_2^2 + \lambda \|\beta\|_1\]

library(lars)
## Loaded lars 1.2
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
options(stringsAsFactors=F)
x <- as.matrix(prostate[,1:8])
y <- prostate[,9]
lassoFit <- lars(x, y) ## lar for least angle regression

lambdas <- seq(0,5,0.5)
coefLassoFit <- coef(lassoFit, s=lambdas, mode="lambda") 
adataFrame <- NULL
for(i in 1:ncol(coefLassoFit)){
  acoef <- colnames(coefLassoFit)[i]
  adataFrame0 <- data.frame(lambda=lambdas, coef = coefLassoFit[,i], variable=acoef)
  adataFrame <- rbind(adataFrame, adataFrame0)
}

ggplot(adataFrame) + aes(x=lambda, y=coef, col=variable) + 
  geom_point() +
  geom_path()

How to select the best tuning parameter

Tuning parameter selection algorithm

\[e(\lambda) = \frac{1}{K} \sum_{k=1}^K \sum_{i \in D_k} (y_i - \hat{f}_\lambda^{-k}(x_i))\]

Implement tuning parameter selection for lasso problem

K <- 10
folds <- cut(seq(1,nrow(prostate)),breaks=K,labels=FALSE)
lambdas <- seq(0.5,5,0.5)

MSEs <- matrix(NA,nrow=length(lambdas),ncol=K)
rownames(MSEs) <- lambdas
colnames(MSEs) <- 1:K

for(k in 1:K){
  atrainx <- as.matrix(prostate[!folds==k,1:8])
  atestx <- as.matrix(prostate[folds==k,1:8])
 
  atrainy <- prostate[!folds==k,9]
  atesty <- prostate[folds==k,9]

  klassoFit <- lars(atrainx, atrainy) ## lar for least angle regression

  predictValue <- predict.lars(klassoFit, atestx, s=lambdas, type="fit",  mode="lambda")
  MSE <- apply(predictValue$fit,2, function(x) sqrt(sum((x - atesty)^2)))
  
  MSEs[,k] <- MSE
}

MSEdataFrame <- data.frame(lambda=lambdas, MSE = rowMeans(MSEs))
ggplot(MSEdataFrame) + aes(x=lambda, y = MSE) + geom_point() + geom_path()

lars example

Refer to cv.lars() help file example

KNN motivating example

library(MASS)
set.seed(32611)
N<-100
d1<-mvrnorm(N, c(1,-1), matrix(c(2, 1, 1, 2), 2, 2))
d2<-mvrnorm(N, c(-1,1), matrix(c(2, 1, 1, 2), 2, 2))
atest <- rbind(c(1,1),c(2,-2),c(-2,2))
d <- rbind(d1, d2,atest)
label <- c(rep("A", N), rep("B", N),rep("test",3))
text <- label
text[label!="test"] <- ""
data_train <- data.frame(label=label, x=d[,1], y=d[,2], text=text)
names(data_train)<-c("label", "x", "y", "text")

ggplot(data_train, aes(x=x,y=y,col=label, label = text)) + 
  geom_point() + geom_text(vjust = -1,size=5)

KNN algorithm

\[\hat{f}(x) = mode_{i \in N_k(x)} y_i\]

\[\hat{f}(x) = \frac{1}{K} \sum_{i \in N_k(x)} y_i\]

KNN example

library(ElemStatLearn)
library(class)

prostate0 <- prostate
prostate0$svi <- as.factor(prostate0$svi)
trainIabel <- prostate$train

adata_train <- prostate0[trainIabel, -5]
adata_test <- prostate0[!trainIabel, -5]

alabel_train <- prostate0[trainIabel, 5]
alabel_test <- prostate0[!trainIabel, 5]

knnFit <- knn(adata_train, adata_test, alabel_train, k=5)

knnTable <- table(knnFit, truth=alabel_test)
knnTable
##       truth
## knnFit  0  1
##      0 22  5
##      1  2  1
1 - sum(diag(knnTable))/sum(knnTable)
## [1] 0.2333333

KNN Exercise

Combine selecting tuning parameters and evaluating prediction accuracy

  1. Split the data into training set and testing set.
  2. In training set, use cross validation to determine the best tuning parameter.
  3. Use the best tuning parameter and the entire training set to build a classifier.
  4. Evaluation:
    • Report the final prediction model
    • Predict the testing set
    • Report the accuracy.

For Step 1, can we also use cross validation?

Common mistake about tuning parameter selection in machine learning

Nested Cross validation

  1. Use K-folder cross validation (outer) to split the original data into training set and testing set.
  2. For \(k^{th}\) fold training set, use cross validation (inner) to determine the best tuning parameter of the \(k^{th}\) fold.
  3. Use the best tuning parameter of the \(k^{th}\) fold and the \(k^{th}\) fold training set to build a classifier.
  4. Predict the \(k^{th}\) fold testing set and report the accuracy.
  5. Evaluation:
    • Report the average accuracy from outer K-folder cross validation..

Feature selection in machine learning

A small subset of breast cancer gene expression data (BRCA) comparing tumor subjects with normal subjects. The data is in file brca_tcga.csv (Download here), with each row representing a gene and each column representing a subject.

If we directly use all these features to build a classifier, we may include too many noises. The model is also too complicated.

Feature selection: only keep top genes most associated with the class labels (e.g. 20 genes and 40 samples)

Feature selection in machine learning

Feature selection should also be thought as tuning parameter selection and model selection.

Resources

knitr::purl("crossValidation.rmd", output = "crossValidation.R ", documentation = 2)
## 
## 
## processing file: crossValidation.rmd
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |....                                                             |   6%
  |                                                                       
  |........                                                         |  12%
  |                                                                       
  |............                                                     |  19%
  |                                                                       
  |................                                                 |  25%
  |                                                                       
  |....................                                             |  31%
  |                                                                       
  |........................                                         |  38%
  |                                                                       
  |............................                                     |  44%
  |                                                                       
  |................................                                 |  50%
  |                                                                       
  |.....................................                            |  56%
  |                                                                       
  |.........................................                        |  62%
  |                                                                       
  |.............................................                    |  69%
  |                                                                       
  |.................................................                |  75%
  |                                                                       
  |.....................................................            |  81%
  |                                                                       
  |.........................................................        |  88%
  |                                                                       
  |.............................................................    |  94%
  |                                                                       
  |.................................................................| 100%
## output file: crossValidation.R
## [1] "crossValidation.R "