Zhiguang Huo (Caleb)
Wednesday November 8, 2017
library(ElemStatLearn)
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
prostate0 <- prostate
prostate0$svi <- as.factor(prostate0$svi)
trainIabel <- prostate$train
prostate0$train <- NULL
prostate_train <- subset(prostate0, trainIabel)
prostate_test <- subset(prostate0, !trainIabel)
rfFit <- randomForest(svi ~ ., data=prostate_train)
rfFit
##
## Call:
## randomForest(formula = svi ~ ., data = prostate_train)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 11.94%
## Confusion matrix:
## 0 1 class.error
## 0 48 4 0.07692308
## 1 4 11 0.26666667
rfPred <- predict(rfFit, prostate_test)
rfTable <- table(rfPred, truth=prostate_test$svi)
rfTable
## truth
## rfPred 0 1
## 0 22 4
## 1 2 2
1 - sum(diag(rfTable))/sum(rfTable)
## [1] 0.2
Algorithm:
Split the original dataset \(D\) into \(K\) folds, and obtain \(D_1, \ldots, D_k,\ldots, D_K\) such that \(D_1, \ldots, D_K\) are disjoint; size of \(D_k\) \(|D_k|\) is roughly the same for different \(k\) and \(\sum_k |D_k| = |D|\).
For each iteration \(k = 1, \ldots, K\), use the \(D_k\) as the testing dataset and \(D_{-k} = D - D_k\) as the training dataset. Build the classifier \(f^k\) based on \(D_{-k}\). Then predict \(D_k\) and obtain the prediction error rate \(E_k\)
Use \(E = \frac{1}{K} \sum_k E_k\) as the final prediction error rate.
#Create K = 5 equally size folds
K <- 5
folds <- cut(seq(1,nrow(prostate0)),breaks=5,labels=FALSE)
EKs <- numeric(K)
for(k in 1:K){
atrain <- subset(prostate0, folds != k)
atest <- subset(prostate0, folds == k)
krfFit <- randomForest(svi ~ ., data=atrain)
krfPred <- predict(krfFit, atest)
krfTable <- table(krfPred, truth=atest$svi)
EKs[k] <- 1 - sum(diag(krfTable))/sum(krfTable)
}
EKs
## [1] 0.0000000 0.0000000 0.0000000 0.2631579 0.3500000
mean(EKs)
## [1] 0.1226316
For K-fold cross validation
n <- nrow(prostate0)
EKs <- numeric(n)
for(k in 1:n){
#cat("leaving the",k,"sample out!","\n")
atrain <- prostate0[-k,]
atest <- prostate0[k,]
krfFit <- randomForest(svi ~ ., data=atrain)
krfPred <- predict(krfFit, atest)
krfTable <- table(krfPred, truth=atest$svi)
EKs[k] <- 1 - sum(diag(krfTable))/sum(krfTable)
}
EKs
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [36] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0
## [71] 1 0 1 1 0 0 0 0 0 1 0 1 1 1 0 0 0 1 0 1 0 1 0 0 0 0 0
mean(EKs)
## [1] 0.1340206
Some machine learning classifiers contain tuning parameters
The tuning parameter is crucial since it controls the amount of regularization (model complexity).
\[\hat{\beta}^{lasso} = \arg\min_{\beta \in \mathbb{R}^P} \|y - X\beta\|_2^2 + \lambda \|\beta\|_1\]
library(lars)
## Loaded lars 1.2
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
options(stringsAsFactors=F)
x <- as.matrix(prostate[,1:8])
y <- prostate[,9]
lassoFit <- lars(x, y) ## lar for least angle regression
lambdas <- seq(0,5,0.5)
coefLassoFit <- coef(lassoFit, s=lambdas, mode="lambda")
adataFrame <- NULL
for(i in 1:ncol(coefLassoFit)){
acoef <- colnames(coefLassoFit)[i]
adataFrame0 <- data.frame(lambda=lambdas, coef = coefLassoFit[,i], variable=acoef)
adataFrame <- rbind(adataFrame, adataFrame0)
}
ggplot(adataFrame) + aes(x=lambda, y=coef, col=variable) +
geom_point() +
geom_path()
Split the original dataset \(D\) into \(K\) folds, and obtain \(D_1, \ldots, D_k,\ldots, D_K\) such that \(D_1, \ldots, D_K\) are disjoint; size of \(D_k\) \(|D_k|\) is roughly the same for different \(k\) and \(\sum_k |D_k| = |D|\).
pre-specify a range of tuning parameters \(\lambda_1, \ldots, \lambda_B\)
Use \(e(\lambda_b) = \frac{1}{K} \sum_k e_k(\lambda_b)\) as the average (MSE, rMSE or classification error rate) for tuning parameter \((\lambda_b)\).
Choose the tuning parameter \(\hat{\lambda} = \arg \min_{\lambda\in (\lambda_1, \ldots, \lambda_B)} e(\lambda)\)
\[e(\lambda) = \frac{1}{K} \sum_{k=1}^K \sum_{i \in D_k} (y_i - \hat{f}_\lambda^{-k}(x_i))\]
K <- 10
folds <- cut(seq(1,nrow(prostate)),breaks=K,labels=FALSE)
lambdas <- seq(0.5,5,0.5)
MSEs <- matrix(NA,nrow=length(lambdas),ncol=K)
rownames(MSEs) <- lambdas
colnames(MSEs) <- 1:K
for(k in 1:K){
atrainx <- as.matrix(prostate[!folds==k,1:8])
atestx <- as.matrix(prostate[folds==k,1:8])
atrainy <- prostate[!folds==k,9]
atesty <- prostate[folds==k,9]
klassoFit <- lars(atrainx, atrainy) ## lar for least angle regression
predictValue <- predict.lars(klassoFit, atestx, s=lambdas, type="fit", mode="lambda")
MSE <- apply(predictValue$fit,2, function(x) sqrt(sum((x - atesty)^2)))
MSEs[,k] <- MSE
}
MSEdataFrame <- data.frame(lambda=lambdas, MSE = rowMeans(MSEs))
ggplot(MSEdataFrame) + aes(x=lambda, y = MSE) + geom_point() + geom_path()
Refer to cv.lars() help file example
library(MASS)
set.seed(32611)
N<-100
d1<-mvrnorm(N, c(1,-1), matrix(c(2, 1, 1, 2), 2, 2))
d2<-mvrnorm(N, c(-1,1), matrix(c(2, 1, 1, 2), 2, 2))
atest <- rbind(c(1,1),c(2,-2),c(-2,2))
d <- rbind(d1, d2,atest)
label <- c(rep("A", N), rep("B", N),rep("test",3))
text <- label
text[label!="test"] <- ""
data_train <- data.frame(label=label, x=d[,1], y=d[,2], text=text)
names(data_train)<-c("label", "x", "y", "text")
ggplot(data_train, aes(x=x,y=y,col=label, label = text)) +
geom_point() + geom_text(vjust = -1,size=5)
\[\hat{f}(x) = mode_{i \in N_k(x)} y_i\]
\[\hat{f}(x) = \frac{1}{K} \sum_{i \in N_k(x)} y_i\]
library(ElemStatLearn)
library(class)
prostate0 <- prostate
prostate0$svi <- as.factor(prostate0$svi)
trainIabel <- prostate$train
adata_train <- prostate0[trainIabel, -5]
adata_test <- prostate0[!trainIabel, -5]
alabel_train <- prostate0[trainIabel, 5]
alabel_test <- prostate0[!trainIabel, 5]
knnFit <- knn(adata_train, adata_test, alabel_train, k=5)
knnTable <- table(knnFit, truth=alabel_test)
knnTable
## truth
## knnFit 0 1
## 0 22 5
## 1 2 1
1 - sum(diag(knnTable))/sum(knnTable)
## [1] 0.2333333
For Step 1, can we also use cross validation?
A small subset of breast cancer gene expression data (BRCA) comparing tumor subjects with normal subjects. The data is in file brca_tcga.csv (Download here), with each row representing a gene and each column representing a subject.
If we directly use all these features to build a classifier, we may include too many noises. The model is also too complicated.
Feature selection: only keep top genes most associated with the class labels (e.g. 20 genes and 40 samples)
Feature selection should also be thought as tuning parameter selection and model selection.
knitr::purl("crossValidation.rmd", output = "crossValidation.R ", documentation = 2)
##
##
## processing file: crossValidation.rmd
##
|
| | 0%
|
|.... | 6%
|
|........ | 12%
|
|............ | 19%
|
|................ | 25%
|
|.................... | 31%
|
|........................ | 38%
|
|............................ | 44%
|
|................................ | 50%
|
|..................................... | 56%
|
|......................................... | 62%
|
|............................................. | 69%
|
|................................................. | 75%
|
|..................................................... | 81%
|
|......................................................... | 88%
|
|............................................................. | 94%
|
|.................................................................| 100%
## output file: crossValidation.R
## [1] "crossValidation.R "