Zhiguang Huo (Caleb)
Monday November 23, 2020
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
prostate0 <- prostate
prostate0$svi <- as.factor(prostate0$svi)
trainLabel <- prostate$train
prostate0$train <- NULL
prostate_train <- subset(prostate0, trainLabel)
prostate_test <- subset(prostate0, !trainLabel)
rfFit <- randomForest(svi ~ ., data=prostate_train)
rfFit
##
## Call:
## randomForest(formula = svi ~ ., data = prostate_train)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 14.93%
## Confusion matrix:
## 0 1 class.error
## 0 48 4 0.07692308
## 1 6 9 0.40000000
## truth
## rfPred 0 1
## 0 22 4
## 1 2 2
## [1] 0.2
Algorithm:
Split the original dataset \(D\) into \(K\) folds, and obtain \(D_1, \ldots, D_k,\ldots, D_K\) such that \(D_1, \ldots, D_K\) are disjoint; \(|D_k|\) (size of \(D_k\)) is roughly the same for different \(k\) and \(\sum_k |D_k| = |D|\).
For each iteration \(k = 1, \ldots, K\), use the \(D_k\) as the testing dataset and \(D_{-k} = D - D_k\) as the training dataset. Build the classifier \(f^k\) based on \(D_{-k}\). Then predict \(D_k\) and obtain the prediction error rate \(E_k\)
Use \(E = \frac{1}{K} \sum_k E_k\) as the final prediction error rate.
#Create K = 5 equally size folds
K <- 5
folds <- cut(seq(1,nrow(prostate0)),breaks=5,labels=FALSE)
EKs <- numeric(K)
for(k in 1:K){
atrain <- subset(prostate0, folds != k)
atest <- subset(prostate0, folds == k)
krfFit <- randomForest(svi ~ ., data=atrain)
krfPred <- predict(krfFit, atest)
krfTable <- table(krfPred, truth=atest$svi)
EKs[k] <- 1 - sum(diag(krfTable))/sum(krfTable)
}
EKs
## [1] 0.0000000 0.0000000 0.0000000 0.3157895 0.3500000
## [1] 0.1331579
For K-fold cross validation
n <- nrow(prostate0)
EKs <- numeric(n)
for(k in 1:n){
#cat("leaving the",k,"sample out!","\n")
atrain <- prostate0[-k,]
atest <- prostate0[k,]
krfFit <- randomForest(svi ~ ., data=atrain)
krfPred <- predict(krfFit, atest)
krfTable <- table(krfPred, truth=atest$svi)
EKs[k] <- 1 - sum(diag(krfTable))/sum(krfTable)
}
EKs
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [36] 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0
## [71] 1 0 1 1 0 0 0 0 0 1 0 1 1 1 0 0 0 1 0 1 0 1 0 0 0 0 0
## [1] 0.1443299
Some machine learning classifiers contain tuning parameters
The tuning parameter is crucial since it controls the amount of regularization (model complexity).
\[\hat{\beta}^{lasso} = \frac{1}{2}\arg\min_{\beta \in \mathbb{R}^P} \|y - X\beta\|_2^2 + \lambda \|\beta\|_1\]
## Loaded lars 1.2
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
options(stringsAsFactors=F)
x <- as.matrix(prostate[,1:8])
y <- prostate[,9]
lassoFit <- lars(x, y) ## lar for least angle regression
lambdas <- seq(0,5,0.5)
coefLassoFit <- coef(lassoFit, s=lambdas, mode="lambda")
adataFrame <- NULL
for(i in 1:ncol(coefLassoFit)){
acoef <- colnames(coefLassoFit)[i]
adataFrame0 <- data.frame(lambda=lambdas, coef = coefLassoFit[,i], variable=acoef)
adataFrame <- rbind(adataFrame, adataFrame0)
}
ggplot(adataFrame) + aes(x=lambda, y=coef, col=variable) +
geom_point() +
geom_path()
Split the original dataset \(D\) into \(K\) folds, and obtain \(D_1, \ldots, D_k,\ldots, D_K\) such that \(D_1, \ldots, D_K\) are disjoint; \(|D_k|\) (size of \(D_k\)) is roughly the same for different \(k\) and \(\sum_k |D_k| = |D|\).
pre-specify a range of tuning parameters \(\lambda_1, \ldots, \lambda_B\)
Use \(e(\lambda_b) = \frac{1}{K} \sum_k e_k(\lambda_b)\) as the average (MSE, rMSE or classification error rate) for tuning parameter \((\lambda_b)\).
Choose the tuning parameter \(\hat{\lambda} = \arg \min_{\lambda\in (\lambda_1, \ldots, \lambda_B)} e(\lambda)\)
Choose the tuning parameter \(\hat{\lambda} = \arg \min_{\lambda\in (\lambda_1, \ldots, \lambda_B)} e(\lambda)\)
Continuous outcome MSE: \[e(\lambda) = \frac{1}{K} \sum_{k=1}^K \sum_{i \in D_k} (y_i - \hat{f}_\lambda^{-k}(x_i))^2\]
Continuous outcome rMSE: \[e(\lambda) = \frac{1}{K} \sum_{k=1}^K \sqrt{\sum_{i \in D_k} (y_i - \hat{f}_\lambda^{-k}(x_i))^2}\]
Categorical outcome variable classification error rate: \[e(\lambda) = \frac{1}{K} \sum_{k=1}^K \frac{1}{|D_k|}\sum_{i \in D_k} \mathbb{I}(y_i \ne \hat{f}_\lambda^{-k}(x_i))\]
K <- 10
folds <- cut(seq(1,nrow(prostate)),breaks=K,labels=FALSE)
lambdas <- seq(0.5,5,0.5)
rMSEs <- matrix(NA,nrow=length(lambdas),ncol=K)
rownames(rMSEs) <- lambdas
colnames(rMSEs) <- 1:K
for(k in 1:K){
atrainx <- as.matrix(prostate[!folds==k,1:8])
atestx <- as.matrix(prostate[folds==k,1:8])
atrainy <- prostate[!folds==k,9]
atesty <- prostate[folds==k,9]
klassoFit <- lars(atrainx, atrainy) ## lar for least angle regression
predictValue <- predict.lars(klassoFit, atestx, s=lambdas, type="fit", mode="lambda")
rMSE <- apply(predictValue$fit,2, function(x) sqrt(sum((x - atesty)^2))) ## square root MSE
rMSEs[,k] <- rMSE
}
rMSEdataFrame <- data.frame(lambda=lambdas, rMSE = rowMeans(rMSEs))
ggplot(rMSEdataFrame) + aes(x=lambda, y = rMSE) + geom_point() + geom_path()
For Step 1, can we also use cross validation?
library(MASS)
set.seed(32611)
N<-100
d1<-mvrnorm(N, c(1,-1), matrix(c(2, 1, 1, 2), 2, 2))
d2<-mvrnorm(N, c(-1,1), matrix(c(2, 1, 1, 2), 2, 2))
atest <- rbind(c(1,1),c(2,-2),c(-2,2))
d <- rbind(d1, d2,atest)
label <- c(rep("A", N), rep("B", N),rep("test",3))
text <- label
text[label!="test"] <- ""
data_train <- data.frame(label=label, x=d[,1], y=d[,2], text=text)
names(data_train)<-c("label", "x", "y", "text")
ggplot(data_train, aes(x=x,y=y,col=label, label = text)) +
geom_point() + geom_text(vjust = -1,size=5)
\[\hat{f}(x) = mode_{i \in N_k(x)} y_i\]
\[\hat{f}(x) = \frac{1}{K} \sum_{i \in N_k(x)} y_i\]
library(ElemStatLearn)
library(class)
prostate0 <- prostate
prostate0$svi <- as.factor(prostate0$svi)
trainLabel <- prostate$train
## here 5 is the index of svi in the prostate data
adata_train <- prostate0[trainLabel, -5]
adata_test <- prostate0[!trainLabel, -5]
alabel_train <- prostate0[trainLabel, 5]
alabel_test <- prostate0[!trainLabel, 5]
knnFit <- knn(adata_train, adata_test, alabel_train, k=5)
knnTable <- table(knnFit, truth=alabel_test)
knnTable
## truth
## knnFit 0 1
## 0 22 5
## 1 2 1
## [1] 0.2333333
Problem setting:
\[f(X|Y=k) = \frac{1}{(2\pi)^{p/2} |\Sigma_k|^{1/2}} \exp ( - \frac{1}{2} (X - \mu_k)^\top \Sigma_k^{-1} (X - \mu_k))\]
library(MASS)
library(ggplot2)
set.seed(32611)
N<-100
d1<-mvrnorm(N, c(1,-1), matrix(c(2, 1, 1, 2), 2, 2))
d2<-mvrnorm(N, c(-1,1), matrix(c(2, 1, 1, 2), 2, 2))
d <- rbind(d1, d2)
label <- c(rep("1", N), rep("2", N))
data_train <- data.frame(label=label, x1=d[,1], x2=d[,2])
names(data_train)<-c("label", "x1", "x2")
ggplot(data_train) + aes(x=x1,y=x2,col=label) + geom_point() + stat_ellipse()
Gaussian density: \[f(X|Y=k) = \frac{1}{(2\pi)^{p/2} |\Sigma_k|^{1/2}} \exp ( - \frac{1}{2} (X - \mu_k)^\top \Sigma_k^{-1} (X - \mu_k))\]
The Bayes rule:
\(\begin{aligned} C_{Bayes}(X) & = \arg \max_k f(Y=k | X) \\ & = \arg \max_k \frac{f(Y=k)f(X|Y=k)}{f(X)} \\ & = \arg \max_k f(Y=k)f(X|Y=k) \\ & = \arg \max_k \log f(Y=k) + \log f(X|Y=k) \\ & = \arg \max_k \log f(Y=k) - \frac{1}{2} \log |\Sigma_k| - \frac{1}{2} (X - \mu_k)^\top \Sigma_k^{-1} (X - \mu_k) \\ & = \arg \min_k - 2 \log f(Y=k) + \log |\Sigma_k| + (X - \mu_k)^\top \Sigma_k^{-1} (X - \mu_k) \end{aligned}\)
The decision boundary for class \(m\) and class \(l\) is: \[f(Y=l|X) = f(Y=m|X)\]
Or equivalently,
\(\begin{aligned} & - 2 \log f(Y=l) + \log |\Sigma_l| + (X - \mu_l)^\top \Sigma_l^{-1} (X - \mu_l) \\ & = - 2 \log f(Y=m) + \log |\Sigma_m| + (X - \mu_m)^\top \Sigma_m^{-1} (X - \mu_m) \end{aligned}\)
Further assume uniform prior \(f(Y=l) = f(Y=m)\), and same covariance structure \(\Sigma_l = \Sigma_m = \Sigma\), the decision hyperlane simplifies as:
\[2 (\mu_l - \mu_m)^\top \Sigma^{-1} X - (\mu_l - \mu_m)^\top \Sigma^{-1} (\mu_l + \mu_m) = 0\]
\[2 (\mu_l - \mu_m)^\top \Sigma^{-1} (X - \frac{\mu_l + \mu_m}{2}) = 0\]
You can see the decision boundary is linear and passes the center between \(\frac{\mu_l + \mu_m}{2}\).
library(MASS)
alda <- lda(label ~ . ,data=data_train)
avec <- seq(-4,4,0.1)
z <- expand.grid(avec, avec)
z <- data.frame(x1=z[,1],x2=z[,2])
predict_lda <- predict(alda, z)$class
z_lda <- cbind(region=as.factor(predict_lda), z)
ggplot(z_lda) + aes(x=x1,y=x2,col=region) + geom_point(alpha = 0.4) + geom_point(data = data_train, aes(x=x1,y=x2,col=label)) +
labs(title = "LDA boundary")
Further assume uniform prior \(f(Y=l) = f(Y=m)\), but different covariance structure \(\Sigma_l\), \(\Sigma_m\), the decision hyperlane simplifies as:
\(\begin{aligned} \log |\Sigma_l| + (X - \mu_l)^\top \Sigma_l^{-1} (X - \mu_l) = \log |\Sigma_m| + (X - \mu_m)^\top \Sigma_m^{-1} (X - \mu_m) \end{aligned}\)
You will see the decision boundary is a quadratic function.
library(MASS)
library(ggplot2)
set.seed(32611)
N<-100
d1<-mvrnorm(N, c(0,2), matrix(c(4, 0, 0, 1), 2, 2))
d2<-mvrnorm(N, c(0,-2), matrix(c(1, 0, 0, 4), 2, 2))
d <- rbind(d1, d2)
label <- c(rep("1", N), rep("2", N))
data_train <- data.frame(label=label, x1=d[,1], x2=d[,2])
names(data_train)<-c("label", "x1", "x2")
ggplot(data_train) + aes(x=x1,y=x2,col=label) + geom_point() + stat_ellipse()
library(MASS)
aqda <- qda(label ~ . ,data=data_train)
avec <- seq(-6,6,0.1)
z <- expand.grid(avec, avec)
z <- data.frame(x1=z[,1],x2=z[,2])
predict_qda <- predict(aqda, z)$class
z_qda <- cbind(region=as.factor(predict_qda), z)
ggplot(z_qda) + aes(x=x1,y=x2,col=region) + geom_point(alpha = 0.4) + geom_point(data = data_train, aes(x=x1,y=x2,col=label)) +
labs(title = "QDA boundary")
Motivation: find a boundary to separate data from with different labels
Linear separable case
library(MASS)
library(ggplot2)
set.seed(32611)
N<-100
d1<-mvrnorm(N, c(3,-3), matrix(c(1, 0, 0, 1), 2, 2))
d2<-mvrnorm(N, c(-3,3), matrix(c(1, 0, 0, 1), 2, 2))
d <- rbind(d1, d2)
label <- c(rep("1", N), rep("-1", N))
data_train <- data.frame(label=label, x1=d[,1], x2=d[,2])
names(data_train)<-c("label", "x1", "x2")
p <- ggplot(data_train) + aes(x=x1,y=x2,col=label) + geom_point()
p + geom_abline(slope=1,intercept=0, col="black") +
geom_abline(slope=0.8,intercept=0.5, col="green") +
geom_abline(slope=1.4,intercept=-0.4, col = "orange")
p +
geom_abline(slope=1,intercept=max(d1[,2] - d1[,1]), col="black", linetype = "dashed") +
geom_abline(slope=1,intercept=min(d2[,2] - d2[,1]), col="black", linetype = "dashed") +
geom_abline(slope=1,intercept=(max(d1[,2] - d1[,1]) + min(d2[,2] - d2[,1]))/2, col="black")
Black line (Separation boundary) with intercept included: \[f(x) = w^\top x = w_1 x_1 + w_2 x_2 + c\]
red dots \[w^\top x \le -1\] Otherwise, we can re-scale \(w\) to make sure this relationship holds
blue dots \[w^\top x \ge 1\]
Unify red dots and blue dots: \[y w^\top x \ge 1\]
lower dashed line: \[w^\top x- 1 = 0\]
Margin:
\[\frac{w^\top x + 1 - (w^\top x - 1)}{2\|w\|_2} = \frac{2}{2\|w\|_2} = \frac{1}{\|w\|_2}\]
\(\max_{w} \frac{1}{\|w\|_2}\) subject to \(y_i (w^\top x_i ) \ge 1\)
\(\frac{1}{\|w\|_2}\) is not easy to maximize, but \(\|w\|_2^2\) is easy to minimize:
SVM (linear separable case):
\(\min_{w} \frac{1}{2} \|w\|_2^2\) subject to \(y_i (w^\top x_i ) \ge 1\) This is the SVM primal objective function.
\[L(w, \alpha) = \frac{1}{2} \|w\|_2^2 - \sum_i \alpha_i [y_i (w^\top x_i ) - 1]\] By KKT condition, \(\frac{\partial L}{\partial w}\) need to be 0 at the optimun solution.
\[\frac{\partial L}{\partial w_j} = w_j - \sum_i \alpha_i y_i x_{ij} = 0\] \[w_j = \sum_i \alpha_i y_i x_{ij}\]
Plugging terms back into Lagrange function \(L\): \[\min_\alpha \frac{1}{2} \sum_{i,i'} \alpha_i \alpha_{i'} y_i y_{i'} x_i^\top x_{i'}\] Subject to \(\alpha_i \ge 0\)
This is the SVM dual objective function.
p +
geom_abline(slope=1,intercept=max(d1[,2] - d1[,1]), col="black", linetype = "dashed") +
geom_abline(slope=1,intercept=min(d2[,2] - d2[,1]), col="black", linetype = "dashed") +
geom_abline(slope=1,intercept=(max(d1[,2] - d1[,1]) + min(d2[,2] - d2[,1]))/2, col="black")
From KKT condition:
\[\alpha_i [y_i w^\top x_i - 1] = 0\]
library(e1071)
library(ggplot2)
set.seed(32611)
N<-100
d1<-mvrnorm(N, c(3,-3), matrix(c(1, 0, 0, 1), 2, 2))
d2<-mvrnorm(N, c(-3,3), matrix(c(1, 0, 0, 1), 2, 2))
d <- rbind(d1, d2)
label <- c(rep("1", N), rep("-1", N))
data_train <- data.frame(label=factor(label), x1=d[,1], x2=d[,2])
names(data_train)<-c("label", "x1", "x2")
asvm <- svm(label ~ x1 + x2 ,data=data_train, kernel = "linear")
avec <- seq(-6,6,0.1)
z <- expand.grid(avec, avec)
z <- data.frame(x1=z[,1],x2=z[,2])
predict_svm <- predict(asvm, z)
z_svm <- cbind(region=as.factor(predict_svm), z)
ggplot(z_svm) + aes(x=x1,y=x2,col=region) + geom_point(alpha = 0.4) + geom_point(data = data_train, aes(x=x1,y=x2,col=label)) +
labs(title = "SVM boundary")
\[w_j = \sum_i \alpha_i y_i x_{ij}\]
\[\min_\alpha \frac{1}{2} \sum_{i,i'} \alpha_i \alpha_{i'} y_i y_{i'} x_i^\top x_{i'} \] Subject to \(\alpha_i \ge 0\)
library(e1071)
library(ggplot2)
N<-100
set.seed(32611)
theta1 <- runif(N,0,2*pi)
r1 <- runif(N,2,3)
theta2 <- runif(N,0,2*pi)
r2 <- runif(N,4,5)
d1 <- cbind(r1 * cos(theta1), r1 * sin(theta1))
d2 <- cbind(r2 * cos(theta2), r2 * sin(theta2))
d <- rbind(d1, d2)
label <- c(rep("1", N), rep("-1", N))
data_train <- data.frame(label=factor(label), x=d[,1], y=d[,2])
names(data_train)<-c("label", "x", "y")
ggplot(data_train, aes(x=x,y=y,col=label))+ geom_point()
asvm <- svm(label ~ . ,data=data_train, kernel = "radial")
avec <- seq(-6,6,0.1)
z <- expand.grid(avec, avec)
z <- data.frame(x=z[,1],y=z[,2])
predict_svm <- predict(asvm, z)
z_svm <- cbind(region=as.factor(predict_svm), z)
ggplot(z_svm) + aes(x=x,y=y,col=region) + geom_point(alpha = 0.4) + geom_point(data = data_train, aes(x=x,y=y,col=label)) +
labs(title = "SVM boundary")
\[\exp(x) = 1 + x + \frac{x^2}{2!} + \frac{x^3}{3!} + \ldots\]
library(MASS)
library(ggplot2)
set.seed(32611)
N<-100
d1<-mvrnorm(N, c(2,-2), matrix(c(2, 1, 1, 2), 2, 2))
d2<-mvrnorm(N, c(-2,2), matrix(c(2, 1, 1, 2), 2, 2))
d <- rbind(d1, d2)
label <- c(rep("1", N), rep("-1", N))
data_train <- data.frame(label=factor(label), x=d[,1], y=d[,2])
names(data_train)<-c("label", "x", "y")
p <- ggplot(data_train) + aes(x=x,y=y,col=label) + geom_point()
p + geom_abline(slope=1,intercept=(max(d1[,2] - d1[,1]) + min(d2[,2] - d2[,1]))/2, col="black")
\[\min_{w} \frac{1}{2} \|w\|_2^2 + C\|\zeta\|_2^2\] subject to \(y_i (w^\top x_i ) \ge 1 - \zeta_i\) and \(\zeta_i \ge 0\)
library(e1071)
library(ggplot2)
asvm <- svm(label ~ . ,data=data_train, kernel = "linear")
avec <- seq(-6,6,0.1)
z <- expand.grid(avec, avec)
z <- data.frame(x=z[,1],y=z[,2])
predict_svm <- predict(asvm, z)
z_svm <- cbind(region=as.factor(predict_svm), z)
ggplot(z_svm) + aes(x=x,y=y,col=region) + geom_point(alpha = 0.4) + geom_point(data = data_train, aes(x=x,y=y,col=label)) +
labs(title = "SVM boundary")
Classifier | package | function |
---|---|---|
Logistic regression | stats | glm with parameter family=binomial() |
Linear and quadratic discriminant analysis | MASS | lda, qda |
DLDA and DQDA | sma | stat.diag.da |
KNN classification | class | knn |
CART | rpart | rpart |
Random forest | randomForest | randomForest |
Support Vector machines | e1071 | svm |