Biostatistical Computing, PHC 6068

Simulation studies 2

Zhiguang Huo (Caleb)

Wednesday November 1, 2017

Outlines

package setup page

library(MASS)
library(rpart)
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin

Compare machine learning classifier

set.seed(32611)
N<-100
d1<-mvrnorm(N, c(1,-1), matrix(c(2, 1, 1, 2), 2, 2))
d2<-mvrnorm(N, c(-1,1), matrix(c(2, 1, 1, 2), 2, 2))
d <- rbind(d1, d2)
label <- c(rep("1", N), rep("2", N))
data_train <- data.frame(label=label, x=d[,1], y=d[,2])
names(data_train)<-c("label", "x", "y")

ggplot(data_train) + aes(x=x,y=y,col=label) + geom_point()

CART

acart <- rpart(label ~ . ,data=data_train)

avec <- seq(-4,4,0.1)
z <- expand.grid(avec, avec)
z <- data.frame(x=z[,1],y=z[,2])
predict_cart <- predict(acart, z, type = "vector")

z_cart <- cbind(region=as.factor(predict_cart), z)
ggplot(z_cart) + aes(x=x,y=y,col=region) + geom_point() + 
  labs(title = "CART boundary")

Random forest

aRF <- randomForest(label ~ . ,data=data_train)

avec <- seq(-4,4,0.1)
z <- expand.grid(avec, avec)
z <- data.frame(x=z[,1],y=z[,2])
predict_RF <- predict(aRF, z)

z_RF <- cbind(region=as.factor(predict_RF), z)
ggplot(z_RF) + aes(x=x,y=y,col=region) + geom_point() + 
  labs(title = "Random forest boundary")

Logistic regression

aLogistic <- glm(label ~ . ,data=data_train, family = binomial())

avec <- seq(-4,4,0.1)
z <- expand.grid(avec, avec)
z <- data.frame(x=z[,1],y=z[,2])
predict_Logistic <- ifelse(predict(aLogistic, z, type="response")>0.5,2,1)

z_Logistic <- cbind(region=as.factor(predict_Logistic), z)
ggplot(z_Logistic) + aes(x=x,y=y,col=region) + geom_point() + 
  labs(title = "logistic regression boundary")

Compare which classifier works the best in terms of accuracy

set.seed(32611)
N_test<-10000
d1_test<-mvrnorm(N_test, c(1,-1), matrix(c(2, 1, 1, 2), 2, 2))
d2_test<-mvrnorm(N_test, c(-1,1), matrix(c(2, 1, 1, 2), 2, 2))
d_test <- rbind(d1_test, d2_test)
label_test <- c(rep("1", N_test), rep("2", N_test))
data_test <- data.frame(label=label_test, x=d_test[,1], y=d_test[,2])
names(data_test)<-c("label", "x", "y")

ggplot(data_test) + aes(x=x,y=y,col=label) + geom_point(alpha = 0.7)

Prediction accuracy for CART

prediction_test_cart <- predict(acart, data_test, type="vector")
table_test_cart <- table(prediction_test_cart, data_test$label)
table_test_cart
##                     
## prediction_test_cart    1    2
##                    1 8681 1239
##                    2 1319 8761
accuracy_test_cart <- sum(diag(table_test_cart))/sum(table_test_cart)
accuracy_test_cart
## [1] 0.8721

Prediction accuracy for random forest

prediction_test_RF <- predict(aRF, data_test)
table_test_RF <- table(prediction_test_RF, data_test$label)
table_test_RF
##                   
## prediction_test_RF    1    2
##                  1 9053  994
##                  2  947 9006
accuracy_test_RF <- sum(diag(table_test_RF))/sum(table_test_RF)
accuracy_test_RF
## [1] 0.90295

Prediction accuracy for logistic regression

prediction_test_logistic <- ifelse(predict(aLogistic, data_test, type="response")>0.5,2,1)
table_test_logistic <- table(prediction_test_logistic, data_test$label)
table_test_logistic
##                         
## prediction_test_logistic    1    2
##                        1 9113  743
##                        2  887 9257
accuracy_test_logistic <- sum(diag(table_test_logistic))/sum(table_test_logistic)
accuracy_test_logistic
## [1] 0.9185

Central limit theorm

Suppose \(\{X_1, X_2, \ldots, X_n\}\) is a sequence of i.i.d. random variables with \(E(X_i) = \mu\) and \(Var(X_i) = \sigma^2 < \infty\). Then as \(n\) approaches infinity, the following random variables converge in distribution to a standarded normal \(N(0,1)\)

\[\frac{\sqrt{n} (\frac{1}{n} \sum_{i=1}^n X_i - \mu)}{\sigma} \rightarrow N(0, 1)\]

Poisson distribution

\[f(k) = \frac{\lambda^k \exp(-\lambda)}{k!}\]

n = 10

n <- 10
lambda <- 5

B <- 5000
collection <- numeric(B)
  
for(b in 1:B){
  set.seed(b)
  x <- rpois(n = n, lambda = lambda)
  collection[b] <- sqrt(n) * (mean(x) - lambda) / sqrt(lambda)
}

hist(collection)

qqnorm(collection)
abline(a = 0, b = 1, col = 2)

shapiro.test(collection)
## 
##  Shapiro-Wilk normality test
## 
## data:  collection
## W = 0.99736, p-value = 1.192e-07

n = 40

n <- 40
lambda <- 5

B <- 5000
collection <- numeric(B)
  
for(b in 1:B){
  set.seed(b)
  x <- rpois(n = n, lambda = lambda)
  collection[b] <- sqrt(n) * (mean(x) - lambda) / sqrt(lambda)
}

hist(collection)

qqnorm(collection)
abline(a = 0, b = 1, col = 2)

shapiro.test(collection)
## 
##  Shapiro-Wilk normality test
## 
## data:  collection
## W = 0.99916, p-value = 0.01562

n = 100

n <- 100
lambda <- 5

B <- 5000
collection <- numeric(B)
  
for(b in 1:B){
  set.seed(b)
  x <- rpois(n = n, lambda = lambda)
  collection[b] <- sqrt(n) * (mean(x) - lambda) / sqrt(lambda)
}

hist(collection)

qqnorm(collection)
abline(a = 0, b = 1, col = 2)

shapiro.test(collection)
## 
##  Shapiro-Wilk normality test
## 
## data:  collection
## W = 0.99955, p-value = 0.3084

Multiple testing

\(\begin{aligned} P(\mbox{at least one significant result}) &= 1 - P(\mbox{no significant result}) \\ & = (1 - 0.05)^{20} \\ & \approx 0.64 \\ \end{aligned}\)

Bonferroni correction

\(\begin{aligned} P(\mbox{at least one significant result}) &= 1 - P(\mbox{no significant result}) \\ & = (1 - 0.0025)^{20} \\ & \approx 0.0488 \\ \end{aligned}\)

Simulation experiment

B <- 1000
n <- 20
alpha <- 0.05

count <- 0
for(b in 1:B){
  set.seed(b)
  ap <- runif(n)
  
  if(any(ap < alpha/n)){
    count <- count + 1
  }
}
count/B
## [1] 0.045

False discovery rate

Simulation setting

B1 <- 9000
B2 <- 1000
B <- B1 + B2
n <- 10

SigLabel <- c(rep(F,  B1), rep(T, B2))

pval <- numeric(B)

for(b in 1:B){
    set.seed(b)
    if(b <= B1){
    x <- rnorm(n)
  } else {
    x <- rnorm(n, mean = 2)
  }
  
  pval[b] <- t.test(x=x)$p.value
}

False discovery rate example

qvalue <- p.adjust(pval, "BH")
test_fdr <- qvalue < 0.05
table_fdr <- table(SigLabel, test_fdr)
table_fdr
##         test_fdr
## SigLabel FALSE TRUE
##    FALSE  8951   49
##    TRUE     27  973
fpr_fdr <-  sum(test_fdr & !SigLabel) / sum(test_fdr)
fpr_fdr
## [1] 0.04794521

Comparison - raw p-value

test_raw <- pval < 0.05
table_raw <- table(SigLabel, test_raw)
table_raw
##         test_raw
## SigLabel FALSE TRUE
##    FALSE  8562  438
##    TRUE      0 1000
fdr_raw <-  sum(test_raw & !SigLabel) / sum(test_raw)
fnr_raw <-  sum(!test_raw & SigLabel) / sum(!test_raw)
fdr_raw
## [1] 0.3045897
fnr_raw
## [1] 0

Comparison - bonferroni correction

test_bonferroni <- pval < 0.05/length(pval)
table_bonferroni <- table(SigLabel, test_bonferroni)
table_bonferroni
##         test_bonferroni
## SigLabel FALSE TRUE
##    FALSE  9000    0
##    TRUE    862  138
fpr_bonferroni <-  sum(test_bonferroni & !SigLabel) / sum(test_bonferroni)
fnr_bonferroni <- sum(!test_bonferroni & SigLabel) / sum(!test_bonferroni)
fpr_bonferroni
## [1] 0
fnr_bonferroni
## [1] 0.08740621

Comparison - false discovery rate

qvalue <- p.adjust(pval, "BH")
test_fdr <- qvalue < 0.05
table_fdr <- table(SigLabel, test_fdr)
table_fdr
##         test_fdr
## SigLabel FALSE TRUE
##    FALSE  8951   49
##    TRUE     27  973
fpr_fdr <-  sum(test_fdr & !SigLabel) / sum(test_fdr)
fnr_fdr <- sum(!test_fdr & SigLabel) / sum(!test_fdr)
fpr_fdr
## [1] 0.04794521
fnr_fdr
## [1] 0.003007351

Validate a distribution as a mixture of other distributions

\[Y \sim N(\mu_0, \sigma^2 + \sigma_0^2)\]

Simulaitons (try to increase n)

library(ggplot2)
mu0 <- 2
sigma <- 1
sigma0 <- 3

## using derived formula
n <- 1000
y1 <- rnorm(n, mean = mu0, sd = sqrt(sigma^2 + sigma0^2))

## using mixture approach
mu <- rnorm(n, mean = mu0, sd = sigma0)
y2 <- rnorm(n, mean = mu, sd = sigma)

df <- data.frame(x = c(y1, y2),
                 g = gl(2, n))
ggplot(df, aes(x, colour = g)) + geom_density()

compare ward test, score test and likelihood ratio test (in one dimension)

\[f(y; \lambda) = \frac{\exp(-\lambda) \lambda^y}{y!}\] - Likelihood function

\[L(\lambda; y_1, \ldots, y_n) = \prod_{i=1}^n \frac{\exp(-\lambda) \lambda^{y_i}}{y_i!}\] - log likelihood function

\(\begin{aligned} l(\lambda; y_1, \ldots, y_n) & = \sum_{i=1}^n log \frac{\exp(-\lambda) \lambda^{y_i}}{y_i!} \\ & \propto \sum_i y_i \log \lambda - n \lambda \\ \end{aligned}\)

\(\begin{aligned} s(\lambda) & = \frac{1}{n}\frac{\partial l(\lambda)}{\partial \lambda} \\ & = \frac{1}{n} \frac{\sum_i y_i}{\lambda} - 1 \\ \end{aligned}\)

\(\begin{aligned} I(\lambda) & = - E(\frac{1}{n}\frac{\partial^2 l(\lambda)}{\partial \lambda^2}) \\ & = E(\frac{1}{n} \frac{\sum_i y_i}{\lambda^2} ) \\ & = \frac{1}{\lambda} \\ \end{aligned}\)

compare ward test, score test and likelihood ratio test (in one dimension)

\[n (\lambda - \lambda_0)^2 \hat{I}(\lambda) \sim \chi^2(1)\]

\[n S(\lambda_0)^2 I(\lambda_0)^{-1} \sim \chi^2(1)\]

\[- 2 \log \{ \frac{\sup_{\lambda = \lambda_0}L(\lambda)}{\sup_{\lambda \in \Lambda}L(\lambda)} \} \sim \chi^2(1)\]

compare ward test, score test and likelihood ratio test

Simulation, check if these tests achieve advertised size

Simulation – Wald test

lambda0 = 5
n <- 20 ## increase
B <- 1000
alpha <- 0.05

count <- 0
for(b in 1:B){
  set.seed(b)
  y <- rpois(n, lambda = lambda0)
  lambda_hat <- mean(y)
  T_ward <- n * (lambda_hat - lambda0)^2 * (1/lambda_hat)
  if(pchisq(T_ward, df = 1, lower.tail = F) < alpha){
    count <- count + 1
  }
}
count / B
## [1] 0.062

Simulation – Score test

lambda0 = 5
n <- 20 ## increase
B <- 1000
alpha <- 0.05

count <- 0
for(b in 1:B){
  set.seed(b)
  y <- rpois(n, lambda = lambda0)
  lambda_hat <- mean(y)
  s <- lambda_hat/lambda0 - 1
  T_score <- n * s^2 * lambda0
  if(pchisq(T_score, df = 1, lower.tail = F) < alpha){
    count <- count + 1
  }
}
count / B
## [1] 0.066

Simulation – likelihood ratio test

lambda0 = 5
n <- 20 ## increase
B <- 1000
alpha <- 0.05

count <- 0
for(b in 1:B){
  set.seed(b)
  y <- rpois(n, lambda = lambda0)
  lambda_hat <- mean(y)
  l <- n * (lambda_hat * log(lambda_hat) - lambda_hat)
  l0 <- n * (lambda_hat * log(lambda0) - lambda0)
  T_lr <- -2*(l0 - l)
  if(pchisq(T_lr, df = 1, lower.tail = F) < alpha){
    count <- count + 1
  }
}
count / B
## [1] 0.063

compare ward test, score test and likelihood ratio test – using large sample

lambda0 = 5
n <- 20 ## increase
B <- 1000
alpha <- 0.05

count_wald <- 0
count_score <- 0
count_lr <- 0
for(b in 1:B){
  set.seed(b)
  y <- rpois(n, lambda = lambda0)
  lambda_hat <- mean(y)
  s <- lambda_hat/lambda0 - 1
  l <- n * (lambda_hat * log(lambda_hat) - lambda_hat)
  l0 <- n * (lambda_hat * log(lambda0) - lambda0)

  T_ward <- n * (lambda_hat - lambda0)^2 * (1/lambda_hat)
  T_score <- n * s^2 * lambda0
  T_lr <- -2*(l0 - l)

  if(pchisq(T_ward, df = 1, lower.tail = F) < alpha) count_wald <- count_wald + 1
  if(pchisq(T_score, df = 1, lower.tail = F) < alpha) count_score <- count_score + 1
  if(pchisq(T_lr, df = 1, lower.tail = F) < alpha) count_lr <- count_lr + 1
}

count_wald / B
## [1] 0.062
count_score / B
## [1] 0.066
count_lr / B
## [1] 0.063

Will be on HW

Compare the power of ward test, score test and likelihood ratio test

knitr::purl("simulation2.rmd", output = "simulation2.R ", documentation = 2)
## 
## 
## processing file: simulation2.rmd
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |.                                                                |   2%
  |                                                                       
  |...                                                              |   4%
  |                                                                       
  |....                                                             |   6%
  |                                                                       
  |.....                                                            |   8%
  |                                                                       
  |.......                                                          |  10%
  |                                                                       
  |........                                                         |  12%
  |                                                                       
  |.........                                                        |  15%
  |                                                                       
  |...........                                                      |  17%
  |                                                                       
  |............                                                     |  19%
  |                                                                       
  |..............                                                   |  21%
  |                                                                       
  |...............                                                  |  23%
  |                                                                       
  |................                                                 |  25%
  |                                                                       
  |..................                                               |  27%
  |                                                                       
  |...................                                              |  29%
  |                                                                       
  |....................                                             |  31%
  |                                                                       
  |......................                                           |  33%
  |                                                                       
  |.......................                                          |  35%
  |                                                                       
  |........................                                         |  38%
  |                                                                       
  |..........................                                       |  40%
  |                                                                       
  |...........................                                      |  42%
  |                                                                       
  |............................                                     |  44%
  |                                                                       
  |..............................                                   |  46%
  |                                                                       
  |...............................                                  |  48%
  |                                                                       
  |................................                                 |  50%
  |                                                                       
  |..................................                               |  52%
  |                                                                       
  |...................................                              |  54%
  |                                                                       
  |.....................................                            |  56%
  |                                                                       
  |......................................                           |  58%
  |                                                                       
  |.......................................                          |  60%
  |                                                                       
  |.........................................                        |  62%
  |                                                                       
  |..........................................                       |  65%
  |                                                                       
  |...........................................                      |  67%
  |                                                                       
  |.............................................                    |  69%
  |                                                                       
  |..............................................                   |  71%
  |                                                                       
  |...............................................                  |  73%
  |                                                                       
  |.................................................                |  75%
  |                                                                       
  |..................................................               |  77%
  |                                                                       
  |...................................................              |  79%
  |                                                                       
  |.....................................................            |  81%
  |                                                                       
  |......................................................           |  83%
  |                                                                       
  |........................................................         |  85%
  |                                                                       
  |.........................................................        |  88%
  |                                                                       
  |..........................................................       |  90%
  |                                                                       
  |............................................................     |  92%
  |                                                                       
  |.............................................................    |  94%
  |                                                                       
  |..............................................................   |  96%
  |                                                                       
  |................................................................ |  98%
  |                                                                       
  |.................................................................| 100%
## output file: simulation2.R
## [1] "simulation2.R "