Zhiguang Huo (Caleb)
Wednesday November 6, 2017
Given \(X_1, \ldots, X_n\) samples from a distribution, how to estimate other parameters (Variance, mean squared)? - Similarly use weak law of large number.
Although the Bootstrap method is nonparametric, it can be used for inference about parameters in both parametric and nonparametric models.
\[V_F (T_n) = \sigma^2 / n\]
\[\hat{\sigma}^2 = \frac{1}{n - 1}\sum_{i=1}^n (X_i - \bar{X})^2\]
Assume \(N(\mu,\sigma^2)\) is known
mu <- 1
sigma <- 1
n <- 100
B <- 1000
Ts <- numeric(B)
for(b in 1:B){
set.seed(b)
ax <- rnorm(n, mu, sigma)
Ts[b] <- median(ax)
}
varTest <- var(Ts)
print(varTest)
## [1] 0.014573
Glivenko-Cantelli Theorem \[\sup_x | \hat{F}_n(x) - F(x)| \rightarrow 0\]
Dvoretzky-Kiefer-Wolfowitz inequality, for any \(\varepsilon > 0\) \[P(\sup_x | \hat{F}_n(x) - F(x)| > \varepsilon) \le 2\exp(-2n\varepsilon^2)\]
library(ggplot2)
n <- 1000
df <- data.frame(x = c(rnorm(n, 0, 1)))
base <- ggplot(df, aes(x)) + stat_ecdf()
base + stat_function(fun = pnorm, colour = "red") + xlim(c(-3,3))
## Warning: Removed 2 rows containing non-finite values (stat_ecdf).
How to sample from the empirical distribution?
mu <- 1
sigma <- 1
n <- 100
set.seed(32611)
X <- rnorm(n, mu, sigma)
B <- 1000
Ts <- numeric(B)
for(b in 1:B){
set.seed(b)
ax <- sample(X, b, replace = T)
Ts[b] <- median(ax)
}
varTest <- var(Ts)
print(varTest)
## [1] 0.01379818
Notation:
Since in general, we don’t know distribution \(F\), we will calculate using the empirical CDF \(F_n\).
To summarize:
mu <- 1
sigma <- 1
n <- 100
set.seed(32611)
X <- rnorm(n, mu, sigma)
muhat <- mean(X)
B <- 1000
Ts <- numeric(B)
for(b in 1:B){
set.seed(b)
ax <- rnorm(n,muhat,sigma)
Ts[b] <- median(ax)
}
varTest <- var(Ts)
print(varTest)
## [1] 0.014573
lambda <- 5
n <- 100
set.seed(32611)
X <- rpois(n, lambda)
var_hat1 <- 4*mean(X)^3/n
print(var_hat1)
## [1] 4.97006
lambda <- 5
n <- 100
set.seed(32611)
X <- rpois(n, lambda)
B <- 1000
TB <- numeric(B)
for(b in 1:B){
set.seed(b)
aX <- sample(X,n,replace = T)
TB[b] <- (mean(aX))^2
}
var_hat2 <- var(TB)
print(var_hat2)
## [1] 4.389935
lambda <- 5
n <- 100
set.seed(32611)
X <- rpois(n, lambda)
lambdaHat <- mean(X)
B <- 1000
TB <- numeric(B)
for(b in 1:B){
set.seed(b)
aX <- rpois(n, lambdaHat)
TB[b] <- (mean(aX))^2
}
var_hat3 <- var(TB)
print(var_hat3)
## [1] 5.261807
library(boot)
myMean <- function(data, indices){
d <- data[indices]
mean(d)^2
}
## non parametric bootstrap
boot_nonpara <- boot(data=X, statistic = myMean, R = B)
var(boot_nonpara$t)
## [,1]
## [1,] 4.208818
## parametric bootstrap
genPois <- function(data, lambda){
rpois(length(data), lambda)
}
boot_para <- boot(data=X, statistic = myMean, R = B, sim="parametric", ran.gen = genPois, mle = mean(X))
var(boot_para$t)
## [,1]
## [1,] 5.264366
We can define 95% confidence interval using (if B = 10,000) \[[\hat{\theta^r}_n^{(250)}, \hat{\theta^r}_n^{(9750)}]\]
lambda <- 5
n <- 100
set.seed(32611)
X <- rpois(n, lambda)
B <- 1000
TB <- numeric(B)
for(b in 1:B){
set.seed(b)
aX <- sample(X,n,replace = T)
TB[b] <- (mean(aX))^2
}
quantile(TB, c(0.025, 0.975))
## 2.5% 97.5%
## 21.0681 29.2681
\[E(\bar{X}^2) = E(\bar{X})^2 + var(\bar{X}) = \lambda^2 + \lambda/n\]
lambda <- 5
n <- 100
truth <- lambda^2 + lambda/n
B <- 1000
Repeats <- 100
counts <- 0
plot(c(0,100),c(0,Repeats), type="n", xlab="boot CI", ylab="repeats index")
abline(v = truth, col=2)
for(r in 1:Repeats){
set.seed(r)
X <- rpois(n, lambda)
TB <- numeric(B)
for(b in 1:B){
set.seed(b)
aX <- sample(X,n,replace = T)
TB[b] <- (mean(aX))^2
}
segments(quantile(TB, c(0.025)), r, quantile(TB, c(0.975)), r)
if(quantile(TB, c(0.025)) < truth & truth < quantile(TB, c(0.975))){
counts <- counts + 1
}
}
counts/Repeats
## [1] 0.93
library(boot)
myMean <- function(data, indices){
d <- data[indices]
mean(d)^2
}
boot_nonpara <- boot(data=X, statistic = myMean, R = B)
boot.ci(boot_nonpara, type="perc")
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = boot_nonpara, type = "perc")
##
## Intervals :
## Level Percentile
## 95% (22.85, 31.02 )
## Calculations and Intervals on Original Scale
?boot.ci
We can define \(1 - \alpha\) confidence interval using \[[\hat{\theta}_n - q_{1 - \alpha/2}\hat{\sigma}_B, \hat{\theta}_n - q_{\alpha/2}\hat{\sigma}_B],\]
Where \(\hat{\theta}_n\) is the estimator from the original sample and \(\hat{\sigma}_B\) is bootstrap se.
If \(\hat{\sigma^*}_n^{(b)}\) is hard to estiamte, we need to use another round of bootstrap to estiamte.
lambda <- 5
n <- 100
set.seed(32611)
X <- rpois(n, lambda)
B <- 1000
TB <- numeric(B)
seB <- numeric(B)
tB <- numeric(B)
lambdaHat <- mean(X)
That <- lambdaHat^2 + lambdaHat/n
for(b in 1:B){
set.seed(b)
aX <- sample(X,n,replace = T)
TB[b] <- (mean(aX))^2
seB[b] <- sqrt(4*lambdaHat^3/n^2)
tB[b] <- (TB[b] - That)/seB[b]
}
se_boot <- sd(TB)/sqrt(n)
CI_l <- That - quantile(tB, 0.975) * se_boot
CI_h <- That - quantile(tB, 0.025) * se_boot
print(c(CI_l, CI_h))
## 97.5% 2.5%
## 20.89173 28.59832
lambda <- 5
n <- 100
truth <- lambda^2 + lambda/n
B <- 1000
Repeats <- 100
counts <- 0
plot(c(0,100),c(0,Repeats), type="n", xlab="boot CI", ylab="repeats index")
abline(v = truth, col=2)
for(r in 1:Repeats){
set.seed(r)
X <- rpois(n, lambda)
TB <- numeric(B)
seB <- numeric(B)
tB <- numeric(B)
lambdaHat <- mean(X)
That <- lambdaHat^2 + lambdaHat/n
for(b in 1:B){
set.seed(b)
aX <- sample(X,n,replace = T)
TB[b] <- (mean(aX))^2
seB[b] <- sqrt(4*lambdaHat^3/n^2)
tB[b] <- (TB[b] - That)/seB[b]
}
se_boot <- sd(TB)/sqrt(n)
CI_l <- That - quantile(tB, 0.975) * se_boot
CI_h <- That - quantile(tB, 0.025) * se_boot
segments(CI_l, r, CI_h, r)
if(CI_l < truth & truth < CI_h){
counts <- counts + 1
}
}
counts/Repeats
## [1] 0.91
We can define \(1 - \alpha\) confidence interval using \[[\hat{\theta}_n - q_{1 - \alpha/2}, \hat{\theta}_n - q_{\alpha/2}],\]
Where \(\hat{\theta}_n\) is the estimator from the original sample. No \(\hat{\sigma}_B\) is required.
lambda <- 5
n <- 100
truth <- lambda^2 + lambda/n
B <- 1000
Repeats <- 100
counts <- 0
plot(c(0,100),c(0,Repeats), type="n", xlab="boot CI", ylab="repeats index")
abline(v = truth, col=2)
for(r in 1:Repeats){
set.seed(r)
X <- rpois(n, lambda)
TB <- numeric(B)
seB <- numeric(B)
tB <- numeric(B)
lambdaHat <- mean(X)
That <- lambdaHat^2 + lambdaHat/n
for(b in 1:B){
set.seed(b)
aX <- sample(X,n,replace = T)
TB[b] <- (mean(aX))^2
tB[b] <- (TB[b] - That)
}
CI_l <- That - quantile(tB, 0.975)
CI_h <- That - quantile(tB, 0.025)
segments(CI_l, r, CI_h, r)
if(CI_l < truth & truth < CI_h){
counts <- counts + 1
}
}
counts/Repeats
## [1] 0.91
\[[\hat{\theta}_n - Z_{1 - \alpha/2}\hat{\sigma}_B, \hat{\theta}_n - Z_{\alpha/2}\hat{\sigma}_B],\] Where \(Z_\alpha = \Phi^{-1}(1-\alpha)\), \(\Phi\) is the cdf of standard Normal distribution.
Where \(\hat{\theta}_n\) is the estimator from the original sample and \(\hat{\sigma}_B\) is bootstrap se.
lambda <- 5
n <- 100
truth <- lambda^2 + lambda/n
B <- 1000
Repeats <- 100
counts <- 0
plot(c(0,100),c(0,Repeats), type="n", xlab="boot CI", ylab="repeats index")
abline(v = truth, col=2)
for(r in 1:Repeats){
set.seed(r)
X <- rpois(n, lambda)
lambdaHat <- mean(X)
That <- lambdaHat^2 + lambdaHat/n
TB <- numeric(B)
for(b in 1:B){
set.seed(b)
aX <- sample(X,n,replace = T)
TB[b] <- (mean(aX))^2
}
ci_l <- That - 1.96*sd(TB)
ci_u <- That + 1.96*sd(TB)
segments(ci_l, r, ci_u, r)
if(ci_l < truth & truth < ci_u){
counts <- counts + 1
}
}
counts/Repeats
## [1] 0.93
Precedure | Theoritical guarantee | Fast | R package Boot? |
---|---|---|---|
Percentiles | No | Yes | Yes |
Pivotal Intervals | Yes | No | Yes |
Pivotal Intervals (simplified, no se) | Yes | Yes | No |
normal approximation | Yes | Yes | Yes |
Problem setting:
\[T = | \frac {\sum_{i = 1}^NZ_i I(L_i=1)} {\sum_{i = 1}^N I(L_i=1)} - \frac {\sum_{i = 1}^NZ_i I(L_i=2)} {\sum_{i = 1}^N I(L_i=2)} |,\] where \(N = n + m\). Therefore the test statistic \(T = g(L,Z)\) is a function of \(L\) and \(Z\).
where \(L_\pi\) is a permutation of the labels and the sum is over all permutations.
Sometimes summing over all possible permutations is infeasible. But it suffices to use a random sample of permutation.
Permutation test procedure:
If we observe \(X_1, \ldots, X_n\) and \(Y_1, \ldots, Y_m\), how can we test if the mean of \(X\) and mean of \(Y\) have a difference.
n <- 100
m <- 100
set.seed(32611)
x <- rnorm(n,0,1)
y <- rnorm(n,0,2)
adataFrame <- data.frame(data=c(x,y),label=gl(2,n))
T <- with(adataFrame, mean(data[label==1] - data[label==2]))
B <- 1000
TB <- numeric(B)
for(b in 1:B){
set.seed(b)
bdataFrame <- adataFrame
bdataFrame$label <- sample(bdataFrame$labe)
TB[b] <- with(bdataFrame, mean(data[label==1] - data[label==2]))
}
sum(TB >= T)/B
## [1] 0.383
n <- 100
m <- 100
R <- 300
B <- 300
alpha <- 0.05
counts <- 0
for(r in 1:R){
set.seed(r)
x <- rnorm(n,0,1)
y <- rnorm(n,0,2)
adataFrame <- data.frame(data=c(x,y),label=gl(2,n))
T <- with(adataFrame, mean(data[label==1] - data[label==2]))
TB <- numeric(B)
for(b in 1:B){
set.seed(b*r)
bdataFrame <- adataFrame
bdataFrame$label <- sample(bdataFrame$labe)
TB[b] <- with(bdataFrame, mean(data[label==1] - data[label==2]))
}
if(sum(TB >= T)/B > alpha){
counts <- counts + 1
}
}
counts/B
## [1] 0.9366667
Consider \(H_0: \mu_X = \mu_Y\) versus \(H_A: \mu_X > \mu_Y\)
n <- 100
m <- 100
R <- 300
B <- 300
alpha <- 0.05
counts_ttest <- 0
counts_perm <- 0
for(r in 1:R){
set.seed(r)
x <- rnorm(n,1,3)
y <- rnorm(n,0,3)
if(t.test(x,y,alternative = "greater")$p.val < alpha){
counts_ttest <- counts_ttest + 1
}
adataFrame <- data.frame(data=c(x,y),label=gl(2,n))
T <- with(adataFrame, mean(data[label==1] - data[label==2]))
TB <- numeric(B)
for(b in 1:B){
set.seed(b*r)
bdataFrame <- adataFrame
bdataFrame$label <- sample(bdataFrame$labe)
TB[b] <- with(bdataFrame, mean(data[label==1] - data[label==2]))
}
if(sum(TB >= T)/B < alpha){
counts_perm <- counts_perm + 1
}
}
counts_ttest/B
## [1] 0.7933333
counts_perm/B
## [1] 0.78
knitr::purl("bootstrap.rmd", output = "bootstrap.R ", documentation = 2)
##
##
## processing file: bootstrap.rmd
##
|
| | 0%
|
|.. | 3%
|
|... | 5%
|
|..... | 8%
|
|....... | 11%
|
|......... | 13%
|
|.......... | 16%
|
|............ | 18%
|
|.............. | 21%
|
|............... | 24%
|
|................. | 26%
|
|................... | 29%
|
|..................... | 32%
|
|...................... | 34%
|
|........................ | 37%
|
|.......................... | 39%
|
|........................... | 42%
|
|............................. | 45%
|
|............................... | 47%
|
|................................ | 50%
|
|.................................. | 53%
|
|.................................... | 55%
|
|...................................... | 58%
|
|....................................... | 61%
|
|......................................... | 63%
|
|........................................... | 66%
|
|............................................ | 68%
|
|.............................................. | 71%
|
|................................................ | 74%
|
|.................................................. | 76%
|
|................................................... | 79%
|
|..................................................... | 82%
|
|....................................................... | 84%
|
|........................................................ | 87%
|
|.......................................................... | 89%
|
|............................................................ | 92%
|
|.............................................................. | 95%
|
|............................................................... | 97%
|
|.................................................................| 100%
## output file: bootstrap.R
## [1] "bootstrap.R "