Zhiguang Huo (Caleb)
Wednesday November 14, 2018
International Prize in Statistics Awarded to Bradley Efron, for his contribution in Bootstrapping (annouced in 11/12/2018)
^Var(ˆμ)=1nˆσ2=1n(n−1)n∑i=1(Xi−ˉX)2
Monte Carlo method
mu <- 1
sigma <- 1
n <- 100
B <- 1000
Ts <- numeric(B)
for(b in 1:B){
set.seed(b)
ax <- rnorm(n, mu, sigma)
Ts[b] <- median(ax)
}
varTest <- var(Ts)
print(varTest)
## [1] 0.014573
library(ggplot2)
n <- 1000
df <- data.frame(x = c(rnorm(n, 0, 1)))
base <- ggplot(df, aes(x)) + stat_ecdf()
base + stat_function(fun = pnorm, colour = "red") + xlim(c(-3,3))
## Warning: Removed 2 rows containing non-finite values (stat_ecdf).
Glivenko-Cantelli Theorem sup
Dvoretzky-Kiefer-Wolfowitz inequality, for any \varepsilon > 0 P(\sup_x | F_n(x) - F(x)| > \varepsilon) \le 2\exp(-2n\varepsilon^2)
Instead of drawing samples from the underlying distribution F – N(\mu, \sigma^2), we draw from the empirical distribution F_n
mu <- 1
sigma <- 1
n <- 100
set.seed(32611)
X <- rnorm(n, mu, sigma)
B <- 1000
Ts <- numeric(B)
for(b in 1:B){
set.seed(b)
ax <- sample(X, replace = T)
Ts[b] <- median(ax)
}
varTest <- var(Ts)
print(varTest)
## [1] 0.02916533
Since in general, we don’t know distribution F, we will calculate using the empirical CDF F_n.
To summarize:
mu <- 1
sigma <- 1
n <- 100
set.seed(32611)
X <- rnorm(n, mu, sigma)
muhat <- mean(X)
B <- 1000
Ts <- numeric(B)
for(b in 1:B){
set.seed(b)
ax <- rnorm(n,muhat,sigma)
Ts[b] <- median(ax)
}
varTest <- var(Ts)
print(varTest)
## [1] 0.014573
lambda <- 5
n <- 100
set.seed(32611)
X <- rpois(n, lambda)
var_hat1 <- 4*mean(X)^3/n
print(var_hat1)
## [1] 4.97006
lambda <- 5
n <- 100
set.seed(32611)
X <- rpois(n, lambda)
B <- 1000
TB <- numeric(B)
for(b in 1:B){
set.seed(b)
aX <- sample(X,n,replace = T)
TB[b] <- (mean(aX))^2
}
var_hat2 <- var(TB)
print(var_hat2)
## [1] 4.389935
lambda <- 5
n <- 100
set.seed(32611)
X <- rpois(n, lambda)
lambdaHat <- mean(X)
B <- 1000
TB <- numeric(B)
for(b in 1:B){
set.seed(b)
aX <- rpois(n, lambdaHat)
TB[b] <- (mean(aX))^2
}
var_hat3 <- var(TB)
print(var_hat3)
## [1] 5.261807
library(boot)
myMean <- function(data, indices){
d <- data[indices]
mean(d)^2
}
## non parametric bootstrap
set.seed(32611)
boot_nonpara <- boot(data=X, statistic = myMean, R = B)
var(boot_nonpara$t)
## [,1]
## [1,] 4.311527
## parametric bootstrap
genPois <- function(data, lambda){
rpois(length(data), lambda)
}
boot_para <- boot(data=X, statistic = myMean, R = B, sim="parametric", ran.gen = genPois, mle = mean(X))
var(boot_para$t)
## [,1]
## [1,] 5.147477
We can define 95% confidence interval using (if B = 10,000) [{T^r}_n^{(250)}, {T^r}_n^{(9750)}]
lambda <- 5
n <- 100
set.seed(32611)
X <- rpois(n, lambda)
B <- 1000
TB <- numeric(B)
for(b in 1:B){
set.seed(b)
aX <- sample(X,n,replace = T)
TB[b] <- (mean(aX))^2
}
quantile(TB, c(0.025, 0.975))
## 2.5% 97.5%
## 21.0681 29.2681
E(\bar{X}^2) = E(\bar{X})^2 + var(\bar{X}) = \lambda^2 + \lambda/n
lambda <- 5
n <- 100
truth <- lambda^2 + lambda/n
B <- 1000
Repeats <- 100
counts <- 0
plot(c(0,100),c(0,Repeats), type="n", xlab="boot CI", ylab="repeats index")
abline(v = truth, col=2)
for(r in 1:Repeats){
set.seed(r)
X <- rpois(n, lambda)
TB <- numeric(B)
for(b in 1:B){
set.seed(b)
aX <- sample(X,n,replace = T)
TB[b] <- (mean(aX))^2
}
segments(quantile(TB, c(0.025)), r, quantile(TB, c(0.975)), r)
if(quantile(TB, c(0.025)) < truth & truth < quantile(TB, c(0.975))){
counts <- counts + 1
}
}
## [1] 0.93
library(boot)
myMean <- function(data, indices){
d <- data[indices]
mean(d)^2
}
boot_nonpara <- boot(data=X, statistic = myMean, R = B)
boot.ci(boot_nonpara, type="perc")
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = boot_nonpara, type = "perc")
##
## Intervals :
## Level Percentile
## 95% (22.85, 31.02 )
## Calculations and Intervals on Original Scale
[\hat{T}_n - Z_{1 - \alpha/2}\hat{\sigma}_B, \hat{T}_n - Z_{\alpha/2}\hat{\sigma}_B], Where Z_\alpha = \Phi^{-1}(1-\alpha), \Phi is the cdf of standard Normal distribution.
Where \hat{T}_n is the estimator from the original sample and \hat{\sigma}_B is bootstrap se.
lambda <- 5
n <- 100
B <- 1000
set.seed(32611)
X <- rpois(n, lambda)
lambdaHat <- mean(X)
That <- lambdaHat^2
TB <- numeric(B)
for(b in 1:B){
set.seed(b)
aX <- sample(X,n,replace = T)
TB[b] <- (mean(aX))^2
}
ci_l <- That - 1.96*sd(TB)
ci_u <- That + 1.96*sd(TB)
c(ci_l, ci_u)
## [1] 20.79347 29.00673
lambda <- 5
n <- 100
truth <- lambda^2
B <- 1000
Repeats <- 100
counts <- 0
plot(c(0,100),c(0,Repeats), type="n", xlab="boot CI", ylab="repeats index")
abline(v = truth, col=2)
for(r in 1:Repeats){
set.seed(r)
X <- rpois(n, lambda)
lambdaHat <- mean(X)
That <- lambdaHat^2
TB <- numeric(B)
for(b in 1:B){
set.seed(b)
aX <- sample(X,n,replace = T)
TB[b] <- (mean(aX))^2
}
ci_l <- That - 1.96*sd(TB)
ci_u <- That + 1.96*sd(TB)
segments(ci_l, r, ci_u, r)
if(ci_l < truth & truth < ci_u){
counts <- counts + 1
}
}
## [1] 0.93
Precedure | Theoritical guarantee | Fast | R package Boot? |
---|---|---|---|
Percentiles | No | Yes | Yes |
Pivotal Intervals | Yes | No | Yes |
Pivotal Intervals (simplified, no se) | Yes | Yes | No |
normal approximation | Yes | Yes | Yes |
Space, Right Arrow or swipe left to move to next slide, click help below for more details