Zhiguang Huo (Caleb)
Monday November 16, 2020
International Prize in Statistics Awarded to Bradley Efron, for his contribution in Bootstrapping (annouced in 11/12/2018)
\[\hat{Var}(\hat{\mu}) =\frac{1}{n} \hat{\sigma}^2 = \frac{1}{n(n-1)} \sum_{i=1}^n (X_i - \bar{X})^2\]
Monte Carlo method
mu <- 1
sigma <- 1
n <- 100
B <- 1000
Ts <- numeric(B)
for(b in 1:B){
set.seed(b)
ax <- rnorm(n, mu, sigma)
Ts[b] <- median(ax)
}
varTest <- var(Ts)
print(varTest)
## [1] 0.014573
library(ggplot2)
n <- 1000
df <- data.frame(x = c(rnorm(n, 0, 1)))
base <- ggplot(df, aes(x)) + stat_ecdf()
base + stat_function(fun = pnorm, colour = "red") + xlim(c(-3,3))
## Warning: Removed 2 rows containing non-finite values (stat_ecdf).
Empirical distribution is close to the underlying distribution
Glivenko-Cantelli Theorem \[\sup_x | F_n(x) - F(x)| \rightarrow 0\]
Dvoretzky-Kiefer-Wolfowitz inequality, for any \(\varepsilon > 0\) \[P(\sup_x | F_n(x) - F(x)| > \varepsilon) \le 2\exp(-2n\varepsilon^2)\]
Instead of drawing samples from the underlying distribution \(F \sim N(\mu, \sigma^2)\), we draw from the empirical distribution \(F_n\)
mu <- 1
sigma <- 1
n <- 100
set.seed(32611)
X <- rnorm(n, mu, sigma)
B <- 1000
Ts <- numeric(B)
for(b in 1:B){
set.seed(b)
ax <- sample(X, replace = T)
Ts[b] <- median(ax)
}
varTest <- var(Ts)
print(varTest)
## [1] 0.02598333
Since in general, we don’t know distribution \(F\), we will calculate using the empirical CDF \(F_n\).
To summarize:
mu <- 1
sigma <- 1
n <- 100
set.seed(32611)
X <- rnorm(n, mu, sigma)
muhat <- mean(X)
B <- 1000
Ts <- numeric(B)
for(b in 1:B){
set.seed(b)
ax <- rnorm(n,muhat,sigma)
Ts[b] <- median(ax)
}
varTest <- var(Ts)
print(varTest)
## [1] 0.014573
What is the variance of \(T_n\)?
lambda <- 5
n <- 100
set.seed(32611)
X <- rpois(n, lambda)
var_hat1 <- 4*mean(X)^3/n
print(var_hat1)
## [1] 4.97006
lambda <- 5
n <- 100
set.seed(32611)
X <- rpois(n, lambda)
B <- 1000
TB <- numeric(B)
for(b in 1:B){
set.seed(b)
aX <- sample(X,n,replace = T)
TB[b] <- (mean(aX))^2
}
var_hat2 <- var(TB)
print(var_hat2)
## [1] 4.44549
lambda <- 5
n <- 100
set.seed(32611)
X <- rpois(n, lambda)
lambdaHat <- mean(X)
B <- 1000
TB <- numeric(B)
for(b in 1:B){
set.seed(b)
aX <- rpois(n, lambdaHat)
TB[b] <- (mean(aX))^2
}
var_hat3 <- var(TB)
print(var_hat3)
## [1] 5.261807
library(boot)
myMean <- function(data, indices){
d <- data[indices]
mean(d)^2
}
## non parametric bootstrap
set.seed(32611)
boot_nonpara <- boot(data=X, statistic = myMean, R = B)
var(boot_nonpara$t)
## [,1]
## [1,] 4.495394
## parametric bootstrap
genPois <- function(data, lambda){
rpois(length(data), lambda)
}
boot_para <- boot(data=X, statistic = myMean, R = B, sim="parametric", ran.gen = genPois, mle = mean(X))
var(boot_para$t)
## [,1]
## [1,] 5.199379
lambda <- 5
n <- 100
B <- 1000
Ts <- numeric(B)
for(b in 1:B){
set.seed(b)
aX <- rpois(n, lambda)
Ts[b] <- (mean(aX))^2
}
print(var(Ts))
## [1] 5.28793
Assume \(\sigma\) is known, \(\mu\) is the only unknown parameter.
\(\hat{V}_{F_n} (T_n) = \frac{1}{B-1} \sum_{b=1}^B \{T^{(b)}_n - \bar{T}_n \}^2\)
Method | Need simulation? | Need underlying parameter |
---|---|---|
Delta method | N | N |
Non-parametric bootstrapping | Y | N |
Parametric bootstrapping | Y | N |
Simulation | Y | Y |
The rest of the percedures are the same for these two methods
We can define 95% confidence interval using (if B = 10,000) \[[{T^r}_n^{(250)}, {T^r}_n^{(9750)}]\]
lambda <- 5
n <- 100
set.seed(32611)
X <- rpois(n, lambda)
B <- 1000
TB <- numeric(B)
for(b in 1:B){
set.seed(b)
aX <- sample(X,n,replace = T)
TB[b] <- (mean(aX))^2
}
quantile(TB, c(0.025, 0.975))
## 2.5% 97.5%
## 20.97411 29.16000
\[E(\bar{X}^2) = E(\bar{X})^2 + var(\bar{X}) = \lambda^2 + \lambda/n\]
lambda <- 5
n <- 100
truth <- lambda^2 + lambda/n
B <- 1000
Repeats <- 100
counts <- 0
plot(c(0,100),c(0,Repeats), type="n", xlab="boot CI", ylab="repeats index")
abline(v = truth, col=2)
for(r in 1:Repeats){
set.seed(r)
X <- rpois(n, lambda)
TB <- numeric(B)
for(b in 1:B){
set.seed(b)
aX <- sample(X,n,replace = T)
TB[b] <- (mean(aX))^2
}
segments(quantile(TB, c(0.025)), r, quantile(TB, c(0.975)), r)
if(quantile(TB, c(0.025)) < truth & truth < quantile(TB, c(0.975))){
counts <- counts + 1
}
}
## [1] 0.93
library(boot)
myMean <- function(data, indices){
d <- data[indices] ## in this example, data is a vector
mean(d)^2
}
boot_nonpara <- boot(data=X, statistic = myMean, R = B)
boot.ci(boot_nonpara, type="perc")
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = boot_nonpara, type = "perc")
##
## Intervals :
## Level Percentile
## 95% (22.56, 30.80 )
## Calculations and Intervals on Original Scale
\[[\hat{T}_n - Z_{1 - \alpha/2}\hat{\sigma}_B, \hat{T}_n - Z_{\alpha/2}\hat{\sigma}_B],\] Where \(Z_\alpha = \Phi^{-1}(1-\alpha)\), \(\Phi\) is the cdf of standard Normal distribution.
Where \(\hat{T}_n\) is the estimator from the original sample and \(\hat{\sigma}_B\) is bootstrapping se.
lambda <- 5
n <- 100
B <- 1000
set.seed(32611)
X <- rpois(n, lambda)
lambdaHat <- mean(X)
That <- lambdaHat^2
TB <- numeric(B)
for(b in 1:B){
set.seed(b)
aX <- sample(X,n,replace = T)
TB[b] <- (mean(aX))^2
}
ci_l <- That - 1.96*sd(TB)
ci_u <- That + 1.96*sd(TB)
c(ci_l, ci_u)
## [1] 20.76757 29.03263
library(boot)
myMean <- function(data, indices){
d <- data[indices] ## in this example, data is a vector
mean(d)^2
}
boot_nonpara <- boot(data=X, statistic = myMean, R = B)
boot.ci(boot_nonpara, type="norm")
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = boot_nonpara, type = "norm")
##
## Intervals :
## Level Normal
## 95% (20.91, 28.95 )
## Calculations and Intervals on Original Scale
lambda <- 5
n <- 100
truth <- lambda^2
B <- 1000
Repeats <- 100
counts <- 0
plot(c(0,100),c(0,Repeats), type="n", xlab="boot CI", ylab="repeats index")
abline(v = truth, col=2)
for(r in 1:Repeats){
set.seed(r)
X <- rpois(n, lambda)
lambdaHat <- mean(X)
That <- lambdaHat^2
TB <- numeric(B)
for(b in 1:B){
set.seed(b)
aX <- sample(X,n,replace = T)
TB[b] <- (mean(aX))^2
}
ci_l <- That - 1.96*sd(TB)
ci_u <- That + 1.96*sd(TB)
segments(ci_l, r, ci_u, r)
if(ci_l < truth & truth < ci_u){
counts <- counts + 1
}
}
## [1] 0.93
Precedure | Theoritical guarantee | Fast | R package Boot? |
---|---|---|---|
Percentiles | No | Yes | Yes |
Pivotal Intervals | Yes | No | Yes |
Pivotal Intervals (simplified, no se) | Yes | Yes | No |
normal approximation | Yes | Yes | Yes |
##
## Call:
## lm(formula = lpsa ~ ., data = prostate)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.76644 -0.35510 -0.00328 0.38087 1.55770
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.181561 1.320568 0.137 0.89096
## lcavol 0.564341 0.087833 6.425 6.55e-09 ***
## lweight 0.622020 0.200897 3.096 0.00263 **
## age -0.021248 0.011084 -1.917 0.05848 .
## lbph 0.096713 0.057913 1.670 0.09848 .
## svi 0.761673 0.241176 3.158 0.00218 **
## lcp -0.106051 0.089868 -1.180 0.24115
## gleason 0.049228 0.155341 0.317 0.75207
## pgg45 0.004458 0.004365 1.021 0.31000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6995 on 88 degrees of freedom
## Multiple R-squared: 0.6634, Adjusted R-squared: 0.6328
## F-statistic: 21.68 on 8 and 88 DF, p-value: < 2.2e-16
B <- 1000
coef_lcavol <- rep(NA,B)
for(b in 1:B){
set.seed(b)
bindex <- sample(nrow(prostate), replace = TRUE)
bprostate <- prostate[bindex,]
blm <- lm(lpsa ~ ., data = bprostate)
coef_lcavol[b] <- coef(blm)["lcavol"]
}
se_lcavol <- sd(coef_lcavol)
Z_lcavol <- coef(alm)["lcavol"] / se_lcavol
pnorm(Z_lcavol, lower.tail = FALSE) * 2 ## two-sided bootstrapping p-value
## lcavol
## 6.96673e-13
## Loaded lars 1.2
x <- as.matrix(prostate[,1:8])
y <- prostate[,9]
lassoFit <- lars(x, y, normalize = FALSE) ## lar for least angle regression
coef(lassoFit, s=10, mode="lambda") ## get beta estimate when lambda = 2.
## lcavol lweight age lbph svi lcp
## 0.576840683 0.023404575 -0.005103704 0.076617068 0.000000000 0.000000000
## gleason pgg45
## 0.000000000 0.006724607
B <- 1000
coef_lcavol <- rep(NA,B)
for(b in 1:B){
set.seed(b)
bindex <- sample(nrow(prostate), replace = TRUE)
bprostate <- prostate[bindex,]
bx <- as.matrix(bprostate[,1:8])
by <- bprostate[,9]
blassoFit <- lars(bx, by, normalize = FALSE) ## lar for least angle regression
bcoef <- coef(blassoFit, s=10, mode="lambda") ## get beta estimate when lambda = 2.
coef_lcavol[b] <- bcoef["lcavol"]
}
se_lcavol <- sd(coef_lcavol)
Z_lcavol <- coef(lassoFit, s=10, mode="lambda")["lcavol"] / se_lcavol
pnorm(Z_lcavol, lower.tail = FALSE) * 2 ## two-sided bootstrapping p-value
## lcavol
## 2.213797e-11
Problem setting:
\[T = | \frac {\sum_{i = 1}^NZ_i I(L_i=1)} {\sum_{i = 1}^N I(L_i=1)} - \frac {\sum_{i = 1}^NZ_i I(L_i=2)} {\sum_{i = 1}^N I(L_i=2)} |,\] where \(N = n + m\). Therefore the test statistic \(T = g(L,Z)\) is a function of \(L\) and \(Z\).
where \(L_\pi\) is a permutation of the labels and the sum is over all permutations.
Sometimes summing over all possible permutations is infeasible. But it suffices to use a random sample of permutation.
Permutation test procedure:
If we observe \(X_1, \ldots, X_n\) and \(Y_1, \ldots, Y_m\), how can we test if the mean of \(X\) and mean of \(Y\) have a difference. Consider the follow one-sided test:
n <- 100
m <- 100
set.seed(32611)
x <- rnorm(n,0,1)
y <- rnorm(n,0,2)
adataFrame <- data.frame(data=c(x,y),label=gl(2,n))
T <- with(adataFrame, mean(data[label==2] - data[label==1]))
B <- 1000
TB <- numeric(B)
for(b in 1:B){
set.seed(b)
bdataFrame <- adataFrame
bdataFrame$label <- sample(bdataFrame$labe)
TB[b] <- with(bdataFrame, mean(data[label==2] - data[label==1]))
}
pvalue = mean(TB >= T)
pvalue
## [1] 0.647
If we observe \(X_1, \ldots, X_n\) and \(Y_1, \ldots, Y_m\), how can we test if the mean of \(X\) and mean of \(Y\) have a difference.
n <- 100
m <- 100
set.seed(32611)
x <- rnorm(n,0,1)
y <- rnorm(n,0.5,2)
adataFrame <- data.frame(data=c(x,y),label=gl(2,n))
T <- with(adataFrame, mean(data[label==2] - data[label==1]))
B <- 1000
TB <- numeric(B)
for(b in 1:B){
set.seed(b)
bdataFrame <- adataFrame
bdataFrame$label <- sample(bdataFrame$labe)
TB[b] <- with(bdataFrame, mean(data[label==2] - data[label==1]))
}
mean(TB >= T)
## [1] 0.031