Introduction to Biostatistical Computing PHC 6937

Basic statistical inference in R

Zhiguang Huo (Caleb)

Wednesday Oct 12th, 2022

Outlines

Random samples from a given pool of numbers

a <- 1:10
sample(x = a, size = 2)
## [1] 4 6
sample(x = a, size = 2)
## [1] 3 7
sample(x = a, size = 2)
## [1] 10  5

Are they really random? Can generate the same random number (for reproducibility)?

a <- 1:10
set.seed(32611) ## 32611 can be replaced by any number
sample(x = a, size = 2)
## [1] 1 7
set.seed(32611) ## if you keep the same random seed, you will end up with the exact same result
sample(x = a, size = 2)
## [1] 1 7
set.seed(32611) ## if you keep the same random seed, you will end up with the exact same result
sample(x = a, size = 2)
## [1] 1 7

Each time the seed is set, the same sequence follows

set.seed(32611)
sample(1:10,2); sample(1:10,2); sample(1:10,2)
## [1] 1 7
## [1] 3 7
## [1] 8 5
set.seed(32611)
sample(1:10,2); sample(1:10,2); sample(1:10,2)
## [1] 1 7
## [1] 3 7
## [1] 8 5

However, R version may impact the random number

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] compiler_4.0.3  magrittr_2.0.1  fastmap_1.1.0   tools_4.0.3    
##  [5] htmltools_0.5.2 yaml_2.2.1      stringi_1.5.3   rmarkdown_2.6  
##  [9] knitr_1.30      stringr_1.4.0   xfun_0.19       digest_0.6.27  
## [13] rlang_0.4.12    evaluate_0.14
set.seed(32611)
sample(1:10,2)
## [1] 1 7

Want to sample from a given pool of numbers with replacement

sample(1:10) ## default is without replacement
##  [1]  3  7  9  5  8  1  2  4  6 10
sample(1:10, replace = T) ## with replacement
##  [1] 3 8 9 7 2 6 5 5 7 3
sample(LETTERS[1:10], replace = T) ## with replacement
##  [1] "G" "C" "A" "J" "G" "I" "I" "B" "D" "H"

Random number generation from normal distribution

For normal distribution:

Random variable simulators in R

Distribution R command
binomial rbinom
Poisson rpois
geometric rgeom
negative binomial rnbinom
uniform runif
exponential rexp
normal rnorm
gamma rgamma
beta rbeta
student t rt
F rf
chi-squared rchisq
Weibull rweibull
log normal rlnorm

Random variable density in R

Distribution R command
binomial dbinom
Poisson dpois
geometric dgeom
negative binomial dnbinom
uniform dunif
exponential dexp
normal dnorm
gamma dgamma
beta dbeta
student t dt
F df
chi-squared dchisq
Weibull dweibull
log normal dlnorm

Random variable distribution tail probablity in R

Distribution R command
binomial pbinom
Poisson ppois
geometric pgeom
negative binomial pnbinom
uniform punif
exponential pexp
normal pnorm
gamma pgamma
beta pbeta
student t pt
F pf
chi-squared pchisq
Weibull pweibull
log normal plnorm

Random variable distribution quantile in R

Distribution R command
binomial qbinom
Poisson qpois
geometric qgeom
negative binomial qnbinom
uniform qunif
exponential qexp
normal qnorm
gamma qgamma
beta qbeta
student t qt
F qf
chi-squared qchisq
Weibull qweibull
log normal qlnorm

Normal distribution

\[f(x;\mu,\sigma) = \frac{1}{\sqrt{2\pi \sigma^2}} e^{-\frac{(x-\mu)^2}{2\sigma^2}}\]

aseq <- seq(-4,4,.01)
plot(aseq,dnorm(aseq, 0, 1),type='l', xlab='x', ylab='Density', lwd=2)
lines(aseq,dnorm(aseq, 1, 1),col=2, lwd=2)
lines(aseq,dnorm(aseq,0, 2),col=3, lwd=2)
legend("topleft",c(expression(paste(mu==0, ", " ,  sigma==1 ,sep=' ')), 
             expression(paste(mu==1, ", " ,  sigma==1 ,sep=' ')), 
             expression(paste(mu==0, ", " ,  sigma==2 ,sep=' '))), 
       col=1:3, lty=c(1,1,1), lwd=c(2,2,2), cex=1, bty='n')
mtext(side=3,line=.5,'Normal distributions',cex=1, font=2)

t distribution

aseq <- seq(-4,4,.01)
plot(aseq,dnorm(aseq),type='l', xlab='x', ylab='Density', lwd=2) ## same as dt(aseq, Inf)
lines(aseq,dt(aseq,10),col=2, lwd=2)
lines(aseq,dt(aseq,4),col=3, lwd=2)
lines(aseq,dt(aseq,2),col=4, lwd=2)
legend("topleft",c(expression(normal), expression(paste(df==10,sep=' ')), 
             expression(paste(df==4,sep=' ')), 
             expression(paste(df==2,sep=' '))), 
       col=1:4, lty=c(1,1,1), lwd=c(2,2,2), cex=1, bty='n')
mtext(side=3,line=.5,'t distributions',cex=1, font=2)

Chi-square distribution

aseq <- seq(0,4,.01)
plot(aseq,dchisq(aseq, df = 1),type='l', xlab='x', ylab='Density', lwd=2, ylim = c(0,1))
lines(aseq,dchisq(aseq,df = 2),col=2, lwd=2)
lines(aseq,dchisq(aseq,df = 5),col=3, lwd=2)
lines(aseq,dchisq(aseq,df = 10),col=4, lwd=2)
legend("topright",c(expression({Chi^2}[(1)]), expression({Chi^2}[(2)]), 
             expression({Chi^2}[(5)]), 
             expression({Chi^2}[(10)])
             ), 
       col=1:4, lty=c(1,1,1), lwd=c(2,2,2), cex=1, bty='n')
mtext(side=3,line=.5,'Chi square distributions',cex=1, font=2)

Chi-square distribution

## verification via qq-plot
set.seed(32608)
n <- 1000
x1 <- rnorm(n)
x2 <- rchisq(n,df = 1)
## the best practice is to use the theoretical distribution
## for simplicity, we can also use the emperical distribution

s1_sorted <- sort(x1^2)
s2_sorted <- sort(x2)

plot(s1_sorted, s2_sorted, xlim = c(0,5), ylim = c(0,5))

Poisson distribution

\[f(k;\lambda) = \frac{\lambda^k e^{-\lambda}}{k!},\] where \(k\) is non negative integer.

aseq <- seq(0,8,1)
par(mfrow=c(2,2))
for(i in 1:4){
  counts <- dpois(aseq,i)
  names(counts) <- aseq
  barplot(counts, xlab='x', ylab='Density', lwd=2, col=i, las = 2,  main=bquote(paste(lambda==.(i), sep=' ')), ylim=c(0, 0.4))
}

Beta distribution

\[f(x;\alpha,\beta) = \frac{\Gamma(\alpha + \beta)}{\Gamma(\alpha) \Gamma(\beta)} x^{\alpha - 1}(1 - x)^{\beta - 1}\]

aseq <- seq(.001,.999,.001)
plot(aseq,dbeta(aseq,.25,.25), type='l', ylim=c(0,6), ylab='Density', xlab='Proportion (p)',lwd=2)
lines(aseq, dbeta(aseq,2,2),lty=2,lwd=2)
lines(aseq, dbeta(aseq,2,5),lty=1,col=2,lwd=2)
lines(aseq, dbeta(aseq,12,2),lty=2,col=2,lwd=2)
lines(aseq, dbeta(aseq,20,.75),lty=1,col='green',lwd=2)
lines(aseq, dbeta(aseq,1,1),lty=2,lwd=2, col=4)
legend(.2,6,c(expression(paste(alpha==.25,', ', beta==.25)), expression(paste(alpha==2,', ',beta==2)), expression(paste(alpha==2,', ', beta==5)), expression(paste(alpha==12,', ',beta==2)), expression(paste(alpha==20,', ',beta==.75)), expression(paste(alpha==1,', ', beta==1))), lty=c(1,2,1,2,1,2), col=c(1,1,2,2,'green',4), cex=1,bty='n',lwd=rep(2,6))
mtext(side=3,line=.5,'Beta distributions',cex=1, font=2)

Beta distribution

samples from Normal distribution

set.seed(32608)
z <- rnorm(1000)
hist(z,nclass=50, freq = F)
lines(density(z), col=2, lwd=2)
curve(dnorm, from = -3, to = 4 ,col=4, lwd=2, add = T)
legend("topright", c("empirical", "theoritical"), col=c(2,4), lwd=c(2,2))

Check CDF

set.seed(32608)
z <- rnorm(1000)
plot(ecdf(z), ylab="Distribution", main="Empirical distribution",
     lwd=2, col="red")
curve(pnorm, from = -3, to = 3, lwd=2, add = T)
legend("topleft", legend=c("Empirical distribution", "Actual distribution"),
       lwd=2, col=c("red","black"))

p-value

p-value example

quantile: qnorm()

qnorm(p = 0.975, mean = 0, sd = 1, lower.tail = TRUE) 
## [1] 1.959964
qnorm(p = 0.025, mean = 0, sd = 1, lower.tail = TRUE) 
## [1] -1.959964

Commonly used statistical tests

Two sample t.test

set.seed(32611)
a <- rnorm(10,0,1)
b <- rnorm(10,1,1)
t.test(a,b)
## 
##  Welch Two Sample t-test
## 
## data:  a and b
## t = -3.3236, df = 17.48, p-value = 0.0039
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.2295315 -0.5002657
## sample estimates:
##  mean of x  mean of y 
## -0.1733548  1.1915438

Two sample paired t.test

set.seed(32611)
a <- rnorm(10,0,1)
b <- rnorm(10,1,1)
t.test(a,b,paired=TRUE)
## 
##  Paired t-test
## 
## data:  a and b
## t = -2.5027, df = 9, p-value = 0.03371
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.5985937 -0.1312035
## sample estimates:
## mean of the differences 
##               -1.364899

One sample t.test

set.seed(32611)
a <- rnorm(10,0,1)
t.test(a)
## 
##  One Sample t-test
## 
## data:  a
## t = -0.65621, df = 9, p-value = 0.5281
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.7709621  0.4242526
## sample estimates:
##  mean of x 
## -0.1733548

Two sample Wilcox Rank Sum test

set.seed(32611)
a <- rnorm(10,0,1)
b <- rnorm(10,1,1)
wilcox.test(a,b)
## 
##  Wilcoxon rank sum exact test
## 
## data:  a and b
## W = 13, p-value = 0.003886
## alternative hypothesis: true location shift is not equal to 0

Two sample paired Wilcox Rank Sum test

set.seed(32611)
a <- rnorm(10,0,1)
b <- rnorm(10,1,1)
wilcox.test(a,b, paired = TRUE)
## 
##  Wilcoxon signed rank exact test
## 
## data:  a and b
## V = 5, p-value = 0.01953
## alternative hypothesis: true location shift is not equal to 0

Chi square test

X <- matrix(c(189, 104, 10845, 10933), nrow=2,
            dimnames=list(Treatment=c("Placebo","Aspirin"), 
                          "Myocardial Infarction"=c("Yes", "No")))

Chi square test

X
##          Myocardial Infarction
## Treatment Yes    No
##   Placebo 189 10845
##   Aspirin 104 10933
chisq.test(X)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  X
## X-squared = 24.429, df = 1, p-value = 7.71e-07

Fisher’s exact test

X
##          Myocardial Infarction
## Treatment Yes    No
##   Placebo 189 10845
##   Aspirin 104 10933
fisher.test(X)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  X
## p-value = 5.033e-07
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  1.432396 2.353927
## sample estimates:
## odds ratio 
##   1.831993

correlation test

set.seed(32611)
a <- rnorm(10,0,1)
b <- rnorm(10,1,1)

cor.test(a,b) ## parametric test: Gaussian assumption
## 
##  Pearson's product-moment correlation
## 
## data:  a and b
## t = -3.4699, df = 8, p-value = 0.008444
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.9440264 -0.2841809
## sample estimates:
##        cor 
## -0.7751172
cor.test(a,b, method = "spearman") ## non-parametric test: no distribution assumption
## 
##  Spearman's rank correlation rho
## 
## data:  a and b
## S = 292, p-value = 0.01367
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## -0.769697

KS test

set.seed(32611)
x <- rnorm(50,0,1)
y <- rnorm(50,1,1)
ks.test(x,y)
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.38, p-value = 0.001315
## alternative hypothesis: two-sided
z <- runif(50)
ks.test(x,z)

Fit linear model in R, lm()

Basic syntax for lm()

lm(formula, data)
lmfit <- lm(mpg ~ cyl, data=mtcars)
lmfit
## 
## Call:
## lm(formula = mpg ~ cyl, data = mtcars)
## 
## Coefficients:
## (Intercept)          cyl  
##      37.885       -2.876
summary(lmfit)
## 
## Call:
## lm(formula = mpg ~ cyl, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.9814 -2.1185  0.2217  1.0717  7.5186 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  37.8846     2.0738   18.27  < 2e-16 ***
## cyl          -2.8758     0.3224   -8.92 6.11e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.206 on 30 degrees of freedom
## Multiple R-squared:  0.7262, Adjusted R-squared:  0.7171 
## F-statistic: 79.56 on 1 and 30 DF,  p-value: 6.113e-10
lmfit <- lm(mpg ~ cyl + disp, data=mtcars)
summary(lmfit)
## 
## Call:
## lm(formula = mpg ~ cyl + disp, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.4213 -2.1722 -0.6362  1.1899  7.0516 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 34.66099    2.54700  13.609 4.02e-14 ***
## cyl         -1.58728    0.71184  -2.230   0.0337 *  
## disp        -0.02058    0.01026  -2.007   0.0542 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.055 on 29 degrees of freedom
## Multiple R-squared:  0.7596, Adjusted R-squared:  0.743 
## F-statistic: 45.81 on 2 and 29 DF,  p-value: 1.058e-09
lmfit <- lm(mpg ~ ., data=mtcars)
summary(lmfit)
## 
## Call:
## lm(formula = mpg ~ ., data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4506 -1.6044 -0.1196  1.2193  4.6271 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 12.30337   18.71788   0.657   0.5181  
## cyl         -0.11144    1.04502  -0.107   0.9161  
## disp         0.01334    0.01786   0.747   0.4635  
## hp          -0.02148    0.02177  -0.987   0.3350  
## drat         0.78711    1.63537   0.481   0.6353  
## wt          -3.71530    1.89441  -1.961   0.0633 .
## qsec         0.82104    0.73084   1.123   0.2739  
## vs           0.31776    2.10451   0.151   0.8814  
## am           2.52023    2.05665   1.225   0.2340  
## gear         0.65541    1.49326   0.439   0.6652  
## carb        -0.19942    0.82875  -0.241   0.8122  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared:  0.869,  Adjusted R-squared:  0.8066 
## F-statistic: 13.93 on 10 and 21 DF,  p-value: 3.793e-07