Biostatistical Computing, PHC 6068

Simulating random variables

Zhiguang Huo (Caleb)

Wednesday September 27, 2017

HW1

HW1 Solution:

Outlines

Random samples from a given pool of numbers

a <- 1:10
sample(x = a, size = 2)
## [1] 4 2
sample(x = a, size = 2)
## [1] 6 9
sample(x = a, size = 2)
## [1] 1 4

Are they really random? Can generate the same random number (for reproducibility)?

a <- 1:10
set.seed(32611) ## 32611 can be replaced by any number
sample(x = a, size = 2)
## [1] 5 4
set.seed(32611) ## if you keep the same random seed, you will end up with the exact same result
sample(x = a, size = 2)
## [1] 5 4
set.seed(32611) ## if you keep the same random seed, you will end up with the exact same result
sample(x = a, size = 2)
## [1] 5 4

Each time the seed is set, the same sequence follows

set.seed(32611)
sample(1:10,2)
## [1] 5 4
sample(1:10,2)
## [1] 1 5
sample(1:10,2)
## [1] 2 1
set.seed(32611)
sample(1:10,2)
## [1] 5 4
sample(1:10,2)
## [1] 1 5
sample(1:10,2)
## [1] 2 1

Want to sample from a given pool of numbers with replacement

sample(1:10) ## default is without replacement
##  [1]  9  4  6 10  2  5  8  3  7  1
sample(1:10, replace = T) ## with replacement
##  [1]  7  8  6 10  3  4  9  2 10  4
sample(LETTERS[1:10], replace = T) ## with replacement
##  [1] "A" "B" "C" "B" "H" "F" "H" "H" "E" "E"

Random number generation from normal distribution

For normal distribution:

Random variable simulators in R

Distribution R command
binomial rbinom
Poisson rpois
geometric rgeom
negative binomial rnbinom
uniform runif
exponential rexp
normal rnorm
gamma rgamma
beta rbeta
student t rt
F rf
chi-squared rchisq
Weibull rweibull
log normal rlnorm

Random variable density in R

Distribution R command
binomial dbinom
Poisson dpois
geometric dgeom
negative binomial dnbinom
uniform dunif
exponential dexp
normal dnorm
gamma dgamma
beta dbeta
student t dt
F df
chi-squared dchisq
Weibull dweibull
log normal dlnorm

Random variable distribution tail probablity in R

Distribution R command
binomial pbinom
Poisson ppois
geometric pgeom
negative binomial pnbinom
uniform punif
exponential pexp
normal pnorm
gamma pgamma
beta pbeta
student t pt
F pf
chi-squared pchisq
Weibull pweibull
log normal plnorm

Random variable distribution quantile in R

Distribution R command
binomial qbinom
Poisson qpois
geometric qgeom
negative binomial qnbinom
uniform qunif
exponential qexp
normal qnorm
gamma qgamma
beta qbeta
student t qt
F qf
chi-squared qchisq
Weibull qweibull
log normal qlnorm

Normal distribution

\[f(x;\mu,\sigma) = \frac{1}{\sqrt{2\pi \sigma^2}} e^{-\frac{(x-\mu)^2}{2\sigma^2}}\]

aseq <- seq(-4,4,.01)
plot(aseq,dnorm(aseq, 0, 1),type='l', xlab='x', ylab='Density', lwd=2)
lines(aseq,dnorm(aseq, 1, 1),col=2, lwd=2)
lines(aseq,dnorm(aseq,0, 2),col=3, lwd=2)
legend("topleft",c(expression(paste(mu==0, ", " ,  sigma==1 ,sep=' ')), 
             expression(paste(mu==1, ", " ,  sigma==1 ,sep=' ')), 
             expression(paste(mu==0, ", " ,  sigma==2 ,sep=' '))), 
       col=1:3, lty=c(1,1,1), lwd=c(2,2,2), cex=1, bty='n')
mtext(side=3,line=.5,'Normal distributions',cex=1, font=2)

t distribution

aseq <- seq(-4,4,.01)
plot(aseq,dnorm(aseq),type='l', xlab='x', ylab='Density', lwd=2)
lines(aseq,dt(aseq,10),col=2, lwd=2)
lines(aseq,dt(aseq,4),col=3, lwd=2)
lines(aseq,dt(aseq,2),col=4, lwd=2)
legend("topleft",c(expression(normal), expression(paste(df==10,sep=' ')), 
             expression(paste(df==4,sep=' ')), 
             expression(paste(df==2,sep=' '))), 
       col=1:4, lty=c(1,1,1), lwd=c(2,2,2), cex=1, bty='n')
mtext(side=3,line=.5,'t distributions',cex=1, font=2)

Poisson distribution

\[f(k;\lambda) = \frac{\lambda^k e^{-\lambda}}{k!},\] where \(k\) is non negative integer.

aseq <- seq(0,8,1)
par(mfrow=c(2,2))
for(i in 1:4){
  counts <- dpois(aseq,i)
  names(counts) <- aseq
  barplot(counts, xlab='x', ylab='Density', lwd=2, col=i, las = 2,  main=bquote(paste(lambda==.(i), sep=' ')), ylim=c(0, 0.4))
}

Beta distribution

\[f(x;\alpha,\beta) = \frac{\Gamma(\alpha + \beta)}{\Gamma(\alpha) \Gamma(\beta)} x^{\alpha - 1}(1 - x)^{\beta - 1}\]

aseq <- seq(.001,.999,.001)
plot(aseq,dbeta(aseq,.25,.25), type='l', ylim=c(0,6), ylab='Density', xlab='Proportion (p)',lwd=2)
lines(aseq, dbeta(aseq,2,2),lty=2,lwd=2)
lines(aseq, dbeta(aseq,2,5),lty=1,col=2,lwd=2)
lines(aseq, dbeta(aseq,12,2),lty=2,col=2,lwd=2)
lines(aseq, dbeta(aseq,20,.75),lty=1,col='green',lwd=2)
lines(aseq, dbeta(aseq,1,1),lty=2,lwd=2, col=4)
legend(.2,6,c(expression(paste(alpha==.25,', ', beta==.25)), expression(paste(alpha==2,', ',beta==2)), expression(paste(alpha==2,', ', beta==5)), expression(paste(alpha==12,', ',beta==2)), expression(paste(alpha==20,', ',beta==.75)), expression(paste(alpha==1,', ', beta==1))), lty=c(1,2,1,2,1,2), col=c(1,1,2,2,'green',4), cex=1,bty='n',lwd=rep(2,6))
mtext(side=3,line=.5,'Beta distributions',cex=1, font=2)

Gamma distribution

\[f(x;k,\theta) = \frac{1}{\Gamma(k) \theta^k} x^{k-1}e^{-\frac{x}{\theta}}\]

aseq <- seq(0,7,.01)
plot(aseq,dgamma(aseq,1,1),type='l', xlab='x', ylab='Density', lwd=2)
lines(aseq,dgamma(aseq,2,1),col=4, lwd=2)
lines(aseq,dgamma(aseq,4,4),col=2, lwd=2)
legend(3,1,c(expression(paste(alpha==1,', ',beta==1,sep=' ')), expression(paste(alpha==2,', ',beta==1,sep=' ')), expression(paste(alpha==4,', ', beta==4,sep=' '))), col=c(1,4,2), lty=c(1,1,1), lwd=c(2,2,2), cex=1, bty='n')
mtext(side=3,line=.5,'Gamma distributions',cex=1, font=2)

log normal distribution

A positive random variable \(X\) is log-normally distributed if the logarithm of X is normally distributed, \[\ln(X) \sim N(\mu, \sigma^2)\]

aseq <- seq(0,7,.01)
plot(aseq,dlnorm(aseq,.1,2),type='l', xlab='x', ylab='Density', lwd=2)
lines(aseq,dlnorm(aseq,2,1),col=4, lwd=2)
lines(aseq,dlnorm(aseq,0,1),col=2, lwd=2)
legend(3,1.2,c(expression(paste(mu==0.1,', ',sigma==2,sep=' ')), expression(paste(mu==2,', ',sigma==1,sep=' ')), expression(paste(mu==0,', ',sigma==1,sep=' '))), col=c(1,4,2), lty=c(1,1,1), lwd=c(2,2,2), cex=1,bty='n')
mtext(side=3,line=.5,'Lognormal distributions',cex=1, font=2)

samples from Normal distribution

set.seed(32608)
z <- rnorm(1000)
hist(z,nclass=50, freq = F)
lines(density(z), col=2, lwd=2)
curve(dnorm, from = -3, to = 4 ,col=4, lwd=2, add = T)
legend("topright", c("empirical", "theoritical"), col=c(2,4), lwd=c(2,2))

Check CDF

set.seed(32608)
z <- rnorm(1000)
plot(ecdf(z), ylab="Distribution", main="Empirical distribution",
     lwd=2, col="red")
curve(pnorm, from = -3, to = 3, lwd=2, add = T)
legend("topleft", legend=c("Empirical distribution", "Actual distribution"),
       lwd=2, col=c("red","black"))

In class exercise: Verify weak law of large number

The weak law of large number states the sample average converges in probability towards the expected value.

\[\frac{1}{n} \sum_{i=1}^n X_i \rightarrow \mathbb{E}(X)\]

n <- 10^seq(2,8)
set.seed(32608)
## your code

p-value

p-value example

quantile: qnorm()

qnorm(p = 0.975, mean = 0, sd = 1, lower.tail = TRUE) 
## [1] 1.959964

Multi-variate normal distribution

\[X \sim N( \begin{pmatrix} \mu_1 \\ \mu_2 \end{pmatrix}, \begin{pmatrix} \sigma_{11} & \sigma_{12} \\ \sigma_{21} & \sigma_{22} \end{pmatrix} )\]

mu <- c(0,0)
Sigma <- matrix(c(10,3,3,2),2,2)
Sigma
##      [,1] [,2]
## [1,]   10    3
## [2,]    3    2
var(MASS::mvrnorm(n = 1000, mu, Sigma))
##           [,1]     [,2]
## [1,] 10.144428 2.880464
## [2,]  2.880464 1.859495

Multi-variate normal distribution simulation

mu <- c(0,0)
Sigma1 <- matrix(c(1,0,0,1),2,2)
Sigma2 <- matrix(c(1,0,0,5),2,2)
Sigma3 <- matrix(c(5,0,0,1),2,2)
Sigma4 <- matrix(c(1,0.8,0.8,1),2,2)
Sigma5 <- matrix(c(1,1,1,1),2,2)
Sigma6 <- matrix(c(1,-0.8,-0.8,1),2,2)
arange <- c(-6,6)
par(mfrow=c(2,3))
plot(MASS::mvrnorm(n = 1000, mu, Sigma1), xlim=arange, ylim=arange)
plot(MASS::mvrnorm(n = 1000, mu, Sigma2), xlim=arange, ylim=arange)
plot(MASS::mvrnorm(n = 1000, mu, Sigma3), xlim=arange, ylim=arange)
plot(MASS::mvrnorm(n = 1000, mu, Sigma4), xlim=arange, ylim=arange)
plot(MASS::mvrnorm(n = 1000, mu, Sigma5), xlim=arange, ylim=arange)
plot(MASS::mvrnorm(n = 1000, mu, Sigma6), xlim=arange, ylim=arange)

Truncated distribution

library(truncdist)
## Loading required package: stats4
## Loading required package: evd
# dtrunc
# rtrunc
# ptrunc
# qtrunc

generate samples from truncated normal distribution

x <- rtrunc( 1000, spec="norm", a=1, b=Inf )
hist(x, nclass = 50)

density of truncated normal distribution

truncf <- function(x) dtrunc(x, spec="norm", a=1, b=Inf )
curve(truncf, from = -6, to = 6)

What kinds of distribution (spec) does this function support

dtrunc(0.5, spec="norm", a=-2, b=Inf )
## [1] 0.3602613
dtrunc(0.5, spec="beta", shape1=2, shape2 = 2, a=0.3, b=Inf )
## [1] 1.913265
dtrunc(0.5, spec="gamma", shape=2, rate = 2, a=0.3, b=Inf )
## [1] 0.8379001

Another way generate random samples

CDF of any density function follows UNIF(0,1)

\[F_Z(x) = P(F_X(X) \le x) = P(X \le F_X^{-1}(x)) = F_X(F_X^{-1}(x)) = x\] - If \(U\) is a uniform random variable who takes values in \([0, 1]\), \[F_U(x) = \int_R f_U(u) du = \int_0^x du = x\]

Thus \(Z \sim UNIF(0, 1)\)

Beta distribution

Beta distribution (two approaches)

set.seed(32611)
n <- 10000
x1 <- rbeta(n, 3, 1)
x2_0 <- runif(n)
x2 <- x2_0^{1/3}

par(mfrow=c(1,2))
hist(x1, nclass=50)
hist(x2, nclass=50)

Simulate two clusters, apply Kmeans algorithm and hclust algorithm (In class exercise)

Compare direct sampling and inverse CDF sampling (In class exercise)

  1. directly sample n=1000 samples from N(1,2)
  2. use inverse CDF method to sample n=1000 from N(1,2) (hint: consider qnorm function)
  3. Compare theoritical CDF, empirical CDF from 1 and empirical CDF from 2.

Generate R code

knitr::purl("rvb.Rmd", output = "rvb.R ", documentation = 2)
## 
## 
## processing file: rvb.Rmd
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |.                                                                |   2%
  |                                                                       
  |..                                                               |   4%
  |                                                                       
  |....                                                             |   6%
  |                                                                       
  |.....                                                            |   8%
  |                                                                       
  |......                                                           |  10%
  |                                                                       
  |........                                                         |  12%
  |                                                                       
  |.........                                                        |  13%
  |                                                                       
  |..........                                                       |  15%
  |                                                                       
  |...........                                                      |  17%
  |                                                                       
  |............                                                     |  19%
  |                                                                       
  |..............                                                   |  21%
  |                                                                       
  |...............                                                  |  23%
  |                                                                       
  |................                                                 |  25%
  |                                                                       
  |..................                                               |  27%
  |                                                                       
  |...................                                              |  29%
  |                                                                       
  |....................                                             |  31%
  |                                                                       
  |.....................                                            |  33%
  |                                                                       
  |......................                                           |  35%
  |                                                                       
  |........................                                         |  37%
  |                                                                       
  |.........................                                        |  38%
  |                                                                       
  |..........................                                       |  40%
  |                                                                       
  |............................                                     |  42%
  |                                                                       
  |.............................                                    |  44%
  |                                                                       
  |..............................                                   |  46%
  |                                                                       
  |...............................                                  |  48%
  |                                                                       
  |................................                                 |  50%
  |                                                                       
  |..................................                               |  52%
  |                                                                       
  |...................................                              |  54%
  |                                                                       
  |....................................                             |  56%
  |                                                                       
  |......................................                           |  58%
  |                                                                       
  |.......................................                          |  60%
  |                                                                       
  |........................................                         |  62%
  |                                                                       
  |.........................................                        |  63%
  |                                                                       
  |..........................................                       |  65%
  |                                                                       
  |............................................                     |  67%
  |                                                                       
  |.............................................                    |  69%
  |                                                                       
  |..............................................                   |  71%
  |                                                                       
  |................................................                 |  73%
  |                                                                       
  |.................................................                |  75%
  |                                                                       
  |..................................................               |  77%
  |                                                                       
  |...................................................              |  79%
  |                                                                       
  |....................................................             |  81%
  |                                                                       
  |......................................................           |  83%
  |                                                                       
  |.......................................................          |  85%
  |                                                                       
  |........................................................         |  87%
  |                                                                       
  |..........................................................       |  88%
  |                                                                       
  |...........................................................      |  90%
  |                                                                       
  |............................................................     |  92%
  |                                                                       
  |.............................................................    |  94%
  |                                                                       
  |..............................................................   |  96%
  |                                                                       
  |................................................................ |  98%
  |                                                                       
  |.................................................................| 100%
## output file: rvb.R
## [1] "rvb.R "