Programming basics for Biostatistics 6099

Vectorized calculation

Zhiguang Huo (Caleb)

Thursday September 7, 2023

Why do we need vectorized calculation (motivating example 1)

version A, calculation by loop

a <- 1:1000000
### version A, loop
start <- Sys.time()
meanA <- 0
for(i in seq_along(a)){
  meanA <- meanA + a[i]/length(a)
}
end <- Sys.time() 
end - start
## Time difference of 0.09918714 secs
meanA
## [1] 500000.5

version B, vectorized calculation

start <- Sys.time()
mean(a)
## [1] 500000.5
end <- Sys.time() 
end - start
## Time difference of 0.002562046 secs

Why do we need vectorized calculation (motivating example 2)

version A, calculation by loop

a <- 1:1000000
b <- 1000000:1
### version A, loop
start <- Sys.time()
result <- numeric(length(a)) ## create a vector with length length(a) and all elements 0
for(i in seq_along(a)){
  result[i] <- a[i] + b[i]
}
end <- Sys.time() 
end - start
## Time difference of 0.04570603 secs

version B, vectorized calculation

start <- Sys.time()
result <- a + b
end <- Sys.time() 
end - start
## Time difference of 0.004063129 secs

Simple examples of vectorized calculation

a <- c(1.1,3.4,9.5)
b <- c(9,2,0.8)
a + b ## vector addition
## [1] 10.1  5.4 10.3
a * b ## vector multiplication
## [1] 9.9 6.8 7.6
a ^ b ## 1.1^9, 3.4^2, 9.5^0.8
## [1]  2.357948 11.560000  6.055903

Simple examples of vectorized calculation 2

a <- 1:8
a + 2
## [1]  3  4  5  6  7  8  9 10
a + c(0,1) ## if vector a is integer multiple of another vector
## [1] 1 3 3 5 5 7 7 9
a + c(1,2,3) ## warming message 
## Warning in a + c(1, 2, 3): longer object length is not a multiple of shorter
## object length
## [1]  2  4  6  5  7  9  8 10

Other scientific calculation

a <- seq(1,3,1)
sin(a)
## [1] 0.8414710 0.9092974 0.1411200
tan(a)
## [1]  1.5574077 -2.1850399 -0.1425465
log(a,base = 10)
## [1] 0.0000000 0.3010300 0.4771213
log10(a)
## [1] 0.0000000 0.3010300 0.4771213
exp(a)
## [1]  2.718282  7.389056 20.085537

ifelse

a <- seq(1,8,1)

## by loop
res <- character(length(a))
for(i in 1:length(a)){
  if(a[i] %% 2 == 0){
    res[i] <- "even"
  } else{
    res[i] <- "odd"
  }
}
res
## [1] "odd"  "even" "odd"  "even" "odd"  "even" "odd"  "even"
## by vecterized calculation
ifelse(a %% 2 ==0, "even", "odd")
## [1] "odd"  "even" "odd"  "even" "odd"  "even" "odd"  "even"

vectorized input for plot

plot(x = 1:10, y = 1:10, col=1:10)

plot(x = 1:10, y = 1:10, col=1)

Initialize a vector

n <- 5
a_int <- integer(n)
a_int
## [1] 0 0 0 0 0
b_int <- rep(0, n)
b_int
## [1] 0 0 0 0 0
c_int <- replicate(5,0)
c_int
## [1] 0 0 0 0 0

Initialize a vector, other types

a_double <- double(n)
a_double
## [1] 0 0 0 0 0
a_char <- character(n)
a_char
## [1] "" "" "" "" ""
a_logical <- logical(n)
a_logical
## [1] FALSE FALSE FALSE FALSE FALSE

A sequence

a <- seq(from=10, to=100, by=2)
a
##  [1]  10  12  14  16  18  20  22  24  26  28  30  32  34  36  38  40  42  44  46
## [20]  48  50  52  54  56  58  60  62  64  66  68  70  72  74  76  78  80  82  84
## [39]  86  88  90  92  94  96  98 100
b <- seq(from=10, to=100, length=10)
b
##  [1]  10  20  30  40  50  60  70  80  90 100
c <- seq(from=10, to=100, along.with=1:10)
c
##  [1]  10  20  30  40  50  60  70  80  90 100
seq_len(10)
##  [1]  1  2  3  4  5  6  7  8  9 10
seq_along(b)
##  [1]  1  2  3  4  5  6  7  8  9 10

Random number generator

set.seed(32611) ## set a seed number such that the random numbers will keep the same
rnorm(n = 5, mean = 0, sd = 1)
## [1] -0.1023726 -1.8869039 -1.0586563  1.0227599  0.3982516
set.seed(32611)
rnorm(n = 5, mean = 0:4, sd = 1)
## [1] -0.1023726 -0.8869039  0.9413437  4.0227599  4.3982516

Note that with the same random seed, different random numbers can be generated on different R versions.

sessionInfo() ## check R version
## R version 4.3.1 (2023-06-16)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.4.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] digest_0.6.33     R6_2.5.1          fastmap_1.1.1     xfun_0.39        
##  [5] cachem_1.0.8      knitr_1.43        htmltools_0.5.5   rmarkdown_2.23   
##  [9] cli_3.6.1         sass_0.4.7        jquerylib_0.1.4   compiler_4.3.1   
## [13] highr_0.10        rstudioapi_0.15.0 tools_4.3.1       evaluate_0.21    
## [17] bslib_0.5.0       yaml_2.3.7        rlang_1.1.1       jsonlite_1.8.7
set.seed(32611)
sample(1:10,3)
## [1] 1 7 3
set.seed(32611)
sample(1:10)
##  [1]  1  7  3  4  5  6 10  2  8  9
sample(1:10, replace = T)
##  [1] 3 8 9 7 2 6 5 5 7 3
set.seed(32611)
runif(n = 4)
## [1] 0.45923048 0.37486058 0.02958663 0.51775341

replicate

replicate(4,list())
## [[1]]
## list()
## 
## [[2]]
## list()
## 
## [[3]]
## list()
## 
## [[4]]
## list()
set.seed(32611)
replicate(5, runif(sample(1:5,1)))
## [[1]]
## [1] 0.3748606
## 
## [[2]]
## [1] 0.51775341 0.14487818 0.02609954
## 
## [[3]]
## [1] 0.6547776 0.5071504 0.2504906 0.9807351 0.4424658
## 
## [[4]]
## [1] 0.5409666 0.5041831 0.6529512 0.7552580 0.5850660
## 
## [[5]]
## [1] 0.2715686 0.3032975

Matrix calculation

a <- matrix(1:6,nrow=3,ncol=2)
a ## matrix will be filled by column by default. 
##      [,1] [,2]
## [1,]    1    4
## [2,]    2    5
## [3,]    3    6
b <- matrix(6:1, nrow=3, ncol=2)
a * b ## similar to vector, matrix algebra will be done element-wise.
##      [,1] [,2]
## [1,]    6   12
## [2,]   10   10
## [3,]   12    6
a + c(1,2,3) ## if a matrix add a vector, add by column
##      [,1] [,2]
## [1,]    2    5
## [2,]    4    7
## [3,]    6    9
a + 3 ## if a matrix add one number, add this number to each element of the matrix
##      [,1] [,2]
## [1,]    4    7
## [2,]    5    8
## [3,]    6    9

apply function: works on margins of a matrix (1)

a <- matrix(1:6,nrow=3,ncol=2)
a
##      [,1] [,2]
## [1,]    1    4
## [2,]    2    5
## [3,]    3    6
apply(a,1,function(x){sum(x)}) ## same as below
## [1] 5 7 9
apply(a,1,sum) ## for each row, equivalent to rowSums
## [1] 5 7 9
rowSums(a)
## [1] 5 7 9
apply(a,1,var) ## for each row, calculate the variance
## [1] 4.5 4.5 4.5
apply(a,1,function(x) x^2) ## can also use an anonymous function, note that R will always fill the result by column.
##      [,1] [,2] [,3]
## [1,]    1    4    9
## [2,]   16   25   36

apply function: works on margins of a matrix (2)

apply(a,2,sum) ## for each column, equivalent to colSums
## [1]  6 15
colSums(a)
## [1]  6 15
apply(a,2,mean) ## for each column, equivalent to colMeans
## [1] 2 5
apply(a,2,function(x) min(x)) ## equivalent to apply(a,2,min)
## [1] 1 4
apply(a,2,min)
## [1] 1 4

Sweep

set.seed(32611)
(x <- matrix(rnorm(6,0,4),nrow=2))
##            [,1]      [,2]      [,3]
## [1,] -0.4094903 -4.234625  1.593006
## [2,] -7.5476157  4.091040 -2.691786
x1 <- sweep(x,1,apply(x,1,min),'-')
x1
##          [,1]     [,2]     [,3]
## [1,] 3.825135  0.00000 5.827631
## [2,] 0.000000 11.63866 4.855829
x2 <- sweep(x1,1,apply(x1,1,max),'/')
x2
##          [,1] [,2]      [,3]
## [1,] 0.656379    0 1.0000000
## [2,] 0.000000    1 0.4172157

lapply()

l <- list(c(1:10), list(a='a'), c("dog","cat","gator"))
unlist(lapply(l, function(x) x[1])) ## x refers to each element of the list
##           a       
##   "1"   "a" "dog"
unlist(lapply(l, length))
## [1] 10  1  3
lapply2 <- function(x, f, ...){
  out <- vector("list", length(x))
  for(i in seq_along(x)){
    out[[i]] <- f(x[[i]], ...)
  }
  out
}
unlist(lapply2(l, length))
## [1] 10  1  3

lapply usages

alist <- list(a = 25, b = 100, c = 64)
lapply(alist, function(x) sqrt(x))
## $a
## [1] 5
## 
## $b
## [1] 10
## 
## $c
## [1] 8
lapply(seq_along(alist), function(x) sqrt(alist[[x]]))
## [[1]]
## [1] 5
## 
## [[2]]
## [1] 10
## 
## [[3]]
## [1] 8
lapply(names(alist), function(x) sqrt(alist[[x]]))
## [[1]]
## [1] 5
## 
## [[2]]
## [1] 10
## 
## [[3]]
## [1] 8

lapply on data.frame

aframe <- data.frame(col1=1:3,col2=4:6)
lapply(aframe, mean) ## as a list
## $col1
## [1] 2
## 
## $col2
## [1] 5
apply(aframe, 2, mean) ## as a matrix
## col1 col2 
##    2    5

lapply on a list of functions

compute_mean <- list(
  base = function(x) mean(x),
  sum = function(x) sum(x)/length(x),
  mannual = function(x){
    total <- 0; n <- length(x)
    for(i in seq_along(x)) total <- total + x[i]/n
    total
  }
)
set.seed(32611); x <- runif(1e6)
lapply(compute_mean,function(f) system.time(f(x))) ## f refers to each element of the list
## $base
##    user  system elapsed 
##   0.002   0.000   0.002 
## 
## $sum
##    user  system elapsed 
##   0.002   0.000   0.001 
## 
## $mannual
##    user  system elapsed 
##   0.020   0.000   0.021

sapply()

aframe <- data.frame(col1=1:3,col2=4:6)
sapply(aframe, sum) ## 
## col1 col2 
##    6   15
alist <- list(col1=1:3,col2=c("a","b"))
sapply(alist, unique) ## if not the same type, will coerce to a list
## $col1
## [1] 1 2 3
## 
## $col2
## [1] "a" "b"

vapply()

aframe <- data.frame(col1=1:3,col2=4:6)
vapply(aframe, sum, numeric(1)) 
## col1 col2 
##    6   15

tapply

pulse <- round(rnorm(22,70,10/3) + rep(c(0,5), c(10,12)))
groups <- rep(c("A", "B"), c(10, 12))
tapply(X = pulse, INDEX = groups, FUN = length)
##  A  B 
## 10 12
tapply(pulse, groups, mean)
##        A        B 
## 70.20000 74.91667
tapply2 <- function(x, group, f, ..., simplify = TRUE){
  pieces <- split(x, group)
  sapply(pieces, f, simplify=simplify)
}
tapply2(pulse, groups, length)
##  A  B 
## 10 12

Multiple inputs (Map)

xs <- replicate(3, runif(4),simplify=FALSE) ## simplify = TRUE (default) will convert a list to matrix whenever possible
ws <- replicate(3, rnorm(4, 1) + 1,simplify=FALSE)
xs
## [[1]]
## [1] 0.08032161 0.79005156 0.60823794 0.43907623
## 
## [[2]]
## [1] 0.2121942 0.7209280 0.3716162 0.7708587
## 
## [[3]]
## [1] 0.7943670 0.3145729 0.4271521 0.3151103
ws
## [[1]]
## [1] 2.6430216 0.2654494 2.0656753 2.0416840
## 
## [[2]]
## [1] 1.378534 1.673015 1.873283 3.820249
## 
## [[3]]
## [1] 2.394193 1.196721 2.730337 1.249842
unlist(lapply(seq_along(xs), function(i){
  weighted.mean(xs[[i]], ws[[i]])
}))
## [1] 0.3670111 0.5877195 0.5069851
Map(weighted.mean,xs,ws)
## [[1]]
## [1] 0.3670111
## 
## [[2]]
## [1] 0.5877195
## 
## [[3]]
## [1] 0.5069851
Map(function(x,w) weighted.mean(x, w, na.rm=TRUE),xs, ws)
## [[1]]
## [1] 0.3670111
## 
## [[2]]
## [1] 0.5877195
## 
## [[3]]
## [1] 0.5069851
Map(weighted.mean,xs, ws, na.rm=TRUE)
## [[1]]
## [1] 0.3670111
## 
## [[2]]
## [1] 0.5877195
## 
## [[3]]
## [1] 0.5069851

approach 3

mapply(weighted.mean,xs,ws)
## [1] 0.3670111 0.5877195 0.5069851
mapply(function(x,y) weighted.mean(x,y), xs,ws)
## [1] 0.3670111 0.5877195 0.5069851

Reduce

set.seed(32611)
l <- replicate(4, sample(1:10, 15, replace = T), simplify = FALSE)
str(l)
## List of 4
##  $ : int [1:15] 1 7 3 7 8 5 5 1 2 6 ...
##  $ : int [1:15] 6 5 5 7 3 7 3 1 10 7 ...
##  $ : int [1:15] 5 1 10 6 4 5 10 7 8 1 ...
##  $ : int [1:15] 1 1 4 4 2 7 3 10 4 2 ...
intersect(intersect(intersect(l[[1]], l[[2]]),
  l[[3]]), l[[4]])
## [1] 1 7 5 2
Reduce(intersect,l)
## [1] 1 7 5 2

Reduce 2

a <- c(1:10)
Reduce('+', a)
## [1] 55
b = as.list(letters[1:10])
Reduce("paste0", b)
## [1] "abcdefghij"
d <- as.list(letters[1:10])
Reduce("c",d)
##  [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j"

Outer

outer(1:3, 1:5, "*")
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    2    3    4    5
## [2,]    2    4    6    8   10
## [3,]    3    6    9   12   15
1:3 %o% 1:5
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    2    3    4    5
## [2,]    2    4    6    8   10
## [3,]    3    6    9   12   15

Outer 2

outer(1:3, 1:5, "paste")
##      [,1]  [,2]  [,3]  [,4]  [,5] 
## [1,] "1 1" "1 2" "1 3" "1 4" "1 5"
## [2,] "2 1" "2 2" "2 3" "2 4" "2 5"
## [3,] "3 1" "3 2" "3 3" "3 4" "3 5"
outer(1:3, 1:5, ">")
##       [,1]  [,2]  [,3]  [,4]  [,5]
## [1,] FALSE FALSE FALSE FALSE FALSE
## [2,]  TRUE FALSE FALSE FALSE FALSE
## [3,]  TRUE  TRUE FALSE FALSE FALSE
outer(1:3, 1:5, "^")
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    1    1    1    1
## [2,]    2    4    8   16   32
## [3,]    3    9   27   81  243

Vectorize a function

f <- function(x, y) c(x, y)
vf <- Vectorize(f, vectorize.args = c("x", "y"), SIMPLIFY = FALSE)
f(1:3, 1:3)
## [1] 1 2 3 1 2 3
vf(1:3, 1:3)
## [[1]]
## [1] 1 1
## 
## [[2]]
## [1] 2 2
## 
## [[3]]
## [1] 3 3
combn(4,2) ## all combinations of "choose 2 numbers from 1:4"
##      [,1] [,2] [,3] [,4] [,5] [,6]
## [1,]    1    1    1    2    2    3
## [2,]    2    3    4    3    4    4
## However, you cannot do combn(c(4,5),c(2,3)), how to vectorize this function
combnV <- Vectorize(function(x, m) combn(x, m),
                    vectorize.args = c("x", "m"))

combnV(c(4,5),c(2,3))
## [[1]]
##      [,1] [,2] [,3] [,4] [,5] [,6]
## [1,]    1    1    1    2    2    3
## [2,]    2    3    4    3    4    4
## 
## [[2]]
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,]    1    1    1    1    1    1    2    2    2     3
## [2,]    2    2    2    3    3    4    3    3    4     4
## [3,]    3    4    5    4    5    5    4    5    5     5

Balance between efficiency and simplicity of your code