Biostatistical Computing, PHC 6068

Vectorized calculation

Zhiguang Huo (Caleb)

Wednesday September 5, 2018

Why do we need vectorized calculation (motivating example 1)

version A, calculation by loop

a <- 1:1000000
### version A, loop
start <- Sys.time()
meanA <- 0
for(i in seq_along(a)){
  meanA <- meanA + a[i]/length(a)
}
end <- Sys.time() 
end - start
## Time difference of 0.320843 secs
meanA
## [1] 500000.5

version B, vectorized calculation

start <- Sys.time()
mean(a)
## [1] 500000.5
end <- Sys.time() 
end - start
## Time difference of 0.00848484 secs

Why do we need vectorized calculation (motivating example 2)

version A, calculation by loop

a <- 1:1000000
b <- 1000000:1
### version A, loop
start <- Sys.time()
result <- numeric(length(a)) ## create a vector with length length(a) and all elements 0
for(i in seq_along(a)){
  result[i] <- a[i] + b[i]
}
end <- Sys.time() 
end - start
## Time difference of 0.162504 secs

version B, vectorized calculation

start <- Sys.time()
result <- a + b
end <- Sys.time() 
end - start
## Time difference of 0.01691008 secs

Simple examples of vectorized calculation

a <- c(1.1,3.4,9.5)
b <- c(9,2,0.8)
a + b ## vector addition
## [1] 10.1  5.4 10.3
a * b ## vector multiplication
## [1] 9.9 6.8 7.6
a ^ b ## 1.1^9, 3.4^2, 9.5^0.8
## [1]  2.357948 11.560000  6.055903

Simple examples of vectorized calculation 2

a <- 1:8
a + 2
## [1]  3  4  5  6  7  8  9 10
a + c(0,1) ## if vector a is integer multiple of another vector
## [1] 1 3 3 5 5 7 7 9
a + c(1,2,3) ## warming message 
## Warning in a + c(1, 2, 3): longer object length is not a multiple of
## shorter object length
## [1]  2  4  6  5  7  9  8 10

Other scientific calculation

a <- seq(1,3,1)
sin(a)
## [1] 0.8414710 0.9092974 0.1411200
tan(a)
## [1]  1.5574077 -2.1850399 -0.1425465
log(a,base = 10)
## [1] 0.0000000 0.3010300 0.4771213
log10(a)
## [1] 0.0000000 0.3010300 0.4771213
exp(a)
## [1]  2.718282  7.389056 20.085537

ifelse

a <- seq(1,8,1)

## by loop
res <- character(length(a))
for(i in 1:length(a)){
  if(a[i] %% 2 == 0){
    res[i] <- "even"
  } else{
    res[i] <- "odd"
  }
}
res
## [1] "odd"  "even" "odd"  "even" "odd"  "even" "odd"  "even"
## by vecterized calculation
ifelse(a %% 2 ==0, "even", "odd")
## [1] "odd"  "even" "odd"  "even" "odd"  "even" "odd"  "even"

vectorized input for plot

plot(x = 1:10, y = 1:10, col=1:10)

plot(x = 1:10, y = 1:10, col=1)

Initialize a vector

n <- 5
a_int <- integer(n)
a_int
## [1] 0 0 0 0 0
b_int <- rep(0, n)
b_int
## [1] 0 0 0 0 0
c_int <- replicate(5,0)
c_int
## [1] 0 0 0 0 0

Initialize a vector, other types

a_double <- double(n)
a_double
## [1] 0 0 0 0 0
a_char <- character(n)
a_char
## [1] "" "" "" "" ""
a_logical <- logical(n)
a_logical
## [1] FALSE FALSE FALSE FALSE FALSE

A sequence

a <- seq(from=10, to=100, by=2)
a
##  [1]  10  12  14  16  18  20  22  24  26  28  30  32  34  36  38  40  42
## [18]  44  46  48  50  52  54  56  58  60  62  64  66  68  70  72  74  76
## [35]  78  80  82  84  86  88  90  92  94  96  98 100
b <- seq(from=10, to=100, length=10)
b
##  [1]  10  20  30  40  50  60  70  80  90 100
c <- seq(from=10, to=100, along.with=1:10)
c
##  [1]  10  20  30  40  50  60  70  80  90 100

Random number generator

set.seed(32611) ## set a seed number such that the random numbers will keep the same
rnorm(n = 5, mean = 0, sd = 1)
## [1] -0.1023726 -1.8869039 -1.0586563  1.0227599  0.3982516
set.seed(32611)
rnorm(n = 5, mean = 0:4, sd = 1)
## [1] -0.1023726 -0.8869039  0.9413437  4.0227599  4.3982516
set.seed(32611)
sample(1:10,3)
## [1] 5 4 1
set.seed(32611)
sample(1:10)
##  [1]  5  4  1  9  8  6  7  2  3 10
sample(1:10, replace = T)
##  [1]  3 10  5  9  6  6  7  8  6 10
set.seed(32611)
runif(n = 4)
## [1] 0.45923048 0.37486058 0.02958663 0.51775341

replicate

replicate(4,list())
## [[1]]
## list()
## 
## [[2]]
## list()
## 
## [[3]]
## list()
## 
## [[4]]
## list()
set.seed(32611)
replicate(5, runif(sample(1:5,1)))
## [[1]]
## [1] 0.37486058 0.02958663 0.51775341
## 
## [[2]]
## [1] 0.02609954
## 
## [[3]]
## [1] 0.4031372 0.6547776 0.5071504 0.2504906 0.9807351
## 
## [[4]]
## [1] 0.8903292 0.5409666 0.5041831
## 
## [[5]]
## [1] 0.7552580 0.5850660 0.9150680 0.2715686

Matrix calculation

a <- matrix(1:6,nrow=3,ncol=2)
a ## matrix will be filled by column by default. 
##      [,1] [,2]
## [1,]    1    4
## [2,]    2    5
## [3,]    3    6
b <- matrix(6:1, nrow=3, ncol=2)
a * b ## similar to vector, matrix algebra will be done element-wise.
##      [,1] [,2]
## [1,]    6   12
## [2,]   10   10
## [3,]   12    6
a + c(1,2,3) ## if a matrix add a vector, add by column
##      [,1] [,2]
## [1,]    2    5
## [2,]    4    7
## [3,]    6    9
a + 3 ## if a matrix add one number, add this number to each element of the matrix
##      [,1] [,2]
## [1,]    4    7
## [2,]    5    8
## [3,]    6    9

apply function: works on margins of a matrix (1)

a <- matrix(1:6,nrow=3,ncol=2)
a
##      [,1] [,2]
## [1,]    1    4
## [2,]    2    5
## [3,]    3    6
apply(a,1,sum) ## for each row, equivalent to rowSums
## [1] 5 7 9
apply(a,1,var) ## for each row, calculate the variance
## [1] 4.5 4.5 4.5
apply(a,1,function(x) x^2) ## can also use an anonymous function, note that R will always fill the result by column.
##      [,1] [,2] [,3]
## [1,]    1    4    9
## [2,]   16   25   36

apply function: works on margins of a matrix (2)

apply(a,2,sum) ## for each column, equivalent to colSums
## [1]  6 15
colSums(a)
## [1]  6 15
apply(a,2,mean) ## for each column, equivalent to colMeans
## [1] 2 5
apply(a,2,function(x) min(x)) ## equivalent to apply(a,2,min)
## [1] 1 4
apply(a,2,min)
## [1] 1 4

Sweep

set.seed(32611)
(x <- matrix(rnorm(6,0,4),nrow=2))
##            [,1]      [,2]      [,3]
## [1,] -0.4094903 -4.234625  1.593006
## [2,] -7.5476157  4.091040 -2.691786
x1 <- sweep(x,1,apply(x,1,min),'-')
x1
##          [,1]     [,2]     [,3]
## [1,] 3.825135  0.00000 5.827631
## [2,] 0.000000 11.63866 4.855829
x2 <- sweep(x1,1,apply(x1,1,max),'/')
x2
##          [,1] [,2]      [,3]
## [1,] 0.656379    0 1.0000000
## [2,] 0.000000    1 0.4172157

lapply()

l <- list(c(1:10), list(a='a'), c("dog","cat","gator"))
unlist(lapply(l, length))
## [1] 10  1  3
lapply2 <- function(x, f, ...){
  out <- vector("list", length(x))
  for(i in seq_along(x)){
    out[[i]] <- f(x[[i]], ...)
  }
  out
}
unlist(lapply2(l, length))
## [1] 10  1  3

lapply usages

alist <- list(a = 25, b = 100, c = 64)
lapply(alist, function(x) sqrt(x))
## $a
## [1] 5
## 
## $b
## [1] 10
## 
## $c
## [1] 8
lapply(seq_along(alist), function(x) sqrt(alist[[x]]))
## [[1]]
## [1] 5
## 
## [[2]]
## [1] 10
## 
## [[3]]
## [1] 8
lapply(names(alist), function(x) sqrt(alist[[x]]))
## [[1]]
## [1] 5
## 
## [[2]]
## [1] 10
## 
## [[3]]
## [1] 8

lapply on data.frame

aframe <- data.frame(col1=1:3,col2=4:6)
lapply(aframe, mean) ## as a list
## $col1
## [1] 2
## 
## $col2
## [1] 5
apply(aframe, 2, mean) ## as a matrix
## col1 col2 
##    2    5

lapply on a list of functions

compute_mean <- list(
  base = function(x) mean(x),
  sum = function(x) sum(x)/length(x),
  mannual = function(x){
    total <- 0; n <- length(x)
    for(i in seq_along(x)) total <- total + x[i]/n
    total
  }
)
set.seed(32611); x <- runif(1e6)
lapply(compute_mean,function(f) system.time(f(x)))
## $base
##    user  system elapsed 
##   0.003   0.000   0.003 
## 
## $sum
##    user  system elapsed 
##   0.002   0.000   0.002 
## 
## $mannual
##    user  system elapsed 
##   0.073   0.000   0.075

sapply()

aframe <- data.frame(col1=1:3,col2=4:6)
sapply(aframe, sum) ## 
## col1 col2 
##    6   15
alist <- list(col1=1:3,col2=c("a","b"))
sapply(alist, unique) ## if not the same type, will coerce to a list
## $col1
## [1] 1 2 3
## 
## $col2
## [1] "a" "b"

vapply()

aframe <- data.frame(col1=1:3,col2=4:6)
vapply(aframe, sum, numeric(1)) 
## col1 col2 
##    6   15

aggregate

head(state.x77)
##            Population Income Illiteracy Life Exp Murder HS Grad Frost
## Alabama          3615   3624        2.1    69.05   15.1    41.3    20
## Alaska            365   6315        1.5    69.31   11.3    66.7   152
## Arizona          2212   4530        1.8    70.55    7.8    58.1    15
## Arkansas         2110   3378        1.9    70.66   10.1    39.9    65
## California      21198   5114        1.1    71.71   10.3    62.6    20
## Colorado         2541   4884        0.7    72.06    6.8    63.9   166
##              Area
## Alabama     50708
## Alaska     566432
## Arizona    113417
## Arkansas    51945
## California 156361
## Colorado   103766
head(state.region)
## [1] South West  West  South West  West 
## Levels: Northeast South North Central West
aggregate(state.x77, list(Region = state.region), mean)
##          Region Population   Income Illiteracy Life Exp    Murder  HS Grad
## 1     Northeast   5495.111 4570.222   1.000000 71.26444  4.722222 53.96667
## 2         South   4208.125 4011.938   1.737500 69.70625 10.581250 44.34375
## 3 North Central   4803.000 4611.083   0.700000 71.76667  5.275000 54.51667
## 4          West   2915.308 4702.615   1.023077 71.23462  7.215385 62.00000
##      Frost      Area
## 1 132.7778  18141.00
## 2  64.6250  54605.12
## 3 138.8333  62652.00
## 4 102.1538 134463.00

tapply

pulse <- round(rnorm(22,70,10/3) + rep(c(0,5), c(10,12)))
groups <- rep(c("A", "B"), c(10, 12))
tapply(pulse, groups, length)
##  A  B 
## 10 12
tapply(pulse, groups, mean)
##        A        B 
## 70.20000 74.91667
tapply2 <- function(x, group, f, ..., simplify = TRUE){
  pieces <- split(x, group)
  sapply(pieces, f, simplify=simplify)
}
tapply2(pulse, groups, length)
##  A  B 
## 10 12

Multiple inputs (Map)

xs <- replicate(3, runif(4),simplify=FALSE) ## simplify = TRUE (default) will convert a list to matrix whenever possible
ws <- replicate(3, rnorm(4, 1) + 1,simplify=FALSE)
xs
## [[1]]
## [1] 0.08032161 0.79005156 0.60823794 0.43907623
## 
## [[2]]
## [1] 0.2121942 0.7209280 0.3716162 0.7708587
## 
## [[3]]
## [1] 0.7943670 0.3145729 0.4271521 0.3151103
ws
## [[1]]
## [1] 2.6430216 0.2654494 2.0656753 2.0416840
## 
## [[2]]
## [1] 1.378534 1.673015 1.873283 3.820249
## 
## [[3]]
## [1] 2.394193 1.196721 2.730337 1.249842
unlist(lapply(seq_along(xs), function(i){
  weighted.mean(xs[[i]], ws[[i]])
}))
## [1] 0.3670111 0.5877195 0.5069851
Map(weighted.mean,xs,ws)
## [[1]]
## [1] 0.3670111
## 
## [[2]]
## [1] 0.5877195
## 
## [[3]]
## [1] 0.5069851
Map(function(x,w) weighted.mean(x, w, na.rm=TRUE),xs, ws)
## [[1]]
## [1] 0.3670111
## 
## [[2]]
## [1] 0.5877195
## 
## [[3]]
## [1] 0.5069851
Map(weighted.mean,xs, ws, na.rm=TRUE)
## [[1]]
## [1] 0.3670111
## 
## [[2]]
## [1] 0.5877195
## 
## [[3]]
## [1] 0.5069851

approach 3

mapply(weighted.mean,xs,ws)
## [1] 0.3670111 0.5877195 0.5069851
mapply(function(x,y) weighted.mean(x,y), xs,ws)
## [1] 0.3670111 0.5877195 0.5069851

Other functional programming

Reduce

set.seed(32611)
l <- replicate(4, sample(1:10, 15, replace = T), simplify = FALSE)
str(l)
## List of 4
##  $ : int [1:15] 5 4 1 6 2 1 9 5 7 6 ...
##  $ : int [1:15] 6 7 8 6 10 3 4 9 2 10 ...
##  $ : int [1:15] 8 6 8 8 5 5 3 6 9 8 ...
##  $ : int [1:15] 5 6 7 2 5 9 4 10 3 8 ...
intersect(intersect(intersect(l[[1]], l[[2]]),
  l[[3]]), l[[4]])
## [1] 1 6 2 9 7 3
Reduce(intersect,l)
## [1] 1 6 2 9 7 3

Reduce 2

a <- c(1:10)
Reduce('+', a)
## [1] 55
b = as.list(letters[1:10])
Reduce("paste0", b)
## [1] "abcdefghij"
d <- as.list(letters[1:10])
Reduce("c",d)
##  [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j"

Filter

alist = list(a="a", b=1, c=FALSE, d=list(1:10), e="e", f = 2)
Filter(is.character, alist)
## $a
## [1] "a"
## 
## $e
## [1] "e"

Find

alist = list(a="a", b=1, c=FALSE, d=list(1:10), e="e", f = 2)
Find(is.character, alist)
## [1] "a"
Find(is.numeric, alist)
## [1] 1

Position

alist = list(a="a", b=1, c=FALSE, d=list(1:10), e="e", f = 2)
Position(is.character, alist)
## [1] 1
Position(is.numeric, alist)
## [1] 2

Outer

outer(1:3, 1:5, "*")
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    2    3    4    5
## [2,]    2    4    6    8   10
## [3,]    3    6    9   12   15
1:3 %o% 1:5
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    2    3    4    5
## [2,]    2    4    6    8   10
## [3,]    3    6    9   12   15

Outer 2

outer(1:3, 1:5, "paste")
##      [,1]  [,2]  [,3]  [,4]  [,5] 
## [1,] "1 1" "1 2" "1 3" "1 4" "1 5"
## [2,] "2 1" "2 2" "2 3" "2 4" "2 5"
## [3,] "3 1" "3 2" "3 3" "3 4" "3 5"
outer(1:3, 1:5, ">")
##       [,1]  [,2]  [,3]  [,4]  [,5]
## [1,] FALSE FALSE FALSE FALSE FALSE
## [2,]  TRUE FALSE FALSE FALSE FALSE
## [3,]  TRUE  TRUE FALSE FALSE FALSE
outer(1:3, 1:5, "^")
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    1    1    1    1
## [2,]    2    4    8   16   32
## [3,]    3    9   27   81  243

Vectorize a function

f <- function(x, y) c(x, y)
vf <- Vectorize(f, vectorize.args = c("x", "y"), SIMPLIFY = FALSE)
f(1:3, 1:3)
## [1] 1 2 3 1 2 3
vf(1:3, 1:3)
## [[1]]
## [1] 1 1
## 
## [[2]]
## [1] 2 2
## 
## [[3]]
## [1] 3 3
combn(4,2) ## all combinations of "choose 2 numbers from 1:4"
##      [,1] [,2] [,3] [,4] [,5] [,6]
## [1,]    1    1    1    2    2    3
## [2,]    2    3    4    3    4    4
## However, you cannot do combn(c(4,5),c(2,3)), how to vectorize this function
combnV <- Vectorize(function(x, m) combn(x, m),
                    vectorize.args = c("x", "m"))

combnV(c(4,5),c(2,3))
## [[1]]
##      [,1] [,2] [,3] [,4] [,5] [,6]
## [1,]    1    1    1    2    2    3
## [2,]    2    3    4    3    4    4
## 
## [[2]]
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,]    1    1    1    1    1    1    2    2    2     3
## [2,]    2    2    2    3    3    4    3    3    4     4
## [3,]    3    4    5    4    5    5    4    5    5     5

Balance between efficiency and simplicity of your code