Introduction to Biostatistical Computing PHC 6937

Basics about R

Zhiguang Huo (Caleb)

Monday August 29, 2022

Outline

Data types

Data types (examples)

typeof(1.1)
## [1] "double"
typeof("a")
## [1] "character"
typeof(TRUE)
## [1] "logical"
typeof(4)
## [1] "double"
typeof(4L)
## [1] "integer"

logical

if(T){
  print(TRUE)
}
## [1] TRUE
typeof(FALSE)
## [1] "logical"
is.logical(T)
## [1] TRUE

integer

aseq <- 1:10 ## <- represent assign value
typeof(aseq)
## [1] "integer"
aint = 6L
is.integer(aint)
## [1] TRUE
is.integer(6)
## [1] FALSE

Assign values

a = 1.6; print(a) ## assign 1.6 to a 
## [1] 1.6
b <- 1.6; print(b) ## assign 1.6 to b
## [1] 1.6
1.6 -> d; print(d) ## assign 1.6 to d
## [1] 1.6

Difference between = and <-

double

typeof(1.4142)
## [1] "double"
is.double(pi)
## [1] TRUE
is.double(0L)
## [1] FALSE
is.double(0L + 1.5)
## [1] TRUE

character

acharacter <- "I like Biostatistical computing"
typeof(acharacter)
## [1] "character"
bcharacter <- 'You like Biostatistical computing'
is.character(bcharacter)
## [1] TRUE
ccharacter <- "He doesn't like Biostatistical computing"
print(ccharacter)
## [1] "He doesn't like Biostatistical computing"

1d Vector

dbl_var <- c(1, 23.1, 4.2)
int_var <- c(1L, 11L, 6L)
log_var <- c(TRUE, FALSE, T, F)
chr_var <- c("I", "like Biostatistical computing")

Commonly used vector functions

Functions Meaning
length(x) Number of elements in x
unique(x) Unique elements of x
sort(x) Sort the elements of x
rev(x) Reverse the order of x
names(x) Name the elements of x
which(x) Indices of x that are TRUE
which.max(x) Index of the maximum element of x
which.min(x) Index of the minimum element of x
append(x) Insert elements into a vector
match(x) First index of an element in a vector
union(x, y) Union of x and y
intersect(x, y) Intersection of x and y
setdiff(x, y) Elements of x that are not in y
setequal(x, y) Do x and y contain the same elements

Example of Vector Functions

x <- c(1,6,9,2,2,5,10)
length(x)
unique(x)
length(unique(x))
sort(x)
rev(x)
which(x==2)
which.max(x)
which.min(x)
append(x,0)
match(9,x)
y <- c(1,2,3)
union(x,y)
intersect(x,y)
setdiff(x,y)
setequal(x,y)

Statistical Vector Functions

Functions Meaning
sum(x) Sum of x
prod(x) Product of x
cumsum(x) Cumulative sum of x
cumprod(x) Cumulative product of x
min(x) Minimum element of x
max(x) Maximum element of x
pmin(x, y) Pairwise minimum of x and y
pmax(x, y) Pairwise maximum of x and y
mean(x) Mean of x
median(x) Median of x
var(x) Variance of x
sd(x) Standard deviation of x
cov(x, y) Covariance of x and y
cor(x, y) Correlation of x and y
range(x) Range of x
quantile(x) Quantiles of x for given probabilities
summary(x) Numerical summary of x

Example of Statistical Vector Functions

avec <- c(5,2,9,3)
max(avec)
## [1] 9
which.max(avec)
## [1] 3
mean(avec)
## [1] 4.75
range(avec)
## [1] 2 9

Coercion

typeof(c("a", 1))
## [1] "character"
x <- c(FALSE, FALSE, TRUE)
as.numeric(x)
## [1] 0 0 1
as.character(x)
## [1] "FALSE" "FALSE" "TRUE"
typeof(c(1.2,1L))
## [1] "double"

How to get help

Missing value

typeof(NA)
## [1] "logical"
typeof(NA_integer_)
## [1] "integer"
typeof(NA_real_)
## [1] "double"
typeof(NA_character_)
## [1] "character"

list

x <- list(1:3, "a", c(TRUE, FALSE, TRUE), list(2.3, 5.9))
str(x)
## List of 4
##  $ : int [1:3] 1 2 3
##  $ : chr "a"
##  $ : logi [1:3] TRUE FALSE TRUE
##  $ :List of 2
##   ..$ : num 2.3
##   ..$ : num 5.9
x[[1]]
## [1] 1 2 3

list

x <- list(p1 = 1:3, p2 = "a", p3 = c(TRUE, FALSE, TRUE), p4 = list(2.3, 5.9))
str(x)
## List of 4
##  $ p1: int [1:3] 1 2 3
##  $ p2: chr "a"
##  $ p3: logi [1:3] TRUE FALSE TRUE
##  $ p4:List of 2
##   ..$ : num 2.3
##   ..$ : num 5.9
x[[1]]
## [1] 1 2 3
x$p1
## [1] 1 2 3

Data structure

Homogeneous Heterogeneous
1d Atomic vector List
2d Matrix Data frame
nd Array

Attributes

y <- 1:10
attr(y, "my_attribute") <- "This is a vector"
attr(y, "my_attribute")
## [1] "This is a vector"
attributes(y)
## $my_attribute
## [1] "This is a vector"

Attributes 2

y <- c(a=1,2:10)
names(y)
##  [1] "a" ""  ""  ""  ""  ""  ""  ""  ""  ""
names(y)[2] <- 'b'
dim(y)
## NULL
dim(y) <- c(2,5)
print(y)
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    3    5    7    9
## [2,]    2    4    6    8   10
class(y)
## [1] "matrix" "array"

factor

x <- factor(c("a", "b", "b", 'a'))
x
## [1] a b b a
## Levels: a b
class(x)
## [1] "factor"
levels(x)
## [1] "a" "b"

factors 2

factors are very useful when there exist missing class

sex_char <- c("m", "m", "m")
sex_factor <- factor(sex_char, levels=c("m", "f"))
table(sex_char)
## sex_char
## m 
## 3
table(sex_factor)
## sex_factor
## m f 
## 3 0

Matrices

a <- matrix(1:6, ncol=3, nrow=2, dimnames = list(c("row1", "row2"),
                               c("C.1", "C.2", "C.3")))
a
##      C.1 C.2 C.3
## row1   1   3   5
## row2   2   4   6
colnames(a)
## [1] "C.1" "C.2" "C.3"
rownames(a)
## [1] "row1" "row2"
ncol(a)
## [1] 3
nrow(a)
## [1] 2

Matrices

c <- 1:6
dim(c) <- c(3,2)
c
##      [,1] [,2]
## [1,]    1    4
## [2,]    2    5
## [3,]    3    6
dim(c) <- c(2,3)
c
##      [,1] [,2] [,3]
## [1,]    1    3    5
## [2,]    2    4    6
rownames(c) <- c("row1", "row2")
colnames(c) <- c("C.1", "C.2", "C.3")
c
##      C.1 C.2 C.3
## row1   1   3   5
## row2   2   4   6

Data frames

df <- data.frame(x=1:3, y=c("a","b","c"),z=0)
str(df)
## 'data.frame':    3 obs. of  3 variables:
##  $ x: int  1 2 3
##  $ y: chr  "a" "b" "c"
##  $ z: num  0 0 0
cat(names(df), "same as", colnames(df))
## x y z same as x y z
cat(length(df), "same as", ncol(df))
## 3 same as 3

String manipulation

sentenses <- "R is a great statistical software.\n\nWe use R in the Biostatistical computing class!"
sentenses
## [1] "R is a great statistical software.\n\nWe use R in the Biostatistical computing class!"
cat(sentenses)
## R is a great statistical software.
## 
## We use R in the Biostatistical computing class!

Convert to upper case or lower case

achar <- "this is a dog."
print(achar)
## [1] "this is a dog."
print(toupper(achar))
## [1] "THIS IS A DOG."
print(tolower("WWW.UFL.EDU"))
## [1] "www.ufl.edu"

Length of a string

achar <- "this is a dog."
nchar(achar)
## [1] 14
length(achar)
## [1] 1

vectorizes nchar

chars <- c("a dog", "a cat", "a gator")
nchar(chars)
## [1] 5 5 7
length(chars)
## [1] 3

Obtaining a substring

chars <- "this is a dog"
substring(chars,1,1)
## [1] "t"
substring(chars,11,13)
## [1] "dog"
substring(chars,11,13) <- "cat"
print(chars)
## [1] "this is a cat"

strsplit

strsplit("this is a dog", split=" ")
## [[1]]
## [1] "this" "is"   "a"    "dog"
strsplit("this is a dog", split="")
## [[1]]
##  [1] "t" "h" "i" "s" " " "i" "s" " " "a" " " "d" "o" "g"
strsplit(c("this is a dog", "this is a cat", "this is a gator"), split=" ")
## [[1]]
## [1] "this" "is"   "a"    "dog" 
## 
## [[2]]
## [1] "this" "is"   "a"    "cat" 
## 
## [[3]]
## [1] "this"  "is"    "a"     "gator"

paste

paste('this','is','a','dog', sep=" ")
## [1] "this is a dog"
paste0('this','is','a','dog')
## [1] "thisisadog"
avec <- c('this','is','a','dog')
nchar(avec)
## [1] 4 2 1 3
paste(c('this','is','a','dog'), collapse = " ")
## [1] "this is a dog"

Substituation

achar <- "this is a dog"
gsub(pattern = "dog",replacement="cat",x=achar) ## pattern, replacement, x
## [1] "this is a cat"
gsub("dog","cat",achar) ## pattern, replacement, x
## [1] "this is a cat"
chars <- c("this is a dog", "this is a cat", "this is a gator")
gsub("this","that",chars) ## pattern, replacement, x
## [1] "that is a dog"   "that is a cat"   "that is a gator"

Regular expression

chars <- c("this is a dog", "this is a cat", "this is a gator")
grep("gator", chars)
## [1] 3
grep("this", chars)
## [1] 1 2 3
grep("gator", chars, value = T)
## [1] "this is a gator"
grep("gator", chars, invert = T)
## [1] 1 2

Regular expression 2

chars <- c("this is a dog", "this is a cat", "this is a gator")
grep("dog|cat", chars)
## [1] 1 2

Regular expression 3

chars <- c("abc", "ab", "bc")
grep("b", chars)
## [1] 1 2 3
grep("^b", chars)
## [1] 3
chars <- c("abc", "ab", "bc")
grep("ab", chars)
## [1] 1 2
grep("ab$", chars)
## [1] 2

Metacharacters

chars <- c("this is a dog", "this is a cat", "this is a gator")
grep("[bced]", chars)
## [1] 1 2
chars <- c("this is a dog", "this is a cat", "this is a gator")
grep("[b-d]", chars)
## [1] 1 2
grep("[0-9]", chars)
## integer(0)

Metacharacters 2

Substituation using metacharacters

chars <- c("this is a dog", "this is a cat", "this is a   gator")
gsub(pattern = "[aeiou]",replacement="Z",x=chars) ## pattern is vowel, replacement is Z
## [1] "thZs Zs Z dZg"     "thZs Zs Z cZt"     "thZs Zs Z   gZtZr"
gsub("[0-9]","#","a1b2") ## pattern, replacement, x
## [1] "a#b#"

Loop

for(i in 1:10){
  cat(i," ")
}
## 1  2  3  4  5  6  7  8  9  10
i <- 1
while(i <= 10){
  cat(i," ")
  i <- i + 1
}
## 1  2  3  4  5  6  7  8  9  10

Loop 2

for(i in 1:10){
  if(i%%2==0){
    next
  }
  cat(i," ")
}
## 1  3  5  7  9
for(i in seq_len(10)){
  if(i%%2==0){
    next
  }
  cat(i," ")
}
## 1  3  5  7  9
for(i in 1:10){
  if(i%%2==0){
    break
  }
  cat(i," ")
}
## 1

Loop 3

x <- list(1:3, "a", c(TRUE, FALSE, TRUE), list(2.3, 5.9))
for(i in 1:length(x)){ ## 1:length(x) is not recommended
  ax <- x[[i]]
  print(ax)
}
## [1] 1 2 3
## [1] "a"
## [1]  TRUE FALSE  TRUE
## [[1]]
## [1] 2.3
## 
## [[2]]
## [1] 5.9
for(i in seq_along(x)){ ## seq_along(x) is the same as 1:length(x) 
  ax <- x[[i]]
  print(ax)
}
## [1] 1 2 3
## [1] "a"
## [1]  TRUE FALSE  TRUE
## [[1]]
## [1] 2.3
## 
## [[2]]
## [1] 5.9
for(ax in x){ ## direct sub-element
  print(ax)
}
## [1] 1 2 3
## [1] "a"
## [1]  TRUE FALSE  TRUE
## [[1]]
## [1] 2.3
## 
## [[2]]
## [1] 5.9

Subsetting – Atomic vectors 1

example: x <- c(2.1, 4.2, 3.3, 5.4). How can we obtain a subset of this vector?

### subseting
## atomic vectors
x <- c(2.1, 4.2, 3.3, 5.4)

# Positive integer
x[c(3,1)]
## [1] 3.3 2.1
x[order(x)] ## equivalent to sort(x)
## [1] 2.1 3.3 4.2 5.4
x[c(1,1,1)]
## [1] 2.1 2.1 2.1

Subsetting – Atomic vectors 1

example: x <- c(2.1, 4.2, 3.3, 5.4). How can we obtain a subset of this vector?

# negative integer
x[-c(1, 3)]
## [1] 4.2 5.4
x[-grep(4.2, x)]
## [1] 2.1 3.3 5.4

Subsetting – Atomic vectors 3

example: x <- c(2.1, 4.2, 3.3, 5.4). How can we obtain a subset of this vector?

x[c(TRUE, TRUE, FALSE, FALSE)]
## [1] 2.1 4.2
x > 3
## [1] FALSE  TRUE  TRUE  TRUE
x[x > 3]
## [1] 4.2 3.3 5.4
x[c(TRUE, TRUE, NA, FALSE)]
## [1] 2.1 4.2  NA

Subsetting – Atomic vectors 4

example: x <- c(2.1, 4.2, 3.3, 5.4). How can we obtain a subset of this vector?

x[]
## [1] 2.1 4.2 3.3 5.4
x[0]
## numeric(0)

Subsetting – Atomic vectors 5

example: x <- c(2.1, 4.2, 3.3, 5.4). How can we obtain a subset of this vector?

x <- c(a=2.1, b=4.2, c=3.3, d=5.4)

x["a"]
##   a 
## 2.1
x[letters[2:3]]
##   b   c 
## 4.2 3.3

Subsetting – Matrices and arrays

a <- matrix(1:9, nrow=3)
colnames(a) <- c("A","B","C")
a
##      A B C
## [1,] 1 4 7
## [2,] 2 5 8
## [3,] 3 6 9
a[1:2,]
##      A B C
## [1,] 1 4 7
## [2,] 2 5 8
a[c(T,F,T), c("B","A")]
##      B A
## [1,] 4 1
## [2,] 6 3
a[,-2]
##      A C
## [1,] 1 7
## [2,] 2 8
## [3,] 3 9

Subsetting – Data frame

df <- data.frame(x=1:2, y=2:1, z=letters[1:2])
df[df$x==2,]
##   x y z
## 2 2 1 b
df[c("x","z")] # like a list
##   x z
## 1 1 a
## 2 2 b
df[,c("x","z")] # like a matrix
##   x z
## 1 1 a
## 2 2 b

Subsetting – simplifying vs preserving

Functions simplifying preserving
List x[[1]] x[1]
Vector x[[1]] x[1]
Factor x[1:2, drop=T] x[1:2]
Data frame x[,1] or x[[1]] x[, 1, drop=F] or x[1]

Subsetting – simplifying vs preserving

x <- list(a=111, b=222)


x[[1]] ## simplified, result is numeric
## [1] 111
typeof(x[[1]])
## [1] "double"
x[1] ## preserved, result is a list
## $a
## [1] 111
typeof(x[1])
## [1] "list"

Matching

grades <- c(1,2,2,3,1)
info <- data.frame(grade=3:1, desc=c("Excellent", "Good", "Poor"), fail=c(F,F,T))
id <- match(grades, info$grade)
id
## [1] 3 2 2 1 3
info[id,]
##     grade      desc  fail
## 3       1      Poor  TRUE
## 2       2      Good FALSE
## 2.1     2      Good FALSE
## 1       3 Excellent FALSE
## 3.1     1      Poor  TRUE

File I/O

Read in txt/csv files

setwd("~/Desktop") ## set your working directory
burnData <- read.csv("burn.csv", row.names = 1)
burnData <- read.csv("https://caleb-huo.github.io/teaching/data/Burn/burn.csv", row.names = 1)
burnData <- read.csv("burn.csv", row.names = 1, nrows = 2)
burnData <- read.csv("burn.csv", row.names = 1, skip = 3)

Delimiter

read.csv()
read.table()
read.delim(, delim=";")

Also pay attention to the arguments such as header, row.names

Save txt/csv files

write.csv(burnData, file = "myBurnData.csv")
write.table(burnData, file = "myBurnData.txt")
write.table(burnData, file = "myBurnData.txt", append = TRUE)

Read in line by line

fileNameFull <- 'https://caleb-huo.github.io/teaching/data/Burn/burn.csv'
con  <- file(fileNameFull, open = "r")

while (length(oneLine <- readLines(con, n = 1, warn = FALSE)) > 0) {
    aline = strsplit(oneLine, ",")[[1]]
    print(aline)
} 
close(con) ## remember to close files

Save R objects part 1 (save)

If you take a long time to obtain your result. How to save your result so in the future, you won’t bother re-run them again?

a <- 1:4
b <- 2:5
ans <- a * b
result <- list(a=a, b=b, ans=ans)
save(result,file="myResult.rdata")
load("myResult.rdata")
result2 <- get(load("myResult.rdata"))

Save R objects part 1 (saveRDS)

If you take a long time to obtain your result. How to save your result so in the future, you won’t bother re-run them again?

a <- 1:4
b <- 2:5
ans <- a * b
result <- list(a=a, b=b, ans=ans)
saveRDS(result,file="myResult.rdata")
result2 <- readRDS("myResult.rdata")

Date

Dates are represented as the number of days since 1970-01-01, with negative values for earlier dates.

mydates <- as.Date(c("2017-09-11", "2012-12-17", "1970-01-01"))
# number of days in between 
days <- mydates[1] - mydates[2]
days
## Time difference of 1729 days
as.numeric(mydates)
## [1] 17420 15691     0
Sys.Date( ) ## returns today's date.
## [1] "2022-08-29"
date() ## returns the current date and time.
## [1] "Mon Aug 29 10:46:08 2022"

Date format

Symbol Meaning Example
%d day as a number (0-31) 01-31
%a abbreviated weekday Fri
%A unabbreviated weekday Friday
%m month (00-12) 00-12
%b abbreviated month Oct
%B unabbreviated month October
%y 2-digit year 22
%Y 4-digit year 2022
today <- Sys.Date()
format(today, format="%B %d %Y")
## [1] "August 29 2022"

Date Conversion

# convert date info in format 'mm/dd/yyyy'
strDates <- c("01/05/1995", "08/16/1995")
dates <- as.Date(strDates, "%m/%d/%Y")
as.Date(c("2007-06-22", "2004-02-13"))
## [1] "2007-06-22" "2004-02-13"
as.character(dates)
## [1] "1995-01-05" "1995-08-16"
as.numeric(dates)
## [1] 9135 9358

References