load these packages individually

library(readr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(stringr)
library(ggplot2)

alternatively, you can do library(tidyverse) to include all of them

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ purrr     1.0.1
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Read in data (readr)

data source: sleepstudy.csv (also available in R lme4 package)
Original way to read in data

asleepfile <- "https:///Caleb-Huo.github.io/teaching/data/sleep/sleepstudy.csv"
data0 <- read.csv(asleepfile)

Use read_csv
- Much faster than read.csv, especially for large datasets

data1 <- read_csv(asleepfile)

## Rows: 180 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): Reaction, Days, Subject
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Inspect the data

read.csv
- You may want to use head, otherwise it will print out everyting

head(data0)

##   Reaction Days Subject
## 1 249.5600    0     308
## 2 258.7047    1     308
## 3 250.8006    2     308
## 4 321.4398    3     308
## 5 356.8519    4     308
## 6 414.6901    5     308

class(data0)

## [1] "data.frame"

read_csv
- smartly print out the first few rows

data1

## # A tibble: 180 × 3
##    Reaction  Days Subject
##       <dbl> <dbl>   <dbl>
##  1     250.     0     308
##  2     259.     1     308
##  3     251.     2     308
##  4     321.     3     308
##  5     357.     4     308
##  6     415.     5     308
##  7     382.     6     308
##  8     290.     7     308
##  9     431.     8     308
## 10     466.     9     308
## # ℹ 170 more rows

class(data1)

## [1] "spec_tbl_df" "tbl_df"      "tbl"         "data.frame"

read_delim

Similar to read_csv, we can use read_delim, which is more general
- read_csv assumes the delimiter is “,”
- for read_delim, you need to specify the delimiter
  - “,”: comma delimited (usually for .csv)
  - “\t”: tab delimited (usually for .txt)
  - ” “: space delimited

data2 <- read_delim("sleepstudy.csv", delim=",")

## Rows: 180 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): Reaction, Days, Subject
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Read excel

readxl package
readxl will be included as part of tidyverse
It seems we need to download the dataset to local first

library(readxl)
## bsleepfile <- "https:///Caleb-Huo.github.io/teaching/data/sleep/sleepstudy.xlsx"
data0 <- read_excel("sleepstudy.xlsx")
data0

xlsx package

library(xlsx)
## bsleepfile <- "https:///Caleb-Huo.github.io/teaching/data/sleep/sleepstudy.xlsx"
data0 <- read.xlsx("sleepstudy.xlsx", sheetIndex = 1)
data0

Write excel

library(xlsx)
## bsleepfile <- "https:///Caleb-Huo.github.io/teaching/data/sleep/sleepstudy.xlsx"
data_iris <- iris
data_cars <- cars
write.xlsx(data_iris, file = "mydata.xlsx", sheetName="iris")
write.xlsx(data_cars, file = "mydata.xlsx", sheetName="cars", append = TRUE)

Read SAS, SPSS, and Stata files.

haven package
haven will be included as part of tidyverse

library(haven)
salesfile <- "https:///Caleb-Huo.github.io/teaching/data/sleep/sales.sas7bdat"
data0 <- read_sas(salesfile)
data0

## # A tibble: 36 × 3
##     YEAR     P     S
##    <dbl> <dbl> <dbl>
##  1  1950  12.9  182.
##  2  1951  11.9  245 
##  3  1952  10.7  250.
##  4  1953  11.3  266.
##  5  1954  11.2  248.
##  6  1955  15.1  278.
##  7  1956  16.2  307.
##  8  1957  15.4  320 
##  9  1958  12.7  305.
## 10  1959  16.3  338 
## # ℹ 26 more rows

Inspection on data1

data1_sub <- data1[1:3,]
data1_sub$Reaction

## [1] 249.5600 258.7047 250.8006

as.matrix(data1_sub)

##      Reaction Days Subject
## [1,] 249.5600    0     308
## [2,] 258.7047    1     308
## [3,] 250.8006    2     308

as.data.frame(data1_sub)

##   Reaction Days Subject
## 1 249.5600    0     308
## 2 258.7047    1     308
## 3 250.8006    2     308

dplyr

dplyr provides a set of efficient tools for data manipulation in R
The key components of dplyr were written in Rcpp, which is very fast and efficient.
naming convention
- precursor of dplyr was plyr
- plyr comes from various “apply” functions in R
- d is for dataframe

dplyr

%>%: pipe
select(): select columns
filter(): filter rows by logical variable
pull(): obtain a specific column
slice(): subset rows by index
arrange(): re-order or arrange rows
mutate(): create new columns
mutate_at(): directly change original data
rename(): rename
summarise(): summarise values
group_by(): allows for group operations in the “split-apply-combine” concept
merge data.frame
- inner_join()
- left_join()
- right_join()
- full_join()
- anti_join()

select

select(data1, Days, Subject)

## # A tibble: 180 × 2
##     Days Subject
##    <dbl>   <dbl>
##  1     0     308
##  2     1     308
##  3     2     308
##  4     3     308
##  5     4     308
##  6     5     308
##  7     6     308
##  8     7     308
##  9     8     308
## 10     9     308
## # ℹ 170 more rows

Use pipe

data1 %>% select(Days, Subject)

## # A tibble: 180 × 2
##     Days Subject
##    <dbl>   <dbl>
##  1     0     308
##  2     1     308
##  3     2     308
##  4     3     308
##  5     4     308
##  6     5     308
##  7     6     308
##  8     7     308
##  9     8     308
## 10     9     308
## # ℹ 170 more rows

pipe

%>%
data1 %>% select(Days, Subject)
- the result before %>% will be piped into the first argument of the function behind %>%
data1 %>% select(., Days, Subject)
shortcut:
- cmd + shift + M: MAC
- ctrl + shift + M: Windows

exp(1)

## [1] 2.718282

1 %>% exp()

## [1] 2.718282

1 %>% exp() %>% log ## () can be omitted if the data is the only argument

## [1] 1

Do not select

data1 %>% select(-Reaction)

## # A tibble: 180 × 2
##     Days Subject
##    <dbl>   <dbl>
##  1     0     308
##  2     1     308
##  3     2     308
##  4     3     308
##  5     4     308
##  6     5     308
##  7     6     308
##  8     7     308
##  9     8     308
## 10     9     308
## # ℹ 170 more rows

filter

data1 %>% 
  select(Days, Subject) %>%
  filter(Subject == 308)

## # A tibble: 10 × 2
##     Days Subject
##    <dbl>   <dbl>
##  1     0     308
##  2     1     308
##  3     2     308
##  4     3     308
##  5     4     308
##  6     5     308
##  7     6     308
##  8     7     308
##  9     8     308
## 10     9     308

data1 %>% 
  filter(Reaction >= 300) %>%
  select(Days, Subject) %>%
  filter(Subject == 308)

## # A tibble: 6 × 2
##    Days Subject
##   <dbl>   <dbl>
## 1     3     308
## 2     4     308
## 3     5     308
## 4     6     308
## 5     8     308
## 6     9     308

multiple filtering criteria

data1 %>% 
  filter(Reaction >= 300, Subject == 308)

## # A tibble: 6 × 3
##   Reaction  Days Subject
##      <dbl> <dbl>   <dbl>
## 1     321.     3     308
## 2     357.     4     308
## 3     415.     5     308
## 4     382.     6     308
## 5     431.     8     308
## 6     466.     9     308

data1 %>% 
  filter(Reaction >= 300 & Subject == 308)

## # A tibble: 6 × 3
##   Reaction  Days Subject
##      <dbl> <dbl>   <dbl>
## 1     321.     3     308
## 2     357.     4     308
## 3     415.     5     308
## 4     382.     6     308
## 5     431.     8     308
## 6     466.     9     308

pull

(data1 %>% 
  filter(Reaction >= 300, Subject == 308))$Reaction

## [1] 321.4398 356.8519 414.6901 382.2038 430.5853 466.3535

data1 %>% 
  filter(Reaction >= 300, Subject == 308) %>%
  pull(Reaction)

## [1] 321.4398 356.8519 414.6901 382.2038 430.5853 466.3535

slice

data1 %>% 
  slice(1:8)

## # A tibble: 8 × 3
##   Reaction  Days Subject
##      <dbl> <dbl>   <dbl>
## 1     250.     0     308
## 2     259.     1     308
## 3     251.     2     308
## 4     321.     3     308
## 5     357.     4     308
## 6     415.     5     308
## 7     382.     6     308
## 8     290.     7     308

data1 %>% 
  head(n=8)

## # A tibble: 8 × 3
##   Reaction  Days Subject
##      <dbl> <dbl>   <dbl>
## 1     250.     0     308
## 2     259.     1     308
## 3     251.     2     308
## 4     321.     3     308
## 5     357.     4     308
## 6     415.     5     308
## 7     382.     6     308
## 8     290.     7     308

which(data1$Reaction >= 300 & data1$Subject == 308)

## [1]  4  5  6  7  9 10

with(data1, which(Reaction >= 300 & Subject == 308)) ## works for data.frame

## [1]  4  5  6  7  9 10

data1 %>% 
  slice(which(Reaction >= 300 & Subject == 308))

## # A tibble: 6 × 3
##   Reaction  Days Subject
##      <dbl> <dbl>   <dbl>
## 1     321.     3     308
## 2     357.     4     308
## 3     415.     5     308
## 4     382.     6     308
## 5     431.     8     308
## 6     466.     9     308

arrange

arrange() is similar to sort() and order()
assending order by default

data1 %>% arrange(Reaction) %>% head

## # A tibble: 6 × 3
##   Reaction  Days Subject
##      <dbl> <dbl>   <dbl>
## 1     194.     1     310
## 2     199.     0     310
## 3     203.     2     309
## 4     205.     3     309
## 5     205.     1     309
## 6     208.     4     309

data1 %>% 
  arrange(Days, Reaction) %>% 
  head

## # A tibble: 6 × 3
##   Reaction  Days Subject
##      <dbl> <dbl>   <dbl>
## 1     199.     0     310
## 2     222.     0     352
## 3     223.     0     309
## 4     225.     0     370
## 5     235.     0     332
## 6     236.     0     349

data1 %>% 
  arrange(Days, Reaction) %>% 
  head %>%
  colSums ## pipe also work for other functions

## Reaction     Days  Subject 
## 1339.693    0.000 2022.000

data1 %>% 
  arrange(desc(Days), Reaction) %>% 
  head

## # A tibble: 6 × 3
##   Reaction  Days Subject
##      <dbl> <dbl>   <dbl>
## 1     237.     9     335
## 2     237.     9     309
## 3     248.     9     310
## 4     254.     9     332
## 5     348.     9     351
## 6     352.     9     349

mutate

data1 %>% 
  mutate(Reaction_binary = Reaction<250) %>%
  head

## # A tibble: 6 × 4
##   Reaction  Days Subject Reaction_binary
##      <dbl> <dbl>   <dbl> <lgl>          
## 1     250.     0     308 TRUE           
## 2     259.     1     308 FALSE          
## 3     251.     2     308 FALSE          
## 4     321.     3     308 FALSE          
## 5     357.     4     308 FALSE          
## 6     415.     5     308 FALSE

data1 %>% 
  mutate(Reaction_binary = Reaction<250,
        Reaction_sec = Reaction/1000) %>% 
  head

## # A tibble: 6 × 5
##   Reaction  Days Subject Reaction_binary Reaction_sec
##      <dbl> <dbl>   <dbl> <lgl>                  <dbl>
## 1     250.     0     308 TRUE                   0.250
## 2     259.     1     308 FALSE                  0.259
## 3     251.     2     308 FALSE                  0.251
## 4     321.     3     308 FALSE                  0.321
## 5     357.     4     308 FALSE                  0.357
## 6     415.     5     308 FALSE                  0.415

mutate at (apply a function to one or several columns)

data1 %>% 
  head

## # A tibble: 6 × 3
##   Reaction  Days Subject
##      <dbl> <dbl>   <dbl>
## 1     250.     0     308
## 2     259.     1     308
## 3     251.     2     308
## 4     321.     3     308
## 5     357.     4     308
## 6     415.     5     308

data1 %>% 
  head %>%
  mutate_at(c("Reaction", "Subject"), log)

## # A tibble: 6 × 3
##   Reaction  Days Subject
##      <dbl> <dbl>   <dbl>
## 1     5.52     0    5.73
## 2     5.56     1    5.73
## 3     5.52     2    5.73
## 4     5.77     3    5.73
## 5     5.88     4    5.73
## 6     6.03     5    5.73

rename

data1 %>% 
  rename(ID = Subject)

## # A tibble: 180 × 3
##    Reaction  Days    ID
##       <dbl> <dbl> <dbl>
##  1     250.     0   308
##  2     259.     1   308
##  3     251.     2   308
##  4     321.     3   308
##  5     357.     4   308
##  6     415.     5   308
##  7     382.     6   308
##  8     290.     7   308
##  9     431.     8   308
## 10     466.     9   308
## # ℹ 170 more rows

summarise

data1 %>% 
    summarise(avg_reaction = mean(Reaction), 
              min_reaction = min(Reaction),
              max_reaction = max(Reaction),
              total = n())

## # A tibble: 1 × 4
##   avg_reaction min_reaction max_reaction total
##          <dbl>        <dbl>        <dbl> <int>
## 1         299.         194.         466.   180

adata <- data1 %>% 
    summarise(avg_reaction = mean(Reaction), 
              min_reaction = min(Reaction),
              max_reaction = max(Reaction),
              total = n())

group_by

seems to be more powerful than tapply

tt <- data1 %>% 
      group_by(Subject) %>%
      summarise(avg_reaction = mean(Reaction), 
              min_reaction = min(Reaction),
              max_reaction = max(Reaction),
              total = n())
tt %>% head

## # A tibble: 6 × 5
##   Subject avg_reaction min_reaction max_reaction total
##     <dbl>        <dbl>        <dbl>        <dbl> <int>
## 1     308         342.         250.         466.    10
## 2     309         215.         203.         237.    10
## 3     310         231.         194.         261.    10
## 4     330         303.         280.         354.    10
## 5     331         309.         285          372.    10
## 6     332         307.         235.         454.    10

select (2)

Select a range

tt %>% 
  head %>% 
  select(avg_reaction:max_reaction)

## # A tibble: 6 × 3
##   avg_reaction min_reaction max_reaction
##          <dbl>        <dbl>        <dbl>
## 1         342.         250.         466.
## 2         215.         203.         237.
## 3         231.         194.         261.
## 4         303.         280.         354.
## 5         309.         285          372.
## 6         307.         235.         454.

contains

tt %>% 
  head %>% 
  select(contains("reaction"))

## # A tibble: 6 × 3
##   avg_reaction min_reaction max_reaction
##          <dbl>        <dbl>        <dbl>
## 1         342.         250.         466.
## 2         215.         203.         237.
## 3         231.         194.         261.
## 4         303.         280.         354.
## 5         309.         285          372.
## 6         307.         235.         454.

more options for select()

starts_with() = Select columns that start with a character string
ends_with() = Select columns that end with a character string
contains() = Select columns that contain a character string
matches() = Select columns that match a regular expression
one_of() = Select columns names that are from a group of names

tt %>% 
  head %>% 
  select(ends_with("reaction"))

## # A tibble: 6 × 3
##   avg_reaction min_reaction max_reaction
##          <dbl>        <dbl>        <dbl>
## 1         342.         250.         466.
## 2         215.         203.         237.
## 3         231.         194.         261.
## 4         303.         280.         354.
## 5         309.         285          372.
## 6         307.         235.         454.

tt %>% 
  head %>% 
  select(starts_with("m"))

## # A tibble: 6 × 2
##   min_reaction max_reaction
##          <dbl>        <dbl>
## 1         250.         466.
## 2         203.         237.
## 3         194.         261.
## 4         280.         354.
## 5         285          372.
## 6         235.         454.

Select by variables that are contained in a character vector

all_of() is for strict selection.

avar <- c("Reaction", "Days")
data1 %>% select(all_of(avar))

## # A tibble: 180 × 2
##    Reaction  Days
##       <dbl> <dbl>
##  1     250.     0
##  2     259.     1
##  3     251.     2
##  4     321.     3
##  5     357.     4
##  6     415.     5
##  7     382.     6
##  8     290.     7
##  9     431.     8
## 10     466.     9
## # ℹ 170 more rows

any_of() doesn’t check for missing variables.

bvar <- c("Reaction", "Days", "Months")
data1 %>% select(any_of(bvar))

## # A tibble: 180 × 2
##    Reaction  Days
##       <dbl> <dbl>
##  1     250.     0
##  2     259.     1
##  3     251.     2
##  4     321.     3
##  5     357.     4
##  6     415.     5
##  7     382.     6
##  8     290.     7
##  9     431.     8
## 10     466.     9
## # ℹ 170 more rows

create a tibble

tibble is similar to a dataframe, but tibble is better designed

atibble <- tibble(A = 4:6, B = c("A", "B", "C"))
atibble

## # A tibble: 3 × 2
##       A B    
##   <int> <chr>
## 1     4 A    
## 2     5 B    
## 3     6 C

adataframe <- data.frame(A = 4:6, B = c("A", "B", "C"))
as_tibble(adataframe)

## # A tibble: 3 × 2
##       A B    
##   <int> <chr>
## 1     4 A    
## 2     5 B    
## 3     6 C

add a index column

btibble <- atibble %>% rowid_to_column()
btibble

## # A tibble: 3 × 3
##   rowid     A B    
##   <int> <int> <chr>
## 1     1     4 A    
## 2     2     5 B    
## 3     3     6 C

the data for merge

superheroes <- "
    name, alignment, gender,         publisher
 Magneto,       bad,   male,            Marvel
   Storm,      good, female,            Marvel
Mystique,       bad, female,            Marvel
  Batman,      good,   male,                DC
   Joker,       bad,   male,                DC
Catwoman,       bad, female,                DC
 Hellboy,      good,   male, Dark Horse Comics
"
superheroes <- read_csv(superheroes, skip = 1)

## Rows: 7 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): name, alignment, gender, publisher
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

publishers <- "
  publisher, yr_founded
         DC,       1934
     Marvel,       1939
      Image,       1992
"
publishers <- read_csv(publishers, skip = 1)

## Rows: 3 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): publisher
## dbl (1): yr_founded
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

inner_join

inner_join(x, y): Return all rows from x where there are matching values in y, and all columns from x and y.
If there are multiple matches between x and y, all combination of the matches are returned.

inner_join(superheroes, publishers)

## Joining with `by = join_by(publisher)`

## # A tibble: 6 × 5
##   name     alignment gender publisher yr_founded
##   <chr>    <chr>     <chr>  <chr>          <dbl>
## 1 Magneto  bad       male   Marvel          1939
## 2 Storm    good      female Marvel          1939
## 3 Mystique bad       female Marvel          1939
## 4 Batman   good      male   DC              1934
## 5 Joker    bad       male   DC              1934
## 6 Catwoman bad       female DC              1934

inner_join(publishers, superheroes)

## Joining with `by = join_by(publisher)`

## # A tibble: 6 × 5
##   publisher yr_founded name     alignment gender
##   <chr>          <dbl> <chr>    <chr>     <chr> 
## 1 DC              1934 Batman   good      male  
## 2 DC              1934 Joker    bad       male  
## 3 DC              1934 Catwoman bad       female
## 4 Marvel          1939 Magneto  bad       male  
## 5 Marvel          1939 Storm    good      female
## 6 Marvel          1939 Mystique bad       female

left_join

left_join(x, y): Return all rows from x, and all columns from x and y.
If there are multiple matches between x and y, all combination of the matches are returned.

left_join(superheroes, publishers)

## Joining with `by = join_by(publisher)`

## # A tibble: 7 × 5
##   name     alignment gender publisher         yr_founded
##   <chr>    <chr>     <chr>  <chr>                  <dbl>
## 1 Magneto  bad       male   Marvel                  1939
## 2 Storm    good      female Marvel                  1939
## 3 Mystique bad       female Marvel                  1939
## 4 Batman   good      male   DC                      1934
## 5 Joker    bad       male   DC                      1934
## 6 Catwoman bad       female DC                      1934
## 7 Hellboy  good      male   Dark Horse Comics         NA

left_join(publishers, superheroes)

## Joining with `by = join_by(publisher)`

## # A tibble: 7 × 5
##   publisher yr_founded name     alignment gender
##   <chr>          <dbl> <chr>    <chr>     <chr> 
## 1 DC              1934 Batman   good      male  
## 2 DC              1934 Joker    bad       male  
## 3 DC              1934 Catwoman bad       female
## 4 Marvel          1939 Magneto  bad       male  
## 5 Marvel          1939 Storm    good      female
## 6 Marvel          1939 Mystique bad       female
## 7 Image           1992 <NA>     <NA>      <NA>

right_join

right_join(x, y): Return all rows from y, and all columns from x and y.
If there are multiple matches between x and y, all combination of the matches are returned.

right_join(superheroes, publishers)

## Joining with `by = join_by(publisher)`

## # A tibble: 7 × 5
##   name     alignment gender publisher yr_founded
##   <chr>    <chr>     <chr>  <chr>          <dbl>
## 1 Magneto  bad       male   Marvel          1939
## 2 Storm    good      female Marvel          1939
## 3 Mystique bad       female Marvel          1939
## 4 Batman   good      male   DC              1934
## 5 Joker    bad       male   DC              1934
## 6 Catwoman bad       female DC              1934
## 7 <NA>     <NA>      <NA>   Image           1992

right_join(publishers, superheroes)

## Joining with `by = join_by(publisher)`

## # A tibble: 7 × 5
##   publisher         yr_founded name     alignment gender
##   <chr>                  <dbl> <chr>    <chr>     <chr> 
## 1 DC                      1934 Batman   good      male  
## 2 DC                      1934 Joker    bad       male  
## 3 DC                      1934 Catwoman bad       female
## 4 Marvel                  1939 Magneto  bad       male  
## 5 Marvel                  1939 Storm    good      female
## 6 Marvel                  1939 Mystique bad       female
## 7 Dark Horse Comics         NA Hellboy  good      male

anti_join

anti_join(x, y): Return all rows from x where there are not matching values in y, keeping just columns from x.

anti_join(superheroes, publishers)

## Joining with `by = join_by(publisher)`

## # A tibble: 1 × 4
##   name    alignment gender publisher        
##   <chr>   <chr>     <chr>  <chr>            
## 1 Hellboy good      male   Dark Horse Comics

anti_join(publishers, superheroes)

## Joining with `by = join_by(publisher)`

## # A tibble: 1 × 2
##   publisher yr_founded
##   <chr>          <dbl>
## 1 Image           1992

full_join

full_join(x, y): Return all rows and all columns from both x and y.
Where there are not matching values, returns NA for the one missing.

full_join(superheroes, publishers)

## Joining with `by = join_by(publisher)`

## # A tibble: 8 × 5
##   name     alignment gender publisher         yr_founded
##   <chr>    <chr>     <chr>  <chr>                  <dbl>
## 1 Magneto  bad       male   Marvel                  1939
## 2 Storm    good      female Marvel                  1939
## 3 Mystique bad       female Marvel                  1939
## 4 Batman   good      male   DC                      1934
## 5 Joker    bad       male   DC                      1934
## 6 Catwoman bad       female DC                      1934
## 7 Hellboy  good      male   Dark Horse Comics         NA
## 8 <NA>     <NA>      <NA>   Image                   1992

full_join(publishers, superheroes)

## Joining with `by = join_by(publisher)`

## # A tibble: 8 × 5
##   publisher         yr_founded name     alignment gender
##   <chr>                  <dbl> <chr>    <chr>     <chr> 
## 1 DC                      1934 Batman   good      male  
## 2 DC                      1934 Joker    bad       male  
## 3 DC                      1934 Catwoman bad       female
## 4 Marvel                  1939 Magneto  bad       male  
## 5 Marvel                  1939 Storm    good      female
## 6 Marvel                  1939 Mystique bad       female
## 7 Image                   1992 <NA>     <NA>      <NA>  
## 8 Dark Horse Comics         NA Hellboy  good      male

tidyr

gather
separate
spread
merge

spread

Function:
- spread(data, key, value, fill = NA)
Same as:
- data %>% spread(key, value, fill = NA)
Arguments:
- data: data frame
- key: column values to convert to multiple columns
- value: single column values to convert to multiple columns’ values
- fill: If there isn’t a value for every combination of the other variables and the key column, this value will be substituted

spread example

data1_wide <- data1 %>% spread(Days, Reaction)
head(data1_wide)

## # A tibble: 6 × 11
##   Subject   `0`   `1`   `2`   `3`   `4`   `5`   `6`   `7`   `8`   `9`
##     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     308  250.  259.  251.  321.  357.  415.  382.  290.  431.  466.
## 2     309  223.  205.  203.  205.  208.  216.  214.  218.  224.  237.
## 3     310  199.  194.  234.  233.  229.  220.  235.  256.  261.  248.
## 4     330  322.  300.  284.  285.  286.  298.  280.  318.  305.  354.
## 5     331  288.  285   302.  320.  316.  293.  290.  335.  294.  372.
## 6     332  235.  243.  273.  310.  317.  310.  454.  347.  330.  254.

gather

Function:
- gather(data, key, value, …, na.rm = FALSE)
Same as:
- data %>% gather(key, value, …, na.rm = FALSE)
Arguments:
- data: data frame
- key: column name representing new variable
- value: column name representing variable values
- …: names of columns to gather (or not gather)
- na.rm: option to remove observations with missing values (represented by NAs)

gather example

data1_long <- data1_wide %>% gather(ddays, rreaction, "0":"9")
head(data1_long)

## # A tibble: 6 × 3
##   Subject ddays rreaction
##     <dbl> <chr>     <dbl>
## 1     308 0          250.
## 2     309 0          223.
## 3     310 0          199.
## 4     330 0          322.
## 5     331 0          288.
## 6     332 0          235.

unite

Function:
- unite(data, col, …, sep = ” “, remove = TRUE)
Same as:
- data %>% unite(col, …, sep = ” “, remove = TRUE)
Arguments:
- data: data frame
- col: column name of new “merged” column
- …: names of columns to merge
- sep: separator to use between merged values
- remove: if TRUE, remove input column from output data frame

unite example

data1_unite<- data1 %>% unite(Subject_Days, Subject, Days, sep="_")
head(data1_unite)

## # A tibble: 6 × 2
##   Reaction Subject_Days
##      <dbl> <chr>       
## 1     250. 308_0       
## 2     259. 308_1       
## 3     251. 308_2       
## 4     321. 308_3       
## 5     357. 308_4       
## 6     415. 308_5

data1_unite<- data1 %>% unite(Subject_Days, Subject, Days, sep="_", remove = FALSE) ## if you want to keep a copy of the original data
head(data1_unite)

separate

Function:
- separate(data, col, into, sep = ” “, remove = TRUE)
Same as:
- data %>% separate(col, into, sep = ” “, remove = TRUE)
Arguments:
- data: data frame
- col: column name representing current variable
- into: names of variables representing new variables
- sep: how to separate current variable (char, num, or symbol)
- remove: if TRUE, remove input column from output data frame

separate example

data1_separate<- data1_unite %>% separate(Subject_Days, c("subjects", "days"), sep="_")
head(data1_separate)

## # A tibble: 6 × 3
##   Reaction subjects days 
##      <dbl> <chr>    <chr>
## 1     250. 308      0    
## 2     259. 308      1    
## 3     251. 308      2    
## 4     321. 308      3    
## 5     357. 308      4    
## 6     415. 308      5

data1_separate<- data1_unite %>% separate(Subject_Days, c("subjects", "days"), sep="_", remove = FALSE)
head(data1_separate)

stringr

stringr package contains a set of commonly used string manipulation functions.

Detect Matches
Subset Strings
Manage lengths
Mutate Strings
Join and split

stringr cheatsheet: - https://github.com/rstudio/cheatsheets/blob/master/strings.pdf

Detect Matches (1)

str_detect: return a logical vector to indicate match position

colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")

str_detect(colorVec, "e") ## contains e

## [1]  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE

str_detect(colorVec, "e$") ## ends with e

## [1] FALSE  TRUE FALSE FALSE FALSE  TRUE  TRUE  TRUE

str_detect(colorVec, "^e") ## starts with e

## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

Detect Matches (2)

str_which: return a index vector to indicate match position

colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")

str_which(colorVec, "e") ## contains e

## [1] 1 2 3 4 6 7 8

str_which(colorVec, "e$") ## ends with e

## [1] 2 6 7 8

str_which(colorVec, "^e") ## starts with e

## integer(0)

Detect Matches (3)

str_count: count frequency of match

colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")

str_count(colorVec, "e") ## contains e

## [1] 1 1 2 1 0 1 1 1

str_count(colorVec, "e$") ## ends with e

## [1] 0 1 0 0 0 1 1 1

str_count(colorVec, "^e") ## starts with e

## [1] 0 0 0 0 0 0 0 0

Subset Strings (1)

str_sub(string, start = 1L, end = -1L): subset of a string

colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")

str_sub(colorVec,2,2)

## [1] "e" "l" "r" "e" "l" "r" "u" "h"

str_sub(colorVec,start = 1, end=-2)

## [1] "re"    "blu"   "gree"  "yello" "blac"  "orang" "purpl" "whit"

str_sub(colorVec,end=-2)

## [1] "re"    "blu"   "gree"  "yello" "blac"  "orang" "purpl" "whit"

str_sub(colorVec,start = 2, end = -1)

## [1] "ed"    "lue"   "reen"  "ellow" "lack"  "range" "urple" "hite"

str_sub(colorVec,start = 2)

## [1] "ed"    "lue"   "reen"  "ellow" "lack"  "range" "urple" "hite"

Subset Strings (2)

str_subset(string, pattern), return the matched string

colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")

str_subset(colorVec,"e")

## [1] "red"    "blue"   "green"  "yellow" "orange" "purple" "white"

str_subset(colorVec,"e$")

## [1] "blue"   "orange" "purple" "white"

str_subset(colorVec,"^e")

## character(0)

Subset Strings (3)

str_extract(string, pattern), extract matching patterns from a string

colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")

str_extract(colorVec,"e")

## [1] "e" "e" "e" "e" NA  "e" "e" "e"

str_extract(colorVec,"[aeiou]") ## the first match

## [1] "e" "u" "e" "e" "a" "o" "u" "i"

str_extract_all(colorVec,"[aeiou]") ## all matches

## [[1]]
## [1] "e"
## 
## [[2]]
## [1] "u" "e"
## 
## [[3]]
## [1] "e" "e"
## 
## [[4]]
## [1] "e" "o"
## 
## [[5]]
## [1] "a"
## 
## [[6]]
## [1] "o" "a" "e"
## 
## [[7]]
## [1] "u" "e"
## 
## [[8]]
## [1] "i" "e"

Manage lengths

colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")

str_length(colorVec)

## [1] 3 4 5 6 5 6 6 5

matrix(str_pad(colorVec, width = 7),ncol=1)

##      [,1]     
## [1,] "    red"
## [2,] "   blue"
## [3,] "  green"
## [4,] " yellow"
## [5,] "  black"
## [6,] " orange"
## [7,] " purple"
## [8,] "  white"

Mutate Strings

colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")
str_sub(colorVec,1,1) <- "Z" ## will change the original string vector
colorVec

## [1] "Zed"    "Zlue"   "Zreen"  "Zellow" "Zlack"  "Zrange" "Zurple" "Zhite"

colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")
str_replace(colorVec, "e", "E")

## [1] "rEd"    "bluE"   "grEen"  "yEllow" "black"  "orangE" "purplE" "whitE"

str_replace_all(colorVec, "e", "E")

## [1] "rEd"    "bluE"   "grEEn"  "yEllow" "black"  "orangE" "purplE" "whitE"

str_to_lower(colorVec)

## [1] "red"    "blue"   "green"  "yellow" "black"  "orange" "purple" "white"

str_to_upper(colorVec)

## [1] "RED"    "BLUE"   "GREEN"  "YELLOW" "BLACK"  "ORANGE" "PURPLE" "WHITE"

str_to_title(colorVec) ## like a sentence

## [1] "Red"    "Blue"   "Green"  "Yellow" "Black"  "Orange" "Purple" "White"

Join and split

colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")


str_c(colorVec, seq_along(colorVec))

## [1] "red1"    "blue2"   "green3"  "yellow4" "black5"  "orange6" "purple7"
## [8] "white8"

str_c(colorVec, collapse = "::")

## [1] "red::blue::green::yellow::black::orange::purple::white"

Order Strings

colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")

str_order(colorVec) ## same as order(colorVec)

## [1] 5 2 3 6 7 1 8 4

str_sort(colorVec) ## same as sort(colorVec)

## [1] "black"  "blue"   "green"  "orange" "purple" "red"    "white"  "yellow"

Programming basics for Biostatistics 6099

Data manipulation (Tidyverse)

Outline

Read in data (readr)

Inspect the data

read_delim

Read excel

Write excel

Read SAS, SPSS, and Stata files.

Inspection on data1

dplyr

dplyr

select

pipe

filter

pull

slice

arrange

mutate

mutate at (apply a function to one or several columns)

rename

summarise

group_by

select (2)

more options for select()

Select by variables that are contained in a character vector

create a tibble

merge data.frame

the data for merge

inner_join

left_join

right_join

anti_join

full_join

tidyr

spread

spread example

gather

gather example

unite

unite example

separate

separate example

stringr

Detect Matches (1)

Detect Matches (2)

Detect Matches (3)

Subset Strings (1)

Subset Strings (2)

Subset Strings (3)

Manage lengths

Mutate Strings

Join and split

Order Strings

references