library(readr)
library(dplyr)

## Warning: package 'dplyr' was built under R version 3.5.1

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(ggplot2)
## alternatively, you can do library(tidyverse) to include all of them

Read in data (readr)

data source: sleepstudy.csv (also available in R lme4 package)
Original way to read in data

asleepfile <- "https://raw.githubusercontent.com/Caleb-Huo/Caleb-Huo.github.io/master/teaching/data/sleep/sleepstudy.csv"
data0 <- read.csv(asleepfile)

Use read_csv
- Much faster than read.csv, especially for large datasets

data1 <- read_csv(asleepfile)

## Parsed with column specification:
## cols(
##   Reaction = col_double(),
##   Days = col_integer(),
##   Subject = col_integer()
## )

Inspect the data

read.csv
- You may want to use head, otherwise it will print out everyting

head(data0)

##   Reaction Days Subject
## 1 249.5600    0     308
## 2 258.7047    1     308
## 3 250.8006    2     308
## 4 321.4398    3     308
## 5 356.8519    4     308
## 6 414.6901    5     308

class(data0)

## [1] "data.frame"

read_csv
- smartly print out the first few rows

data1

## # A tibble: 180 x 3
##    Reaction  Days Subject
##       <dbl> <int>   <int>
##  1     250.     0     308
##  2     259.     1     308
##  3     251.     2     308
##  4     321.     3     308
##  5     357.     4     308
##  6     415.     5     308
##  7     382.     6     308
##  8     290.     7     308
##  9     431.     8     308
## 10     466.     9     308
## # ... with 170 more rows

class(data1)

## [1] "tbl_df"     "tbl"        "data.frame"

read_delim

Similar to read_csv, we can use read_delim, which is more general
- read_csv assumes the delimiter is “,”
- for read_delim, you need to specify the delimiter
  - “,”: comma delimited (usually for .csv)
  - “\t”: tab delimited (usually for .txt)
  - " “: space delimited

data2 <- read_delim("sleepstudy.csv", delim=",")

## Parsed with column specification:
## cols(
##   Reaction = col_double(),
##   Days = col_integer(),
##   Subject = col_integer()
## )

Play with data1

data1_sub <- data1[1:3,]
data1_sub$Reaction

## [1] 249.5600 258.7047 250.8006

as.matrix(data1_sub)

##      Reaction Days Subject
## [1,] 249.5600    0     308
## [2,] 258.7047    1     308
## [3,] 250.8006    2     308

as.data.frame(data1_sub)

##   Reaction Days Subject
## 1 249.5600    0     308
## 2 258.7047    1     308
## 3 250.8006    2     308

dplyr

%>%: pipe
select(): select columns
filter(): filter rows
arrange(): re-order or arrange rows
mutate(): create new columns
summarise(): summarise values
group_by(): allows for group operations in the “split-apply-combine” concept
merge data.frame
- inner_join()
- left_join()
- right_join()
- full_join()
- anti_join()

select

select(data1, Days, Subject)

## # A tibble: 180 x 2
##     Days Subject
##    <int>   <int>
##  1     0     308
##  2     1     308
##  3     2     308
##  4     3     308
##  5     4     308
##  6     5     308
##  7     6     308
##  8     7     308
##  9     8     308
## 10     9     308
## # ... with 170 more rows

Use pipe

data1 %>% select(Days, Subject)

## # A tibble: 180 x 2
##     Days Subject
##    <int>   <int>
##  1     0     308
##  2     1     308
##  3     2     308
##  4     3     308
##  5     4     308
##  6     5     308
##  7     6     308
##  8     7     308
##  9     8     308
## 10     9     308
## # ... with 170 more rows

pipe

%>%
data1 %>% select(Days, Subject)
- the result before %>% will be piped into the first argument of the function behind %>%

Do not select

data1 %>% select(-Reaction)

## # A tibble: 180 x 2
##     Days Subject
##    <int>   <int>
##  1     0     308
##  2     1     308
##  3     2     308
##  4     3     308
##  5     4     308
##  6     5     308
##  7     6     308
##  8     7     308
##  9     8     308
## 10     9     308
## # ... with 170 more rows

filter

data1 %>% 
  select(Days, Subject) %>%
  filter(Subject == 308)

## # A tibble: 10 x 2
##     Days Subject
##    <int>   <int>
##  1     0     308
##  2     1     308
##  3     2     308
##  4     3     308
##  5     4     308
##  6     5     308
##  7     6     308
##  8     7     308
##  9     8     308
## 10     9     308

data1 %>% 
  filter(Reaction >= 300) %>%
  select(Days, Subject) %>%
  filter(Subject == 308)

## # A tibble: 6 x 2
##    Days Subject
##   <int>   <int>
## 1     3     308
## 2     4     308
## 3     5     308
## 4     6     308
## 5     8     308
## 6     9     308

multiple filtering criteria

data1 %>% 
  filter(Reaction >= 300, Subject == 308)

## # A tibble: 6 x 3
##   Reaction  Days Subject
##      <dbl> <int>   <int>
## 1     321.     3     308
## 2     357.     4     308
## 3     415.     5     308
## 4     382.     6     308
## 5     431.     8     308
## 6     466.     9     308

arrange

arrange() is similar to sort() and order()
assending order by default

data1 %>% arrange(Reaction) %>% head

## # A tibble: 6 x 3
##   Reaction  Days Subject
##      <dbl> <int>   <int>
## 1     194.     1     310
## 2     199.     0     310
## 3     203.     2     309
## 4     205.     3     309
## 5     205.     1     309
## 6     208.     4     309

data1 %>% 
  arrange(Days, Reaction) %>% 
  head

## # A tibble: 6 x 3
##   Reaction  Days Subject
##      <dbl> <int>   <int>
## 1     199.     0     310
## 2     222.     0     352
## 3     223.     0     309
## 4     225.     0     370
## 5     235.     0     332
## 6     236.     0     349

data1 %>% 
  arrange(desc(Days), Reaction) %>% 
  head

## # A tibble: 6 x 3
##   Reaction  Days Subject
##      <dbl> <int>   <int>
## 1     237.     9     335
## 2     237.     9     309
## 3     248.     9     310
## 4     254.     9     332
## 5     348.     9     351
## 6     352.     9     349

mutate

data1 %>% 
  mutate(Reaction_binary = Reaction<250) %>%
  head

## # A tibble: 6 x 4
##   Reaction  Days Subject Reaction_binary
##      <dbl> <int>   <int> <lgl>          
## 1     250.     0     308 TRUE           
## 2     259.     1     308 FALSE          
## 3     251.     2     308 FALSE          
## 4     321.     3     308 FALSE          
## 5     357.     4     308 FALSE          
## 6     415.     5     308 FALSE

data1 %>% 
  mutate(Reaction_binary = Reaction<250) %>%
  mutate(Reaction_sec = Reaction/1000) %>% 
  head

## # A tibble: 6 x 5
##   Reaction  Days Subject Reaction_binary Reaction_sec
##      <dbl> <int>   <int> <lgl>                  <dbl>
## 1     250.     0     308 TRUE                   0.250
## 2     259.     1     308 FALSE                  0.259
## 3     251.     2     308 FALSE                  0.251
## 4     321.     3     308 FALSE                  0.321
## 5     357.     4     308 FALSE                  0.357
## 6     415.     5     308 FALSE                  0.415

summarise

data1 %>% 
    summarise(avg_reaction = mean(Reaction), 
              min_reaction = min(Reaction),
              max_reaction = max(Reaction),
              total = n())

## # A tibble: 1 x 4
##   avg_reaction min_reaction max_reaction total
##          <dbl>        <dbl>        <dbl> <int>
## 1         299.         194.         466.   180

group_by

seems to be more powerful than tapply

tt <- data1 %>% 
      group_by(Subject) %>%
      summarise(avg_reaction = mean(Reaction), 
              min_reaction = min(Reaction),
              max_reaction = max(Reaction),
              total = n())
tt %>% head

## # A tibble: 6 x 5
##   Subject avg_reaction min_reaction max_reaction total
##     <int>        <dbl>        <dbl>        <dbl> <int>
## 1     308         342.         250.         466.    10
## 2     309         215.         203.         237.    10
## 3     310         231.         194.         261.    10
## 4     330         303.         280.         354.    10
## 5     331         309.         285          372.    10
## 6     332         307.         235.         454.    10

select (2)

Select a range

tt %>% 
  head %>% 
  select(avg_reaction:max_reaction)

## # A tibble: 6 x 3
##   avg_reaction min_reaction max_reaction
##          <dbl>        <dbl>        <dbl>
## 1         342.         250.         466.
## 2         215.         203.         237.
## 3         231.         194.         261.
## 4         303.         280.         354.
## 5         309.         285          372.
## 6         307.         235.         454.

contains

tt %>% 
  head %>% 
  select(contains("reaction"))

## # A tibble: 6 x 3
##   avg_reaction min_reaction max_reaction
##          <dbl>        <dbl>        <dbl>
## 1         342.         250.         466.
## 2         215.         203.         237.
## 3         231.         194.         261.
## 4         303.         280.         354.
## 5         309.         285          372.
## 6         307.         235.         454.

more options for select()

starts_with() = Select columns that start with a character string
ends_with() = Select columns that end with a character string
contains() = Select columns that contain a character string
matches() = Select columns that match a regular expression
one_of() = Select columns names that are from a group of names

merge data.frame

inner_join()
left_join()
right_join()
full_join()
anti_join()

the data for merge

superheroes <- "
    name, alignment, gender,         publisher
 Magneto,       bad,   male,            Marvel
   Storm,      good, female,            Marvel
Mystique,       bad, female,            Marvel
  Batman,      good,   male,                DC
   Joker,       bad,   male,                DC
Catwoman,       bad, female,                DC
 Hellboy,      good,   male, Dark Horse Comics
"
superheroes <- read_csv(superheroes, skip = 1)

publishers <- "
  publisher, yr_founded
         DC,       1934
     Marvel,       1939
      Image,       1992
"
publishers <- read_csv(publishers, skip = 1)

inner_join

inner_join(x, y): Return all rows from x where there are matching values in y, and all columns from x and y.
If there are multiple matches between x and y, all combination of the matches are returned.
This is a mutating join.

inner_join(superheroes, publishers)

## Joining, by = "publisher"

## # A tibble: 6 x 5
##   name     alignment gender publisher yr_founded
##   <chr>    <chr>     <chr>  <chr>          <int>
## 1 Magneto  bad       male   Marvel          1939
## 2 Storm    good      female Marvel          1939
## 3 Mystique bad       female Marvel          1939
## 4 Batman   good      male   DC              1934
## 5 Joker    bad       male   DC              1934
## 6 Catwoman bad       female DC              1934

inner_join(publishers, superheroes)

## Joining, by = "publisher"

## # A tibble: 6 x 5
##   publisher yr_founded name     alignment gender
##   <chr>          <int> <chr>    <chr>     <chr> 
## 1 DC              1934 Batman   good      male  
## 2 DC              1934 Joker    bad       male  
## 3 DC              1934 Catwoman bad       female
## 4 Marvel          1939 Magneto  bad       male  
## 5 Marvel          1939 Storm    good      female
## 6 Marvel          1939 Mystique bad       female

left_join

left_join(x, y): Return all rows from x, and all columns from x and y.
If there are multiple matches between x and y, all combination of the matches are returned.
This is a mutating join

left_join(superheroes, publishers)

## Joining, by = "publisher"

## # A tibble: 7 x 5
##   name     alignment gender publisher         yr_founded
##   <chr>    <chr>     <chr>  <chr>                  <int>
## 1 Magneto  bad       male   Marvel                  1939
## 2 Storm    good      female Marvel                  1939
## 3 Mystique bad       female Marvel                  1939
## 4 Batman   good      male   DC                      1934
## 5 Joker    bad       male   DC                      1934
## 6 Catwoman bad       female DC                      1934
## 7 Hellboy  good      male   Dark Horse Comics         NA

left_join(publishers, superheroes)

## Joining, by = "publisher"

## # A tibble: 7 x 5
##   publisher yr_founded name     alignment gender
##   <chr>          <int> <chr>    <chr>     <chr> 
## 1 DC              1934 Batman   good      male  
## 2 DC              1934 Joker    bad       male  
## 3 DC              1934 Catwoman bad       female
## 4 Marvel          1939 Magneto  bad       male  
## 5 Marvel          1939 Storm    good      female
## 6 Marvel          1939 Mystique bad       female
## 7 Image           1992 <NA>     <NA>      <NA>

right_join

right_join(x, y): Return all rows from y, and all columns from x and y.
If there are multiple matches between x and y, all combination of the matches are returned.
This is a mutating join

right_join(superheroes, publishers)

## Joining, by = "publisher"

## # A tibble: 7 x 5
##   name     alignment gender publisher yr_founded
##   <chr>    <chr>     <chr>  <chr>          <int>
## 1 Batman   good      male   DC              1934
## 2 Joker    bad       male   DC              1934
## 3 Catwoman bad       female DC              1934
## 4 Magneto  bad       male   Marvel          1939
## 5 Storm    good      female Marvel          1939
## 6 Mystique bad       female Marvel          1939
## 7 <NA>     <NA>      <NA>   Image           1992

right_join(publishers, superheroes)

## Joining, by = "publisher"

## # A tibble: 7 x 5
##   publisher         yr_founded name     alignment gender
##   <chr>                  <int> <chr>    <chr>     <chr> 
## 1 Marvel                  1939 Magneto  bad       male  
## 2 Marvel                  1939 Storm    good      female
## 3 Marvel                  1939 Mystique bad       female
## 4 DC                      1934 Batman   good      male  
## 5 DC                      1934 Joker    bad       male  
## 6 DC                      1934 Catwoman bad       female
## 7 Dark Horse Comics         NA Hellboy  good      male

anti_join

anti_join(x, y): Return all rows from x where there are not matching values in y, keeping just columns from x.
This is a filtering join.

anti_join(superheroes, publishers)

## Joining, by = "publisher"

## # A tibble: 1 x 4
##   name    alignment gender publisher        
##   <chr>   <chr>     <chr>  <chr>            
## 1 Hellboy good      male   Dark Horse Comics

anti_join(publishers, superheroes)

## Joining, by = "publisher"

## # A tibble: 1 x 2
##   publisher yr_founded
##   <chr>          <int>
## 1 Image           1992

full_join

full_join(x, y): Return all rows and all columns from both x and y.
Where there are not matching values, returns NA for the one missing.
This is a mutating join.

full_join(superheroes, publishers)

## Joining, by = "publisher"

## # A tibble: 8 x 5
##   name     alignment gender publisher         yr_founded
##   <chr>    <chr>     <chr>  <chr>                  <int>
## 1 Magneto  bad       male   Marvel                  1939
## 2 Storm    good      female Marvel                  1939
## 3 Mystique bad       female Marvel                  1939
## 4 Batman   good      male   DC                      1934
## 5 Joker    bad       male   DC                      1934
## 6 Catwoman bad       female DC                      1934
## 7 Hellboy  good      male   Dark Horse Comics         NA
## 8 <NA>     <NA>      <NA>   Image                   1992

full_join(publishers, superheroes)

## Joining, by = "publisher"

## # A tibble: 8 x 5
##   publisher         yr_founded name     alignment gender
##   <chr>                  <int> <chr>    <chr>     <chr> 
## 1 DC                      1934 Batman   good      male  
## 2 DC                      1934 Joker    bad       male  
## 3 DC                      1934 Catwoman bad       female
## 4 Marvel                  1939 Magneto  bad       male  
## 5 Marvel                  1939 Storm    good      female
## 6 Marvel                  1939 Mystique bad       female
## 7 Image                   1992 <NA>     <NA>      <NA>  
## 8 Dark Horse Comics         NA Hellboy  good      male

tidyr

gather
separate
spread
merge

spread

Function:
- spread(data, key, value, fill = NA, convert = FALSE)
Same as:
- data %>% spread(key, value, fill = NA, convert = FALSE)
Arguments:
- data: data frame
- key: column values to convert to multiple columns
- value: single column values to convert to multiple columns’ values
- fill: If there isn’t a value for every combination of the other variables and the key column, this value will be substituted
- convert: if TRUE will automatically convert values to logical, integer, numeric, complex or factor as appropriate

spread example

data1_wide <- data1 %>% spread(Days, Reaction, fill = NA, convert = FALSE)
head(data1_wide)

## # A tibble: 6 x 11
##   Subject   `0`   `1`   `2`   `3`   `4`   `5`   `6`   `7`   `8`   `9`
##     <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     308  250.  259.  251.  321.  357.  415.  382.  290.  431.  466.
## 2     309  223.  205.  203.  205.  208.  216.  214.  218.  224.  237.
## 3     310  199.  194.  234.  233.  229.  220.  235.  256.  261.  248.
## 4     330  322.  300.  284.  285.  286.  298.  280.  318.  305.  354.
## 5     331  288.  285   302.  320.  316.  293.  290.  335.  294.  372.
## 6     332  235.  243.  273.  310.  317.  310.  454.  347.  330.  254.

gather

Function:
- gather(data, key, value, …, na.rm = FALSE, convert = FALSE)
Same as:
- data %>% gather(key, value, …, na.rm = FALSE, convert = FALSE)
Arguments:
- data: data frame
- key: column name representing new variable
- value: column name representing variable values
- …: names of columns to gather (or not gather)
- na.rm: option to remove observations with missing values (represented by NAs)
- convert: if TRUE will automatically convert values to logical, integer, numeric, complex or factor as appropriate

gather example

data1_long <- data1_wide %>% gather(ddays, rreaction, "0":"9")
head(data1_long)

## # A tibble: 6 x 3
##   Subject ddays rreaction
##     <int> <chr>     <dbl>
## 1     308 0          250.
## 2     309 0          223.
## 3     310 0          199.
## 4     330 0          322.
## 5     331 0          288.
## 6     332 0          235.

unite

Function:
- unite(data, col, …, sep = " “, remove = TRUE)
Same as:
- data %>% unite(col, …, sep = " “, remove = TRUE)
Arguments:
- data: data frame
- col: column name of new “merged” column
- …: names of columns to merge
- sep: separator to use between merged values
- remove: if TRUE, remove input column from output data frame

unite example

data1_unite<- data1 %>% unite(Subject_Days, Subject, Days, sep="_")
head(data1_unite)

## # A tibble: 6 x 2
##   Reaction Subject_Days
##      <dbl> <chr>       
## 1     250. 308_0       
## 2     259. 308_1       
## 3     251. 308_2       
## 4     321. 308_3       
## 5     357. 308_4       
## 6     415. 308_5

separate

Function:
- separate(data, col, into, sep = " “, remove = TRUE, convert = FALSE)
Same as:
- data %>% separate(col, into, sep = " “, remove = TRUE, convert = FALSE)
Arguments:
- data: data frame
- col: column name representing current variable
- into: names of variables representing new variables
- sep: how to separate current variable (char, num, or symbol)
- remove: if TRUE, remove input column from output data frame
- convert: if TRUE will automatically convert values to logical, integer, numeric, complex or factor as appropriate

separate example

data1_separate<- data1_unite %>% separate(Subject_Days, c("subjects", "days"), sep="_")
head(data1_separate)

## # A tibble: 6 x 3
##   Reaction subjects days 
##      <dbl> <chr>    <chr>
## 1     250. 308      0    
## 2     259. 308      1    
## 3     251. 308      2    
## 4     321. 308      3    
## 5     357. 308      4    
## 6     415. 308      5

Biostatistical Computing, PHC 6068

Data manipulation (Tidyverse)

Outline

Read in data (readr)

Inspect the data

read_delim

Play with data1

dplyr

select

pipe

filter

arrange

mutate

summarise

group_by

select (2)

more options for select()

merge data.frame

the data for merge

inner_join

left_join

right_join

anti_join

full_join

tidyr

spread

spread example

gather

gather example

unite

unite example

separate

separate example

A combined example

references