Biostatistical Computing, PHC 6068

Data manipulation (Tidyverse)

Zhiguang Huo (Caleb)

Wednesday September 19, 2018

Outline

The tidyverse is a collection of R packages designed for data science.

library(readr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
## alternatively, you can do library(tidyverse) to include all of them

Read in data (readr)

asleepfile <- "https://raw.githubusercontent.com/Caleb-Huo/Caleb-Huo.github.io/master/teaching/data/sleep/sleepstudy.csv"
data0 <- read.csv(asleepfile)
data1 <- read_csv(asleepfile)
## Parsed with column specification:
## cols(
##   Reaction = col_double(),
##   Days = col_integer(),
##   Subject = col_integer()
## )

Inspect the data

head(data0)
##   Reaction Days Subject
## 1 249.5600    0     308
## 2 258.7047    1     308
## 3 250.8006    2     308
## 4 321.4398    3     308
## 5 356.8519    4     308
## 6 414.6901    5     308
class(data0)
## [1] "data.frame"
data1
## # A tibble: 180 x 3
##    Reaction  Days Subject
##       <dbl> <int>   <int>
##  1     250.     0     308
##  2     259.     1     308
##  3     251.     2     308
##  4     321.     3     308
##  5     357.     4     308
##  6     415.     5     308
##  7     382.     6     308
##  8     290.     7     308
##  9     431.     8     308
## 10     466.     9     308
## # ... with 170 more rows
class(data1)
## [1] "tbl_df"     "tbl"        "data.frame"

read_delim

data2 <- read_delim("sleepstudy.csv", delim=",")
## Parsed with column specification:
## cols(
##   Reaction = col_double(),
##   Days = col_integer(),
##   Subject = col_integer()
## )

Play with data1

data1_sub <- data1[1:3,]
data1_sub$Reaction
## [1] 249.5600 258.7047 250.8006
as.matrix(data1_sub)
##      Reaction Days Subject
## [1,] 249.5600    0     308
## [2,] 258.7047    1     308
## [3,] 250.8006    2     308
as.data.frame(data1_sub)
##   Reaction Days Subject
## 1 249.5600    0     308
## 2 258.7047    1     308
## 3 250.8006    2     308

dplyr

select

select(data1, Days, Subject)
## # A tibble: 180 x 2
##     Days Subject
##    <int>   <int>
##  1     0     308
##  2     1     308
##  3     2     308
##  4     3     308
##  5     4     308
##  6     5     308
##  7     6     308
##  8     7     308
##  9     8     308
## 10     9     308
## # ... with 170 more rows
data1 %>% select(Days, Subject)
## # A tibble: 180 x 2
##     Days Subject
##    <int>   <int>
##  1     0     308
##  2     1     308
##  3     2     308
##  4     3     308
##  5     4     308
##  6     5     308
##  7     6     308
##  8     7     308
##  9     8     308
## 10     9     308
## # ... with 170 more rows

pipe

data1 %>% select(-Reaction)
## # A tibble: 180 x 2
##     Days Subject
##    <int>   <int>
##  1     0     308
##  2     1     308
##  3     2     308
##  4     3     308
##  5     4     308
##  6     5     308
##  7     6     308
##  8     7     308
##  9     8     308
## 10     9     308
## # ... with 170 more rows

filter

data1 %>% 
  select(Days, Subject) %>%
  filter(Subject == 308)
## # A tibble: 10 x 2
##     Days Subject
##    <int>   <int>
##  1     0     308
##  2     1     308
##  3     2     308
##  4     3     308
##  5     4     308
##  6     5     308
##  7     6     308
##  8     7     308
##  9     8     308
## 10     9     308
data1 %>% 
  filter(Reaction >= 300) %>%
  select(Days, Subject) %>%
  filter(Subject == 308)
## # A tibble: 6 x 2
##    Days Subject
##   <int>   <int>
## 1     3     308
## 2     4     308
## 3     5     308
## 4     6     308
## 5     8     308
## 6     9     308
data1 %>% 
  filter(Reaction >= 300, Subject == 308) 
## # A tibble: 6 x 3
##   Reaction  Days Subject
##      <dbl> <int>   <int>
## 1     321.     3     308
## 2     357.     4     308
## 3     415.     5     308
## 4     382.     6     308
## 5     431.     8     308
## 6     466.     9     308

arrange

data1 %>% arrange(Reaction) %>% head
## # A tibble: 6 x 3
##   Reaction  Days Subject
##      <dbl> <int>   <int>
## 1     194.     1     310
## 2     199.     0     310
## 3     203.     2     309
## 4     205.     3     309
## 5     205.     1     309
## 6     208.     4     309
data1 %>% 
  arrange(Days, Reaction) %>% 
  head
## # A tibble: 6 x 3
##   Reaction  Days Subject
##      <dbl> <int>   <int>
## 1     199.     0     310
## 2     222.     0     352
## 3     223.     0     309
## 4     225.     0     370
## 5     235.     0     332
## 6     236.     0     349
data1 %>% 
  arrange(desc(Days), Reaction) %>% 
  head
## # A tibble: 6 x 3
##   Reaction  Days Subject
##      <dbl> <int>   <int>
## 1     237.     9     335
## 2     237.     9     309
## 3     248.     9     310
## 4     254.     9     332
## 5     348.     9     351
## 6     352.     9     349

mutate

data1 %>% 
  mutate(Reaction_binary = Reaction<250) %>%
  head
## # A tibble: 6 x 4
##   Reaction  Days Subject Reaction_binary
##      <dbl> <int>   <int> <lgl>          
## 1     250.     0     308 TRUE           
## 2     259.     1     308 FALSE          
## 3     251.     2     308 FALSE          
## 4     321.     3     308 FALSE          
## 5     357.     4     308 FALSE          
## 6     415.     5     308 FALSE
data1 %>% 
  mutate(Reaction_binary = Reaction<250) %>%
  mutate(Reaction_sec = Reaction/1000) %>% 
  head
## # A tibble: 6 x 5
##   Reaction  Days Subject Reaction_binary Reaction_sec
##      <dbl> <int>   <int> <lgl>                  <dbl>
## 1     250.     0     308 TRUE                   0.250
## 2     259.     1     308 FALSE                  0.259
## 3     251.     2     308 FALSE                  0.251
## 4     321.     3     308 FALSE                  0.321
## 5     357.     4     308 FALSE                  0.357
## 6     415.     5     308 FALSE                  0.415

summarise

data1 %>% 
    summarise(avg_reaction = mean(Reaction), 
              min_reaction = min(Reaction),
              max_reaction = max(Reaction),
              total = n())
## # A tibble: 1 x 4
##   avg_reaction min_reaction max_reaction total
##          <dbl>        <dbl>        <dbl> <int>
## 1         299.         194.         466.   180

group_by

tt <- data1 %>% 
      group_by(Subject) %>%
      summarise(avg_reaction = mean(Reaction), 
              min_reaction = min(Reaction),
              max_reaction = max(Reaction),
              total = n())
tt %>% head
## # A tibble: 6 x 5
##   Subject avg_reaction min_reaction max_reaction total
##     <int>        <dbl>        <dbl>        <dbl> <int>
## 1     308         342.         250.         466.    10
## 2     309         215.         203.         237.    10
## 3     310         231.         194.         261.    10
## 4     330         303.         280.         354.    10
## 5     331         309.         285          372.    10
## 6     332         307.         235.         454.    10

select (2)

tt %>% 
  head %>% 
  select(avg_reaction:max_reaction)
## # A tibble: 6 x 3
##   avg_reaction min_reaction max_reaction
##          <dbl>        <dbl>        <dbl>
## 1         342.         250.         466.
## 2         215.         203.         237.
## 3         231.         194.         261.
## 4         303.         280.         354.
## 5         309.         285          372.
## 6         307.         235.         454.
tt %>% 
  head %>% 
  select(contains("reaction"))
## # A tibble: 6 x 3
##   avg_reaction min_reaction max_reaction
##          <dbl>        <dbl>        <dbl>
## 1         342.         250.         466.
## 2         215.         203.         237.
## 3         231.         194.         261.
## 4         303.         280.         354.
## 5         309.         285          372.
## 6         307.         235.         454.

more options for select()

merge data.frame

the data for merge

superheroes <- "
    name, alignment, gender,         publisher
 Magneto,       bad,   male,            Marvel
   Storm,      good, female,            Marvel
Mystique,       bad, female,            Marvel
  Batman,      good,   male,                DC
   Joker,       bad,   male,                DC
Catwoman,       bad, female,                DC
 Hellboy,      good,   male, Dark Horse Comics
"
superheroes <- read_csv(superheroes, skip = 1)

publishers <- "
  publisher, yr_founded
         DC,       1934
     Marvel,       1939
      Image,       1992
"
publishers <- read_csv(publishers, skip = 1)

inner_join

inner_join(superheroes, publishers)
## Joining, by = "publisher"
## # A tibble: 6 x 5
##   name     alignment gender publisher yr_founded
##   <chr>    <chr>     <chr>  <chr>          <int>
## 1 Magneto  bad       male   Marvel          1939
## 2 Storm    good      female Marvel          1939
## 3 Mystique bad       female Marvel          1939
## 4 Batman   good      male   DC              1934
## 5 Joker    bad       male   DC              1934
## 6 Catwoman bad       female DC              1934
inner_join(publishers, superheroes)
## Joining, by = "publisher"
## # A tibble: 6 x 5
##   publisher yr_founded name     alignment gender
##   <chr>          <int> <chr>    <chr>     <chr> 
## 1 DC              1934 Batman   good      male  
## 2 DC              1934 Joker    bad       male  
## 3 DC              1934 Catwoman bad       female
## 4 Marvel          1939 Magneto  bad       male  
## 5 Marvel          1939 Storm    good      female
## 6 Marvel          1939 Mystique bad       female

left_join

left_join(superheroes, publishers)
## Joining, by = "publisher"
## # A tibble: 7 x 5
##   name     alignment gender publisher         yr_founded
##   <chr>    <chr>     <chr>  <chr>                  <int>
## 1 Magneto  bad       male   Marvel                  1939
## 2 Storm    good      female Marvel                  1939
## 3 Mystique bad       female Marvel                  1939
## 4 Batman   good      male   DC                      1934
## 5 Joker    bad       male   DC                      1934
## 6 Catwoman bad       female DC                      1934
## 7 Hellboy  good      male   Dark Horse Comics         NA
left_join(publishers, superheroes)
## Joining, by = "publisher"
## # A tibble: 7 x 5
##   publisher yr_founded name     alignment gender
##   <chr>          <int> <chr>    <chr>     <chr> 
## 1 DC              1934 Batman   good      male  
## 2 DC              1934 Joker    bad       male  
## 3 DC              1934 Catwoman bad       female
## 4 Marvel          1939 Magneto  bad       male  
## 5 Marvel          1939 Storm    good      female
## 6 Marvel          1939 Mystique bad       female
## 7 Image           1992 <NA>     <NA>      <NA>

right_join

right_join(superheroes, publishers)
## Joining, by = "publisher"
## # A tibble: 7 x 5
##   name     alignment gender publisher yr_founded
##   <chr>    <chr>     <chr>  <chr>          <int>
## 1 Batman   good      male   DC              1934
## 2 Joker    bad       male   DC              1934
## 3 Catwoman bad       female DC              1934
## 4 Magneto  bad       male   Marvel          1939
## 5 Storm    good      female Marvel          1939
## 6 Mystique bad       female Marvel          1939
## 7 <NA>     <NA>      <NA>   Image           1992
right_join(publishers, superheroes)
## Joining, by = "publisher"
## # A tibble: 7 x 5
##   publisher         yr_founded name     alignment gender
##   <chr>                  <int> <chr>    <chr>     <chr> 
## 1 Marvel                  1939 Magneto  bad       male  
## 2 Marvel                  1939 Storm    good      female
## 3 Marvel                  1939 Mystique bad       female
## 4 DC                      1934 Batman   good      male  
## 5 DC                      1934 Joker    bad       male  
## 6 DC                      1934 Catwoman bad       female
## 7 Dark Horse Comics         NA Hellboy  good      male

anti_join

anti_join(superheroes, publishers)
## Joining, by = "publisher"
## # A tibble: 1 x 4
##   name    alignment gender publisher        
##   <chr>   <chr>     <chr>  <chr>            
## 1 Hellboy good      male   Dark Horse Comics
anti_join(publishers, superheroes)
## Joining, by = "publisher"
## # A tibble: 1 x 2
##   publisher yr_founded
##   <chr>          <int>
## 1 Image           1992

full_join

full_join(superheroes, publishers)
## Joining, by = "publisher"
## # A tibble: 8 x 5
##   name     alignment gender publisher         yr_founded
##   <chr>    <chr>     <chr>  <chr>                  <int>
## 1 Magneto  bad       male   Marvel                  1939
## 2 Storm    good      female Marvel                  1939
## 3 Mystique bad       female Marvel                  1939
## 4 Batman   good      male   DC                      1934
## 5 Joker    bad       male   DC                      1934
## 6 Catwoman bad       female DC                      1934
## 7 Hellboy  good      male   Dark Horse Comics         NA
## 8 <NA>     <NA>      <NA>   Image                   1992
full_join(publishers, superheroes)
## Joining, by = "publisher"
## # A tibble: 8 x 5
##   publisher         yr_founded name     alignment gender
##   <chr>                  <int> <chr>    <chr>     <chr> 
## 1 DC                      1934 Batman   good      male  
## 2 DC                      1934 Joker    bad       male  
## 3 DC                      1934 Catwoman bad       female
## 4 Marvel                  1939 Magneto  bad       male  
## 5 Marvel                  1939 Storm    good      female
## 6 Marvel                  1939 Mystique bad       female
## 7 Image                   1992 <NA>     <NA>      <NA>  
## 8 Dark Horse Comics         NA Hellboy  good      male

tidyr

spread

spread example

data1_wide <- data1 %>% spread(Days, Reaction, fill = NA, convert = FALSE)
head(data1_wide)
## # A tibble: 6 x 11
##   Subject   `0`   `1`   `2`   `3`   `4`   `5`   `6`   `7`   `8`   `9`
##     <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     308  250.  259.  251.  321.  357.  415.  382.  290.  431.  466.
## 2     309  223.  205.  203.  205.  208.  216.  214.  218.  224.  237.
## 3     310  199.  194.  234.  233.  229.  220.  235.  256.  261.  248.
## 4     330  322.  300.  284.  285.  286.  298.  280.  318.  305.  354.
## 5     331  288.  285   302.  320.  316.  293.  290.  335.  294.  372.
## 6     332  235.  243.  273.  310.  317.  310.  454.  347.  330.  254.

gather

gather example

data1_long <- data1_wide %>% gather(ddays, rreaction, "0":"9")
head(data1_long)
## # A tibble: 6 x 3
##   Subject ddays rreaction
##     <int> <chr>     <dbl>
## 1     308 0          250.
## 2     309 0          223.
## 3     310 0          199.
## 4     330 0          322.
## 5     331 0          288.
## 6     332 0          235.

unite

unite example

data1_unite<- data1 %>% unite(Subject_Days, Subject, Days, sep="_")
head(data1_unite)
## # A tibble: 6 x 2
##   Reaction Subject_Days
##      <dbl> <chr>       
## 1     250. 308_0       
## 2     259. 308_1       
## 3     251. 308_2       
## 4     321. 308_3       
## 5     357. 308_4       
## 6     415. 308_5

separate

separate example

data1_separate<- data1_unite %>% separate(Subject_Days, c("subjects", "days"), sep="_")
head(data1_separate)
## # A tibble: 6 x 3
##   Reaction subjects days 
##      <dbl> <chr>    <chr>
## 1     250. 308      0    
## 2     259. 308      1    
## 3     251. 308      2    
## 4     321. 308      3    
## 5     357. 308      4    
## 6     415. 308      5

A combined example

mpg %>% 
  filter(class != "compact") %>%
  mutate(ctyEfficient = cty>20) %>%
  select(displ, hwy, class, ctyEfficient) %>%
  ggplot() +
  aes(x=displ, y=hwy, color=class, shape = ctyEfficient) +
  geom_point()

references