Programming basics for Biostatistics 6099

Data manipulation (Tidyverse)

Zhiguang Huo (Caleb)

Thursday September 7, 2023

Outline

The tidyverse is a collection of R packages designed for data science.

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ purrr     1.0.1
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Read in data (readr)

asleepfile <- "https:///Caleb-Huo.github.io/teaching/data/sleep/sleepstudy.csv"
data0 <- read.csv(asleepfile)
data1 <- read_csv(asleepfile)
## Rows: 180 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): Reaction, Days, Subject
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Inspect the data

head(data0)
##   Reaction Days Subject
## 1 249.5600    0     308
## 2 258.7047    1     308
## 3 250.8006    2     308
## 4 321.4398    3     308
## 5 356.8519    4     308
## 6 414.6901    5     308
class(data0)
## [1] "data.frame"
data1
## # A tibble: 180 × 3
##    Reaction  Days Subject
##       <dbl> <dbl>   <dbl>
##  1     250.     0     308
##  2     259.     1     308
##  3     251.     2     308
##  4     321.     3     308
##  5     357.     4     308
##  6     415.     5     308
##  7     382.     6     308
##  8     290.     7     308
##  9     431.     8     308
## 10     466.     9     308
## # ℹ 170 more rows
class(data1)
## [1] "spec_tbl_df" "tbl_df"      "tbl"         "data.frame"

read_delim

data2 <- read_delim("sleepstudy.csv", delim=",")
## Rows: 180 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): Reaction, Days, Subject
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Read excel

library(readxl)
## bsleepfile <- "https:///Caleb-Huo.github.io/teaching/data/sleep/sleepstudy.xlsx"
data0 <- read_excel("sleepstudy.xlsx")
data0
library(xlsx)
## bsleepfile <- "https:///Caleb-Huo.github.io/teaching/data/sleep/sleepstudy.xlsx"
data0 <- read.xlsx("sleepstudy.xlsx", sheetIndex = 1)
data0

Write excel

library(xlsx)
## bsleepfile <- "https:///Caleb-Huo.github.io/teaching/data/sleep/sleepstudy.xlsx"
data_iris <- iris
data_cars <- cars
write.xlsx(data_iris, file = "mydata.xlsx", sheetName="iris")
write.xlsx(data_cars, file = "mydata.xlsx", sheetName="cars", append = TRUE)

Read SAS, SPSS, and Stata files.

library(haven)
salesfile <- "https:///Caleb-Huo.github.io/teaching/data/sleep/sales.sas7bdat"
data0 <- read_sas(salesfile)
data0
## # A tibble: 36 × 3
##     YEAR     P     S
##    <dbl> <dbl> <dbl>
##  1  1950  12.9  182.
##  2  1951  11.9  245 
##  3  1952  10.7  250.
##  4  1953  11.3  266.
##  5  1954  11.2  248.
##  6  1955  15.1  278.
##  7  1956  16.2  307.
##  8  1957  15.4  320 
##  9  1958  12.7  305.
## 10  1959  16.3  338 
## # ℹ 26 more rows

Inspection on data1

data1_sub <- data1[1:3,]
data1_sub$Reaction
## [1] 249.5600 258.7047 250.8006
as.matrix(data1_sub)
##      Reaction Days Subject
## [1,] 249.5600    0     308
## [2,] 258.7047    1     308
## [3,] 250.8006    2     308
as.data.frame(data1_sub)
##   Reaction Days Subject
## 1 249.5600    0     308
## 2 258.7047    1     308
## 3 250.8006    2     308

dplyr

dplyr

select

select(data1, Days, Subject)
## # A tibble: 180 × 2
##     Days Subject
##    <dbl>   <dbl>
##  1     0     308
##  2     1     308
##  3     2     308
##  4     3     308
##  5     4     308
##  6     5     308
##  7     6     308
##  8     7     308
##  9     8     308
## 10     9     308
## # ℹ 170 more rows
data1 %>% select(Days, Subject)
## # A tibble: 180 × 2
##     Days Subject
##    <dbl>   <dbl>
##  1     0     308
##  2     1     308
##  3     2     308
##  4     3     308
##  5     4     308
##  6     5     308
##  7     6     308
##  8     7     308
##  9     8     308
## 10     9     308
## # ℹ 170 more rows

pipe

exp(1)
## [1] 2.718282
1 %>% exp()
## [1] 2.718282
1 %>% exp() %>% log ## () can be omitted if the data is the only argument
## [1] 1
data1 %>% select(-Reaction)
## # A tibble: 180 × 2
##     Days Subject
##    <dbl>   <dbl>
##  1     0     308
##  2     1     308
##  3     2     308
##  4     3     308
##  5     4     308
##  6     5     308
##  7     6     308
##  8     7     308
##  9     8     308
## 10     9     308
## # ℹ 170 more rows

filter

data1 %>% 
  select(Days, Subject) %>%
  filter(Subject == 308)
## # A tibble: 10 × 2
##     Days Subject
##    <dbl>   <dbl>
##  1     0     308
##  2     1     308
##  3     2     308
##  4     3     308
##  5     4     308
##  6     5     308
##  7     6     308
##  8     7     308
##  9     8     308
## 10     9     308
data1 %>% 
  filter(Reaction >= 300) %>%
  select(Days, Subject) %>%
  filter(Subject == 308)
## # A tibble: 6 × 2
##    Days Subject
##   <dbl>   <dbl>
## 1     3     308
## 2     4     308
## 3     5     308
## 4     6     308
## 5     8     308
## 6     9     308
data1 %>% 
  filter(Reaction >= 300, Subject == 308) 
## # A tibble: 6 × 3
##   Reaction  Days Subject
##      <dbl> <dbl>   <dbl>
## 1     321.     3     308
## 2     357.     4     308
## 3     415.     5     308
## 4     382.     6     308
## 5     431.     8     308
## 6     466.     9     308
data1 %>% 
  filter(Reaction >= 300 & Subject == 308) 
## # A tibble: 6 × 3
##   Reaction  Days Subject
##      <dbl> <dbl>   <dbl>
## 1     321.     3     308
## 2     357.     4     308
## 3     415.     5     308
## 4     382.     6     308
## 5     431.     8     308
## 6     466.     9     308

pull

(data1 %>% 
  filter(Reaction >= 300, Subject == 308))$Reaction
## [1] 321.4398 356.8519 414.6901 382.2038 430.5853 466.3535
data1 %>% 
  filter(Reaction >= 300, Subject == 308) %>%
  pull(Reaction)
## [1] 321.4398 356.8519 414.6901 382.2038 430.5853 466.3535

slice

data1 %>% 
  slice(1:8) 
## # A tibble: 8 × 3
##   Reaction  Days Subject
##      <dbl> <dbl>   <dbl>
## 1     250.     0     308
## 2     259.     1     308
## 3     251.     2     308
## 4     321.     3     308
## 5     357.     4     308
## 6     415.     5     308
## 7     382.     6     308
## 8     290.     7     308
data1 %>% 
  head(n=8) 
## # A tibble: 8 × 3
##   Reaction  Days Subject
##      <dbl> <dbl>   <dbl>
## 1     250.     0     308
## 2     259.     1     308
## 3     251.     2     308
## 4     321.     3     308
## 5     357.     4     308
## 6     415.     5     308
## 7     382.     6     308
## 8     290.     7     308
which(data1$Reaction >= 300 & data1$Subject == 308)
## [1]  4  5  6  7  9 10
with(data1, which(Reaction >= 300 & Subject == 308)) ## works for data.frame
## [1]  4  5  6  7  9 10
data1 %>% 
  slice(which(Reaction >= 300 & Subject == 308)) 
## # A tibble: 6 × 3
##   Reaction  Days Subject
##      <dbl> <dbl>   <dbl>
## 1     321.     3     308
## 2     357.     4     308
## 3     415.     5     308
## 4     382.     6     308
## 5     431.     8     308
## 6     466.     9     308

arrange

data1 %>% arrange(Reaction) %>% head
## # A tibble: 6 × 3
##   Reaction  Days Subject
##      <dbl> <dbl>   <dbl>
## 1     194.     1     310
## 2     199.     0     310
## 3     203.     2     309
## 4     205.     3     309
## 5     205.     1     309
## 6     208.     4     309
data1 %>% 
  arrange(Days, Reaction) %>% 
  head
## # A tibble: 6 × 3
##   Reaction  Days Subject
##      <dbl> <dbl>   <dbl>
## 1     199.     0     310
## 2     222.     0     352
## 3     223.     0     309
## 4     225.     0     370
## 5     235.     0     332
## 6     236.     0     349
data1 %>% 
  arrange(Days, Reaction) %>% 
  head %>%
  colSums ## pipe also work for other functions
## Reaction     Days  Subject 
## 1339.693    0.000 2022.000
data1 %>% 
  arrange(desc(Days), Reaction) %>% 
  head
## # A tibble: 6 × 3
##   Reaction  Days Subject
##      <dbl> <dbl>   <dbl>
## 1     237.     9     335
## 2     237.     9     309
## 3     248.     9     310
## 4     254.     9     332
## 5     348.     9     351
## 6     352.     9     349

mutate

data1 %>% 
  mutate(Reaction_binary = Reaction<250) %>%
  head
## # A tibble: 6 × 4
##   Reaction  Days Subject Reaction_binary
##      <dbl> <dbl>   <dbl> <lgl>          
## 1     250.     0     308 TRUE           
## 2     259.     1     308 FALSE          
## 3     251.     2     308 FALSE          
## 4     321.     3     308 FALSE          
## 5     357.     4     308 FALSE          
## 6     415.     5     308 FALSE
data1 %>% 
  mutate(Reaction_binary = Reaction<250,
        Reaction_sec = Reaction/1000) %>% 
  head
## # A tibble: 6 × 5
##   Reaction  Days Subject Reaction_binary Reaction_sec
##      <dbl> <dbl>   <dbl> <lgl>                  <dbl>
## 1     250.     0     308 TRUE                   0.250
## 2     259.     1     308 FALSE                  0.259
## 3     251.     2     308 FALSE                  0.251
## 4     321.     3     308 FALSE                  0.321
## 5     357.     4     308 FALSE                  0.357
## 6     415.     5     308 FALSE                  0.415

mutate at (apply a function to one or several columns)

data1 %>% 
  head
## # A tibble: 6 × 3
##   Reaction  Days Subject
##      <dbl> <dbl>   <dbl>
## 1     250.     0     308
## 2     259.     1     308
## 3     251.     2     308
## 4     321.     3     308
## 5     357.     4     308
## 6     415.     5     308
data1 %>% 
  head %>%
  mutate_at(c("Reaction", "Subject"), log)
## # A tibble: 6 × 3
##   Reaction  Days Subject
##      <dbl> <dbl>   <dbl>
## 1     5.52     0    5.73
## 2     5.56     1    5.73
## 3     5.52     2    5.73
## 4     5.77     3    5.73
## 5     5.88     4    5.73
## 6     6.03     5    5.73

rename

data1 %>% 
  rename(ID = Subject)
## # A tibble: 180 × 3
##    Reaction  Days    ID
##       <dbl> <dbl> <dbl>
##  1     250.     0   308
##  2     259.     1   308
##  3     251.     2   308
##  4     321.     3   308
##  5     357.     4   308
##  6     415.     5   308
##  7     382.     6   308
##  8     290.     7   308
##  9     431.     8   308
## 10     466.     9   308
## # ℹ 170 more rows

summarise

data1 %>% 
    summarise(avg_reaction = mean(Reaction), 
              min_reaction = min(Reaction),
              max_reaction = max(Reaction),
              total = n())
## # A tibble: 1 × 4
##   avg_reaction min_reaction max_reaction total
##          <dbl>        <dbl>        <dbl> <int>
## 1         299.         194.         466.   180
adata <- data1 %>% 
    summarise(avg_reaction = mean(Reaction), 
              min_reaction = min(Reaction),
              max_reaction = max(Reaction),
              total = n())

group_by

tt <- data1 %>% 
      group_by(Subject) %>%
      summarise(avg_reaction = mean(Reaction), 
              min_reaction = min(Reaction),
              max_reaction = max(Reaction),
              total = n())
tt %>% head
## # A tibble: 6 × 5
##   Subject avg_reaction min_reaction max_reaction total
##     <dbl>        <dbl>        <dbl>        <dbl> <int>
## 1     308         342.         250.         466.    10
## 2     309         215.         203.         237.    10
## 3     310         231.         194.         261.    10
## 4     330         303.         280.         354.    10
## 5     331         309.         285          372.    10
## 6     332         307.         235.         454.    10

select (2)

tt %>% 
  head %>% 
  select(avg_reaction:max_reaction)
## # A tibble: 6 × 3
##   avg_reaction min_reaction max_reaction
##          <dbl>        <dbl>        <dbl>
## 1         342.         250.         466.
## 2         215.         203.         237.
## 3         231.         194.         261.
## 4         303.         280.         354.
## 5         309.         285          372.
## 6         307.         235.         454.
tt %>% 
  head %>% 
  select(contains("reaction"))
## # A tibble: 6 × 3
##   avg_reaction min_reaction max_reaction
##          <dbl>        <dbl>        <dbl>
## 1         342.         250.         466.
## 2         215.         203.         237.
## 3         231.         194.         261.
## 4         303.         280.         354.
## 5         309.         285          372.
## 6         307.         235.         454.

more options for select()

tt %>% 
  head %>% 
  select(ends_with("reaction"))
## # A tibble: 6 × 3
##   avg_reaction min_reaction max_reaction
##          <dbl>        <dbl>        <dbl>
## 1         342.         250.         466.
## 2         215.         203.         237.
## 3         231.         194.         261.
## 4         303.         280.         354.
## 5         309.         285          372.
## 6         307.         235.         454.
tt %>% 
  head %>% 
  select(starts_with("m"))
## # A tibble: 6 × 2
##   min_reaction max_reaction
##          <dbl>        <dbl>
## 1         250.         466.
## 2         203.         237.
## 3         194.         261.
## 4         280.         354.
## 5         285          372.
## 6         235.         454.

Select by variables that are contained in a character vector

avar <- c("Reaction", "Days")
data1 %>% select(all_of(avar))
## # A tibble: 180 × 2
##    Reaction  Days
##       <dbl> <dbl>
##  1     250.     0
##  2     259.     1
##  3     251.     2
##  4     321.     3
##  5     357.     4
##  6     415.     5
##  7     382.     6
##  8     290.     7
##  9     431.     8
## 10     466.     9
## # ℹ 170 more rows
bvar <- c("Reaction", "Days", "Months")
data1 %>% select(any_of(bvar))
## # A tibble: 180 × 2
##    Reaction  Days
##       <dbl> <dbl>
##  1     250.     0
##  2     259.     1
##  3     251.     2
##  4     321.     3
##  5     357.     4
##  6     415.     5
##  7     382.     6
##  8     290.     7
##  9     431.     8
## 10     466.     9
## # ℹ 170 more rows

create a tibble

atibble <- tibble(A = 4:6, B = c("A", "B", "C"))
atibble
## # A tibble: 3 × 2
##       A B    
##   <int> <chr>
## 1     4 A    
## 2     5 B    
## 3     6 C
adataframe <- data.frame(A = 4:6, B = c("A", "B", "C"))
as_tibble(adataframe)
## # A tibble: 3 × 2
##       A B    
##   <int> <chr>
## 1     4 A    
## 2     5 B    
## 3     6 C
btibble <- atibble %>% rowid_to_column()
btibble
## # A tibble: 3 × 3
##   rowid     A B    
##   <int> <int> <chr>
## 1     1     4 A    
## 2     2     5 B    
## 3     3     6 C

merge data.frame

the data for merge

superheroes <- "
    name, alignment, gender,         publisher
 Magneto,       bad,   male,            Marvel
   Storm,      good, female,            Marvel
Mystique,       bad, female,            Marvel
  Batman,      good,   male,                DC
   Joker,       bad,   male,                DC
Catwoman,       bad, female,                DC
 Hellboy,      good,   male, Dark Horse Comics
"
superheroes <- read_csv(superheroes, skip = 1)
## Rows: 7 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): name, alignment, gender, publisher
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
publishers <- "
  publisher, yr_founded
         DC,       1934
     Marvel,       1939
      Image,       1992
"
publishers <- read_csv(publishers, skip = 1)
## Rows: 3 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): publisher
## dbl (1): yr_founded
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

inner_join

inner_join(superheroes, publishers)
## Joining with `by = join_by(publisher)`
## # A tibble: 6 × 5
##   name     alignment gender publisher yr_founded
##   <chr>    <chr>     <chr>  <chr>          <dbl>
## 1 Magneto  bad       male   Marvel          1939
## 2 Storm    good      female Marvel          1939
## 3 Mystique bad       female Marvel          1939
## 4 Batman   good      male   DC              1934
## 5 Joker    bad       male   DC              1934
## 6 Catwoman bad       female DC              1934
inner_join(publishers, superheroes)
## Joining with `by = join_by(publisher)`
## # A tibble: 6 × 5
##   publisher yr_founded name     alignment gender
##   <chr>          <dbl> <chr>    <chr>     <chr> 
## 1 DC              1934 Batman   good      male  
## 2 DC              1934 Joker    bad       male  
## 3 DC              1934 Catwoman bad       female
## 4 Marvel          1939 Magneto  bad       male  
## 5 Marvel          1939 Storm    good      female
## 6 Marvel          1939 Mystique bad       female

left_join

left_join(superheroes, publishers)
## Joining with `by = join_by(publisher)`
## # A tibble: 7 × 5
##   name     alignment gender publisher         yr_founded
##   <chr>    <chr>     <chr>  <chr>                  <dbl>
## 1 Magneto  bad       male   Marvel                  1939
## 2 Storm    good      female Marvel                  1939
## 3 Mystique bad       female Marvel                  1939
## 4 Batman   good      male   DC                      1934
## 5 Joker    bad       male   DC                      1934
## 6 Catwoman bad       female DC                      1934
## 7 Hellboy  good      male   Dark Horse Comics         NA
left_join(publishers, superheroes)
## Joining with `by = join_by(publisher)`
## # A tibble: 7 × 5
##   publisher yr_founded name     alignment gender
##   <chr>          <dbl> <chr>    <chr>     <chr> 
## 1 DC              1934 Batman   good      male  
## 2 DC              1934 Joker    bad       male  
## 3 DC              1934 Catwoman bad       female
## 4 Marvel          1939 Magneto  bad       male  
## 5 Marvel          1939 Storm    good      female
## 6 Marvel          1939 Mystique bad       female
## 7 Image           1992 <NA>     <NA>      <NA>

right_join

right_join(superheroes, publishers)
## Joining with `by = join_by(publisher)`
## # A tibble: 7 × 5
##   name     alignment gender publisher yr_founded
##   <chr>    <chr>     <chr>  <chr>          <dbl>
## 1 Magneto  bad       male   Marvel          1939
## 2 Storm    good      female Marvel          1939
## 3 Mystique bad       female Marvel          1939
## 4 Batman   good      male   DC              1934
## 5 Joker    bad       male   DC              1934
## 6 Catwoman bad       female DC              1934
## 7 <NA>     <NA>      <NA>   Image           1992
right_join(publishers, superheroes)
## Joining with `by = join_by(publisher)`
## # A tibble: 7 × 5
##   publisher         yr_founded name     alignment gender
##   <chr>                  <dbl> <chr>    <chr>     <chr> 
## 1 DC                      1934 Batman   good      male  
## 2 DC                      1934 Joker    bad       male  
## 3 DC                      1934 Catwoman bad       female
## 4 Marvel                  1939 Magneto  bad       male  
## 5 Marvel                  1939 Storm    good      female
## 6 Marvel                  1939 Mystique bad       female
## 7 Dark Horse Comics         NA Hellboy  good      male

anti_join

anti_join(superheroes, publishers)
## Joining with `by = join_by(publisher)`
## # A tibble: 1 × 4
##   name    alignment gender publisher        
##   <chr>   <chr>     <chr>  <chr>            
## 1 Hellboy good      male   Dark Horse Comics
anti_join(publishers, superheroes)
## Joining with `by = join_by(publisher)`
## # A tibble: 1 × 2
##   publisher yr_founded
##   <chr>          <dbl>
## 1 Image           1992

full_join

full_join(superheroes, publishers)
## Joining with `by = join_by(publisher)`
## # A tibble: 8 × 5
##   name     alignment gender publisher         yr_founded
##   <chr>    <chr>     <chr>  <chr>                  <dbl>
## 1 Magneto  bad       male   Marvel                  1939
## 2 Storm    good      female Marvel                  1939
## 3 Mystique bad       female Marvel                  1939
## 4 Batman   good      male   DC                      1934
## 5 Joker    bad       male   DC                      1934
## 6 Catwoman bad       female DC                      1934
## 7 Hellboy  good      male   Dark Horse Comics         NA
## 8 <NA>     <NA>      <NA>   Image                   1992
full_join(publishers, superheroes)
## Joining with `by = join_by(publisher)`
## # A tibble: 8 × 5
##   publisher         yr_founded name     alignment gender
##   <chr>                  <dbl> <chr>    <chr>     <chr> 
## 1 DC                      1934 Batman   good      male  
## 2 DC                      1934 Joker    bad       male  
## 3 DC                      1934 Catwoman bad       female
## 4 Marvel                  1939 Magneto  bad       male  
## 5 Marvel                  1939 Storm    good      female
## 6 Marvel                  1939 Mystique bad       female
## 7 Image                   1992 <NA>     <NA>      <NA>  
## 8 Dark Horse Comics         NA Hellboy  good      male

tidyr

spread

spread example

data1_wide <- data1 %>% spread(Days, Reaction)
head(data1_wide)
## # A tibble: 6 × 11
##   Subject   `0`   `1`   `2`   `3`   `4`   `5`   `6`   `7`   `8`   `9`
##     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     308  250.  259.  251.  321.  357.  415.  382.  290.  431.  466.
## 2     309  223.  205.  203.  205.  208.  216.  214.  218.  224.  237.
## 3     310  199.  194.  234.  233.  229.  220.  235.  256.  261.  248.
## 4     330  322.  300.  284.  285.  286.  298.  280.  318.  305.  354.
## 5     331  288.  285   302.  320.  316.  293.  290.  335.  294.  372.
## 6     332  235.  243.  273.  310.  317.  310.  454.  347.  330.  254.

gather

gather example

data1_long <- data1_wide %>% gather(ddays, rreaction, "0":"9")
head(data1_long)
## # A tibble: 6 × 3
##   Subject ddays rreaction
##     <dbl> <chr>     <dbl>
## 1     308 0          250.
## 2     309 0          223.
## 3     310 0          199.
## 4     330 0          322.
## 5     331 0          288.
## 6     332 0          235.

unite

unite example

data1_unite<- data1 %>% unite(Subject_Days, Subject, Days, sep="_")
head(data1_unite)
## # A tibble: 6 × 2
##   Reaction Subject_Days
##      <dbl> <chr>       
## 1     250. 308_0       
## 2     259. 308_1       
## 3     251. 308_2       
## 4     321. 308_3       
## 5     357. 308_4       
## 6     415. 308_5
data1_unite<- data1 %>% unite(Subject_Days, Subject, Days, sep="_", remove = FALSE) ## if you want to keep a copy of the original data
head(data1_unite)

separate

separate example

data1_separate<- data1_unite %>% separate(Subject_Days, c("subjects", "days"), sep="_")
head(data1_separate)
## # A tibble: 6 × 3
##   Reaction subjects days 
##      <dbl> <chr>    <chr>
## 1     250. 308      0    
## 2     259. 308      1    
## 3     251. 308      2    
## 4     321. 308      3    
## 5     357. 308      4    
## 6     415. 308      5
data1_separate<- data1_unite %>% separate(Subject_Days, c("subjects", "days"), sep="_", remove = FALSE)
head(data1_separate)

stringr

stringr package contains a set of commonly used string manipulation functions.

stringr cheatsheet: - https://github.com/rstudio/cheatsheets/blob/master/strings.pdf

Detect Matches (1)

colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")
str_detect(colorVec, "e") ## contains e
## [1]  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE
str_detect(colorVec, "e$") ## ends with e
## [1] FALSE  TRUE FALSE FALSE FALSE  TRUE  TRUE  TRUE
str_detect(colorVec, "^e") ## starts with e
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

Detect Matches (2)

colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")
str_which(colorVec, "e") ## contains e
## [1] 1 2 3 4 6 7 8
str_which(colorVec, "e$") ## ends with e
## [1] 2 6 7 8
str_which(colorVec, "^e") ## starts with e
## integer(0)

Detect Matches (3)

colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")
str_count(colorVec, "e") ## contains e
## [1] 1 1 2 1 0 1 1 1
str_count(colorVec, "e$") ## ends with e
## [1] 0 1 0 0 0 1 1 1
str_count(colorVec, "^e") ## starts with e
## [1] 0 0 0 0 0 0 0 0

Subset Strings (1)

colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")
str_sub(colorVec,2,2)
## [1] "e" "l" "r" "e" "l" "r" "u" "h"
str_sub(colorVec,start = 1, end=-2)
## [1] "re"    "blu"   "gree"  "yello" "blac"  "orang" "purpl" "whit"
str_sub(colorVec,end=-2)
## [1] "re"    "blu"   "gree"  "yello" "blac"  "orang" "purpl" "whit"
str_sub(colorVec,start = 2, end = -1)
## [1] "ed"    "lue"   "reen"  "ellow" "lack"  "range" "urple" "hite"
str_sub(colorVec,start = 2)
## [1] "ed"    "lue"   "reen"  "ellow" "lack"  "range" "urple" "hite"

Subset Strings (2)

colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")
str_subset(colorVec,"e")
## [1] "red"    "blue"   "green"  "yellow" "orange" "purple" "white"
str_subset(colorVec,"e$")
## [1] "blue"   "orange" "purple" "white"
str_subset(colorVec,"^e")
## character(0)

Subset Strings (3)

colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")
str_extract(colorVec,"e")
## [1] "e" "e" "e" "e" NA  "e" "e" "e"
str_extract(colorVec,"[aeiou]") ## the first match
## [1] "e" "u" "e" "e" "a" "o" "u" "i"
str_extract_all(colorVec,"[aeiou]") ## all matches
## [[1]]
## [1] "e"
## 
## [[2]]
## [1] "u" "e"
## 
## [[3]]
## [1] "e" "e"
## 
## [[4]]
## [1] "e" "o"
## 
## [[5]]
## [1] "a"
## 
## [[6]]
## [1] "o" "a" "e"
## 
## [[7]]
## [1] "u" "e"
## 
## [[8]]
## [1] "i" "e"

Manage lengths

colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")

str_length(colorVec)
## [1] 3 4 5 6 5 6 6 5
matrix(str_pad(colorVec, width = 7),ncol=1)
##      [,1]     
## [1,] "    red"
## [2,] "   blue"
## [3,] "  green"
## [4,] " yellow"
## [5,] "  black"
## [6,] " orange"
## [7,] " purple"
## [8,] "  white"

Mutate Strings

colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")
str_sub(colorVec,1,1) <- "Z" ## will change the original string vector
colorVec
## [1] "Zed"    "Zlue"   "Zreen"  "Zellow" "Zlack"  "Zrange" "Zurple" "Zhite"
colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")
str_replace(colorVec, "e", "E") 
## [1] "rEd"    "bluE"   "grEen"  "yEllow" "black"  "orangE" "purplE" "whitE"
str_replace_all(colorVec, "e", "E")
## [1] "rEd"    "bluE"   "grEEn"  "yEllow" "black"  "orangE" "purplE" "whitE"
str_to_lower(colorVec)
## [1] "red"    "blue"   "green"  "yellow" "black"  "orange" "purple" "white"
str_to_upper(colorVec)
## [1] "RED"    "BLUE"   "GREEN"  "YELLOW" "BLACK"  "ORANGE" "PURPLE" "WHITE"
str_to_title(colorVec) ## like a sentence
## [1] "Red"    "Blue"   "Green"  "Yellow" "Black"  "Orange" "Purple" "White"

Join and split

colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")


str_c(colorVec, seq_along(colorVec))
## [1] "red1"    "blue2"   "green3"  "yellow4" "black5"  "orange6" "purple7"
## [8] "white8"
str_c(colorVec, collapse = "::")
## [1] "red::blue::green::yellow::black::orange::purple::white"

Order Strings

colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")

str_order(colorVec) ## same as order(colorVec)
## [1] 5 2 3 6 7 1 8 4
str_sort(colorVec) ## same as sort(colorVec)
## [1] "black"  "blue"   "green"  "orange" "purple" "red"    "white"  "yellow"

references