Zhiguang Huo (Caleb)
Monday August 30, 2021
The tidyverse is a collection of R packages designed for data science.
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## filter, lag
## The following objects are masked from 'package:base':
## intersect, setdiff, setequal, union
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble 3.0.4 ✓ forcats 0.5.0
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
data source: sleepstudy.csv (also available in R lme4 package)
Original way to read in data
asleepfile <- "https:///Caleb-Huo.github.io/teaching/data/sleep/sleepstudy.csv"
data0 <- read.csv(asleepfile)
## Reaction Days Subject
## 1 249.5600 0 308
## 2 258.7047 1 308
## 3 250.8006 2 308
## 4 321.4398 3 308
## 5 356.8519 4 308
## 6 414.6901 5 308
## [1] "data.frame"
## # A tibble: 180 x 3
## Reaction Days Subject
## <dbl> <dbl> <dbl>
## 1 250. 0 308
## 2 259. 1 308
## 3 251. 2 308
## 4 321. 3 308
## 5 357. 4 308
## 6 415. 5 308
## 7 382. 6 308
## 8 290. 7 308
## 9 431. 8 308
## 10 466. 9 308
## # … with 170 more rows
## [1] "spec_tbl_df" "tbl_df" "tbl" "data.frame"
## bsleepfile <- "https:///Caleb-Huo.github.io/teaching/data/sleep/sleepstudy.xlsx"
data0 <- read_excel("sleepstudy.xlsx")
salesfile <- "https:///Caleb-Huo.github.io/teaching/data/sleep/sales.sas7bdat"
data0 <- read_sas(salesfile)
## # A tibble: 36 x 3
## <dbl> <dbl> <dbl>
## 1 1950 12.9 182.
## 2 1951 11.9 245
## 3 1952 10.7 250.
## 4 1953 11.3 266.
## 5 1954 11.2 248.
## 6 1955 15.1 278.
## 7 1956 16.2 307.
## 8 1957 15.4 320
## 9 1958 12.7 305.
## 10 1959 16.3 338
## # … with 26 more rows
## [1] 249.5600 258.7047 250.8006
## Reaction Days Subject
## [1,] 249.5600 0 308
## [2,] 258.7047 1 308
## [3,] 250.8006 2 308
## Reaction Days Subject
## 1 249.5600 0 308
## 2 258.7047 1 308
## 3 250.8006 2 308
## # A tibble: 180 x 2
## Days Subject
## <dbl> <dbl>
## 1 0 308
## 2 1 308
## 3 2 308
## 4 3 308
## 5 4 308
## 6 5 308
## 7 6 308
## 8 7 308
## 9 8 308
## 10 9 308
## # … with 170 more rows
## [1] 2.718282
## [1] 2.718282
## [1] 1
## # A tibble: 10 x 2
## Days Subject
## <dbl> <dbl>
## 1 0 308
## 2 1 308
## 3 2 308
## 4 3 308
## 5 4 308
## 6 5 308
## 7 6 308
## 8 7 308
## 9 8 308
## 10 9 308
## # A tibble: 6 x 3
## Reaction Days Subject
## <dbl> <dbl> <dbl>
## 1 321. 3 308
## 2 357. 4 308
## 3 415. 5 308
## 4 382. 6 308
## 5 431. 8 308
## 6 466. 9 308
## [1] 321.4398 356.8519 414.6901 382.2038 430.5853 466.3535
## [1] 321.4398 356.8519 414.6901 382.2038 430.5853 466.3535
## # A tibble: 8 x 3
## Reaction Days Subject
## <dbl> <dbl> <dbl>
## 1 250. 0 308
## 2 259. 1 308
## 3 251. 2 308
## 4 321. 3 308
## 5 357. 4 308
## 6 415. 5 308
## 7 382. 6 308
## 8 290. 7 308
## [1] 4 5 6 7 9 10
## [1] 4 5 6 7 9 10
## # A tibble: 6 x 3
## Reaction Days Subject
## <dbl> <dbl> <dbl>
## 1 321. 3 308
## 2 357. 4 308
## 3 415. 5 308
## 4 382. 6 308
## 5 431. 8 308
## 6 466. 9 308
## # A tibble: 6 x 3
## Reaction Days Subject
## <dbl> <dbl> <dbl>
## 1 199. 0 310
## 2 222. 0 352
## 3 223. 0 309
## 4 225. 0 370
## 5 235. 0 332
## 6 236. 0 349
## Reaction Days Subject
## 1339.693 0.000 2022.000
## # A tibble: 6 x 3
## Reaction Days Subject
## <dbl> <dbl> <dbl>
## 1 237. 9 335
## 2 237. 9 309
## 3 248. 9 310
## 4 254. 9 332
## 5 348. 9 351
## 6 352. 9 349
## # A tibble: 6 x 4
## Reaction Days Subject Reaction_binary
## <dbl> <dbl> <dbl> <lgl>
## 1 250. 0 308 TRUE
## 2 259. 1 308 FALSE
## 3 251. 2 308 FALSE
## 4 321. 3 308 FALSE
## 5 357. 4 308 FALSE
## 6 415. 5 308 FALSE
## # A tibble: 6 x 3
## Reaction Days Subject
## <dbl> <dbl> <dbl>
## 1 250. 0 308
## 2 259. 1 308
## 3 251. 2 308
## 4 321. 3 308
## 5 357. 4 308
## 6 415. 5 308
## # A tibble: 180 x 3
## Reaction Days ID
## <dbl> <dbl> <dbl>
## 1 250. 0 308
## 2 259. 1 308
## 3 251. 2 308
## 4 321. 3 308
## 5 357. 4 308
## 6 415. 5 308
## 7 382. 6 308
## 8 290. 7 308
## 9 431. 8 308
## 10 466. 9 308
## # … with 170 more rows
data1 %>%
summarise(avg_reaction = mean(Reaction),
min_reaction = min(Reaction),
max_reaction = max(Reaction),
total = n())
tt <- data1 %>%
group_by(Subject) %>%
summarise(avg_reaction = mean(Reaction),
min_reaction = min(Reaction),
max_reaction = max(Reaction),
total = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 6 x 5
## Subject avg_reaction min_reaction max_reaction total
## <dbl> <dbl> <dbl> <dbl> <int>
## 1 308 342. 250. 466. 10
## 2 309 215. 203. 237. 10
## 3 310 231. 194. 261. 10
## 4 330 303. 280. 354. 10
## 5 331 309. 285 372. 10
## 6 332 307. 235. 454. 10
## # A tibble: 6 x 3
## avg_reaction min_reaction max_reaction
## <dbl> <dbl> <dbl>
## 1 342. 250. 466.
## 2 215. 203. 237.
## 3 231. 194. 261.
## 4 303. 280. 354.
## 5 309. 285 372.
## 6 307. 235. 454.
## # A tibble: 6 x 2
## min_reaction max_reaction
## <dbl> <dbl>
## 1 250. 466.
## 2 203. 237.
## 3 194. 261.
## 4 280. 354.
## 5 285 372.
## 6 235. 454.
## # A tibble: 180 x 2
## Reaction Days
## <dbl> <dbl>
## 1 250. 0
## 2 259. 1
## 3 251. 2
## 4 321. 3
## 5 357. 4
## 6 415. 5
## 7 382. 6
## 8 290. 7
## 9 431. 8
## 10 466. 9
## # … with 170 more rows
## # A tibble: 3 x 2
## A B
## <int> <chr>
## 1 4 A
## 2 5 B
## 3 6 C
superheroes <- "
name, alignment, gender, publisher
Magneto, bad, male, Marvel
Storm, good, female, Marvel
Mystique, bad, female, Marvel
Batman, good, male, DC
Joker, bad, male, DC
Catwoman, bad, female, DC
Hellboy, good, male, Dark Horse Comics
superheroes <- read_csv(superheroes, skip = 1)
publishers <- "
publisher, yr_founded
DC, 1934
Marvel, 1939
Image, 1992
publishers <- read_csv(publishers, skip = 1)
## Joining, by = "publisher"
## Joining, by = "publisher"
## # A tibble: 6 x 5
## publisher yr_founded name alignment gender
## <chr> <dbl> <chr> <chr> <chr>
## 1 DC 1934 Batman good male
## 2 DC 1934 Joker bad male
## 3 DC 1934 Catwoman bad female
## 4 Marvel 1939 Magneto bad male
## 5 Marvel 1939 Storm good female
## 6 Marvel 1939 Mystique bad female
## Joining, by = "publisher"
## Joining, by = "publisher"
## # A tibble: 7 x 5
## publisher yr_founded name alignment gender
## <chr> <dbl> <chr> <chr> <chr>
## 1 DC 1934 Batman good male
## 2 DC 1934 Joker bad male
## 3 DC 1934 Catwoman bad female
## 4 Marvel 1939 Magneto bad male
## 5 Marvel 1939 Storm good female
## 6 Marvel 1939 Mystique bad female
## 7 Image 1992 <NA> <NA> <NA>
## Joining, by = "publisher"
## Joining, by = "publisher"
## # A tibble: 7 x 5
## publisher yr_founded name alignment gender
## <chr> <dbl> <chr> <chr> <chr>
## 1 DC 1934 Batman good male
## 2 DC 1934 Joker bad male
## 3 DC 1934 Catwoman bad female
## 4 Marvel 1939 Magneto bad male
## 5 Marvel 1939 Storm good female
## 6 Marvel 1939 Mystique bad female
## 7 Dark Horse Comics NA Hellboy good male
## Joining, by = "publisher"
## Joining, by = "publisher"
## # A tibble: 1 x 2
## publisher yr_founded
## <chr> <dbl>
## 1 Image 1992
## Joining, by = "publisher"
## Joining, by = "publisher"
## # A tibble: 8 x 5
## publisher yr_founded name alignment gender
## <chr> <dbl> <chr> <chr> <chr>
## 1 DC 1934 Batman good male
## 2 DC 1934 Joker bad male
## 3 DC 1934 Catwoman bad female
## 4 Marvel 1939 Magneto bad male
## 5 Marvel 1939 Storm good female
## 6 Marvel 1939 Mystique bad female
## 7 Image 1992 <NA> <NA> <NA>
## 8 Dark Horse Comics NA Hellboy good male
## # A tibble: 6 x 3
## Subject ddays rreaction
## <dbl> <chr> <dbl>
## 1 308 0 250.
## 2 309 0 223.
## 3 310 0 199.
## 4 330 0 322.
## 5 331 0 288.
## 6 332 0 235.
data1_separate<- data1_unite %>% separate(Subject_Days, c("subjects", "days"), sep="_")
## # A tibble: 6 x 3
## Reaction subjects days
## <dbl> <chr> <chr>
## 1 250. 308 0
## 2 259. 308 1
## 3 251. 308 2
## 4 321. 308 3
## 5 357. 308 4
## 6 415. 308 5
stringr package contains a set of commonly used string manipulation functions.
stringr cheatsheet: - https://github.com/rstudio/cheatsheets/blob/master/strings.pdf
## [1] 1 2 3 4 6 7 8
## [1] 2 6 7 8
## integer(0)
## [1] 1 1 2 1 0 1 1 1
## [1] 0 1 0 0 0 1 1 1
## [1] 0 0 0 0 0 0 0 0
## [1] "e" "l" "r" "e" "l" "r" "u" "h"
## [1] "re" "blu" "gree" "yello" "blac" "orang" "purpl" "whit"
## [1] "re" "blu" "gree" "yello" "blac" "orang" "purpl" "whit"
## [1] "ed" "lue" "reen" "ellow" "lack" "range" "urple" "hite"
## [1] "ed" "lue" "reen" "ellow" "lack" "range" "urple" "hite"
## [1] "red" "blue" "green" "yellow" "orange" "purple" "white"
## [1] "blue" "orange" "purple" "white"
## character(0)
## [1] "e" "e" "e" "e" NA "e" "e" "e"
## [1] "e" "u" "e" "e" "a" "o" "u" "i"
## [[1]]
## [1] "e"
## [[2]]
## [1] "u" "e"
## [[3]]
## [1] "e" "e"
## [[4]]
## [1] "e" "o"
## [[5]]
## [1] "a"
## [[6]]
## [1] "o" "a" "e"
## [[7]]
## [1] "u" "e"
## [[8]]
## [1] "i" "e"
colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")
## [1] 3 4 5 6 5 6 6 5
## [,1]
## [1,] " red"
## [2,] " blue"
## [3,] " green"
## [4,] " yellow"
## [5,] " black"
## [6,] " orange"
## [7,] " purple"
## [8,] " white"
colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")
str_sub(colorVec,1,1) <- "Z" ## will change the original string vector
## [1] "Zed" "Zlue" "Zreen" "Zellow" "Zlack" "Zrange" "Zurple" "Zhite"
colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")
str_replace(colorVec, "e", "E")
## [1] "rEd" "bluE" "grEen" "yEllow" "black" "orangE" "purplE" "whitE"
## [1] "rEd" "bluE" "grEEn" "yEllow" "black" "orangE" "purplE" "whitE"
## [1] "red" "blue" "green" "yellow" "black" "orange" "purple" "white"
## [1] "Red" "Blue" "Green" "Yellow" "Black" "Orange" "Purple" "White"
colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")
str_c(colorVec, seq_along(colorVec))
## [1] "red1" "blue2" "green3" "yellow4" "black5" "orange6" "purple7"
## [8] "white8"
## [1] "red::blue::green::yellow::black::orange::purple::white"
colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")
str_order(colorVec) ## same as order(colorVec)
## [1] 5 2 3 6 7 1 8 4
## [1] "black" "blue" "green" "orange" "purple" "red" "white" "yellow"