Zhiguang Huo (Caleb)
Monday September 14, 2020
The tidyverse is a collection of R packages designed for data science.
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## ── Attaching packages ────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble 2.1.3 ✓ forcats 0.4.0
## ✓ purrr 0.3.3
## ── Conflicts ───────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
data source: sleepstudy.csv (also available in R lme4 package)
Original way to read in data
asleepfile <- "https:///Caleb-Huo.github.io/teaching/data/sleep/sleepstudy.csv"
data0 <- read.csv(asleepfile)
## Parsed with column specification:
## cols(
## Reaction = col_double(),
## Days = col_double(),
## Subject = col_double()
## )
## Reaction Days Subject
## 1 249.5600 0 308
## 2 258.7047 1 308
## 3 250.8006 2 308
## 4 321.4398 3 308
## 5 356.8519 4 308
## 6 414.6901 5 308
## [1] "data.frame"
## # A tibble: 180 x 3
## Reaction Days Subject
## <dbl> <dbl> <dbl>
## 1 250. 0 308
## 2 259. 1 308
## 3 251. 2 308
## 4 321. 3 308
## 5 357. 4 308
## 6 415. 5 308
## 7 382. 6 308
## 8 290. 7 308
## 9 431. 8 308
## 10 466. 9 308
## # … with 170 more rows
## [1] "spec_tbl_df" "tbl_df" "tbl" "data.frame"
## Parsed with column specification:
## cols(
## Reaction = col_double(),
## Days = col_double(),
## Subject = col_double()
## )
library(haven)
salesfile <- "https:///Caleb-Huo.github.io/teaching/data/sleep/sales.sas7bdat"
data0 <- read_sas(salesfile)
data0
## # A tibble: 36 x 3
## YEAR P S
## <dbl> <dbl> <dbl>
## 1 1950 12.9 182.
## 2 1951 11.9 245
## 3 1952 10.7 250.
## 4 1953 11.3 266.
## 5 1954 11.2 248.
## 6 1955 15.1 278.
## 7 1956 16.2 307.
## 8 1957 15.4 320
## 9 1958 12.7 305.
## 10 1959 16.3 338
## # … with 26 more rows
## [1] 249.5600 258.7047 250.8006
## Reaction Days Subject
## [1,] 249.5600 0 308
## [2,] 258.7047 1 308
## [3,] 250.8006 2 308
## Reaction Days Subject
## 1 249.5600 0 308
## 2 258.7047 1 308
## 3 250.8006 2 308
## # A tibble: 180 x 2
## Days Subject
## <dbl> <dbl>
## 1 0 308
## 2 1 308
## 3 2 308
## 4 3 308
## 5 4 308
## 6 5 308
## 7 6 308
## 8 7 308
## 9 8 308
## 10 9 308
## # … with 170 more rows
## # A tibble: 180 x 2
## Days Subject
## <dbl> <dbl>
## 1 0 308
## 2 1 308
## 3 2 308
## 4 3 308
## 5 4 308
## 6 5 308
## 7 6 308
## 8 7 308
## 9 8 308
## 10 9 308
## # … with 170 more rows
## [1] 2.718282
## [1] 2.718282
## [1] 1
## # A tibble: 180 x 2
## Days Subject
## <dbl> <dbl>
## 1 0 308
## 2 1 308
## 3 2 308
## 4 3 308
## 5 4 308
## 6 5 308
## 7 6 308
## 8 7 308
## 9 8 308
## 10 9 308
## # … with 170 more rows
## # A tibble: 10 x 2
## Days Subject
## <dbl> <dbl>
## 1 0 308
## 2 1 308
## 3 2 308
## 4 3 308
## 5 4 308
## 6 5 308
## 7 6 308
## 8 7 308
## 9 8 308
## 10 9 308
## # A tibble: 6 x 2
## Days Subject
## <dbl> <dbl>
## 1 3 308
## 2 4 308
## 3 5 308
## 4 6 308
## 5 8 308
## 6 9 308
## # A tibble: 6 x 3
## Reaction Days Subject
## <dbl> <dbl> <dbl>
## 1 321. 3 308
## 2 357. 4 308
## 3 415. 5 308
## 4 382. 6 308
## 5 431. 8 308
## 6 466. 9 308
## # A tibble: 6 x 3
## Reaction Days Subject
## <dbl> <dbl> <dbl>
## 1 321. 3 308
## 2 357. 4 308
## 3 415. 5 308
## 4 382. 6 308
## 5 431. 8 308
## 6 466. 9 308
## [1] 321.4398 356.8519 414.6901 382.2038 430.5853 466.3535
## [1] 321.4398 356.8519 414.6901 382.2038 430.5853 466.3535
## [1] 4 5 6 7 9 10
## [1] 4 5 6 7 9 10
## # A tibble: 6 x 3
## Reaction Days Subject
## <dbl> <dbl> <dbl>
## 1 321. 3 308
## 2 357. 4 308
## 3 415. 5 308
## 4 382. 6 308
## 5 431. 8 308
## 6 466. 9 308
## # A tibble: 8 x 3
## Reaction Days Subject
## <dbl> <dbl> <dbl>
## 1 250. 0 308
## 2 259. 1 308
## 3 251. 2 308
## 4 321. 3 308
## 5 357. 4 308
## 6 415. 5 308
## 7 382. 6 308
## 8 290. 7 308
## # A tibble: 8 x 3
## Reaction Days Subject
## <dbl> <dbl> <dbl>
## 1 250. 0 308
## 2 259. 1 308
## 3 251. 2 308
## 4 321. 3 308
## 5 357. 4 308
## 6 415. 5 308
## 7 382. 6 308
## 8 290. 7 308
## # A tibble: 6 x 3
## Reaction Days Subject
## <dbl> <dbl> <dbl>
## 1 194. 1 310
## 2 199. 0 310
## 3 203. 2 309
## 4 205. 3 309
## 5 205. 1 309
## 6 208. 4 309
## # A tibble: 6 x 3
## Reaction Days Subject
## <dbl> <dbl> <dbl>
## 1 199. 0 310
## 2 222. 0 352
## 3 223. 0 309
## 4 225. 0 370
## 5 235. 0 332
## 6 236. 0 349
## Reaction Days Subject
## 1339.693 0.000 2022.000
## # A tibble: 6 x 3
## Reaction Days Subject
## <dbl> <dbl> <dbl>
## 1 237. 9 335
## 2 237. 9 309
## 3 248. 9 310
## 4 254. 9 332
## 5 348. 9 351
## 6 352. 9 349
## # A tibble: 6 x 4
## Reaction Days Subject Reaction_binary
## <dbl> <dbl> <dbl> <lgl>
## 1 250. 0 308 TRUE
## 2 259. 1 308 FALSE
## 3 251. 2 308 FALSE
## 4 321. 3 308 FALSE
## 5 357. 4 308 FALSE
## 6 415. 5 308 FALSE
## # A tibble: 6 x 5
## Reaction Days Subject Reaction_binary Reaction_sec
## <dbl> <dbl> <dbl> <lgl> <dbl>
## 1 250. 0 308 TRUE 0.250
## 2 259. 1 308 FALSE 0.259
## 3 251. 2 308 FALSE 0.251
## 4 321. 3 308 FALSE 0.321
## 5 357. 4 308 FALSE 0.357
## 6 415. 5 308 FALSE 0.415
## # A tibble: 6 x 3
## Reaction Days Subject
## <dbl> <dbl> <dbl>
## 1 250. 0 308
## 2 259. 1 308
## 3 251. 2 308
## 4 321. 3 308
## 5 357. 4 308
## 6 415. 5 308
## # A tibble: 6 x 3
## Reaction Days Subject
## <dbl> <dbl> <dbl>
## 1 5.52 0 5.73
## 2 5.56 1 5.73
## 3 5.52 2 5.73
## 4 5.77 3 5.73
## 5 5.88 4 5.73
## 6 6.03 5 5.73
## # A tibble: 180 x 3
## Reaction Days ID
## <dbl> <dbl> <dbl>
## 1 250. 0 308
## 2 259. 1 308
## 3 251. 2 308
## 4 321. 3 308
## 5 357. 4 308
## 6 415. 5 308
## 7 382. 6 308
## 8 290. 7 308
## 9 431. 8 308
## 10 466. 9 308
## # … with 170 more rows
data1 %>%
summarise(avg_reaction = mean(Reaction),
min_reaction = min(Reaction),
max_reaction = max(Reaction),
total = n())
## # A tibble: 1 x 4
## avg_reaction min_reaction max_reaction total
## <dbl> <dbl> <dbl> <int>
## 1 299. 194. 466. 180
tt <- data1 %>%
group_by(Subject) %>%
summarise(avg_reaction = mean(Reaction),
min_reaction = min(Reaction),
max_reaction = max(Reaction),
total = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 6 x 5
## Subject avg_reaction min_reaction max_reaction total
## <dbl> <dbl> <dbl> <dbl> <int>
## 1 308 342. 250. 466. 10
## 2 309 215. 203. 237. 10
## 3 310 231. 194. 261. 10
## 4 330 303. 280. 354. 10
## 5 331 309. 285 372. 10
## 6 332 307. 235. 454. 10
## # A tibble: 6 x 3
## avg_reaction min_reaction max_reaction
## <dbl> <dbl> <dbl>
## 1 342. 250. 466.
## 2 215. 203. 237.
## 3 231. 194. 261.
## 4 303. 280. 354.
## 5 309. 285 372.
## 6 307. 235. 454.
## # A tibble: 6 x 3
## avg_reaction min_reaction max_reaction
## <dbl> <dbl> <dbl>
## 1 342. 250. 466.
## 2 215. 203. 237.
## 3 231. 194. 261.
## 4 303. 280. 354.
## 5 309. 285 372.
## 6 307. 235. 454.
## # A tibble: 6 x 3
## avg_reaction min_reaction max_reaction
## <dbl> <dbl> <dbl>
## 1 342. 250. 466.
## 2 215. 203. 237.
## 3 231. 194. 261.
## 4 303. 280. 354.
## 5 309. 285 372.
## 6 307. 235. 454.
## # A tibble: 6 x 2
## min_reaction max_reaction
## <dbl> <dbl>
## 1 250. 466.
## 2 203. 237.
## 3 194. 261.
## 4 280. 354.
## 5 285 372.
## 6 235. 454.
## # A tibble: 3 x 2
## A B
## <int> <chr>
## 1 4 A
## 2 5 B
## 3 6 C
## # A tibble: 3 x 2
## A B
## <int> <fct>
## 1 4 A
## 2 5 B
## 3 6 C
## # A tibble: 3 x 3
## rowid A B
## <int> <int> <chr>
## 1 1 4 A
## 2 2 5 B
## 3 3 6 C
superheroes <- "
name, alignment, gender, publisher
Magneto, bad, male, Marvel
Storm, good, female, Marvel
Mystique, bad, female, Marvel
Batman, good, male, DC
Joker, bad, male, DC
Catwoman, bad, female, DC
Hellboy, good, male, Dark Horse Comics
"
superheroes <- read_csv(superheroes, skip = 1)
publishers <- "
publisher, yr_founded
DC, 1934
Marvel, 1939
Image, 1992
"
publishers <- read_csv(publishers, skip = 1)
## Joining, by = "publisher"
## # A tibble: 6 x 5
## name alignment gender publisher yr_founded
## <chr> <chr> <chr> <chr> <dbl>
## 1 Magneto bad male Marvel 1939
## 2 Storm good female Marvel 1939
## 3 Mystique bad female Marvel 1939
## 4 Batman good male DC 1934
## 5 Joker bad male DC 1934
## 6 Catwoman bad female DC 1934
## Joining, by = "publisher"
## # A tibble: 6 x 5
## publisher yr_founded name alignment gender
## <chr> <dbl> <chr> <chr> <chr>
## 1 DC 1934 Batman good male
## 2 DC 1934 Joker bad male
## 3 DC 1934 Catwoman bad female
## 4 Marvel 1939 Magneto bad male
## 5 Marvel 1939 Storm good female
## 6 Marvel 1939 Mystique bad female
## Joining, by = "publisher"
## # A tibble: 7 x 5
## name alignment gender publisher yr_founded
## <chr> <chr> <chr> <chr> <dbl>
## 1 Magneto bad male Marvel 1939
## 2 Storm good female Marvel 1939
## 3 Mystique bad female Marvel 1939
## 4 Batman good male DC 1934
## 5 Joker bad male DC 1934
## 6 Catwoman bad female DC 1934
## 7 Hellboy good male Dark Horse Comics NA
## Joining, by = "publisher"
## # A tibble: 7 x 5
## publisher yr_founded name alignment gender
## <chr> <dbl> <chr> <chr> <chr>
## 1 DC 1934 Batman good male
## 2 DC 1934 Joker bad male
## 3 DC 1934 Catwoman bad female
## 4 Marvel 1939 Magneto bad male
## 5 Marvel 1939 Storm good female
## 6 Marvel 1939 Mystique bad female
## 7 Image 1992 <NA> <NA> <NA>
## Joining, by = "publisher"
## # A tibble: 7 x 5
## name alignment gender publisher yr_founded
## <chr> <chr> <chr> <chr> <dbl>
## 1 Magneto bad male Marvel 1939
## 2 Storm good female Marvel 1939
## 3 Mystique bad female Marvel 1939
## 4 Batman good male DC 1934
## 5 Joker bad male DC 1934
## 6 Catwoman bad female DC 1934
## 7 <NA> <NA> <NA> Image 1992
## Joining, by = "publisher"
## # A tibble: 7 x 5
## publisher yr_founded name alignment gender
## <chr> <dbl> <chr> <chr> <chr>
## 1 DC 1934 Batman good male
## 2 DC 1934 Joker bad male
## 3 DC 1934 Catwoman bad female
## 4 Marvel 1939 Magneto bad male
## 5 Marvel 1939 Storm good female
## 6 Marvel 1939 Mystique bad female
## 7 Dark Horse Comics NA Hellboy good male
## Joining, by = "publisher"
## # A tibble: 1 x 4
## name alignment gender publisher
## <chr> <chr> <chr> <chr>
## 1 Hellboy good male Dark Horse Comics
## Joining, by = "publisher"
## # A tibble: 1 x 2
## publisher yr_founded
## <chr> <dbl>
## 1 Image 1992
## Joining, by = "publisher"
## # A tibble: 8 x 5
## name alignment gender publisher yr_founded
## <chr> <chr> <chr> <chr> <dbl>
## 1 Magneto bad male Marvel 1939
## 2 Storm good female Marvel 1939
## 3 Mystique bad female Marvel 1939
## 4 Batman good male DC 1934
## 5 Joker bad male DC 1934
## 6 Catwoman bad female DC 1934
## 7 Hellboy good male Dark Horse Comics NA
## 8 <NA> <NA> <NA> Image 1992
## Joining, by = "publisher"
## # A tibble: 8 x 5
## publisher yr_founded name alignment gender
## <chr> <dbl> <chr> <chr> <chr>
## 1 DC 1934 Batman good male
## 2 DC 1934 Joker bad male
## 3 DC 1934 Catwoman bad female
## 4 Marvel 1939 Magneto bad male
## 5 Marvel 1939 Storm good female
## 6 Marvel 1939 Mystique bad female
## 7 Image 1992 <NA> <NA> <NA>
## 8 Dark Horse Comics NA Hellboy good male
## # A tibble: 6 x 11
## Subject `0` `1` `2` `3` `4` `5` `6` `7` `8` `9`
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 308 250. 259. 251. 321. 357. 415. 382. 290. 431. 466.
## 2 309 223. 205. 203. 205. 208. 216. 214. 218. 224. 237.
## 3 310 199. 194. 234. 233. 229. 220. 235. 256. 261. 248.
## 4 330 322. 300. 284. 285. 286. 298. 280. 318. 305. 354.
## 5 331 288. 285 302. 320. 316. 293. 290. 335. 294. 372.
## 6 332 235. 243. 273. 310. 317. 310. 454. 347. 330. 254.
## # A tibble: 6 x 3
## Subject ddays rreaction
## <dbl> <chr> <dbl>
## 1 308 0 250.
## 2 309 0 223.
## 3 310 0 199.
## 4 330 0 322.
## 5 331 0 288.
## 6 332 0 235.
## # A tibble: 6 x 2
## Reaction Subject_Days
## <dbl> <chr>
## 1 250. 308_0
## 2 259. 308_1
## 3 251. 308_2
## 4 321. 308_3
## 5 357. 308_4
## 6 415. 308_5
data1_separate<- data1_unite %>% separate(Subject_Days, c("subjects", "days"), sep="_")
head(data1_separate)
## # A tibble: 6 x 3
## Reaction subjects days
## <dbl> <chr> <chr>
## 1 250. 308 0
## 2 259. 308 1
## 3 251. 308 2
## 4 321. 308 3
## 5 357. 308 4
## 6 415. 308 5
stringr package contains a set of commonly used string manipulation functions.
stringr cheatsheet: - https://github.com/rstudio/cheatsheets/blob/master/strings.pdf
## [1] TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
## [1] FALSE TRUE FALSE FALSE FALSE TRUE TRUE TRUE
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1] 1 2 3 4 6 7 8
## [1] 2 6 7 8
## integer(0)
## [1] 1 1 2 1 0 1 1 1
## [1] 0 1 0 0 0 1 1 1
## [1] 0 0 0 0 0 0 0 0
## [1] "e" "l" "r" "e" "l" "r" "u" "h"
## [1] "re" "blu" "gree" "yello" "blac" "orang" "purpl" "whit"
## [1] "re" "blu" "gree" "yello" "blac" "orang" "purpl" "whit"
## [1] "ed" "lue" "reen" "ellow" "lack" "range" "urple" "hite"
## [1] "ed" "lue" "reen" "ellow" "lack" "range" "urple" "hite"
## [1] "red" "blue" "green" "yellow" "orange" "purple" "white"
## [1] 1 2 3 4 6 7 8
## [1] "blue" "orange" "purple" "white"
## [1] 2 6 7 8
## [1] "e" "e" "e" "e" NA "e" "e" "e"
## [1] "e" "u" "e" "e" "a" "o" "u" "i"
## [[1]]
## [1] "e"
##
## [[2]]
## [1] "u" "e"
##
## [[3]]
## [1] "e" "e"
##
## [[4]]
## [1] "e" "o"
##
## [[5]]
## [1] "a"
##
## [[6]]
## [1] "o" "a" "e"
##
## [[7]]
## [1] "u" "e"
##
## [[8]]
## [1] "i" "e"
colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")
str_length(colorVec)
## [1] 3 4 5 6 5 6 6 5
## [,1]
## [1,] " red"
## [2,] " blue"
## [3,] " green"
## [4,] " yellow"
## [5,] " black"
## [6,] " orange"
## [7,] " purple"
## [8,] " white"
colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")
str_sub(colorVec,1,1) <- "Z" ## will change the original string vector
colorVec
## [1] "Zed" "Zlue" "Zreen" "Zellow" "Zlack" "Zrange" "Zurple" "Zhite"
colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")
str_replace(colorVec, "e", "E")
## [1] "rEd" "bluE" "grEen" "yEllow" "black" "orangE" "purplE" "whitE"
## [1] "rEd" "bluE" "grEEn" "yEllow" "black" "orangE" "purplE" "whitE"
## [1] "red" "blue" "green" "yellow" "black" "orange" "purple" "white"
## [1] "RED" "BLUE" "GREEN" "YELLOW" "BLACK" "ORANGE" "PURPLE" "WHITE"
## [1] "Red" "Blue" "Green" "Yellow" "Black" "Orange" "Purple" "White"
colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")
str_c(colorVec, seq_along(colorVec))
## [1] "red1" "blue2" "green3" "yellow4" "black5" "orange6" "purple7"
## [8] "white8"
## [1] "red::blue::green::yellow::black::orange::purple::white"
colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")
str_order(colorVec) ## same as order(colorVec)
## [1] 5 2 3 6 7 1 8 4
## [1] "black" "blue" "green" "orange" "purple" "red" "white" "yellow"