Zhiguang Huo (Caleb)
Thursday September 7, 2023
The tidyverse is a collection of R packages designed for data science.
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ purrr 1.0.1
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data source: sleepstudy.csv (also available in R lme4 package)
Original way to read in data
asleepfile <- "https:///Caleb-Huo.github.io/teaching/data/sleep/sleepstudy.csv"
data0 <- read.csv(asleepfile)
## Rows: 180 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): Reaction, Days, Subject
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Reaction Days Subject
## 1 249.5600 0 308
## 2 258.7047 1 308
## 3 250.8006 2 308
## 4 321.4398 3 308
## 5 356.8519 4 308
## 6 414.6901 5 308
## [1] "data.frame"
## # A tibble: 180 × 3
## Reaction Days Subject
## <dbl> <dbl> <dbl>
## 1 250. 0 308
## 2 259. 1 308
## 3 251. 2 308
## 4 321. 3 308
## 5 357. 4 308
## 6 415. 5 308
## 7 382. 6 308
## 8 290. 7 308
## 9 431. 8 308
## 10 466. 9 308
## # ℹ 170 more rows
## [1] "spec_tbl_df" "tbl_df" "tbl" "data.frame"
## Rows: 180 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): Reaction, Days, Subject
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(readxl)
## bsleepfile <- "https:///Caleb-Huo.github.io/teaching/data/sleep/sleepstudy.xlsx"
data0 <- read_excel("sleepstudy.xlsx")
data0
library(haven)
salesfile <- "https:///Caleb-Huo.github.io/teaching/data/sleep/sales.sas7bdat"
data0 <- read_sas(salesfile)
data0
## # A tibble: 36 × 3
## YEAR P S
## <dbl> <dbl> <dbl>
## 1 1950 12.9 182.
## 2 1951 11.9 245
## 3 1952 10.7 250.
## 4 1953 11.3 266.
## 5 1954 11.2 248.
## 6 1955 15.1 278.
## 7 1956 16.2 307.
## 8 1957 15.4 320
## 9 1958 12.7 305.
## 10 1959 16.3 338
## # ℹ 26 more rows
## [1] 249.5600 258.7047 250.8006
## Reaction Days Subject
## [1,] 249.5600 0 308
## [2,] 258.7047 1 308
## [3,] 250.8006 2 308
## Reaction Days Subject
## 1 249.5600 0 308
## 2 258.7047 1 308
## 3 250.8006 2 308
## # A tibble: 180 × 2
## Days Subject
## <dbl> <dbl>
## 1 0 308
## 2 1 308
## 3 2 308
## 4 3 308
## 5 4 308
## 6 5 308
## 7 6 308
## 8 7 308
## 9 8 308
## 10 9 308
## # ℹ 170 more rows
## # A tibble: 180 × 2
## Days Subject
## <dbl> <dbl>
## 1 0 308
## 2 1 308
## 3 2 308
## 4 3 308
## 5 4 308
## 6 5 308
## 7 6 308
## 8 7 308
## 9 8 308
## 10 9 308
## # ℹ 170 more rows
## [1] 2.718282
## [1] 2.718282
## [1] 1
## # A tibble: 180 × 2
## Days Subject
## <dbl> <dbl>
## 1 0 308
## 2 1 308
## 3 2 308
## 4 3 308
## 5 4 308
## 6 5 308
## 7 6 308
## 8 7 308
## 9 8 308
## 10 9 308
## # ℹ 170 more rows
## # A tibble: 10 × 2
## Days Subject
## <dbl> <dbl>
## 1 0 308
## 2 1 308
## 3 2 308
## 4 3 308
## 5 4 308
## 6 5 308
## 7 6 308
## 8 7 308
## 9 8 308
## 10 9 308
## # A tibble: 6 × 2
## Days Subject
## <dbl> <dbl>
## 1 3 308
## 2 4 308
## 3 5 308
## 4 6 308
## 5 8 308
## 6 9 308
## # A tibble: 6 × 3
## Reaction Days Subject
## <dbl> <dbl> <dbl>
## 1 321. 3 308
## 2 357. 4 308
## 3 415. 5 308
## 4 382. 6 308
## 5 431. 8 308
## 6 466. 9 308
## # A tibble: 6 × 3
## Reaction Days Subject
## <dbl> <dbl> <dbl>
## 1 321. 3 308
## 2 357. 4 308
## 3 415. 5 308
## 4 382. 6 308
## 5 431. 8 308
## 6 466. 9 308
## [1] 321.4398 356.8519 414.6901 382.2038 430.5853 466.3535
## [1] 321.4398 356.8519 414.6901 382.2038 430.5853 466.3535
## # A tibble: 8 × 3
## Reaction Days Subject
## <dbl> <dbl> <dbl>
## 1 250. 0 308
## 2 259. 1 308
## 3 251. 2 308
## 4 321. 3 308
## 5 357. 4 308
## 6 415. 5 308
## 7 382. 6 308
## 8 290. 7 308
## # A tibble: 8 × 3
## Reaction Days Subject
## <dbl> <dbl> <dbl>
## 1 250. 0 308
## 2 259. 1 308
## 3 251. 2 308
## 4 321. 3 308
## 5 357. 4 308
## 6 415. 5 308
## 7 382. 6 308
## 8 290. 7 308
## [1] 4 5 6 7 9 10
## [1] 4 5 6 7 9 10
## # A tibble: 6 × 3
## Reaction Days Subject
## <dbl> <dbl> <dbl>
## 1 321. 3 308
## 2 357. 4 308
## 3 415. 5 308
## 4 382. 6 308
## 5 431. 8 308
## 6 466. 9 308
## # A tibble: 6 × 3
## Reaction Days Subject
## <dbl> <dbl> <dbl>
## 1 194. 1 310
## 2 199. 0 310
## 3 203. 2 309
## 4 205. 3 309
## 5 205. 1 309
## 6 208. 4 309
## # A tibble: 6 × 3
## Reaction Days Subject
## <dbl> <dbl> <dbl>
## 1 199. 0 310
## 2 222. 0 352
## 3 223. 0 309
## 4 225. 0 370
## 5 235. 0 332
## 6 236. 0 349
## Reaction Days Subject
## 1339.693 0.000 2022.000
## # A tibble: 6 × 3
## Reaction Days Subject
## <dbl> <dbl> <dbl>
## 1 237. 9 335
## 2 237. 9 309
## 3 248. 9 310
## 4 254. 9 332
## 5 348. 9 351
## 6 352. 9 349
## # A tibble: 6 × 4
## Reaction Days Subject Reaction_binary
## <dbl> <dbl> <dbl> <lgl>
## 1 250. 0 308 TRUE
## 2 259. 1 308 FALSE
## 3 251. 2 308 FALSE
## 4 321. 3 308 FALSE
## 5 357. 4 308 FALSE
## 6 415. 5 308 FALSE
## # A tibble: 6 × 5
## Reaction Days Subject Reaction_binary Reaction_sec
## <dbl> <dbl> <dbl> <lgl> <dbl>
## 1 250. 0 308 TRUE 0.250
## 2 259. 1 308 FALSE 0.259
## 3 251. 2 308 FALSE 0.251
## 4 321. 3 308 FALSE 0.321
## 5 357. 4 308 FALSE 0.357
## 6 415. 5 308 FALSE 0.415
## # A tibble: 6 × 3
## Reaction Days Subject
## <dbl> <dbl> <dbl>
## 1 250. 0 308
## 2 259. 1 308
## 3 251. 2 308
## 4 321. 3 308
## 5 357. 4 308
## 6 415. 5 308
## # A tibble: 6 × 3
## Reaction Days Subject
## <dbl> <dbl> <dbl>
## 1 5.52 0 5.73
## 2 5.56 1 5.73
## 3 5.52 2 5.73
## 4 5.77 3 5.73
## 5 5.88 4 5.73
## 6 6.03 5 5.73
## # A tibble: 180 × 3
## Reaction Days ID
## <dbl> <dbl> <dbl>
## 1 250. 0 308
## 2 259. 1 308
## 3 251. 2 308
## 4 321. 3 308
## 5 357. 4 308
## 6 415. 5 308
## 7 382. 6 308
## 8 290. 7 308
## 9 431. 8 308
## 10 466. 9 308
## # ℹ 170 more rows
data1 %>%
summarise(avg_reaction = mean(Reaction),
min_reaction = min(Reaction),
max_reaction = max(Reaction),
total = n())
## # A tibble: 1 × 4
## avg_reaction min_reaction max_reaction total
## <dbl> <dbl> <dbl> <int>
## 1 299. 194. 466. 180
tt <- data1 %>%
group_by(Subject) %>%
summarise(avg_reaction = mean(Reaction),
min_reaction = min(Reaction),
max_reaction = max(Reaction),
total = n())
tt %>% head
## # A tibble: 6 × 5
## Subject avg_reaction min_reaction max_reaction total
## <dbl> <dbl> <dbl> <dbl> <int>
## 1 308 342. 250. 466. 10
## 2 309 215. 203. 237. 10
## 3 310 231. 194. 261. 10
## 4 330 303. 280. 354. 10
## 5 331 309. 285 372. 10
## 6 332 307. 235. 454. 10
## # A tibble: 6 × 3
## avg_reaction min_reaction max_reaction
## <dbl> <dbl> <dbl>
## 1 342. 250. 466.
## 2 215. 203. 237.
## 3 231. 194. 261.
## 4 303. 280. 354.
## 5 309. 285 372.
## 6 307. 235. 454.
## # A tibble: 6 × 3
## avg_reaction min_reaction max_reaction
## <dbl> <dbl> <dbl>
## 1 342. 250. 466.
## 2 215. 203. 237.
## 3 231. 194. 261.
## 4 303. 280. 354.
## 5 309. 285 372.
## 6 307. 235. 454.
## # A tibble: 6 × 3
## avg_reaction min_reaction max_reaction
## <dbl> <dbl> <dbl>
## 1 342. 250. 466.
## 2 215. 203. 237.
## 3 231. 194. 261.
## 4 303. 280. 354.
## 5 309. 285 372.
## 6 307. 235. 454.
## # A tibble: 6 × 2
## min_reaction max_reaction
## <dbl> <dbl>
## 1 250. 466.
## 2 203. 237.
## 3 194. 261.
## 4 280. 354.
## 5 285 372.
## 6 235. 454.
## # A tibble: 180 × 2
## Reaction Days
## <dbl> <dbl>
## 1 250. 0
## 2 259. 1
## 3 251. 2
## 4 321. 3
## 5 357. 4
## 6 415. 5
## 7 382. 6
## 8 290. 7
## 9 431. 8
## 10 466. 9
## # ℹ 170 more rows
## # A tibble: 180 × 2
## Reaction Days
## <dbl> <dbl>
## 1 250. 0
## 2 259. 1
## 3 251. 2
## 4 321. 3
## 5 357. 4
## 6 415. 5
## 7 382. 6
## 8 290. 7
## 9 431. 8
## 10 466. 9
## # ℹ 170 more rows
## # A tibble: 3 × 2
## A B
## <int> <chr>
## 1 4 A
## 2 5 B
## 3 6 C
## # A tibble: 3 × 2
## A B
## <int> <chr>
## 1 4 A
## 2 5 B
## 3 6 C
## # A tibble: 3 × 3
## rowid A B
## <int> <int> <chr>
## 1 1 4 A
## 2 2 5 B
## 3 3 6 C
superheroes <- "
name, alignment, gender, publisher
Magneto, bad, male, Marvel
Storm, good, female, Marvel
Mystique, bad, female, Marvel
Batman, good, male, DC
Joker, bad, male, DC
Catwoman, bad, female, DC
Hellboy, good, male, Dark Horse Comics
"
superheroes <- read_csv(superheroes, skip = 1)
## Rows: 7 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): name, alignment, gender, publisher
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
publishers <- "
publisher, yr_founded
DC, 1934
Marvel, 1939
Image, 1992
"
publishers <- read_csv(publishers, skip = 1)
## Rows: 3 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): publisher
## dbl (1): yr_founded
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining with `by = join_by(publisher)`
## # A tibble: 6 × 5
## name alignment gender publisher yr_founded
## <chr> <chr> <chr> <chr> <dbl>
## 1 Magneto bad male Marvel 1939
## 2 Storm good female Marvel 1939
## 3 Mystique bad female Marvel 1939
## 4 Batman good male DC 1934
## 5 Joker bad male DC 1934
## 6 Catwoman bad female DC 1934
## Joining with `by = join_by(publisher)`
## # A tibble: 6 × 5
## publisher yr_founded name alignment gender
## <chr> <dbl> <chr> <chr> <chr>
## 1 DC 1934 Batman good male
## 2 DC 1934 Joker bad male
## 3 DC 1934 Catwoman bad female
## 4 Marvel 1939 Magneto bad male
## 5 Marvel 1939 Storm good female
## 6 Marvel 1939 Mystique bad female
## Joining with `by = join_by(publisher)`
## # A tibble: 7 × 5
## name alignment gender publisher yr_founded
## <chr> <chr> <chr> <chr> <dbl>
## 1 Magneto bad male Marvel 1939
## 2 Storm good female Marvel 1939
## 3 Mystique bad female Marvel 1939
## 4 Batman good male DC 1934
## 5 Joker bad male DC 1934
## 6 Catwoman bad female DC 1934
## 7 Hellboy good male Dark Horse Comics NA
## Joining with `by = join_by(publisher)`
## # A tibble: 7 × 5
## publisher yr_founded name alignment gender
## <chr> <dbl> <chr> <chr> <chr>
## 1 DC 1934 Batman good male
## 2 DC 1934 Joker bad male
## 3 DC 1934 Catwoman bad female
## 4 Marvel 1939 Magneto bad male
## 5 Marvel 1939 Storm good female
## 6 Marvel 1939 Mystique bad female
## 7 Image 1992 <NA> <NA> <NA>
## Joining with `by = join_by(publisher)`
## # A tibble: 7 × 5
## name alignment gender publisher yr_founded
## <chr> <chr> <chr> <chr> <dbl>
## 1 Magneto bad male Marvel 1939
## 2 Storm good female Marvel 1939
## 3 Mystique bad female Marvel 1939
## 4 Batman good male DC 1934
## 5 Joker bad male DC 1934
## 6 Catwoman bad female DC 1934
## 7 <NA> <NA> <NA> Image 1992
## Joining with `by = join_by(publisher)`
## # A tibble: 7 × 5
## publisher yr_founded name alignment gender
## <chr> <dbl> <chr> <chr> <chr>
## 1 DC 1934 Batman good male
## 2 DC 1934 Joker bad male
## 3 DC 1934 Catwoman bad female
## 4 Marvel 1939 Magneto bad male
## 5 Marvel 1939 Storm good female
## 6 Marvel 1939 Mystique bad female
## 7 Dark Horse Comics NA Hellboy good male
## Joining with `by = join_by(publisher)`
## # A tibble: 1 × 4
## name alignment gender publisher
## <chr> <chr> <chr> <chr>
## 1 Hellboy good male Dark Horse Comics
## Joining with `by = join_by(publisher)`
## # A tibble: 1 × 2
## publisher yr_founded
## <chr> <dbl>
## 1 Image 1992
## Joining with `by = join_by(publisher)`
## # A tibble: 8 × 5
## name alignment gender publisher yr_founded
## <chr> <chr> <chr> <chr> <dbl>
## 1 Magneto bad male Marvel 1939
## 2 Storm good female Marvel 1939
## 3 Mystique bad female Marvel 1939
## 4 Batman good male DC 1934
## 5 Joker bad male DC 1934
## 6 Catwoman bad female DC 1934
## 7 Hellboy good male Dark Horse Comics NA
## 8 <NA> <NA> <NA> Image 1992
## Joining with `by = join_by(publisher)`
## # A tibble: 8 × 5
## publisher yr_founded name alignment gender
## <chr> <dbl> <chr> <chr> <chr>
## 1 DC 1934 Batman good male
## 2 DC 1934 Joker bad male
## 3 DC 1934 Catwoman bad female
## 4 Marvel 1939 Magneto bad male
## 5 Marvel 1939 Storm good female
## 6 Marvel 1939 Mystique bad female
## 7 Image 1992 <NA> <NA> <NA>
## 8 Dark Horse Comics NA Hellboy good male
## # A tibble: 6 × 11
## Subject `0` `1` `2` `3` `4` `5` `6` `7` `8` `9`
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 308 250. 259. 251. 321. 357. 415. 382. 290. 431. 466.
## 2 309 223. 205. 203. 205. 208. 216. 214. 218. 224. 237.
## 3 310 199. 194. 234. 233. 229. 220. 235. 256. 261. 248.
## 4 330 322. 300. 284. 285. 286. 298. 280. 318. 305. 354.
## 5 331 288. 285 302. 320. 316. 293. 290. 335. 294. 372.
## 6 332 235. 243. 273. 310. 317. 310. 454. 347. 330. 254.
## # A tibble: 6 × 3
## Subject ddays rreaction
## <dbl> <chr> <dbl>
## 1 308 0 250.
## 2 309 0 223.
## 3 310 0 199.
## 4 330 0 322.
## 5 331 0 288.
## 6 332 0 235.
## # A tibble: 6 × 2
## Reaction Subject_Days
## <dbl> <chr>
## 1 250. 308_0
## 2 259. 308_1
## 3 251. 308_2
## 4 321. 308_3
## 5 357. 308_4
## 6 415. 308_5
data1_separate<- data1_unite %>% separate(Subject_Days, c("subjects", "days"), sep="_")
head(data1_separate)
## # A tibble: 6 × 3
## Reaction subjects days
## <dbl> <chr> <chr>
## 1 250. 308 0
## 2 259. 308 1
## 3 251. 308 2
## 4 321. 308 3
## 5 357. 308 4
## 6 415. 308 5
stringr package contains a set of commonly used string manipulation functions.
stringr cheatsheet: - https://github.com/rstudio/cheatsheets/blob/master/strings.pdf
## [1] TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
## [1] FALSE TRUE FALSE FALSE FALSE TRUE TRUE TRUE
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1] 1 2 3 4 6 7 8
## [1] 2 6 7 8
## integer(0)
## [1] 1 1 2 1 0 1 1 1
## [1] 0 1 0 0 0 1 1 1
## [1] 0 0 0 0 0 0 0 0
## [1] "e" "l" "r" "e" "l" "r" "u" "h"
## [1] "re" "blu" "gree" "yello" "blac" "orang" "purpl" "whit"
## [1] "re" "blu" "gree" "yello" "blac" "orang" "purpl" "whit"
## [1] "ed" "lue" "reen" "ellow" "lack" "range" "urple" "hite"
## [1] "ed" "lue" "reen" "ellow" "lack" "range" "urple" "hite"
## [1] "red" "blue" "green" "yellow" "orange" "purple" "white"
## [1] "blue" "orange" "purple" "white"
## character(0)
## [1] "e" "e" "e" "e" NA "e" "e" "e"
## [1] "e" "u" "e" "e" "a" "o" "u" "i"
## [[1]]
## [1] "e"
##
## [[2]]
## [1] "u" "e"
##
## [[3]]
## [1] "e" "e"
##
## [[4]]
## [1] "e" "o"
##
## [[5]]
## [1] "a"
##
## [[6]]
## [1] "o" "a" "e"
##
## [[7]]
## [1] "u" "e"
##
## [[8]]
## [1] "i" "e"
colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")
str_length(colorVec)
## [1] 3 4 5 6 5 6 6 5
## [,1]
## [1,] " red"
## [2,] " blue"
## [3,] " green"
## [4,] " yellow"
## [5,] " black"
## [6,] " orange"
## [7,] " purple"
## [8,] " white"
colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")
str_sub(colorVec,1,1) <- "Z" ## will change the original string vector
colorVec
## [1] "Zed" "Zlue" "Zreen" "Zellow" "Zlack" "Zrange" "Zurple" "Zhite"
colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")
str_replace(colorVec, "e", "E")
## [1] "rEd" "bluE" "grEen" "yEllow" "black" "orangE" "purplE" "whitE"
## [1] "rEd" "bluE" "grEEn" "yEllow" "black" "orangE" "purplE" "whitE"
## [1] "red" "blue" "green" "yellow" "black" "orange" "purple" "white"
## [1] "RED" "BLUE" "GREEN" "YELLOW" "BLACK" "ORANGE" "PURPLE" "WHITE"
## [1] "Red" "Blue" "Green" "Yellow" "Black" "Orange" "Purple" "White"
colorVec <- c("red", "blue", "green", "yellow", "black", "orange", "purple", "white")
str_c(colorVec, seq_along(colorVec))
## [1] "red1" "blue2" "green3" "yellow4" "black5" "orange6" "purple7"
## [8] "white8"
## [1] "red::blue::green::yellow::black::orange::purple::white"