Zhiguang Huo (Caleb)
Wednesday September 19, 2018
The tidyverse is a collection of R packages designed for data science.
## Warning: package 'dplyr' was built under R version 3.5.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data source: sleepstudy.csv (also available in R lme4 package)
Original way to read in data
asleepfile <- "https://raw.githubusercontent.com/Caleb-Huo/Caleb-Huo.github.io/master/teaching/data/sleep/sleepstudy.csv"
data0 <- read.csv(asleepfile)
## Parsed with column specification:
## cols(
## Reaction = col_double(),
## Days = col_integer(),
## Subject = col_integer()
## )
## Reaction Days Subject
## 1 249.5600 0 308
## 2 258.7047 1 308
## 3 250.8006 2 308
## 4 321.4398 3 308
## 5 356.8519 4 308
## 6 414.6901 5 308
## [1] "data.frame"
## # A tibble: 180 x 3
## Reaction Days Subject
## <dbl> <int> <int>
## 1 250. 0 308
## 2 259. 1 308
## 3 251. 2 308
## 4 321. 3 308
## 5 357. 4 308
## 6 415. 5 308
## 7 382. 6 308
## 8 290. 7 308
## 9 431. 8 308
## 10 466. 9 308
## # ... with 170 more rows
## [1] "tbl_df" "tbl" "data.frame"
## Parsed with column specification:
## cols(
## Reaction = col_double(),
## Days = col_integer(),
## Subject = col_integer()
## )
## [1] 249.5600 258.7047 250.8006
## Reaction Days Subject
## [1,] 249.5600 0 308
## [2,] 258.7047 1 308
## [3,] 250.8006 2 308
## Reaction Days Subject
## 1 249.5600 0 308
## 2 258.7047 1 308
## 3 250.8006 2 308
## # A tibble: 180 x 2
## Days Subject
## <int> <int>
## 1 0 308
## 2 1 308
## 3 2 308
## 4 3 308
## 5 4 308
## 6 5 308
## 7 6 308
## 8 7 308
## 9 8 308
## 10 9 308
## # ... with 170 more rows
## # A tibble: 180 x 2
## Days Subject
## <int> <int>
## 1 0 308
## 2 1 308
## 3 2 308
## 4 3 308
## 5 4 308
## 6 5 308
## 7 6 308
## 8 7 308
## 9 8 308
## 10 9 308
## # ... with 170 more rows
## # A tibble: 180 x 2
## Days Subject
## <int> <int>
## 1 0 308
## 2 1 308
## 3 2 308
## 4 3 308
## 5 4 308
## 6 5 308
## 7 6 308
## 8 7 308
## 9 8 308
## 10 9 308
## # ... with 170 more rows
## # A tibble: 10 x 2
## Days Subject
## <int> <int>
## 1 0 308
## 2 1 308
## 3 2 308
## 4 3 308
## 5 4 308
## 6 5 308
## 7 6 308
## 8 7 308
## 9 8 308
## 10 9 308
## # A tibble: 6 x 2
## Days Subject
## <int> <int>
## 1 3 308
## 2 4 308
## 3 5 308
## 4 6 308
## 5 8 308
## 6 9 308
## # A tibble: 6 x 3
## Reaction Days Subject
## <dbl> <int> <int>
## 1 321. 3 308
## 2 357. 4 308
## 3 415. 5 308
## 4 382. 6 308
## 5 431. 8 308
## 6 466. 9 308
## # A tibble: 6 x 3
## Reaction Days Subject
## <dbl> <int> <int>
## 1 194. 1 310
## 2 199. 0 310
## 3 203. 2 309
## 4 205. 3 309
## 5 205. 1 309
## 6 208. 4 309
## # A tibble: 6 x 3
## Reaction Days Subject
## <dbl> <int> <int>
## 1 199. 0 310
## 2 222. 0 352
## 3 223. 0 309
## 4 225. 0 370
## 5 235. 0 332
## 6 236. 0 349
## # A tibble: 6 x 3
## Reaction Days Subject
## <dbl> <int> <int>
## 1 237. 9 335
## 2 237. 9 309
## 3 248. 9 310
## 4 254. 9 332
## 5 348. 9 351
## 6 352. 9 349
## # A tibble: 6 x 4
## Reaction Days Subject Reaction_binary
## <dbl> <int> <int> <lgl>
## 1 250. 0 308 TRUE
## 2 259. 1 308 FALSE
## 3 251. 2 308 FALSE
## 4 321. 3 308 FALSE
## 5 357. 4 308 FALSE
## 6 415. 5 308 FALSE
## # A tibble: 6 x 5
## Reaction Days Subject Reaction_binary Reaction_sec
## <dbl> <int> <int> <lgl> <dbl>
## 1 250. 0 308 TRUE 0.250
## 2 259. 1 308 FALSE 0.259
## 3 251. 2 308 FALSE 0.251
## 4 321. 3 308 FALSE 0.321
## 5 357. 4 308 FALSE 0.357
## 6 415. 5 308 FALSE 0.415
data1 %>%
summarise(avg_reaction = mean(Reaction),
min_reaction = min(Reaction),
max_reaction = max(Reaction),
total = n())
## # A tibble: 1 x 4
## avg_reaction min_reaction max_reaction total
## <dbl> <dbl> <dbl> <int>
## 1 299. 194. 466. 180
tt <- data1 %>%
group_by(Subject) %>%
summarise(avg_reaction = mean(Reaction),
min_reaction = min(Reaction),
max_reaction = max(Reaction),
total = n())
tt %>% head
## # A tibble: 6 x 5
## Subject avg_reaction min_reaction max_reaction total
## <int> <dbl> <dbl> <dbl> <int>
## 1 308 342. 250. 466. 10
## 2 309 215. 203. 237. 10
## 3 310 231. 194. 261. 10
## 4 330 303. 280. 354. 10
## 5 331 309. 285 372. 10
## 6 332 307. 235. 454. 10
## # A tibble: 6 x 3
## avg_reaction min_reaction max_reaction
## <dbl> <dbl> <dbl>
## 1 342. 250. 466.
## 2 215. 203. 237.
## 3 231. 194. 261.
## 4 303. 280. 354.
## 5 309. 285 372.
## 6 307. 235. 454.
## # A tibble: 6 x 3
## avg_reaction min_reaction max_reaction
## <dbl> <dbl> <dbl>
## 1 342. 250. 466.
## 2 215. 203. 237.
## 3 231. 194. 261.
## 4 303. 280. 354.
## 5 309. 285 372.
## 6 307. 235. 454.
superheroes <- "
name, alignment, gender, publisher
Magneto, bad, male, Marvel
Storm, good, female, Marvel
Mystique, bad, female, Marvel
Batman, good, male, DC
Joker, bad, male, DC
Catwoman, bad, female, DC
Hellboy, good, male, Dark Horse Comics
"
superheroes <- read_csv(superheroes, skip = 1)
publishers <- "
publisher, yr_founded
DC, 1934
Marvel, 1939
Image, 1992
"
publishers <- read_csv(publishers, skip = 1)
## Joining, by = "publisher"
## # A tibble: 6 x 5
## name alignment gender publisher yr_founded
## <chr> <chr> <chr> <chr> <int>
## 1 Magneto bad male Marvel 1939
## 2 Storm good female Marvel 1939
## 3 Mystique bad female Marvel 1939
## 4 Batman good male DC 1934
## 5 Joker bad male DC 1934
## 6 Catwoman bad female DC 1934
## Joining, by = "publisher"
## # A tibble: 6 x 5
## publisher yr_founded name alignment gender
## <chr> <int> <chr> <chr> <chr>
## 1 DC 1934 Batman good male
## 2 DC 1934 Joker bad male
## 3 DC 1934 Catwoman bad female
## 4 Marvel 1939 Magneto bad male
## 5 Marvel 1939 Storm good female
## 6 Marvel 1939 Mystique bad female
## Joining, by = "publisher"
## # A tibble: 7 x 5
## name alignment gender publisher yr_founded
## <chr> <chr> <chr> <chr> <int>
## 1 Magneto bad male Marvel 1939
## 2 Storm good female Marvel 1939
## 3 Mystique bad female Marvel 1939
## 4 Batman good male DC 1934
## 5 Joker bad male DC 1934
## 6 Catwoman bad female DC 1934
## 7 Hellboy good male Dark Horse Comics NA
## Joining, by = "publisher"
## # A tibble: 7 x 5
## publisher yr_founded name alignment gender
## <chr> <int> <chr> <chr> <chr>
## 1 DC 1934 Batman good male
## 2 DC 1934 Joker bad male
## 3 DC 1934 Catwoman bad female
## 4 Marvel 1939 Magneto bad male
## 5 Marvel 1939 Storm good female
## 6 Marvel 1939 Mystique bad female
## 7 Image 1992 <NA> <NA> <NA>
## Joining, by = "publisher"
## # A tibble: 7 x 5
## name alignment gender publisher yr_founded
## <chr> <chr> <chr> <chr> <int>
## 1 Batman good male DC 1934
## 2 Joker bad male DC 1934
## 3 Catwoman bad female DC 1934
## 4 Magneto bad male Marvel 1939
## 5 Storm good female Marvel 1939
## 6 Mystique bad female Marvel 1939
## 7 <NA> <NA> <NA> Image 1992
## Joining, by = "publisher"
## # A tibble: 7 x 5
## publisher yr_founded name alignment gender
## <chr> <int> <chr> <chr> <chr>
## 1 Marvel 1939 Magneto bad male
## 2 Marvel 1939 Storm good female
## 3 Marvel 1939 Mystique bad female
## 4 DC 1934 Batman good male
## 5 DC 1934 Joker bad male
## 6 DC 1934 Catwoman bad female
## 7 Dark Horse Comics NA Hellboy good male
## Joining, by = "publisher"
## # A tibble: 1 x 4
## name alignment gender publisher
## <chr> <chr> <chr> <chr>
## 1 Hellboy good male Dark Horse Comics
## Joining, by = "publisher"
## # A tibble: 1 x 2
## publisher yr_founded
## <chr> <int>
## 1 Image 1992
## Joining, by = "publisher"
## # A tibble: 8 x 5
## name alignment gender publisher yr_founded
## <chr> <chr> <chr> <chr> <int>
## 1 Magneto bad male Marvel 1939
## 2 Storm good female Marvel 1939
## 3 Mystique bad female Marvel 1939
## 4 Batman good male DC 1934
## 5 Joker bad male DC 1934
## 6 Catwoman bad female DC 1934
## 7 Hellboy good male Dark Horse Comics NA
## 8 <NA> <NA> <NA> Image 1992
## Joining, by = "publisher"
## # A tibble: 8 x 5
## publisher yr_founded name alignment gender
## <chr> <int> <chr> <chr> <chr>
## 1 DC 1934 Batman good male
## 2 DC 1934 Joker bad male
## 3 DC 1934 Catwoman bad female
## 4 Marvel 1939 Magneto bad male
## 5 Marvel 1939 Storm good female
## 6 Marvel 1939 Mystique bad female
## 7 Image 1992 <NA> <NA> <NA>
## 8 Dark Horse Comics NA Hellboy good male
## # A tibble: 6 x 11
## Subject `0` `1` `2` `3` `4` `5` `6` `7` `8` `9`
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 308 250. 259. 251. 321. 357. 415. 382. 290. 431. 466.
## 2 309 223. 205. 203. 205. 208. 216. 214. 218. 224. 237.
## 3 310 199. 194. 234. 233. 229. 220. 235. 256. 261. 248.
## 4 330 322. 300. 284. 285. 286. 298. 280. 318. 305. 354.
## 5 331 288. 285 302. 320. 316. 293. 290. 335. 294. 372.
## 6 332 235. 243. 273. 310. 317. 310. 454. 347. 330. 254.
## # A tibble: 6 x 3
## Subject ddays rreaction
## <int> <chr> <dbl>
## 1 308 0 250.
## 2 309 0 223.
## 3 310 0 199.
## 4 330 0 322.
## 5 331 0 288.
## 6 332 0 235.
## # A tibble: 6 x 2
## Reaction Subject_Days
## <dbl> <chr>
## 1 250. 308_0
## 2 259. 308_1
## 3 251. 308_2
## 4 321. 308_3
## 5 357. 308_4
## 6 415. 308_5
data1_separate<- data1_unite %>% separate(Subject_Days, c("subjects", "days"), sep="_")
head(data1_separate)
## # A tibble: 6 x 3
## Reaction subjects days
## <dbl> <chr> <chr>
## 1 250. 308 0
## 2 259. 308 1
## 3 251. 308 2
## 4 321. 308 3
## 5 357. 308 4
## 6 415. 308 5