Programming basics for Biostatistics 6099

R graphics ggplot2

Zhiguang Huo (Caleb)

Thursday September 14, 2023

ggplot2

ggplot2 is based on the grammer of graphics, the idea that you can build every graph from the same few components:

ggplot2 grammers

ggplot() - graphics are added up by different layers

Aesthetics — aes()

load ggplot2 package

library(ggplot2) ## part of tidyverse
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble  3.1.8     ✔ dplyr   1.1.0
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.4     ✔ forcats 1.0.0
## ✔ purrr   1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

mpg data data

str(mpg)
## tibble [234 × 11] (S3: tbl_df/tbl/data.frame)
##  $ manufacturer: chr [1:234] "audi" "audi" "audi" "audi" ...
##  $ model       : chr [1:234] "a4" "a4" "a4" "a4" ...
##  $ displ       : num [1:234] 1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
##  $ year        : int [1:234] 1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
##  $ cyl         : int [1:234] 4 4 4 4 6 6 6 4 4 4 ...
##  $ trans       : chr [1:234] "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
##  $ drv         : chr [1:234] "f" "f" "f" "f" ...
##  $ cty         : int [1:234] 18 21 20 21 16 18 18 18 16 20 ...
##  $ hwy         : int [1:234] 29 29 31 30 26 26 27 26 25 28 ...
##  $ fl          : chr [1:234] "p" "p" "p" "p" ...
##  $ class       : chr [1:234] "compact" "compact" "compact" "compact" ...
head(mpg)
## # A tibble: 6 × 11
##   manufacturer model displ  year   cyl trans      drv     cty   hwy fl    class 
##   <chr>        <chr> <dbl> <int> <int> <chr>      <chr> <int> <int> <chr> <chr> 
## 1 audi         a4      1.8  1999     4 auto(l5)   f        18    29 p     compa…
## 2 audi         a4      1.8  1999     4 manual(m5) f        21    29 p     compa…
## 3 audi         a4      2    2008     4 manual(m6) f        20    31 p     compa…
## 4 audi         a4      2    2008     4 auto(av)   f        21    30 p     compa…
## 5 audi         a4      2.8  1999     6 auto(l5)   f        16    26 p     compa…
## 6 audi         a4      2.8  1999     6 manual(m5) f        18    26 p     compa…

ggplot example

ggplot(data = mpg) + aes(x=displ, y=hwy) + geom_point()

ggplot: combine layers

myggplot <- ggplot(data = mpg) + aes(x=displ, y=hwy)
myggplot + geom_point()

aes – color (continuous)

ggplot(data = mpg) + 
  aes(x=displ, y=hwy, color = cyl) +
  geom_point()

aes – color (categorical)

ggplot(data = mpg) + 
  aes(x=displ, y=hwy, color = class) +
  geom_point()

aes – color (absolute color)

ggplot(data = mpg) + 
  aes(x=displ, y=hwy, color = "blue") +
  geom_point() ## Doesn't work, aes only maps a variable (in the data) to a color.

ggplot(data = mpg) + 
  aes(x=displ, y=hwy, color = I("blue")) +
  geom_point() ## use I to indicate absolute color

ggplot(data = mpg) + 
  aes(x=displ, y=hwy ) +
  geom_point(color = "blue") ## or use the color here (outside aes)

aes – color (categorical)

ggplot(data = mpg) + 
  aes(x=displ, y=hwy, color = class) +
  geom_point()

aes – size

ggplot(data = mpg) + 
  aes(x=displ, y=hwy, size = cyl) +
  geom_point()

aes – size (absolute size)

ggplot(data = mpg) + 
  aes(x=displ, y=hwy, size = "3") +
  geom_point() ## Doesn't work, aes only maps a variable (in the data) to a size

ggplot(data = mpg) + 
  aes(x=displ, y=hwy, size = I(3)) +
  geom_point() ## use I to indicate absolute size

ggplot(data = mpg) + 
  aes(x=displ, y=hwy ) +
  geom_point(size = 3) ## or use the size here

aes – alpha (transparency)

ggplot(data = mpg) + 
  aes(x=displ, y=hwy, alpha = cyl) +
  geom_point()

aes – alpha (absolute alpha)

ggplot(data = mpg) + 
  aes(x=displ, y=hwy, alpha = 1) +
  geom_point() ## Doesn't work, aes only maps a variable (in the data) to a alpha

ggplot(data = mpg) + 
  aes(x=displ, y=hwy, alpha = I(0.5)) +
  geom_point() ## use I to indicate absolute alpha

ggplot(data = mpg) + 
  aes(x=displ, y=hwy ) +
  geom_point(alpha = 0.5) ## or use the alpha here

aes – shape

mpg_sub <- subset(mpg, class!="suv")
ggplot(data = mpg_sub) + 
  aes(x=displ, y=hwy, shape = class) +
  geom_point()

mpg %>% 
  filter(class!="suv") %>%
  ggplot() + 
  aes(x=displ, y=hwy, shape = class) +
  geom_point()

aes by variable names

xvariable = "displ"
yvariable = "hwy"

ggplot(data = mpg) + 
  aes_string(x=xvariable, y=yvariable, color = "class") +
  geom_point()
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Geom functions

ls(pattern = '^geom_', env = as.environment('package:ggplot2'))
##  [1] "geom_abline"            "geom_area"              "geom_bar"              
##  [4] "geom_bin_2d"            "geom_bin2d"             "geom_blank"            
##  [7] "geom_boxplot"           "geom_col"               "geom_contour"          
## [10] "geom_contour_filled"    "geom_count"             "geom_crossbar"         
## [13] "geom_curve"             "geom_density"           "geom_density_2d"       
## [16] "geom_density_2d_filled" "geom_density2d"         "geom_density2d_filled" 
## [19] "geom_dotplot"           "geom_errorbar"          "geom_errorbarh"        
## [22] "geom_freqpoly"          "geom_function"          "geom_hex"              
## [25] "geom_histogram"         "geom_hline"             "geom_jitter"           
## [28] "geom_label"             "geom_line"              "geom_linerange"        
## [31] "geom_map"               "geom_path"              "geom_point"            
## [34] "geom_pointrange"        "geom_polygon"           "geom_qq"               
## [37] "geom_qq_line"           "geom_quantile"          "geom_raster"           
## [40] "geom_rect"              "geom_ribbon"            "geom_rug"              
## [43] "geom_segment"           "geom_sf"                "geom_sf_label"         
## [46] "geom_sf_text"           "geom_smooth"            "geom_spoke"            
## [49] "geom_step"              "geom_text"              "geom_tile"             
## [52] "geom_violin"            "geom_vline"

ggplot: geom_line by group

ggplot(data = mpg) + 
  aes(displ, hwy, colour=class) + 
  geom_point() + 
  geom_line()

ggplot(data = mpg) + 
  aes(displ, hwy) + 
  geom_point() + 
  geom_line(aes(colour=class))

ggplot: aes()

ggplot(data = mpg) + 
  aes(displ, hwy, colour=class) + ## this is global color
  geom_point() + 
  geom_line()
ggplot(data = mpg) + 
  aes(displ, hwy) + 
  geom_point() + 
  geom_line(aes(colour=class)) ## this is local color

Line segments

ggplot(data = mpg) + 
  aes(displ, hwy, colour = class) + 
  geom_point() + 
  geom_abline(aes(intercept = 0, slope = 5), color = "green") + 
  geom_hline(aes(yintercept = 30), color = "blue") + 
  geom_vline(aes(xintercept = 5), color = "red") 

smooth by group 1

ggplot(data = mpg) + 
  aes(displ, hwy) + 
  geom_point(aes(colour=class)) + 
  geom_smooth() 
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

smooth by group 2

ggplot(data = mpg) + 
  aes(displ, hwy) + 
  geom_point(aes(colour=class)) + 
  geom_smooth(method="lm") 
## `geom_smooth()` using formula = 'y ~ x'

smooth by group 3

ggplot(data = mpg) + 
  aes(displ, hwy) + 
  geom_point(aes(colour=class)) + 
  geom_smooth(aes(group=class), method="lm") 
## `geom_smooth()` using formula = 'y ~ x'

smooth by group 4

ggplot(data = mpg) + 
  aes(displ, hwy, colour = class) + ## global aes will be applied to all higher level aes
  geom_point() + 
  geom_smooth(method="lm") 
## `geom_smooth()` using formula = 'y ~ x'

smooth by group 5

ggplot(data = mpg) + 
  aes(displ, hwy, colour = class) + ## lower level aes will be applied to all higher level aes
  geom_point() + 
  geom_smooth(method="lm", se = F, size = 2) 
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'

ggplot() boxplot

mpgbox <- ggplot(data = mpg) + 
  aes(class, hwy) + 
  geom_boxplot(aes(fill=class))
mpgbox

ggplot() jitter

ggplot(data = mpg) + 
  aes(class, hwy, color=class) + 
  geom_jitter()

ggplot() boxplot + jitter

ggplot(data = mpg) + 
  aes(class, hwy, color=class) + 
  geom_boxplot() + 
  geom_jitter()

ggplot(data = mpg) + 
  aes(class, hwy, color=class) + 
  geom_jitter() + 
  geom_boxplot() 

ggplot() violin plot

ggplot(data = mpg) + 
  aes(class, hwy, fill=class) + 
  geom_violin() 

ggplot() bar plot 1

ggplot(mpg) + 
  aes(class) + 
  geom_bar()

ggplot(mpg) + 
  aes(class, color = class) + 
  geom_bar()

ggplot() bar plot 2

ggplot(mpg) + 
  aes(class, fill=as.factor(cyl)) + 
  geom_bar()

ggplot() bar plot 3

ggplot(mpg) + 
  aes(class, fill=as.factor(cyl)) + 
  geom_bar(position="dodge")  #side by side

ggplot() bar plot: how to specify error bar 1

mpgSummary <- mpg %>%
  group_by(class) %>%
  summarize(meanDispl = mean(displ), sdDispl = sd(displ)) ## sd is standard deviation, standard error se = sd/sqrt(n)

ggplot(data = mpgSummary) + 
  aes(x=class, y=meanDispl, fill=class) + 
  geom_bar(stat="identity",
           colour="black", # Use black outlines,
           size=.3)       # Thinner lines

ggplot() bar plot: how to specify error bar 2

ggplot(data = mpgSummary) + 
  aes(x=class, y=meanDispl, fill=class) + 
  geom_bar(stat="identity",
           colour="black", # Use black outlines,
           size=.3) +      # Thinner lines
  geom_errorbar(aes(ymin=meanDispl-sdDispl, ymax=meanDispl+sdDispl),
                size=.3,    # Thinner lines
                width=.2
                )

ggplot() histogram simple example

ggplot(data = mpg) + 
  aes(x = hwy) + 
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot() histogram fill by color

ggplot(data = mpg) + 
  aes(x = hwy) + 
  geom_histogram(aes(fill = class))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot() histogram facets by group (1)

ggplot(data = mpg) + 
  aes(x = hwy) + 
  geom_histogram(aes(fill = class)) + 
  facet_wrap(~ class)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

facet

ggplot() histogram facets by group (2)

ggplot(data = mpg) + 
  aes(x = hwy) + 
  geom_histogram(aes(fill = class)) + 
  facet_grid(. ~ class) ## or facet_grid(cols = vars(class))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot() histogram facets by group (3)

ggplot(data = mpg) + 
  aes(x = hwy) + 
  geom_histogram(aes(fill = class)) + 
  facet_grid(class ~ .) ## or facet_grid(rows = vars(class))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot() histogram facets by group (4)

ggplot(data = mpg) + 
  aes(x = hwy) + 
  geom_histogram(aes(fill = class)) + 
  facet_grid(drv ~ class) ## or facet_grid(rows = vars(drv), cols = vars(class))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

longitudinal data visualization

sleepstudy: Reaction times in a sleep deprivation study

sleepstudy: Reaction times in a sleep deprivation study

library(lme4)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
data(sleepstudy)
head(sleepstudy, n=5)
##   Reaction Days Subject
## 1 249.5600    0     308
## 2 258.7047    1     308
## 3 250.8006    2     308
## 4 321.4398    3     308
## 5 356.8519    4     308

spaghetti plot

ggplot(data=sleepstudy) + 
  aes(x = Days, y=Reaction, colour = Subject) +
  geom_path()

individual subject lm smooth

ggplot(data=sleepstudy) + 
  aes(x = Days, y=Reaction, colour = Subject) +
  geom_smooth(method="lm") + 
  facet_wrap(~Subject)
## `geom_smooth()` using formula = 'y ~ x'

mean trajectory (with SE bar)

sleepSummary <- sleepstudy %>% 
  group_by(Days) %>%
  summarize(Mean = mean(Reaction), SD = sd(Reaction), SE = sd(Reaction)/sqrt(n()))

ggplot(data=sleepSummary) + 
  aes(x = Days, y=Mean) +
  geom_path() + 
  geom_errorbar(aes(ymin=Mean-SE, ymax=Mean+SE),
                  size=0.5,    # Thinner lines
                  width=.2) 

Add text annotations to a graph

Text annotations using geom_text()

# Subset 10 rows
set.seed(32611)
ss <- sample(1:32, 10)
df <- mtcars[ss, ]

sp <- ggplot(data = df) +
  aes(wt, mpg, label = rownames(df)) +
  geom_point()
# Add texts
sp + geom_text() ## geom_text need the label aes

Other experiment

sp + geom_text(size=6)
sp +  geom_text(hjust=0, vjust=0)
sp + geom_text(aes(fontface=2))
sp + geom_text(family = "Times New Roman")
sp + geom_text(aes(color=factor(cyl)))
sp + geom_text(aes(size=wt))

Text annotations using geom_label()

sp <- ggplot(data = df) +
  aes(wt, mpg, label = rownames(df)) +
  geom_point()
# Add texts
sp + geom_label()

Add a text annotation at a particular coordinate

# Solution 1
sp + geom_text(x=3, y=20, label="Scatter plot")

ggrepel: Avoid overlapping of text labels

library(ggrepel)

Create a scatter plot and add labels

p <- ggplot(mtcars, aes(wt, mpg)) +
  geom_point(color = 'red') 
p + geom_text(aes(label = rownames(mtcars)),
              size = 3.5)

Use geom_text_repel

set.seed(32611)
p + geom_text_repel(aes(label = rownames(mtcars)),
                    size = 3.5) 

Use label_text_repel

set.seed(32611)
p + geom_label_repel(aes(label = rownames(mtcars)))
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## p + geom_label_repel(aes(label = rownames(mtcars), fill = factor(cyl)), 
##                         color = 'white', size = 3.5
##                    )

Labs

p <- ggplot(mpg) + 
  geom_point(aes(x = displ, y = hwy, colour=factor(cyl))) + 
  labs(title = "New plot title", x = "New x label", y = "New y label")

Theme

examples on different themes

p <- ggplot(mpg) + 
  geom_point(aes(x = displ, y = hwy, colour=factor(cyl))) + 
  facet_wrap(~class)
p ## default theme_grey

p + theme_bw()

# p + theme_linedraw()
# p + theme_light()
# p + theme_dark()
# p + theme_minimal()
# p + theme_classic()
# p + theme_void()

More about the theme

p + theme(text = element_text(size=20),
        axis.text.x = element_text(angle=90, hjust=1,colour="red")) 

Four elements to control the theme

examples

plot <- ggplot(mpg, aes(displ, hwy)) + geom_point()

plot + theme(
  panel.background = element_blank(),
  axis.text = element_blank()
)

plot + theme(
  axis.text = element_text(colour = "red", size = rel(1.5))
)

plot + theme(
  axis.line = element_line(arrow = arrow())
)

plot + theme(
  panel.background = element_rect(fill = "white"),
  plot.margin = margin(2, 2, 2, 2, "cm"),
  plot.background = element_rect(
    fill = "grey90",
    colour = "black",
    size = 1
  )
)
## all changes are relative to the default value
line
rect
text
title
aspect.ratio
axis.title
axis.title.x
axis.title.y 
axis.text
axis.text.x
axis.text.y
axis.ticks
axis.ticks.x
axis.ticks.y,
axis.ticks.length
axis.line
axis.line.x
axis.line.y
## for more options, see
?theme
theme_gray

No legend

ggplot(mpg) + 
  geom_point(aes(x = displ, y = hwy, colour=factor(cyl))) + 
  theme(legend.position = "none") 

One of my favourate themes (1)

black.bold.text <- element_text(face = "bold", color = "black", size=20)
ggplot(mpg, aes(displ, hwy, colour=class)) + geom_point() + 
    labs(title="hwy vs displ") + 
    theme_bw() + 
    theme(text = black.bold.text) 

One of my favourate themes (2)

black.bold.text <- element_text(face = "bold", color = "black", size=20)
ggplot(mpg, aes(displ, hwy, colour=class)) + geom_point() + 
    labs(title="hwy vs displ") + 
    theme_bw() + 
    theme(text = black.bold.text, panel.grid =element_blank()) 

Change font

black.bold.text <- element_text(face = "bold", color = "black", size=20)
red.italic.text <- element_text(face = "italic", color = "red", size=15)

ggplot(mpg, aes(displ, hwy, colour=class)) + geom_point() + 
    labs(title="hwy vs displ") + 
    theme_bw() + 
    theme(axis.text = black.bold.text , axis.title = black.bold.text, 
          legend.title = red.italic.text, 
          legend.text = black.bold.text) 

Create your own discrete scale

p <- ggplot(mtcars, aes(mpg, wt)) +
  geom_point(aes(colour = factor(cyl)))
p 

p + scale_colour_manual(values = c("red", "blue", "green"))

cols <- c("8" = "red", "4" = "blue", "6" = "darkgreen", "10" = "orange")
p + scale_colour_manual(values = cols)

ggplot(mtcars) +
  aes(mpg, wt, colour = factor(cyl), fill = factor(cyl)) +
   geom_point() + 
  scale_colour_manual(
    values = cols,
    aesthetics = c("colour", "fill")
  )

Create your own axis ticks

p <- ggplot(mtcars, aes(mpg, wt)) +
  geom_point(aes(colour = factor(cyl))) 
p + scale_x_continuous(breaks = c(15,25),
    labels = c("A", "B"),
    name = "My MPG") 

Stat transformation

empirical CDF

df <- data.frame(x = rnorm(1000))
ggplot(df, aes(x)) + stat_ecdf()

n <- 100
df <- data.frame(x = c(rnorm(n, 0, 3), rnorm(n, 0, 10)),
                 g = gl(2, n))
ggplot(df, aes(x, colour = g)) + stat_ecdf()

stat_function

n <- 100
set.seed(32611)
df <- data.frame(
  x = rnorm(n)
)
base <- ggplot(df, aes(x)) + geom_density()
base + stat_function(fun = dnorm, colour = "red") + xlim(c(-3,3))

stat_ellipse

ggplot(mpg, aes(x = displ, y = hwy)) + geom_point() +   stat_ellipse()

stat_ellipse by group

ggplot(mpg, aes(x = displ, y = hwy, color=displ > 4)) + geom_point() +   stat_ellipse()

Coordinate

Previous example on mpg

p <- ggplot(mpg, aes(displ, hwy)) +
  geom_point() +
  geom_smooth()

p
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Setting the limits on the coordinate system performs a visual zoom.

p + coord_cartesian(xlim = c(3, 5), expand = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Setting the limits on a scale converts all values outside the range to NA.

p + xlim(3, 5)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 136 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 136 rows containing missing values (`geom_point()`).

#the same as p + scale_x_continuous(limits = c(325, 500))

resize the plot

p <- ggplot(mpg, aes(displ, hwy)) +  geom_point()
p + coord_fixed(ratio = 0.5)

flip x and y

ggplot(mpg, aes(class, hwy)) +
  geom_boxplot() +
  coord_flip()

ggplot Cheat Sheet