Biostatistical Computing, PHC 6068

R graphics ggplot2

Zhiguang Huo (Caleb)

Monday September 17, 2018

Outline

ggplot2

ggplot2 is based on the grammer of graphics, the idea that you can build every graph from the same few components:

ggplot2 cheetsheet: https://www.rstudio.com/wp-content/uploads/2015/03/ggplot2-cheatsheet.pdf

ggplot2 grammers

ggplot() - graphics are added up by different layers

Compared to qplot(), it’s easier to use multiple dataset in ggplot().

Aesthetics — aes()

ggplot example

library(ggplot2)
ggplot(data = mpg) + aes(x=displ, y=hwy) + geom_point()

ggplot: combine layers

myggplot <- ggplot(data = mpg) + aes(x=displ, y=hwy)
myggplot + geom_point()

aes – color (continuous)

ggplot(data = mpg) + 
  aes(x=displ, y=hwy, color = cyl) +
  geom_point()

aes – color (categorical)

ggplot(data = mpg) + 
  aes(x=displ, y=hwy, color = class) +
  geom_point()

aes – size

ggplot(data = mpg) + 
  aes(x=displ, y=hwy, size = cyl) +
  geom_point()

aes – alpha (transparency)

ggplot(data = mpg) + 
  aes(x=displ, y=hwy, alpha = cyl) +
  geom_point()

aes – shape

mpg_sub <- subset(mpg, class!="suv") ## qplot support a maximum of 6 shapes
ggplot(data = mpg_sub) + 
  aes(x=displ, y=hwy, shape = class) +
  geom_point()

Geom functions

ls(pattern = '^geom_', env = as.environment('package:ggplot2'))
##  [1] "geom_abline"     "geom_area"       "geom_bar"       
##  [4] "geom_bin2d"      "geom_blank"      "geom_boxplot"   
##  [7] "geom_col"        "geom_contour"    "geom_count"     
## [10] "geom_crossbar"   "geom_curve"      "geom_density"   
## [13] "geom_density_2d" "geom_density2d"  "geom_dotplot"   
## [16] "geom_errorbar"   "geom_errorbarh"  "geom_freqpoly"  
## [19] "geom_hex"        "geom_histogram"  "geom_hline"     
## [22] "geom_jitter"     "geom_label"      "geom_line"      
## [25] "geom_linerange"  "geom_map"        "geom_path"      
## [28] "geom_point"      "geom_pointrange" "geom_polygon"   
## [31] "geom_qq"         "geom_qq_line"    "geom_quantile"  
## [34] "geom_raster"     "geom_rect"       "geom_ribbon"    
## [37] "geom_rug"        "geom_segment"    "geom_sf"        
## [40] "geom_smooth"     "geom_spoke"      "geom_step"      
## [43] "geom_text"       "geom_tile"       "geom_violin"    
## [46] "geom_vline"

ggplot: geom_line by group

ggplot(data = mpg) + 
  aes(displ, hwy, colour=class) + 
  geom_point(aes(size=cyl)) + 
  geom_line(aes(group = class))

smooth by group 1

ggplot(data = mpg) + 
  aes(displ, hwy) + 
  geom_point(aes(colour=class)) + 
  geom_smooth() 
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

smooth by group 2

ggplot(data = mpg) + 
  aes(displ, hwy) + 
  geom_point(aes(colour=class)) + 
  geom_smooth(method="lm") 

smooth by group 3

ggplot(data = mpg) + 
  aes(displ, hwy) + 
  geom_point(aes(colour=class)) + 
  geom_smooth(aes(group=class), method="lm") 

smooth by group 4

ggplot(data = mpg) + 
  aes(displ, hwy, colour = class) + ## lower level aes will be applied to all higher level aes
  geom_point() + 
  geom_smooth(method="lm") 

smooth by group 5

ggplot(data = mpg) + 
  aes(displ, hwy, colour = class) + ## lower level aes will be applied to all higher level aes
  geom_point() + 
  geom_smooth(method="lm", se = F, size = 2) 

ggplot() boxplot

mpgbox <- ggplot(data = mpg) + 
  aes(class, hwy) + 
  geom_boxplot(aes(fill=class))
mpgbox

ggplot() jitter

ggplot(data = mpg) + 
  aes(class, hwy, color=class) + 
  geom_jitter()

ggplot() boxplot + jitter

ggplot(data = mpg) + 
  aes(class, hwy, color=class) + 
  geom_boxplot() + 
  geom_jitter()

ggplot() violin plot

ggplot(data = mpg) + 
  aes(class, hwy, fill=class) + 
  geom_violin() 

ggplot() bar plot 1

ggplot(mpg) + 
  aes(class) + 
  geom_bar()

ggplot() bar plot 2

ggplot(mpg) + 
  aes(class, fill=as.factor(cyl)) + 
  geom_bar()

ggplot() bar plot 3

ggplot(mpg) + 
  aes(class, fill=as.factor(cyl)) + 
  geom_bar(position="dodge")  #side by side

ggplot() bar plot: how to specify error bar

mpgSummary <- data.frame(class = with(mpg, tapply(class, class, unique)), 
                         meanDispl = with(mpg, tapply(displ, class, mean)),
                         sdDispl = with(mpg, tapply(displ, class, sd)))

ggplot(data = mpgSummary) + 
  aes(x=class, y=meanDispl, fill=class) + 
  geom_bar(position=position_dodge(), stat="identity",
           colour="black", # Use black outlines,
           size=.3) +      # Thinner lines
  geom_errorbar(aes(ymin=meanDispl-sdDispl, ymax=meanDispl+sdDispl),
                size=.3,    # Thinner lines
                width=.2,
                position=position_dodge(.9))

ggplot() histogram simple example

ggplot(data = mpg) + 
  aes(x = hwy) + 
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot() histogram fill by color

ggplot(data = mpg) + 
  aes(x = hwy) + 
  geom_histogram(aes(fill = class))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot() histogram facets by group (1)

ggplot(data = mpg) + 
  aes(x = hwy) + 
  geom_histogram(aes(fill = class)) + 
  facet_wrap(~ class)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

facet

ggplot() histogram facets by group (2)

ggplot(data = mpg) + 
  aes(x = hwy) + 
  geom_histogram(aes(fill = class)) + 
  facet_grid(. ~ class)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot() histogram facets by group (3)

ggplot(data = mpg) + 
  aes(x = hwy) + 
  geom_histogram(aes(fill = class)) + 
  facet_grid(class ~ .)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot() histogram facets by group (4)

ggplot(data = mpg) + 
  aes(x = hwy) + 
  geom_histogram(aes(fill = class)) + 
  facet_grid(drv ~ class)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

longitudinal data visualization

sleepstudy: Reaction times in a sleep deprivation study

sleepstudy: Reaction times in a sleep deprivation study

library(lme4)
## Loading required package: Matrix
data(sleepstudy)
head(sleepstudy, n=5)
##   Reaction Days Subject
## 1 249.5600    0     308
## 2 258.7047    1     308
## 3 250.8006    2     308
## 4 321.4398    3     308
## 5 356.8519    4     308

spaghetti plot

ggplot(data=sleepstudy) + 
  aes(x = Days, y=Reaction, colour = Subject) +
  geom_path()

individual subject lm smooth

ggplot(data=sleepstudy) + 
  aes(x = Days, y=Reaction, colour = Subject) +
  geom_smooth(method="lm") + 
  facet_wrap(~Subject)

mean trajectory (with SD bar)

sleepSummary <- with(sleepstudy, 
                     data.frame(Days = tapply(Days,Days,unique),
                                Mean = tapply(Reaction,Days,mean),
                                SD = tapply(Reaction,Days,sd))
                     )

ggplot(data=sleepSummary) + 
  aes(x = Days, y=Mean) +
  geom_path() + 
  geom_errorbar(aes(ymin=Mean-SD, ymax=Mean+SD),
                  size=0.5,    # Thinner lines
                  width=.2) 

Add text annotations to a graph

Create some data from mtchar

# Subset 10 rows
set.seed(32611)
ss <- sample(1:32, 10)
df <- mtcars[ss, ]

Text annotations using geom_text()

sp <- ggplot(data = df) +
  aes(wt, mpg, label = rownames(df)) +
  geom_point()
# Add texts
sp + geom_text()

Other experiment

sp + geom_text(size=6)
sp +  geom_text(hjust=0, vjust=0)
sp + geom_text(aes(fontface=2))
sp + geom_text(family = "Times New Roman")
sp + geom_text(aes(color=factor(cyl)))
sp + geom_text(aes(size=wt))

Text annotations using geom_label()

sp <- ggplot(data = df) +
  aes(wt, mpg, label = rownames(df)) +
  geom_point()
# Add texts
sp + geom_label()

Add a text annotation at a particular coordinate

# Solution 1
sp + geom_text(x=3, y=20, label="Scatter plot")

ggrepel: Avoid overlapping of text labels

library(ggrepel)

Create a scatter plot and add labels

p <- ggplot(mtcars, aes(wt, mpg)) +
  geom_point(color = 'red') 
p + geom_text(aes(label = rownames(mtcars)),
              size = 3.5)

Use geom_text_repel

set.seed(32611)
p + geom_text_repel(aes(label = rownames(mtcars)),
                    size = 3.5) 

Use label_text_repel

set.seed(32611)
p + geom_label_repel(aes(label = rownames(mtcars)))

## p + geom_label_repel(aes(label = rownames(mtcars), fill = factor(cyl)), 
##                         color = 'white', size = 3.5
##                    )

Labs

p <- ggplot(mpg) + 
  geom_point(aes(x = displ, y = hwy, colour=factor(cyl))) + 
  labs(title = "New plot title", x = "New x label", y = "New y label")

Theme

examples on different themes

p <- ggplot(mpg) + 
  geom_point(aes(x = displ, y = hwy, colour=factor(cyl))) + 
  facet_wrap(~class)
p ## default theme_grey

p + theme_bw()

# p + theme_linedraw()
# p + theme_light()
# p + theme_dark()
# p + theme_minimal()
# p + theme_classic()
# p + theme_void()

More about the theme

p + theme(text = element_text(size=20),
        axis.text.x = element_text(angle=90, hjust=1,colour="red")) 

Four elements to control the theme

plot <- ggplot(mpg, aes(displ, hwy)) + geom_point()

plot + theme(
  panel.background = element_blank(),
  axis.text = element_blank()
)

plot + theme(
  axis.text = element_text(colour = "red", size = rel(1.5))
)

plot + theme(
  axis.line = element_line(arrow = arrow())
)

plot + theme(
  panel.background = element_rect(fill = "white"),
  plot.margin = margin(2, 2, 2, 2, "cm"),
  plot.background = element_rect(
    fill = "grey90",
    colour = "black",
    size = 1
  )
)
## all changes are relative to the default value
line
rect
text
title
aspect.ratio
axis.title
axis.title.x
axis.title.y 
axis.text
axis.text.x
axis.text.y
axis.ticks
axis.ticks.x
axis.ticks.y,
axis.ticks.length
axis.line
axis.line.x
axis.line.y
## for more options, see
?theme
theme_gray

No legend

ggplot(mpg) + 
  geom_point(aes(x = displ, y = hwy, colour=factor(cyl))) + 
  theme(legend.position = "none") 

One of my favourate themes (1)

black.bold.text <- element_text(face = "bold", color = "black", size=20)
ggplot(mpg, aes(displ, hwy, colour=class)) + geom_point() + 
    labs(title="hwy vs displ") + 
    theme_bw() + 
    theme(text = black.bold.text) 

One of my favourate themes (2)

black.bold.text <- element_text(face = "bold", color = "black", size=20)
ggplot(mpg, aes(displ, hwy, colour=class)) + geom_point() + 
    labs(title="hwy vs displ") + 
    theme_bw() + 
    theme(text = black.bold.text, panel.grid =element_blank()) 

Other experiments

black.bold.text <- element_text(face = "bold", color = "black", size=20)
red.italic.text <- element_text(face = "italic", color = "red", size=15)

ggplot(mpg, aes(displ, hwy, colour=class)) + geom_point() + 
    labs(title="hwy vs displ") + 
    theme_bw() + 
    theme(axis.text = black.bold.text , axis.title = black.bold.text, 
          legend.title = red.italic.text, 
          legend.text = black.bold.text) 

Stat transformation

empirical CDF

df <- data.frame(x = rnorm(1000))
ggplot(df, aes(x)) + stat_ecdf(geom = "step")

n <- 100
df <- data.frame(x = c(rnorm(n, 0, 3), rnorm(n, 0, 10)),
                 g = gl(2, n))
ggplot(df, aes(x, colour = g)) + stat_ecdf()

stat_function

n <- 100
set.seed(32611)
df <- data.frame(
  x = rnorm(n)
)
x <- df$x
base <- ggplot(df, aes(x)) + geom_density()
base + stat_function(fun = dnorm, colour = "red") + xlim(c(-3,3))

stat_ellipse

ggplot(mpg, aes(x = displ, y = hwy)) + geom_point() +   stat_ellipse()

stat_ellipse by group

ggplot(mpg, aes(x = displ, y = hwy, color=displ > 4)) + geom_point() +   stat_ellipse()

Coordinate

Our old friend mpg

p <- ggplot(mpg, aes(displ, hwy)) +
  geom_point() +
  geom_smooth()

p
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Setting the limits on the coordinate system performs a visual zoom.

p + coord_cartesian(xlim = c(3, 5), expand = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Setting the limits on a scale converts all values outside the range to NA.

p + xlim(3, 5)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 136 rows containing non-finite values (stat_smooth).
## Warning: Removed 136 rows containing missing values (geom_point).

#the same as p + scale_x_continuous(limits = c(325, 500))

resize the plot

p <- ggplot(mpg, aes(displ, hwy)) +  geom_point()
p + coord_fixed(ratio = 0.5)

p + coord_fixed(ratio = 0.1)

flip x and y

ggplot(mpg, aes(class, hwy)) +
  geom_boxplot() +
  coord_flip()