library(tidyverse)
library(viridis)
library(patchwork)
library(gghighlight)
Data vizualisation
This page is a collection of plots describing a fake portfolio of M&A deals. I use this page as a personal reminder. First I generate a dataset with a few attributes : sectors, regions, deal size. Then I do basic exploratory data analysis. You can unwrap code chunks if needed.
Creating a dataset
Load libraries
Create objects
# Create dates
<- seq.Date(from = as.Date("2014-01-01"), to = Sys.Date(), by = 1)
datesx # Sample random dates and sort in chronologically
set.seed(123)
<- sample(datesx, size = 400, replace = TRUE)
dates <- sort(dates, decreasing = FALSE)
dates
# Create sectors and regions (categorical)
<- sample(x = c("air", "space", "land", "sea", "ground", "radio"), size = 400, replace = TRUE, prob=c(0.1, 0.2, 0.30, 0.2, 0.1, 0.2))
sectors <- sample(x = c("NAM", "LATAM", "EMEA", "APAC", "AUS"), size = 400, replace = TRUE, prob=c(0.1, 0.2, 0.40, 0.2, 0.1))
regions
# Create deal sizes and revenues (numeric)
<- sample(1:1000, size = 400, replace = TRUE)
dealsize <- round(rnorm(400, 20, 5)^2, 0) revenues
Create data frame
# create tibbles by combining vectors
<-
evolution tibble(dates, sectors, regions, dealsize, revenues)
#convert character variables to factors
<-
evolution %>%
evolution mutate_if(sapply(evolution, is.character), as.factor) %>%
print()
# A tibble: 400 × 5
dates sectors regions dealsize revenues
<date> <fct> <fct> <int> <dbl>
1 2014-01-13 sea LATAM 778 185
2 2014-02-10 land APAC 86 527
3 2014-03-17 sea APAC 42 343
4 2014-03-24 land EMEA 100 411
5 2014-04-27 land EMEA 937 676
6 2014-05-07 radio NAM 25 573
7 2014-05-17 sea APAC 606 123
8 2014-05-21 space APAC 180 476
9 2014-05-31 land EMEA 293 351
10 2014-06-02 ground LATAM 896 346
# ℹ 390 more rows
#filter year
<-
evolution %>%
evolution filter(dates > "2015-01-01",
< "2023-01-01") dates
Set theme
theme_set(theme_minimal())
Deal count
Code
<-
plot.bar.vertical.count %>%
evolution ggplot(mapping = aes(x = fct_rev(fct_infreq(sectors)))) +
geom_bar() +
geom_text(aes(label = after_stat(count)),
stat = "count",
vjust = 2.5,
colour = "white") + #add labels
labs(x = "Sectors",
title = "Number of deals by sector")
plot.bar.vertical.count
Code
<-
plot.percent.stacked %>%
evolution ggplot(mapping = aes(x=sectors, fill = regions)) +
geom_bar(position = "fill")
plot.percent.stacked
Code
<-
plot.count.dodge %>%
evolution ggplot(mapping = aes(x=fct_infreq(sectors), fill = fct_infreq(regions))) +
geom_bar(position = "dodge", color = "white")
plot.count.dodge
Highlight variable
Overlay two plots on top of each other to highlight a variable.
Code
<-
plot.bar.horizontal %>%
evolution count(sectors) %>%
mutate(prop = (n/sum(n))) %>%
ggplot(mapping = aes(x = fct_reorder(sectors, prop), y = prop)) +
geom_col(fill = "red3") +
coord_flip() +
gghighlight(sectors == "radio") +
geom_text(aes(label = paste0(round(prop*100,2), "%")),
vjust = .5,
hjust = 1.2,
size = 4,
color = "white") + #add labels
scale_y_continuous(labels = scales::percent) + # axis label in %
labs(x = "Sectors",
title = "Proportion of deals by sector",
subtitle = "By number of deals") #relabel x axis
plot.bar.horizontal
Code
#create base data
<-
evolutioncount %>%
evolution group_by(year = year(dates)) %>%
count(sectors) %>%
mutate(year = year, n = n)
#create highlight data
<-
evolutioncount.filt %>%
evolution filter(sectors == "radio") %>% # with filter
group_by(year = year(dates)) %>%
count(sectors) %>%
mutate(year = year, n = n)
#plot base and highlight data on top of each other
<-
plot.bar.time.highlight ggplot(data = evolutioncount, aes(x = year, y = n)) +
geom_col(fill = "red3") +
geom_text(evolutioncount.filt, mapping = aes(label = n),
vjust = -0.8,
color = "red3") +
gghighlight(sectors == "radio")
plot.bar.time.highlight
Change rate
Using the lag()
function which helps calculating the YoY change.
Code
# Prepare data using lag()
<-
data.change.roll.avg %>%
evolutioncount.filt ungroup() %>% #remember to ungroup()
mutate(lag0 = n,
lag1 = lag(lag0),
yoy.change = (lag0 - lag1),
yoy.changeperc = ((lag0/lag1-1)*100)
%>%
) mutate(yoy.changeperc = round(yoy.changeperc, 1))
#Plot 1
<-
plot.change1 %>%
data.change.roll.avg select(year, yoy.change, yoy.changeperc) %>%
filter(!is.na(yoy.change)) %>%
ggplot(aes(x = year, y = yoy.change)) +
geom_col(aes(fill = yoy.change > 0)) + # conditional formatting with > 0
geom_text(aes(label = yoy.change, y = yoy.change -0.5 * sign(yoy.change)), color = "white", fontface = "bold") + # data labels, use sign() to fit diverging labels, see : https://tinyurl.com/z5n7uk5k or https://tinyurl.com/3uakpz7y
labs(x = NULL) + # remove x axis title
scale_x_continuous(labels = NULL) + # remove x axis tick labels
scale_fill_manual(values = c("#c9191e", "#27a658" )) +
theme(legend.position = "none") # remove legend of col chart
#Plot 2
<-
plot.change2 %>%
data.change.roll.avg select(year, yoy.change, yoy.changeperc) %>%
filter(!is.na(yoy.changeperc)) %>%
ggplot(aes(x = year, y = yoy.changeperc)) +
geom_hline(aes(yintercept = 0), color = "gray", linetype = "dashed") + #highlight x axis
geom_line(color = "blue4") +
labs(x = NULL)
# Combining Plot 1 & 2 using patchwork
/ plot.change2) + plot_layout(ncol = 1, heights = c(2.5, 1)) (plot.change1
Code
# Resources :
# https://tinyurl.com/23yh29hu
Line plots : cumulated sums
Counting the number of deals, and grouping line geoms by color.
Code
<-
plot.line %>%
evolution group_by(sectors) %>%
arrange(dates) %>%
mutate(count = row_number()) %>%
ggplot(mapping = aes(x = dates, y = count, color = sectors)) +
geom_line(linewidth = 0.8)
plot.line
Without gghighlight
Now doing two things on this line plot :
Highlighting one line without using the
gghighlight
package.Graphing cumulated dealsize, instead of count previously. We use the function
cumsum()
Code
#create background line geoms (will be in grey)
<-
data.evolutionback %>%
evolution group_by(sectors) %>%
arrange(dates) %>%
mutate(evol = cumsum(dealsize))
#create the highlighted line (will be in red)
<- data.evolutionback %>%
data.evolutionfilt filter(sectors == "radio")
#plot
<- ggplot(data = data.evolutionback, mapping = aes(x = dates, y = evol, group = sectors)) +
plot.line.highlight geom_line(linewidth = 0.8,
color = "gray85") + #main lines in gray
geom_line(data = data.evolutionfilt,
linewidth = 0.8,
color = "red") + #highlight line in red
#add a dot at max value of line (max value of evol)
geom_point(data = data.evolutionfilt %>% filter(data.evolutionfilt$evol == max(evol)),
color = "red",
size = 2) +
#add text label to end of line
geom_text(data = data.evolutionfilt %>% filter(dates == last(dates)),
mapping = aes(label = sectors),
color = "red",
hjust = -0.2,
vjust = 0.1) +
# edit plot axis labels
labs(title = "Cumulated value over time",
subtitle = "Radio segment",
x = "Time",
y = "Cumulated deal value")
plot.line.highlight
To allow labels to bleed past the canvas boundaries we could use coord_cartesian(clip = 'off')
Adding a trend line with geom_smooth
Code
<-
plot.line.highlight +
plot.line.highlight # geom smooth trend line
geom_smooth(group = 1, color = "black",
linetype = "dashed",
linewidth = 0.4, se = FALSE,
method = "loess",
formula = y ~ x)
plot.line.highlight
Code
# Prepare data to get proportion per sector and per year
<-
data.line.percent %>%
evolution mutate(year = year(dates)) %>% #get year name
group_by(year, sectors) %>%
summarise(n = length(dates)) %>% #get length of vectors containing dates (a.k.a "number of dates"; for each sectors in each year)
group_by(year) %>%
mutate(sumperyear = sum(n),
prop = (n/sumperyear*100)) #sum the total number of dates per year and get prop of "each sectors for each year" compared to total number of all sectors in a year
# Filter data for highlight
<- data.line.percent %>% filter(sectors == "radio")
data.line.percent.filt
<-
plot.line.percent.highlight ggplot(data = data.line.percent, mapping = aes(x = year, y = prop, group = sectors)) +
geom_line(color = "gray80") +
geom_line(data = data.line.percent.filt,
color = "red3",
linewidth = 1) +
geom_point(data = data.line.percent.filt %>% filter(year == max(data.line.percent.filt$year)),
color = "red3") +
geom_text(data = data.line.percent.filt %>% filter(year == max(data.line.percent.filt$year)),
mapping = aes(label = paste0(round(prop), "", "%"),
hjust = 0.5,
vjust = -1,
color = "red3",
fontface = "bold")) +
theme(legend.position = "none") +
scale_y_continuous(limits = c(0,100)) +
labs(title = "Proportion per year",
subtitle = "Radio sector",
x = "Year",
y = "Share (%)")
plot.line.percent.highlight
With gghighlight
Code
+
plot.line gghighlight(sectors == "space")
With facets + gghighlight
Code
# No need for gghighlight here
+
plot.line facet_wrap(~sectors)
Code
# Highlighting segments with gghighlight
+
plot.line gghighlight(label_params = list(size = 3)) + #adjust size of labels
facet_wrap(~sectors) +
theme(
strip.text.x = element_blank() #remove titles from facets
)
Code
# With multiple gghighlights
+
plot.line gghighlight(sectors %in% c("space", "radio"),
label_params = list(nudge_x = 2))
- For fun we could try library(geomtextpath)
- For better reliability we could add labels without using additional packages:
https://stackoverflow.com/questions/29357612/plot-labels-at-ends-of-lines (answered Jul 2, 2018 at 15:57)
Describing deal value
Summary stats
Code
<-
data.evolution.summary %>%
evolution group_by(sectors) %>%
summarize(meandeal = mean(dealsize),
mediandeal = median(dealsize),
max = max(dealsize),
min = min(dealsize),
IQR = IQR(dealsize),
NBdeals = n()) %>%
arrange(mediandeal)
data.evolution.summary
# A tibble: 6 × 7
sectors meandeal mediandeal max min IQR NBdeals
<fct> <dbl> <dbl> <int> <int> <dbl> <int>
1 air 414. 319 917 51 474. 26
2 sea 473. 443 993 4 494. 58
3 space 493. 498 964 1 574 65
4 land 526. 502. 993 6 406. 84
5 radio 563 627 990 18 452. 59
6 ground 543. 642. 956 26 543 34
Code
# Note : it is also possible to use a library such as Janitor for this.
Sum
Which is the most important sector in terms of deal size ?
Code
<-
plot.bar.vertical.sum %>%
evolution group_by(sectors) %>%
summarise(sumdeal = sum(dealsize)) %>%
ggplot(mapping = aes(x = fct_reorder(sectors, sumdeal), y = sumdeal)) +
geom_col() +
geom_text(aes(label = sumdeal),
vjust = 2,
size = 4,
color = "white") +
labs(x = NULL)
+
plot.bar.vertical.sum labs(title = "Sum of deals' value per sector",
subtitle = "In USD",
x = NULL)
Code
plot.bar.vertical.sum
Frequency
How are the deals distributed in terms of deal size ?
Code
## Distribution of deal size
<-
plot.hist.dealsize %>%
evolution ggplot(aes(x = dealsize)) +
geom_histogram(binwidth = 20)
plot.hist.dealsize
Density
Code
## Density of deal size per sector
<-
plot.density %>%
evolution ggplot(aes(x = dealsize, fill = sectors)) +
geom_density(alpha = 0.3) +
geom_rug(alpha = 0.6) + #lower part of graph
facet_wrap(~sectors)
plot.density
Boxplot
Code
<-
plot.boxplot %>%
evolution group_by(sectors) %>%
mutate(mediandeal = median(dealsize)) %>%
ggplot(mapping = aes(x = fct_reorder(sectors, mediandeal), y = dealsize)) +
geom_boxplot() +
geom_jitter(width = 0.1, alpha = 0.4) +
labs(title = "Deal size distribution per sector",
subtitle = "In USD",
x = "Sector") +
coord_flip()
plot.boxplot
Code
<-
plot.boxplot.color %>%
evolution group_by(sectors) %>%
mutate(mediandeal = median(dealsize)) %>%
ggplot(mapping = aes(x = fct_reorder(sectors, mediandeal), y = dealsize)) +
geom_boxplot(fill = "blue3", alpha = 0.1, color = "blue3") +
geom_jitter(aes(color = sectors), width = 0.1, alpha = 0.4) +
labs(title = "Deal size distribution per sector",
subtitle = "In USD",
x = "Sector") +
guides(color = guide_legend(override.aes = list(size = 5))) + #increase legend items size
coord_flip()
plot.boxplot.color
Code
ggsave("plot.boxplot.color.png")
Saving 7 x 5 in image
Binning
Let’s create 3 buckets of deal sizes and count the number of deals in each of them. The key function here is cut()
were we define breaks within a variable. The breaks are defined in a vector like breaks = c(0,100,500,999)
.
Code
# binning deal sizes
# add column named "bin" containing 3 types of deal sizes
<-
bin %>%
evolution mutate(bin = cut(evolution$dealsize,
breaks = c(0,100,500,999)))
#relabeled bin
<-
bin %>%
evolution mutate(bin = cut(evolution$dealsize,
breaks = c(0,100,500,999),
labels = c("small < 100", "100 < medium < 500 ", "big > 500")))
# plot bins
<-
plot.bar.binned %>%
bin ggplot(mapping = aes(x = bin)) +
geom_bar() +
geom_text(aes(label = after_stat(count)),
stat = "count",
vjust = 2.5,
colour = "white")
plot.bar.binned
Code
# plot proportions for each bin in %
%>%
bin group_by(bin) %>%
summarise(sumperbin = sum(dealsize)) %>%
mutate(tot = sum(sumperbin), prop = sumperbin/tot*100) %>% #proportion
ggplot(mapping = aes(x = bin, y = prop)) +
geom_col() +
geom_text(aes(label = round(prop)), vjust = 2.5, colour = "white")
Code
<-
plot.binned.stacked %>%
bin group_by(sectors, bin) %>%
summarise(sumdeal = sum(dealsize)) %>%
group_by(sectors) %>%
mutate(totsecteur = sum(sumdeal)) %>%
arrange(rev(sumdeal)) %>%
mutate(pos = cumsum(sumdeal)) %>%
ggplot(mapping = aes(x = fct_reorder(sectors, totsecteur), y = sumdeal, fill = bin)) +
geom_bar(stat="identity")
plot.binned.stacked
Code
<-
data.binsizestack %>%
bin group_by(sectors, bin) %>%
#add column summarizing total sum of deals per sectors for each bin
summarise(sumperbin = sum(dealsize)) %>%
#get proportion
mutate(tot = sum(sumperbin), prop = sumperbin/tot*100)
#plot
<-
plot.binned.stacked.dodge %>%
data.binsizestack ggplot(aes(x = fct_reorder(sectors, tot), y = prop, fill = fct_rev(bin))) + #use fct_rev on (bin) to reverse stacked bar order (optional...)
geom_col(position = "dodge") +
labs(x = "Sector", color = "Deal category") + #change legend name with color = ""
#need to use scale_fill_discrete, instead of scale_color_discrete to reorder legend cat
scale_fill_discrete(breaks=c("big > 500", "medium > 200" , "small < 200")) #change legend label order
plot.binned.stacked.dodge
Bubble charts
Code
<-
plot.bubble %>%
evolution ggplot(aes(x = dates, y = dealsize, size = (revenues/100))) +
geom_point(color= "gray", alpha = 0.5)
plot.bubble
Code
<-
plot.bubble.highlight +
plot.bubble geom_point(data = evolution %>% filter(evolution$sectors == "radio"),
alpha = 0.5,
color = "magenta")
plot.bubble.highlight
Code
#Highlight with conditionality on deal size
<-
plot.bubble.highlight2 +
plot.bubble geom_point(data = evolution %>% filter(evolution$dealsize > 900),
alpha = 0.5,
color = "red")
plot.bubble.highlight2
Heatmaps
Prepare data
Code
#get year, month, week
<-
data.heatmap.weekcount %>%
evolution mutate(year = year(dates),
month = month(dates, label = TRUE, abbr= TRUE),
wk = week(dates))
data.heatmap.weekcount
# A tibble: 326 × 8
dates sectors regions dealsize revenues year month wk
<date> <fct> <fct> <int> <dbl> <dbl> <ord> <dbl>
1 2015-01-08 space EMEA 681 661 2015 Jan 2
2 2015-01-09 radio LATAM 425 785 2015 Jan 2
3 2015-01-21 space LATAM 39 406 2015 Jan 3
4 2015-02-17 land EMEA 242 192 2015 Feb 7
5 2015-04-03 ground EMEA 876 340 2015 Apr 14
6 2015-04-18 radio APAC 504 309 2015 Apr 16
7 2015-04-22 space LATAM 215 569 2015 Apr 16
8 2015-04-25 space AUS 902 484 2015 Apr 17
9 2015-05-31 radio APAC 928 248 2015 May 22
10 2015-06-10 sea EMEA 103 374 2015 Jun 23
# ℹ 316 more rows
Week
Code
#plot week intensity for each year with heat map with geom_tile()
<-
plot.heatmap.weekcount %>%
data.heatmap.weekcount group_by(year) %>%
count(wk) %>%
ggplot(aes(x = wk, y = year, fill = n)) +
geom_tile(color="white", linewidth = 0.1) + #use color and size to draw borders
coord_equal() + #draw nice squares instead of rectangles
scale_fill_viridis(name="# Deals", option = "plasma") +
theme(
panel.background = element_rect(fill = NA, color = NA), # Background of plotting area
) plot.heatmap.weekcount
Month
Code
#plot month intensity for each year with heat map with geom_tile()
<-
plot.heatmap.monthcount %>%
data.heatmap.weekcount group_by(year) %>%
count(month) %>% #change count to month
ggplot(aes(x = month, y = year, fill = n)) + #change x to month
geom_tile(color="white", linewidth = 0.1) + #use color and size to draw borders
coord_equal() + #draw nice squares instead of rectangles
theme(
panel.background = element_rect(fill = NA, color = NA) # Background of plotting area
+
) geom_text(aes(label=n), color="grey", size=3) # adding labels
plot.heatmap.monthcount
Overview
Code
# syntax of patchwork library to combine plots
<-
patchwork + plot.line.highlight + plot.bar.time.highlight) /
((plot.bar.horizontal + plot.bar.vertical.count + (plot.change1 / plot.change2))
(plot.line.percent.highlight
) patchwork