library(tidyverse)
library(viridis)
library(patchwork)
library(gghighlight)Data vizualisation
Generating a sample dataset and performing basic data exploration centered around M&A deal portfolio. Collapsible code sections are included.
Creating a dataset
Load libraries
Create objects
# Create dates
datesx <- seq.Date(from = as.Date("2014-01-01"), to = Sys.Date(), by = 1)
# Sample random dates and sort in chronologically
set.seed(123)
dates <- sample(datesx, size = 400, replace = TRUE)
dates <- sort(dates, decreasing = FALSE)
# Create sectors and regions (categorical)
sectors <- sample(x = c("air", "space", "land", "sea", "ground", "radio"), size = 400, replace = TRUE, prob=c(0.1, 0.2, 0.30, 0.2, 0.1, 0.2))
regions <- sample(x = c("NAM", "LATAM", "EMEA", "APAC", "AUS"), size = 400, replace = TRUE, prob=c(0.1, 0.2, 0.40, 0.2, 0.1))
# Create deal sizes and revenues (numeric)
dealsize <- sample(1:1000, size = 400, replace = TRUE)
revenues <- round(rnorm(400, 20, 5)^2, 0)Create data frame
# create tibbles by combining vectors
evolution <-
tibble(dates, sectors, regions, dealsize, revenues)
#convert character variables to factors
evolution <-
evolution %>%
mutate_if(sapply(evolution, is.character), as.factor) %>%
print()# A tibble: 400 × 5
dates sectors regions dealsize revenues
<date> <fct> <fct> <int> <dbl>
1 2014-01-31 land EMEA 380 573
2 2014-02-10 sea EMEA 705 216
3 2014-02-18 land EMEA 303 800
4 2014-03-20 sea LATAM 142 547
5 2014-03-24 sea EMEA 71 214
6 2014-04-05 sea EMEA 703 496
7 2014-04-30 land EMEA 674 358
8 2014-05-14 land APAC 458 465
9 2014-05-21 radio EMEA 373 382
10 2014-05-31 space EMEA 719 643
# ℹ 390 more rows
#filter year
evolution <-
evolution %>%
filter(dates > "2015-01-01",
dates < "2023-01-01")Set theme
theme_set(theme_minimal())Deal count
Code
plot.bar.vertical.count <-
evolution %>%
ggplot(mapping = aes(x = fct_rev(fct_infreq(sectors)))) +
geom_bar() +
geom_text(aes(label = after_stat(count)),
stat = "count",
vjust = 2.5,
colour = "white") + #add labels
labs(x = "Sectors",
title = "Number of deals by sector")
plot.bar.vertical.count
Code
plot.percent.stacked <-
evolution %>%
ggplot(mapping = aes(x=sectors, fill = regions)) +
geom_bar(position = "fill")
plot.percent.stackedCode
plot.count.dodge <-
evolution %>%
ggplot(mapping = aes(x=fct_infreq(sectors), fill = fct_infreq(regions))) +
geom_bar(position = "dodge", color = "white")
plot.count.dodgeHighlight variable
Overlay two plots on top of each other to highlight a variable.
Code
plot.bar.horizontal <-
evolution %>%
count(sectors) %>%
mutate(prop = (n/sum(n))) %>%
ggplot(mapping = aes(x = fct_reorder(sectors, prop), y = prop)) +
geom_col(fill = "red3") +
coord_flip() +
gghighlight(sectors == "radio") +
geom_text(aes(label = paste0(round(prop*100,2), "%")),
vjust = .5,
hjust = 1.2,
size = 4,
color = "white") + #add labels
scale_y_continuous(labels = scales::percent) + # axis label in %
labs(x = "Sectors",
title = "Proportion of deals by sector",
subtitle = "By number of deals") #relabel x axis
plot.bar.horizontalCode
#create base data
evolutioncount <-
evolution %>%
group_by(year = year(dates)) %>%
count(sectors) %>%
mutate(year = year, n = n)
#create highlight data
evolutioncount.filt <-
evolution %>%
filter(sectors == "radio") %>% # with filter
group_by(year = year(dates)) %>%
count(sectors) %>%
mutate(year = year, n = n)
#plot base and highlight data on top of each other
plot.bar.time.highlight <-
ggplot(data = evolutioncount, aes(x = year, y = n)) +
geom_col(fill = "red3") +
geom_text(evolutioncount.filt, mapping = aes(label = n),
vjust = -0.8,
color = "red3") +
gghighlight(sectors == "radio")
plot.bar.time.highlightChange rate
Using the lag() function which helps calculating the YoY change.
Code
# Prepare data using lag()
data.change.roll.avg <-
evolutioncount.filt %>%
ungroup() %>% #remember to ungroup()
mutate(lag0 = n,
lag1 = lag(lag0),
yoy.change = (lag0 - lag1),
yoy.changeperc = ((lag0/lag1-1)*100)
) %>%
mutate(yoy.changeperc = round(yoy.changeperc, 1))
#Plot 1
plot.change1 <-
data.change.roll.avg %>%
select(year, yoy.change, yoy.changeperc) %>%
filter(!is.na(yoy.change)) %>%
ggplot(aes(x = year, y = yoy.change)) +
geom_col(aes(fill = yoy.change > 0)) + # conditional formatting with > 0
geom_text(aes(label = yoy.change, y = yoy.change -0.5 * sign(yoy.change)), color = "white", fontface = "bold") + # data labels, use sign() to fit diverging labels, see : https://tinyurl.com/z5n7uk5k or https://tinyurl.com/3uakpz7y
labs(x = NULL) + # remove x axis title
scale_x_continuous(labels = NULL) + # remove x axis tick labels
scale_fill_manual(values = c("#c9191e", "#27a658" )) +
theme(legend.position = "none") # remove legend of col chart
#Plot 2
plot.change2 <-
data.change.roll.avg %>%
select(year, yoy.change, yoy.changeperc) %>%
filter(!is.na(yoy.changeperc)) %>%
ggplot(aes(x = year, y = yoy.changeperc)) +
geom_hline(aes(yintercept = 0), color = "gray", linetype = "dashed") + #highlight x axis
geom_line(color = "blue4") +
labs(x = NULL)
# Combining Plot 1 & 2 using patchwork
(plot.change1 / plot.change2) + plot_layout(ncol = 1, heights = c(2.5, 1))Code
# Resources :
# https://tinyurl.com/23yh29huLine plots : cumulated sums
Counting the number of deals, and grouping line geoms by color.
Code
plot.line <-
evolution %>%
group_by(sectors) %>%
arrange(dates) %>%
mutate(count = row_number()) %>%
ggplot(mapping = aes(x = dates, y = count, color = sectors)) +
geom_line(linewidth = 0.8)
plot.lineWithout gghighlight
Now doing two things on this line plot :
Highlighting one line without using the
gghighlightpackage.Graphing cumulated dealsize, instead of count previously. We use the function
cumsum()
Code
#create background line geoms (will be in grey)
data.evolutionback <-
evolution %>%
group_by(sectors) %>%
arrange(dates) %>%
mutate(evol = cumsum(dealsize))
#create the highlighted line (will be in red)
data.evolutionfilt <- data.evolutionback %>%
filter(sectors == "radio")
#plot
plot.line.highlight <- ggplot(data = data.evolutionback, mapping = aes(x = dates, y = evol, group = sectors)) +
geom_line(linewidth = 0.8,
color = "gray85") + #main lines in gray
geom_line(data = data.evolutionfilt,
linewidth = 0.8,
color = "red") + #highlight line in red
#add a dot at max value of line (max value of evol)
geom_point(data = data.evolutionfilt %>% filter(data.evolutionfilt$evol == max(evol)),
color = "red",
size = 2) +
#add text label to end of line
geom_text(data = data.evolutionfilt %>% filter(dates == last(dates)),
mapping = aes(label = sectors),
color = "red",
hjust = -0.2,
vjust = 0.1) +
# edit plot axis labels
labs(title = "Cumulated value over time",
subtitle = "Radio segment",
x = "Time",
y = "Cumulated deal value")
plot.line.highlightTo allow labels to bleed past the canvas boundaries we could use coord_cartesian(clip = 'off')
Adding a trend line with geom_smooth
Code
plot.line.highlight <-
plot.line.highlight +
# geom smooth trend line
geom_smooth(group = 1, color = "black",
linetype = "dashed",
linewidth = 0.4, se = FALSE,
method = "loess",
formula = y ~ x)
plot.line.highlightCode
# Prepare data to get proportion per sector and per year
data.line.percent <-
evolution %>%
mutate(year = year(dates)) %>% #get year name
group_by(year, sectors) %>%
summarise(n = length(dates)) %>% #get length of vectors containing dates (a.k.a "number of dates"; for each sectors in each year)
group_by(year) %>%
mutate(sumperyear = sum(n),
prop = (n/sumperyear*100)) #sum the total number of dates per year and get prop of "each sectors for each year" compared to total number of all sectors in a year
# Filter data for highlight
data.line.percent.filt <- data.line.percent %>% filter(sectors == "radio")
plot.line.percent.highlight <-
ggplot(data = data.line.percent, mapping = aes(x = year, y = prop, group = sectors)) +
geom_line(color = "gray80") +
geom_line(data = data.line.percent.filt,
color = "red3",
linewidth = 1) +
geom_point(data = data.line.percent.filt %>% filter(year == max(data.line.percent.filt$year)),
color = "red3") +
geom_text(data = data.line.percent.filt %>% filter(year == max(data.line.percent.filt$year)),
mapping = aes(label = paste0(round(prop), "", "%"),
hjust = 0.5,
vjust = -1,
color = "red3",
fontface = "bold")) +
theme(legend.position = "none") +
scale_y_continuous(limits = c(0,100)) +
labs(title = "Proportion per year",
subtitle = "Radio sector",
x = "Year",
y = "Share (%)")
plot.line.percent.highlightWith gghighlight
Code
plot.line +
gghighlight(sectors == "space") With facets + gghighlight
Code
# No need for gghighlight here
plot.line +
facet_wrap(~sectors)Code
# Highlighting segments with gghighlight
plot.line +
gghighlight(label_params = list(size = 3)) + #adjust size of labels
facet_wrap(~sectors) +
theme(
strip.text.x = element_blank() #remove titles from facets
)Code
# With multiple gghighlights
plot.line +
gghighlight(sectors %in% c("space", "radio"),
label_params = list(nudge_x = 2))- For fun we could try library(geomtextpath)
- For better reliability we could add labels without using additional packages:
https://stackoverflow.com/questions/29357612/plot-labels-at-ends-of-lines (answered Jul 2, 2018 at 15:57)
Describing deal value
Summary stats
Code
data.evolution.summary <-
evolution %>%
group_by(sectors) %>%
summarize(meandeal = mean(dealsize),
mediandeal = median(dealsize),
max = max(dealsize),
min = min(dealsize),
IQR = IQR(dealsize),
NBdeals = n()) %>%
arrange(mediandeal)
data.evolution.summary# A tibble: 6 × 7
sectors meandeal mediandeal max min IQR NBdeals
<fct> <dbl> <dbl> <int> <int> <dbl> <int>
1 ground 416. 296 999 12 460. 23
2 sea 452. 413 984 4 445 61
3 land 459. 440. 989 28 508. 72
4 space 507. 492 999 9 462. 54
5 air 482. 506 862 98 347 17
6 radio 515. 536. 981 14 521. 52
Code
# Note : it is also possible to use a library such as Janitor for this.Sum
Which is the most important sector in terms of deal size ?
Code
plot.bar.vertical.sum <-
evolution %>%
group_by(sectors) %>%
summarise(sumdeal = sum(dealsize)) %>%
ggplot(mapping = aes(x = fct_reorder(sectors, sumdeal), y = sumdeal)) +
geom_col() +
geom_text(aes(label = sumdeal),
vjust = 2,
size = 4,
color = "white") +
labs(x = NULL)
plot.bar.vertical.sum +
labs(title = "Sum of deals' value per sector",
subtitle = "In USD",
x = NULL)Code
plot.bar.vertical.sum Frequency
How are the deals distributed in terms of deal size ?
Code
## Distribution of deal size
plot.hist.dealsize <-
evolution %>%
ggplot(aes(x = dealsize)) +
geom_histogram(binwidth = 20)
plot.hist.dealsizeDensity
Code
## Density of deal size per sector
plot.density <-
evolution %>%
ggplot(aes(x = dealsize, fill = sectors)) +
geom_density(alpha = 0.3) +
geom_rug(alpha = 0.6) + #lower part of graph
facet_wrap(~sectors)
plot.densityBoxplot
Code
plot.boxplot <-
evolution %>%
group_by(sectors) %>%
mutate(mediandeal = median(dealsize)) %>%
ggplot(mapping = aes(x = fct_reorder(sectors, mediandeal), y = dealsize)) +
geom_boxplot() +
geom_jitter(width = 0.1, alpha = 0.4) +
labs(title = "Deal size distribution per sector",
subtitle = "In USD",
x = "Sector") +
coord_flip()
plot.boxplotCode
plot.boxplot.color <-
evolution %>%
group_by(sectors) %>%
mutate(mediandeal = median(dealsize)) %>%
ggplot(mapping = aes(x = fct_reorder(sectors, mediandeal), y = dealsize)) +
geom_boxplot(fill = "blue3", alpha = 0.1, color = "blue3") +
geom_jitter(aes(color = sectors), width = 0.1, alpha = 0.4) +
labs(title = "Deal size distribution per sector",
subtitle = "In USD",
x = "Sector") +
guides(color = guide_legend(override.aes = list(size = 5))) + #increase legend items size
coord_flip()
plot.boxplot.colorCode
ggsave("plot.boxplot.color.png")Saving 7 x 5 in image
Binning
Let’s create 3 buckets of deal sizes and count the number of deals in each of them. The key function here is cut() were we define breaks within a variable. The breaks are defined in a vector like breaks = c(0,100,500,999).
Code
# binning deal sizes
# add column named "bin" containing 3 types of deal sizes
bin <-
evolution %>%
mutate(bin = cut(evolution$dealsize,
breaks = c(0,100,500,999)))
#relabeled bin
bin <-
evolution %>%
mutate(bin = cut(evolution$dealsize,
breaks = c(0,100,500,999),
labels = c("small < 100", "100 < medium < 500 ", "big > 500")))
# plot bins
plot.bar.binned <-
bin %>%
ggplot(mapping = aes(x = bin)) +
geom_bar() +
geom_text(aes(label = after_stat(count)),
stat = "count",
vjust = 2.5,
colour = "white")
plot.bar.binnedCode
# plot proportions for each bin in %
bin %>%
group_by(bin) %>%
summarise(sumperbin = sum(dealsize)) %>%
mutate(tot = sum(sumperbin), prop = sumperbin/tot*100) %>% #proportion
ggplot(mapping = aes(x = bin, y = prop)) +
geom_col() +
geom_text(aes(label = round(prop)), vjust = 2.5, colour = "white")Code
plot.binned.stacked <-
bin %>%
group_by(sectors, bin) %>%
summarise(sumdeal = sum(dealsize)) %>%
group_by(sectors) %>%
mutate(totsecteur = sum(sumdeal)) %>%
arrange(rev(sumdeal)) %>%
mutate(pos = cumsum(sumdeal)) %>%
ggplot(mapping = aes(x = fct_reorder(sectors, totsecteur), y = sumdeal, fill = bin)) +
geom_bar(stat="identity")
plot.binned.stackedCode
data.binsizestack <-
bin %>%
group_by(sectors, bin) %>%
#add column summarizing total sum of deals per sectors for each bin
summarise(sumperbin = sum(dealsize)) %>%
#get proportion
mutate(tot = sum(sumperbin), prop = sumperbin/tot*100)
#plot
plot.binned.stacked.dodge <-
data.binsizestack %>%
ggplot(aes(x = fct_reorder(sectors, tot), y = prop, fill = fct_rev(bin))) + #use fct_rev on (bin) to reverse stacked bar order (optional...)
geom_col(position = "dodge") +
labs(x = "Sector", color = "Deal category") + #change legend name with color = ""
#need to use scale_fill_discrete, instead of scale_color_discrete to reorder legend cat
scale_fill_discrete(breaks=c("big > 500", "medium > 200" , "small < 200")) #change legend label order
plot.binned.stacked.dodgeBubble charts
Code
plot.bubble <-
evolution %>%
ggplot(aes(x = dates, y = dealsize, size = (revenues/100))) +
geom_point(color= "gray", alpha = 0.5)
plot.bubbleCode
plot.bubble.highlight <-
plot.bubble +
geom_point(data = evolution %>% filter(evolution$sectors == "radio"),
alpha = 0.5,
color = "magenta")
plot.bubble.highlightCode
#Highlight with conditionality on deal size
plot.bubble.highlight2 <-
plot.bubble +
geom_point(data = evolution %>% filter(evolution$dealsize > 900),
alpha = 0.5,
color = "red")
plot.bubble.highlight2Heatmaps
Prepare data
Code
#get year, month, week
data.heatmap.weekcount <-
evolution %>%
mutate(year = year(dates),
month = month(dates, label = TRUE, abbr= TRUE),
wk = week(dates))
data.heatmap.weekcount# A tibble: 279 × 8
dates sectors regions dealsize revenues year month wk
<date> <fct> <fct> <int> <dbl> <dbl> <ord> <dbl>
1 2015-01-09 land LATAM 870 368 2015 Jan 2
2 2015-01-13 sea APAC 424 368 2015 Jan 2
3 2015-01-14 space EMEA 458 729 2015 Jan 2
4 2015-01-21 radio EMEA 828 600 2015 Jan 3
5 2015-02-11 land AUS 930 138 2015 Feb 6
6 2015-02-17 radio EMEA 346 447 2015 Feb 7
7 2015-03-21 radio NAM 530 799 2015 Mar 12
8 2015-03-24 land EMEA 453 733 2015 Mar 12
9 2015-04-03 sea AUS 956 488 2015 Apr 14
10 2015-04-18 radio AUS 35 557 2015 Apr 16
# ℹ 269 more rows
Week
Code
#plot week intensity for each year with heat map with geom_tile()
plot.heatmap.weekcount <-
data.heatmap.weekcount %>%
group_by(year) %>%
count(wk) %>%
ggplot(aes(x = wk, y = year, fill = n)) +
geom_tile(color="white", linewidth = 0.1) + #use color and size to draw borders
coord_equal() + #draw nice squares instead of rectangles
scale_fill_viridis(name="# Deals", option = "plasma") +
theme(
panel.background = element_rect(fill = NA, color = NA), # Background of plotting area
)
plot.heatmap.weekcountMonth
Code
#plot month intensity for each year with heat map with geom_tile()
plot.heatmap.monthcount <-
data.heatmap.weekcount %>%
group_by(year) %>%
count(month) %>% #change count to month
ggplot(aes(x = month, y = year, fill = n)) + #change x to month
geom_tile(color="white", linewidth = 0.1) + #use color and size to draw borders
coord_equal() + #draw nice squares instead of rectangles
theme(
panel.background = element_rect(fill = NA, color = NA) # Background of plotting area
) +
geom_text(aes(label=n), color="grey", size=3) # adding labels
plot.heatmap.monthcountOverview
Code
# syntax of patchwork library to combine plots
patchwork <-
((plot.bar.horizontal + plot.line.highlight + plot.bar.time.highlight) /
(plot.line.percent.highlight + plot.bar.vertical.count + (plot.change1 / plot.change2))
)
patchwork