Data vizualisation

memo
R
Personal reminder for data visualization
Published

May 31, 2024

About

This page is a collection of plots describing a fake portfolio of M&A deals. I use this page as a personal reminder. First I generate a dataset with a few attributes : sectors, regions, deal size. Then I do basic exploratory data analysis. You can unwrap code chunks if needed.

Creating a dataset

Load libraries

library(tidyverse)
library(viridis)
library(patchwork)
library(gghighlight)

Create objects

# Create dates
datesx <- seq.Date(from = as.Date("2014-01-01"), to = Sys.Date(), by = 1)
# Sample random dates and sort in chronologically
set.seed(123)
dates <- sample(datesx, size = 400, replace = TRUE)
dates <- sort(dates, decreasing = FALSE)

# Create sectors and regions (categorical)
sectors <- sample(x = c("air", "space", "land", "sea", "ground", "radio"), size = 400, replace = TRUE, prob=c(0.1, 0.2, 0.30, 0.2, 0.1, 0.2))
regions <- sample(x = c("NAM", "LATAM", "EMEA", "APAC", "AUS"), size = 400, replace = TRUE, prob=c(0.1, 0.2, 0.40, 0.2, 0.1))

# Create deal sizes and revenues (numeric)
dealsize <- sample(1:1000, size = 400, replace = TRUE)
revenues <- round(rnorm(400, 20, 5)^2, 0)

Create data frame

# create tibbles by combining vectors
evolution <-
  tibble(dates, sectors, regions, dealsize, revenues)
  
#convert character variables to factors
evolution <-
  evolution %>%
  mutate_if(sapply(evolution, is.character), as.factor) %>% 
  print()
# A tibble: 400 × 5
   dates      sectors regions dealsize revenues
   <date>     <fct>   <fct>      <int>    <dbl>
 1 2014-01-13 sea     LATAM        778      185
 2 2014-02-10 land    APAC          86      527
 3 2014-03-17 sea     APAC          42      343
 4 2014-03-24 land    EMEA         100      411
 5 2014-04-27 land    EMEA         937      676
 6 2014-05-07 radio   NAM           25      573
 7 2014-05-17 sea     APAC         606      123
 8 2014-05-21 space   APAC         180      476
 9 2014-05-31 land    EMEA         293      351
10 2014-06-02 ground  LATAM        896      346
# ℹ 390 more rows
#filter year
evolution <-
  evolution %>%
  filter(dates > "2015-01-01", 
         dates < "2023-01-01")

Set theme

theme_set(theme_minimal())

Deal count

Code
plot.bar.vertical.count <-
evolution %>%
  ggplot(mapping = aes(x = fct_rev(fct_infreq(sectors)))) +
  geom_bar() +
  geom_text(aes(label = after_stat(count)), 
            stat = "count", 
            vjust = 2.5, 
            colour = "white") + #add labels
  labs(x = "Sectors", 
       title = "Number of deals by sector")
plot.bar.vertical.count

Code
plot.percent.stacked <-
evolution %>%
ggplot(mapping = aes(x=sectors, fill = regions)) +
  geom_bar(position = "fill") 
plot.percent.stacked

Code
plot.count.dodge <-
evolution %>%
ggplot(mapping = aes(x=fct_infreq(sectors), fill = fct_infreq(regions))) +
  geom_bar(position = "dodge", color = "white") 
plot.count.dodge

Highlight variable

Overlay two plots on top of each other to highlight a variable.

Code
plot.bar.horizontal <-
evolution %>%
  count(sectors) %>%
  mutate(prop = (n/sum(n))) %>% 
  ggplot(mapping = aes(x = fct_reorder(sectors, prop), y = prop)) +
  geom_col(fill = "red3") +
  coord_flip() +
  gghighlight(sectors == "radio") +
  geom_text(aes(label = paste0(round(prop*100,2), "%")), 
            vjust = .5,
            hjust = 1.2,
            size = 4,
            color = "white") + #add labels
  scale_y_continuous(labels = scales::percent) + # axis label in %
  labs(x = "Sectors",
       title = "Proportion of deals by sector", 
       subtitle = "By number of deals") #relabel x axis
plot.bar.horizontal

Code
#create base data
evolutioncount <- 
evolution %>%
  group_by(year = year(dates)) %>%
  count(sectors) %>%
  mutate(year = year, n = n) 

#create highlight data 
evolutioncount.filt <-
  evolution %>%
  filter(sectors == "radio") %>% # with filter
  group_by(year = year(dates)) %>% 
  count(sectors) %>%
  mutate(year = year, n = n)

#plot base and highlight data on top of each other
plot.bar.time.highlight <-
  ggplot(data = evolutioncount, aes(x = year, y = n)) +
  geom_col(fill = "red3") + 
  geom_text(evolutioncount.filt, mapping = aes(label = n), 
            vjust = -0.8, 
            color = "red3") + 
  gghighlight(sectors == "radio")
plot.bar.time.highlight

Change rate

Using the lag() function which helps calculating the YoY change.

Code
# Prepare data using lag()
data.change.roll.avg <-
evolutioncount.filt %>% 
  ungroup() %>% #remember to ungroup() 
  mutate(lag0 = n,
         lag1 = lag(lag0),
         yoy.change = (lag0 - lag1),
         yoy.changeperc = ((lag0/lag1-1)*100)
                       ) %>%
  mutate(yoy.changeperc = round(yoy.changeperc, 1))

#Plot 1
plot.change1 <-
data.change.roll.avg %>%
  select(year, yoy.change, yoy.changeperc) %>%
  filter(!is.na(yoy.change)) %>%
  ggplot(aes(x = year, y = yoy.change)) +
  geom_col(aes(fill = yoy.change > 0)) + # conditional formatting with > 0
  geom_text(aes(label = yoy.change, y = yoy.change -0.5 * sign(yoy.change)), color = "white", fontface = "bold") + # data labels, use sign() to fit diverging labels, see : https://tinyurl.com/z5n7uk5k or https://tinyurl.com/3uakpz7y
  labs(x = NULL) + # remove x axis title
  scale_x_continuous(labels = NULL) + # remove x axis tick labels
  scale_fill_manual(values = c("#c9191e", "#27a658" )) +
  theme(legend.position = "none")  #  remove legend of col chart

#Plot 2
plot.change2 <-
data.change.roll.avg %>%
  select(year, yoy.change, yoy.changeperc) %>%
  filter(!is.na(yoy.changeperc)) %>%
  ggplot(aes(x = year, y = yoy.changeperc)) +
  geom_hline(aes(yintercept = 0), color = "gray", linetype = "dashed") + #highlight x axis
  geom_line(color = "blue4") +
  labs(x = NULL)

# Combining Plot 1 & 2 using patchwork
(plot.change1 / plot.change2) + plot_layout(ncol = 1, heights = c(2.5, 1))

Code
# Resources :
# https://tinyurl.com/23yh29hu

Line plots : cumulated sums

Counting the number of deals, and grouping line geoms by color.

Code
plot.line <- 
evolution %>%
    group_by(sectors) %>%
    arrange(dates) %>%
    mutate(count = row_number()) %>%
    ggplot(mapping = aes(x = dates, y = count, color = sectors)) + 
    geom_line(linewidth = 0.8)
plot.line

Without gghighlight

Now doing two things on this line plot :

  • Highlighting one line without using the gghighlight package.

  • Graphing cumulated dealsize, instead of count previously. We use the function cumsum()

Code
#create background line geoms (will be in grey)
data.evolutionback <- 
  evolution %>%
    group_by(sectors) %>%
    arrange(dates) %>%
    mutate(evol = cumsum(dealsize))

#create the highlighted line (will be in red)
data.evolutionfilt <- data.evolutionback %>% 
  filter(sectors == "radio")

#plot
plot.line.highlight <- ggplot(data = data.evolutionback, mapping = aes(x = dates, y = evol, group = sectors)) + 
    geom_line(linewidth = 0.8, 
              color = "gray85") + #main lines in gray
    geom_line(data = data.evolutionfilt, 
              linewidth = 0.8, 
              color = "red") + #highlight line in red
  #add a dot at max value of line (max value of evol)
    geom_point(data = data.evolutionfilt %>% filter(data.evolutionfilt$evol == max(evol)), 
               color = "red", 
               size = 2) +
  #add text label to end of line
    geom_text(data = data.evolutionfilt %>% filter(dates == last(dates)), 
              mapping = aes(label = sectors), 
              color = "red", 
              hjust = -0.2, 
              vjust = 0.1) + 
  # edit plot axis labels 
    labs(title = "Cumulated value over time",
        subtitle = "Radio segment",
         x = "Time", 
         y = "Cumulated deal value")
plot.line.highlight

To allow labels to bleed past the canvas boundaries we could use coord_cartesian(clip = 'off')

Adding a trend line with geom_smooth

Code
plot.line.highlight <-
  plot.line.highlight +
# geom smooth trend line
geom_smooth(group = 1, color = "black", 
            linetype = "dashed", 
            linewidth = 0.4, se = FALSE, 
            method = "loess", 
            formula = y ~ x)
plot.line.highlight

Code
# Prepare data to get proportion per sector and per year
data.line.percent <-
evolution %>%
     mutate(year = year(dates)) %>% #get year name
     group_by(year, sectors) %>% 
     summarise(n = length(dates)) %>% #get length of vectors containing dates (a.k.a "number of dates"; for each sectors in each year)
     group_by(year) %>%
     mutate(sumperyear = sum(n), 
            prop = (n/sumperyear*100)) #sum the total number of dates per year and get prop of "each sectors for each year" compared to total number of all sectors in a year
    
# Filter data for highlight
data.line.percent.filt <- data.line.percent %>% filter(sectors == "radio")

plot.line.percent.highlight <- 
  ggplot(data = data.line.percent, mapping = aes(x = year, y = prop, group = sectors)) +
  geom_line(color = "gray80") + 
  geom_line(data = data.line.percent.filt, 
            color = "red3", 
            linewidth = 1) +
  geom_point(data = data.line.percent.filt %>% filter(year == max(data.line.percent.filt$year)), 
             color = "red3") +
  geom_text(data = data.line.percent.filt %>% filter(year == max(data.line.percent.filt$year)), 
            mapping = aes(label = paste0(round(prop), "", "%"), 
                          hjust = 0.5,
                          vjust = -1,
                          color = "red3", 
                          fontface = "bold")) + 
  theme(legend.position = "none") + 
  scale_y_continuous(limits = c(0,100)) +
  labs(title = "Proportion per year",
      subtitle = "Radio sector",
      x = "Year",
      y = "Share (%)")
plot.line.percent.highlight

With gghighlight

Code
plot.line +
gghighlight(sectors == "space") 

With facets + gghighlight

Code
# No need for gghighlight here
plot.line + 
  facet_wrap(~sectors)

Code
# Highlighting segments with gghighlight
plot.line +
  gghighlight(label_params = list(size = 3)) + #adjust size of labels
  facet_wrap(~sectors) + 
theme(
  strip.text.x = element_blank() #remove titles from facets
)

Code
# With multiple gghighlights
plot.line +
gghighlight(sectors %in% c("space", "radio"), 
            label_params = list(nudge_x = 2))

Some thoughts

Describing deal value

Summary stats

Code
data.evolution.summary <-
  evolution %>%
  group_by(sectors) %>%
  summarize(meandeal = mean(dealsize), 
            mediandeal = median(dealsize), 
            max = max(dealsize), 
            min = min(dealsize), 
            IQR = IQR(dealsize), 
            NBdeals = n()) %>%
  arrange(mediandeal)
data.evolution.summary
# A tibble: 6 × 7
  sectors meandeal mediandeal   max   min   IQR NBdeals
  <fct>      <dbl>      <dbl> <int> <int> <dbl>   <int>
1 air         414.       319    917    51  474.      26
2 sea         473.       443    993     4  494.      58
3 space       493.       498    964     1  574       65
4 land        526.       502.   993     6  406.      84
5 radio       563        627    990    18  452.      59
6 ground      543.       642.   956    26  543       34
Code
# Note : it is also possible to use a library such as Janitor for this.

Sum

Which is the most important sector in terms of deal size ?

Code
plot.bar.vertical.sum <- 
evolution %>%
  group_by(sectors) %>%
  summarise(sumdeal = sum(dealsize)) %>%
  ggplot(mapping = aes(x = fct_reorder(sectors, sumdeal), y = sumdeal)) +
  geom_col() +
  geom_text(aes(label = sumdeal), 
                vjust = 2, 
                size = 4,
                color = "white") + 
  labs(x = NULL)
  
plot.bar.vertical.sum +
  labs(title = "Sum of deals' value per sector",
       subtitle = "In USD",
       x = NULL)

Code
plot.bar.vertical.sum 

Frequency

How are the deals distributed in terms of deal size ?

Code
## Distribution of deal size
plot.hist.dealsize <-
evolution %>%
  ggplot(aes(x = dealsize)) +
  geom_histogram(binwidth = 20) 
plot.hist.dealsize

Density

Code
## Density of deal size per sector
plot.density <-
evolution %>%
  ggplot(aes(x = dealsize, fill = sectors)) +
  geom_density(alpha = 0.3) +
  geom_rug(alpha = 0.6) + #lower part of graph
  facet_wrap(~sectors)
plot.density

Boxplot

Code
plot.boxplot <-
evolution %>%
  group_by(sectors) %>%
  mutate(mediandeal = median(dealsize)) %>%
  ggplot(mapping = aes(x = fct_reorder(sectors, mediandeal), y = dealsize)) +
  geom_boxplot() +
  geom_jitter(width = 0.1, alpha = 0.4) +
  labs(title = "Deal size distribution per sector",
       subtitle = "In USD",
       x = "Sector") +
  coord_flip()
plot.boxplot

Code
plot.boxplot.color <-
evolution %>%
  group_by(sectors) %>%
  mutate(mediandeal = median(dealsize)) %>%
  ggplot(mapping = aes(x = fct_reorder(sectors, mediandeal), y = dealsize)) +
  geom_boxplot(fill = "blue3", alpha = 0.1, color = "blue3") +
  geom_jitter(aes(color = sectors), width = 0.1, alpha = 0.4) +
  labs(title = "Deal size distribution per sector",
       subtitle = "In USD",
       x = "Sector") +
  guides(color = guide_legend(override.aes = list(size = 5))) + #increase legend items size
  coord_flip()
plot.boxplot.color

Code
ggsave("plot.boxplot.color.png")
Saving 7 x 5 in image

Binning

Let’s create 3 buckets of deal sizes and count the number of deals in each of them. The key function here is cut() were we define breaks within a variable. The breaks are defined in a vector like breaks = c(0,100,500,999).

Code
# binning deal sizes
# add column named "bin" containing 3 types of deal sizes
bin <- 
evolution %>% 
    mutate(bin = cut(evolution$dealsize, 
                     breaks = c(0,100,500,999)))

#relabeled bin
bin <-
evolution %>% 
  mutate(bin = cut(evolution$dealsize, 
                   breaks = c(0,100,500,999), 
                   labels = c("small < 100", "100 < medium < 500  ", "big > 500"))) 

# plot bins
  
plot.bar.binned <-
bin %>%
  ggplot(mapping = aes(x = bin)) +
  geom_bar() +
  geom_text(aes(label = after_stat(count)), 
            stat = "count", 
            vjust = 2.5, 
            colour = "white") 
plot.bar.binned

Code
# plot proportions for each bin in %
bin %>%
  group_by(bin) %>%
  summarise(sumperbin  = sum(dealsize)) %>%
  mutate(tot = sum(sumperbin), prop = sumperbin/tot*100) %>% #proportion
  ggplot(mapping = aes(x = bin, y = prop)) +
  geom_col() +
  geom_text(aes(label = round(prop)), vjust = 2.5, colour = "white")

Code
plot.binned.stacked <-
bin %>% 
  group_by(sectors, bin) %>%
  summarise(sumdeal = sum(dealsize)) %>%
  group_by(sectors) %>%
  mutate(totsecteur = sum(sumdeal)) %>%
  arrange(rev(sumdeal)) %>%
  mutate(pos = cumsum(sumdeal)) %>%
  ggplot(mapping = aes(x = fct_reorder(sectors, totsecteur), y = sumdeal, fill = bin)) + 
  geom_bar(stat="identity")
plot.binned.stacked

Code
data.binsizestack <-
bin %>%
  group_by(sectors, bin) %>%
  #add column summarizing total sum of deals per sectors for each bin
  summarise(sumperbin = sum(dealsize)) %>%
  #get proportion
  mutate(tot = sum(sumperbin), prop = sumperbin/tot*100)
 
#plot
plot.binned.stacked.dodge <-  
  data.binsizestack %>% 
  ggplot(aes(x = fct_reorder(sectors, tot), y = prop, fill = fct_rev(bin))) + #use fct_rev on (bin) to reverse stacked bar order (optional...)
  geom_col(position = "dodge") +
  labs(x = "Sector", color = "Deal category") + #change legend name with color = ""
  #need to use scale_fill_discrete, instead of scale_color_discrete to reorder legend cat
  scale_fill_discrete(breaks=c("big > 500", "medium > 200" , "small < 200")) #change legend label order 
plot.binned.stacked.dodge

Bubble charts

Code
plot.bubble <-
evolution %>%
  ggplot(aes(x = dates, y = dealsize, size = (revenues/100))) +
  geom_point(color= "gray", alpha = 0.5)
plot.bubble

Code
plot.bubble.highlight <-
plot.bubble +
  geom_point(data = evolution %>% filter(evolution$sectors == "radio"), 
             alpha = 0.5, 
             color = "magenta")
plot.bubble.highlight

Code
#Highlight with conditionality on deal size
plot.bubble.highlight2 <-
plot.bubble +
  geom_point(data = evolution %>% filter(evolution$dealsize > 900), 
             alpha = 0.5, 
             color = "red")
plot.bubble.highlight2

Heatmaps

Prepare data

Code
#get year, month, week
data.heatmap.weekcount <-
  evolution %>%
    mutate(year = year(dates), 
           month = month(dates, label = TRUE, abbr= TRUE), 
           wk = week(dates))
data.heatmap.weekcount
# A tibble: 326 × 8
   dates      sectors regions dealsize revenues  year month    wk
   <date>     <fct>   <fct>      <int>    <dbl> <dbl> <ord> <dbl>
 1 2015-01-08 space   EMEA         681      661  2015 Jan       2
 2 2015-01-09 radio   LATAM        425      785  2015 Jan       2
 3 2015-01-21 space   LATAM         39      406  2015 Jan       3
 4 2015-02-17 land    EMEA         242      192  2015 Feb       7
 5 2015-04-03 ground  EMEA         876      340  2015 Apr      14
 6 2015-04-18 radio   APAC         504      309  2015 Apr      16
 7 2015-04-22 space   LATAM        215      569  2015 Apr      16
 8 2015-04-25 space   AUS          902      484  2015 Apr      17
 9 2015-05-31 radio   APAC         928      248  2015 May      22
10 2015-06-10 sea     EMEA         103      374  2015 Jun      23
# ℹ 316 more rows

Week

Code
  #plot week intensity for each year with heat map with geom_tile()
plot.heatmap.weekcount <-
  data.heatmap.weekcount %>%
  group_by(year) %>%
  count(wk) %>%
  ggplot(aes(x = wk, y = year, fill = n)) +
  geom_tile(color="white", linewidth = 0.1) + #use color and size to draw borders
  coord_equal() + #draw nice squares instead of rectangles 
  scale_fill_viridis(name="# Deals", option = "plasma") +
  theme(
  panel.background = element_rect(fill = NA, color = NA), # Background of plotting area 
  )
plot.heatmap.weekcount

Month

Code
  #plot month intensity for each year with heat map with geom_tile()
plot.heatmap.monthcount <-  
data.heatmap.weekcount %>%
  group_by(year) %>%
  count(month) %>% #change count to month
  ggplot(aes(x = month, y = year, fill = n)) + #change x to month
  geom_tile(color="white", linewidth = 0.1) + #use color and size to draw borders
  coord_equal() + #draw nice squares instead of rectangles
  theme(
  panel.background = element_rect(fill = NA, color = NA)   # Background of plotting area 
  ) + 
  geom_text(aes(label=n), color="grey", size=3) # adding labels
plot.heatmap.monthcount

Overview

Code
# syntax of patchwork library to combine plots
patchwork <-
((plot.bar.horizontal + plot.line.highlight + plot.bar.time.highlight) /
  (plot.line.percent.highlight + plot.bar.vertical.count + (plot.change1 / plot.change2))
)
patchwork