Published: May 22, 2020 by Petra

To create the figures in this post, we use the data from the BC CDC’s summary reports and daily updates, both of which can be found here.

library(tidyverse)
library(lubridate)
library(zoo)
library(egg)

daily_reports <- read_csv("BC_reports.csv", trim_ws = TRUE, na=c("","NA"))

head(daily_reports)
# A tibble: 6 x 12
  date       new_cases recov  died   VCH    FH   VIH   INH    NH  hosp    IC
  <date>         <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2020-01-28         1     0     0     1     0     0     0     0     0     0
2 2020-02-04         1     0     0     2     0     0     0     0     0     0
3 2020-02-06         2     1     0     4     0     0     0     0     0     0
4 2020-02-14         1     0     0     4     0     0     1     0     0     0
5 2020-02-20         1     0     0     4     1     0     1     0     0     0
6 2020-02-24         1     0     0     4     2     0     1     0     0     0
# … with 1 more variable: self_isolation <dbl>

The data don’t include the number of active cases, so we need to calculate that as a the cumulative number of cases ever minus cumulative recoveries minus cumulative deaths. We start with daily reports, which includes provincial totals.

cum_drs <- 
daily_reports %>%
  select(date, new_cases, recov, died) %>%
  mutate(recov = ifelse(is.na(recov), 0, recov),
         died = ifelse(is.na(died),0,died),
         cume_new_cases = cumsum(new_cases),
         cume_recov = cumsum(recov),
         cume_died = cumsum(died),
         actives = cume_new_cases - cume_recov - cume_died) # calculate active cases

cum_drs_long <- 
cum_drs %>%
  select(contains("cume"), date, actives) %>%
  gather(key = cat, value = cases, -date) %>%
  mutate(cat = factor(cat, levels = c("cume_new_cases", "cume_recov", "cume_died","actives"), ordered = TRUE)) # convert to long form for plotting
 
# I want to include the number of new cases reported daily, but not as a line graph. To plot as a bar graph, I can isolate that data as a separate tibble.

new_cases <- 
cum_drs %>%
  select(date, new_cases) %>%
  add_column(cat = "new_cases") %>%
  rename(cases = new_cases) 

# Manually assign colours and create pretty labels for the different categories.

cat_colours <- rev(RColorBrewer::brewer.pal(5,"Set1"))
names(cat_colours) <- c("new_cases","cume_new_cases", "cume_recov", "cume_died","actives")
cat_names <- c("Cumulative # Cases","Cumulative Recovered","Cumulative Deaths", "Active Cases", "New Cases Today")


ggplot(data = cum_drs_long, aes(x = date, y = cases)) +
  geom_bar(data = new_cases, aes(x = date, y = cases, fill = cat), stat = "identity")+ # plot new cases as a bar graph
  geom_line(aes(colour = cat), size = 1) + # plot everything else as a line graph
  scale_colour_manual(values = cat_colours, labels = cat_names, name = NULL)+ # assign colours
  scale_fill_manual(values = cat_colours, labels = "New Cases Today", name = NULL ) + # assign fill for bar graph
  theme(legend.position = c(0.2, 0.7))

To look at how people are receiving care (in hospital vs. extra-hospital, and critical care vs. non-critical care in hospitals), we need to subtract hospitalized numbers from active numbers:

# I'm also going to use nicer names for the plot legend

treats_drs_long <-
  daily_reports %>%
  select(hosp, IC, self_isolation, date) %>% # select treatment categories
  left_join(cum_drs %>%
              select(date, actives)) %>% # join with current active data 
  filter(!is.na(IC)) %>%
  mutate(NC = hosp-IC,
         self_isolation = actives-hosp) %>% # calculate extra-hospitals by subtractive hospitalized patients from active cases
  select(-actives, -hosp) %>% # remove actives because we don't need that data anymore
  gather(key = state, value = cases, -date) %>% # convert to long form
  mutate(state = factor(state, levels = c("self_isolation", "NC", "IC"), ordered = TRUE))

# assign colours manually

care_colours <- RColorBrewer::brewer.pal(3,"Dark2")
names(care_colours) <- c("self_isolation", "IC", "NC")
care_names <- c("Extra-Hospital Care", "Non-critical Care (In Hospital)","Critical Care (In Hospital)")

# plot

ggplot(data = treats_drs_long, aes(x = date, y = cases))+
  geom_line(aes(colour = state), size = 1)+
  labs(colour = "Type of Care")+
  scale_colour_manual(values = care_colours, labels = care_names)+ # assign colours
  theme(legend.position = c(0.1,0.9),
        legend.justification = c(0,1)) +
  xlab("Date")+
  ylab("# of Cases")

To look at the individual health regions, we need the summary data; the daily reports do provide us the cumulative number of cases at the regional level but they don’t tell us about recoveries or hospitalizations. Unfortunately the summary reports only go back to March 23, and are released less frequently so there are gaps, which for clarity I prefer to indicate on the graphs.

The row labels in the summary report tables are long strings, which can be painful and error-prone to type out when manipulating the data. It’s easier to use abbreviated labels for data catgories, factor levels, treatments, etc., and then have the nice labels standing by for figures. I do that here, by making a csv table with my abbreviated labels and nice labels that I then read in.

# regional data from the summary reports
reg_details <- read_csv("reg_details.csv", trim_ws = TRUE, na = c("","NA"))

# table of nice and abbreviated labels
cat_labs <- read_csv("cat_labs.csv", trim_ws = TRUE, na = c("","NA")) 

# join labels to data
reg_details <- 
reg_details %>%
  left_join(cat_labs) 

# create abbreviations for health regions
map_regions <- 
  tibble(reg2 = c("FH", "INH", "NH", "VCH", "VIH"),
         HA_Name = c("Fraser", "Interior", "Northern", "Vancouver Coastal", 
                     "Vancouver Island"))

# assign colours manually
reg_colours <- rev(RColorBrewer::brewer.pal(5,"Dark2"))
names(reg_colours) <- c("FH","INH","NH","VCH","VIH")
reg_labs <- c("Fraser", "Interior", "Northern", "Vancouver Coastal", "Vancouver Island")

The data from the summary reports is organized with regions as columns and various case information as rows. We want to calculate the number of active cases in each region for each day that we have data, then use that number to calculate the number of extra-hospital cases. To get to that, we need to filter the data, and then rearrange the table to a format that facilitates our calculations. Once we have the numbers we need, we convert the data back to long form, which is best for plotting.

care <- 
reg_details %>%
  filter(cat == "cume_cases" | # filter just the data we want
           cat == "cume_died" |
           cat == "cume_recov" |
           cat == "hosp" |
           cat == "crit_care") %>%
  select(-total, -n, -category, -range, -comments) %>% # select just the regions and date
  gather(key = region, value = n, -date, -cat) %>% # convert to long form
  left_join(map_regions, 
            by = c("region" = "HA_Name")) %>% # add region abbreviations
  spread(key = cat, value = n) %>% # convert to wide form with case categories as columns
  mutate(actives = cume_cases - cume_died - cume_recov, # calculate actives
         noncrit_care = hosp - crit_care, # calculate non critical care hospitalized cases
         home_care = actives - hosp) %>% # calculate extra_hospital cases (abbreviated to home_care)
  select(date, region, reg2, home_care, noncrit_care, crit_care) %>% # select just the data we want
  gather(key = care_cat, value = n, -date, -region, -reg2) # convert to long form

We can now go about filling in the gaps in our data.

# make some nice labels for care categories
care_names_list <- c("crit_care" = "Critical Care",
       "home_care" = "Extra-hospital Care",
       "noncrit_care" = "Non-critical Care")

# convert care category to an ordered factor so that when we plot the data as a set of panels, extra-hospital care appears on top.
care <-
  care %>%
  mutate(care_cat = factor(care_cat, levels = c("home_care", "noncrit_care", "crit_care"), ordered = TRUE))

# to fill in the gaps in our data, we need the most recent and earliest dates of the data

most_recent <- 
  care %>%
  select(date) %>%
  pull() %>%
  max(.) %>%
  as_date()

earliest <- 
  care %>%
  filter(!is.na(n)) %>%
  select(date) %>%
  pull() %>%
  min(.) %>%
  as_date()

# store the difference between the earliest and latest dates

fill_dates_length2 <- 1 + interval(earliest,most_recent) %/% ddays(1)

# use the stored difference in dates to create the "full" combination of dates, regions, and care categories
care_full <- 
  tibble(date = rep(seq(earliest, most_recent, by = "day"), each = 15),
         reg2 = rep(c("FH","INH","NH","VCH","VIH"), times = fill_dates_length2, each = 3),
         care_cat = rep(c("home_care", "noncrit_care", "crit_care"), times = 5*fill_dates_length2, each = 1)) %>%
  left_join(care %>% # then join the real data
              select(-region),
            by = c("date" = "date",
                   "reg2" = "reg2",
                   "care_cat" = "care_cat")) 

# now to interpolate the missing data
care_gaps <- tibble() # create an empty tibble to store the data

for(i in 1:NROW(map_regions)){ # for each region
  for(k in 1:length(care_names_list)){ # for each care category

    # extract the earliest date for the filtered data
    min_date_gap <- 
    care_full %>%
      filter(reg2 == map_regions$reg2[i] & care_cat == names(care_names_list)[k] & !is.na(n)) %>%
      select(date) %>%
      pull(.) %>%
      min(.) %>%
      as_date()

    # identify the dates that are missing data    
gaps2 <- 
  care_full %>%
  filter(reg2 == map_regions$reg2[i] & care_cat == names(care_names_list)[k] & date >= min_date_gap) %>%
  mutate(rownum = 1:NROW(.)) %>%
  filter(is.na(n)) %>%
  pull(rownum)

gap_brack2 <- vector() # create an empty vector to store the gap positions

# for each identified row that is missing data, store the row number before, row number of missing data, and row number after
for(j in 1:length(gaps2)){
  gap_brack2 <- c(gap_brack2, gaps2[j]-1, gaps2[j], gaps2[j]+1) 
}

gap_brack2 <- unique(gap_brack2) # get rid of duplicates (instances where gaps are longer than one day)

# filter the full data set for just the rows missing data and the rows that bracket them, then interpolate
care_gapstmp <- 
  care_full %>%
  filter(reg2 == map_regions$reg2[i] & care_cat == names(care_names_list)[k] & date >= min_date_gap) %>%
  mutate(rownum = 1:NROW(.)) %>%
  filter(rownum %in% gap_brack2) %>% # filter for gaps
  mutate(n = na.approx(n)) %>% # interpolate
  select(-rownum) %>%
  left_join(map_regions, by = c("reg2" = "reg2")) %>% # add region names
  rename(region = HA_Name)

# store interpolated data
care_gaps <- 
  bind_rows(care_gaps, care_gapstmp)

  }
}

# add region names to full data set
care_full <- 
care_full %>%
  left_join(map_regions) %>%
  rename(region = HA_Name) %>%
  mutate(care_cat = factor(care_cat, levels = c("home_care", "noncrit_care", "crit_care"), ordered = TRUE))

# in order for our gap data to plot correctly, we need the dates for which we do have data set to NA and included in the interpolated data tibble
# we do this by taking our full data set, removing the data, and left joining the interpolated data to it
care_gaps <- 
care_full %>%
  select(-n) %>%
  left_join(care_gaps) %>%
  mutate(care_cat = factor(care_cat, levels = c("home_care", "noncrit_care", "crit_care"), ordered = TRUE))

# bind interpolated and observed data into one long tibble
care_all <- 
bind_rows(care_gaps %>%
            add_column(type = "inter"),
          care_full %>%
            add_column(type = "repo"))

# lastly, there are a couple observed data points that are surrounded by gaps 
# these data wont appear on the plot because they are isolated points, there are no lines to draw
# we can plot them as points

singletons <- tibble()
for(j in 1:NROW(reg_labs)){
  for(k in 1:length(care_names)){
    
    sing_tmp <- 
    care_all %>%
              filter(reg2 == map_regions$reg2[j] & care_cat == names(care_names_list)[k] & type == "repo" & is.na(n)) %>%
              mutate(date2 = c(NA,diff(date) == 2)) %>%
              filter(date2 == TRUE) %>%
              select(date, reg2, care_cat, type) %>%
              mutate(date = date - 1) %>%
              left_join(care_all)

    singletons <- bind_rows(singletons, sing_tmp)
        
  }
}

# plot
ggplot(data = care_all, aes(x = date, y = n))+
  geom_line(aes(colour = reg2, linetype = type, size = type))+
  geom_point(data = singletons, aes(x = date, y = n, colour = reg2), shape = 15, size = 1)+
  facet_wrap(.~care_cat, ncol = 1, scales = "free", strip.position = "right", labeller = labeller(care_cat = care_names_list))+
  scale_linetype_manual(name = "Data type", values = c("dotted", "solid"), labels = c("Interpolated", "Reported"))+
  scale_size_manual(name = "Data type", values = c(0.5, 1), labels = c("Interpolated", "Reported"))+
  scale_colour_manual(values = reg_colours, labels = reg_labs)+
  labs(colour = "Health Region")+
  xlab("Date")+
  ylab("# of Cases") 

We can use the same data to plot case categories by region:

# the summary reports-based tibbles use slightly different category names
# need to make a new colour list

care_colours2 <- care_colours

names(care_colours2) <- c("home_care", "crit_care", "noncrit_care")
reg_labs2 <- c("FH", "INH", "NH", "VCH", "VIH")
names(reg_labs) <- reg_labs2

ggplot(data = care_all, aes(x = date, y = n))+
  geom_line(aes(colour = care_cat, linetype = type, size = type))+
  geom_point(data = singletons, aes(x = date, y = n, colour = care_cat), shape = 15, size = 1 )+
  facet_wrap(.~reg2, ncol = 1, scales = "free", strip.position = "right", 
             labeller = labeller(reg2 = reg_labs)) +
  scale_linetype_manual(name = "Data type", values = c("dotted", "solid"), labels = c("Interpolated", "Reported"))+
  scale_size_manual(name = "Data type", values = c(0.5, 1), labels = c("Interpolated", "Reported"))+
  scale_colour_manual(values = care_colours2, labels = care_names_list)+
  labs(colour = "Type of Care")+
  xlab("Date") + 
  ylab("# of Cases") 

To standardize data from each region by population, we can use the population data from BCStats (estimates are from 2019, data licensed under the Open Government Licence - British Columbia).

reg_pops <- read_csv("reg_pops.csv", trim_ws = TRUE, na = c("","NA"))

# keep just totals

reg_pops <- 
reg_pops %>%
  filter(cat == "total") %>%
  gather(key = reg2, value = tot, -cat) %>%
  select(-cat)

# left join populations onto care data, calculate cases per 100,000

care_all2 <- 
care_all %>%
  left_join(reg_pops) %>%
  mutate(nprop = 100000*n/tot)

singletons2 <- 
singletons %>%
  left_join(reg_pops) %>%
  mutate(nprop = 100000*n/tot)

# plot

ggplot(data = care_all2, aes(x = date, y = nprop))+
  geom_line(aes(colour = reg2, linetype = type, size = type))+
  geom_point(data = singletons2, aes(x = date, y = nprop, colour = reg2), shape = 15, size = 1)+
  facet_wrap(.~care_cat, ncol = 1, scale = "free", strip.position = "right", labeller = labeller(care_cat = care_names_list))+
  scale_linetype_manual(name = "Data type", values = c("dotted", "solid"), labels = c("Interpolated", "Reported"))+
  scale_size_manual(name = "Data type", values = c(0.5, 1), labels = c("Interpolated", "Reported"))+
  scale_colour_manual(values = reg_colours, labels = reg_labs)+
  labs(colour = "Health Region")+
  xlab("Date")+
  ylab("# of Cases per 100,000") #+

Et voila!

All analyses performed in R V3.6.3

ggplot2 covid-19 colourbrewer

Share