Correlation Analyses for COVID-19 Indicators

County analysis

First, we’ll look at counties.

Getting data

We fetch various signals from our API, from April 15 through to the current day.

library(covidcast)
library(dplyr)
library(ggplot2)

# Fetch the following sources and signals from the API 
sources = c("doctor-visits", "fb-survey", "fb-survey", "hospital-admissions", 
            "indicator-combination")
signals = c("smoothed_adj_cli", "smoothed_cli", "smoothed_hh_cmnty_cli", 
            "smoothed_adj_covid19", "nmf_day_doc_fbc_fbs_ght")
names = c("Doctor visits", "Facebook CLI", "Facebook CLI-in-community", 
          "Hospitalizations", "Combo indicator")

start_day = "2020-04-15"
end_day = NULL

df_signals = vector("list", length(signals))
for (i in 1:length(signals)) {
  df_signals[[i]] = covidcast_signal(sources[i], signals[i], start_day, end_day)
}

# Fetch USAFacts confirmed case incidence proportion (smoothed with 7-day 
# trailing average)
df_cases = covidcast_signal("usa-facts", "confirmed_7dav_incidence_prop",
                            start_day, end_day)

Correlations sliced by time

Here we look at Spearman (rank) correlations between our signals and COVID-19 case incidence rates, sliced by time. That is, for each day, we compute the correlation between each signal and COVID-19 case incidence rates, over all
counties (with at least 500 cumulative cases).

# Consider only counties with at least 500 cumulative cases
case_num = 500
geo_values = covidcast_signal("usa-facts", "confirmed_cumulative_num",
                              max(df_cases$time_value), 
                              max(df_cases$time_value)) %>%
  filter(value >= case_num) %>% pull(geo_value)

df_cor = vector("list", length(signals))
for (i in 1:length(signals)) {
  df_cor[[i]] = covidcast_cor(df_signals[[i]] %>% 
                                filter(geo_value %in% geo_values), 
                              df_cases %>% 
                                filter(geo_value %in% geo_values), 
                              by = "time_value", method = "spearman")
  df_cor[[i]]$signal = names[i]
}
df = do.call(rbind, df_cor)

ggplot(df, aes(x = time_value, y = value)) +
  geom_line(aes(color = signal)) +
  guides(color = guide_legend(nrow = 2)) +
  labs(title = "Correlation between signals and case rates",
       subtitle = sprintf("Over all counties with at least %i cumulative cases",
                          case_num), x = "Date", y = "Correlation") +
  theme(legend.position = "bottom", legend.title = element_blank())

Correlations sliced by county

Now we look at Spearman (rank) correlations between our signals and COVID-19 case incidence rates, sliced by county. That is, for each county (with at least 500 cumulative cases), we compute the correlation between each signal and COVID-19 case incidence rates, over all time.

df_cor = vector("list", length(signals))
for (i in 1:length(signals)) {
  df_cor[[i]] = covidcast_cor(df_signals[[i]] %>% 
                                filter(geo_value %in% geo_values), 
                              df_cases %>% 
                                filter(geo_value %in% geo_values), 
                              by = "geo_value", method = "spearman")
  df_cor[[i]]$signal = names[i]
}
df = do.call(rbind, df_cor)

ggplot(df, aes(value)) +
  geom_density(aes(color = signal, fill = signal), alpha = 0.4) +
  guides(color = guide_legend(nrow = 2)) +
  labs(title = "Correlation between signals and case rates",
       subtitle = sprintf("Over all counties with at least %i cumulative cases",
                          case_num), x = "Date", y = "Correlation") +
  theme(legend.position = "bottom", legend.title = element_blank())

We can also look at choropleth maps to get a geographic sense of the correlation distribution for each signal.

# Set some fields, then plot choropleth maps using covidcast functionality
for (i in 1:length(signals)) {
  df_cor[[i]]$time_value = start_day
  df_cor[[i]]$issue = start_day
  attributes(df_cor[[i]])$geo_type = "county"
  class(df_cor[[i]]) = c("covidcast_signal", "data.frame")
  
  print(plot(df_cor[[i]], range = c(-1, 1), choro_col = cm.colors(10),
             title = sprintf("Correlations for %s", names[i])))
}

Metro area analysis

Next, we’ll look at metro areas.

Getting data

We fetch various signals from our API, from April 15 through to the current day.

# Fetch the following sources and signals from the API 
sources = c("doctor-visits", "fb-survey", "fb-survey", "ght",
            "hospital-admissions", "indicator-combination")
signals = c("smoothed_adj_cli", "smoothed_cli", "smoothed_hh_cmnty_cli", 
            "smoothed_search", "smoothed_adj_covid19", 
            "nmf_day_doc_fbc_fbs_ght")
names = c("Doctor visits", "Facebook CLI", "Facebook CLI-in-community", 
          "Google trends", "Hospitalizations", "Combo indicator")

start_day = "2020-04-15"
end_day = NULL

df_signals = vector("list", length(signals))
for (i in 1:length(signals)) {
  df_signals[[i]] = covidcast_signal(sources[i], signals[i], start_day, end_day,
                                     geo_type = "msa")
}

# Fetch USAFacts confirmed case incidence proportion (smoothed with 7-day
# trailing average)
df_cases = covidcast_signal("usa-facts", "confirmed_7dav_incidence_prop",
                            start_day, end_day, geo_type = "msa")

Correlations sliced by time

# Consider only metro areas with at least 500 cumulative cases
case_num = 500
geo_values = covidcast_signal("usa-facts", "confirmed_cumulative_num",
                              max(df_cases$time_value), 
                              max(df_cases$time_value), 
                              geo_type = "msa") %>%
  filter(value >= case_num) %>% pull(geo_value)

df_cor = vector("list", length(signals))
for (i in 1:length(signals)) {
  df_cor[[i]] = covidcast_cor(df_signals[[i]] %>% 
                                filter(geo_value %in% geo_values), 
                              df_cases %>% 
                                filter(geo_value %in% geo_values), 
                              by = "time_value", method = "spearman")
  df_cor[[i]]$signal = names[i]
}
df = do.call(rbind, df_cor)

ggplot(df, aes(x = time_value, y = value)) +
  geom_line(aes(color = signal)) +
  guides(color = guide_legend(nrow = 2)) +
  labs(title = "Correlation between signals and case rates",
       subtitle = sprintf("Over metro areas with at least %i cumulative cases",
                          case_num), x = "Date", y = "Correlation") +
  theme(legend.position = "bottom", legend.title = element_blank())

Correlations sliced by metro area

Now we look at Spearman (rank) correlations between our signals and COVID-19 case incidence rates, sliced by metro area That is, for each metro area (with at least 500 cumulative cases), we compute the correlation between each signal and COVID-19 case incidence rates, over all time.

df_cor = vector("list", length(signals))
for (i in 1:length(signals)) {
  df_cor[[i]] = covidcast_cor(df_signals[[i]] %>% 
                                filter(geo_value %in% geo_values), 
                              df_cases %>% 
                                filter(geo_value %in% geo_values), 
                              by = "geo_value", method = "spearman")
  df_cor[[i]]$signal = names[i]
}
df = do.call(rbind, df_cor)

ggplot(df, aes(value)) +
  geom_density(aes(color = signal, fill = signal), alpha = 0.4) +
  guides(color = guide_legend(nrow = 2)) +
  labs(title = "Correlation between signals and case rates",
       subtitle = sprintf("Over metro areas with at least %i cumulative cases",
                          case_num), x = "Date", y = "Correlation") +
  theme(legend.position = "bottom", legend.title = element_blank())

Correlation Analyses for COVID-19 Indicators

Delphi Group

September 01, 2020

County analysis

Getting data

Correlations sliced by time

Correlations sliced by county

Metro area analysis

Getting data

Correlations sliced by time

Correlations sliced by metro area