First, we’ll look at counties.
We fetch various signals from our API, from April 15 through to the current day.
library(covidcast)
library(dplyr)
library(ggplot2)
# Fetch the following sources and signals from the API
sources = c("doctor-visits", "fb-survey", "fb-survey", "hospital-admissions",
"indicator-combination")
signals = c("smoothed_adj_cli", "smoothed_cli", "smoothed_hh_cmnty_cli",
"smoothed_adj_covid19", "nmf_day_doc_fbc_fbs_ght")
names = c("Doctor visits", "Facebook CLI", "Facebook CLI-in-community",
"Hospitalizations", "Combo indicator")
start_day = "2020-04-15"
end_day = NULL
df_signals = vector("list", length(signals))
for (i in 1:length(signals)) {
df_signals[[i]] = covidcast_signal(sources[i], signals[i], start_day, end_day)
}
# Fetch USAFacts confirmed case incidence proportion (smoothed with 7-day
# trailing average)
df_cases = covidcast_signal("usa-facts", "confirmed_7dav_incidence_prop",
start_day, end_day)
Here we look at Spearman (rank) correlations between our signals and COVID-19 case incidence rates, sliced by time. That is, for each day, we compute the correlation between each signal and COVID-19 case incidence rates, over all
counties (with at least 500 cumulative cases).
# Consider only counties with at least 500 cumulative cases
case_num = 500
geo_values = covidcast_signal("usa-facts", "confirmed_cumulative_num",
max(df_cases$time_value),
max(df_cases$time_value)) %>%
filter(value >= case_num) %>% pull(geo_value)
df_cor = vector("list", length(signals))
for (i in 1:length(signals)) {
df_cor[[i]] = covidcast_cor(df_signals[[i]] %>%
filter(geo_value %in% geo_values),
df_cases %>%
filter(geo_value %in% geo_values),
by = "time_value", method = "spearman")
df_cor[[i]]$signal = names[i]
}
df = do.call(rbind, df_cor)
ggplot(df, aes(x = time_value, y = value)) +
geom_line(aes(color = signal)) +
guides(color = guide_legend(nrow = 2)) +
labs(title = "Correlation between signals and case rates",
subtitle = sprintf("Over all counties with at least %i cumulative cases",
case_num), x = "Date", y = "Correlation") +
theme(legend.position = "bottom", legend.title = element_blank())
Now we look at Spearman (rank) correlations between our signals and COVID-19 case incidence rates, sliced by county. That is, for each county (with at least 500 cumulative cases), we compute the correlation between each signal and COVID-19 case incidence rates, over all time.
df_cor = vector("list", length(signals))
for (i in 1:length(signals)) {
df_cor[[i]] = covidcast_cor(df_signals[[i]] %>%
filter(geo_value %in% geo_values),
df_cases %>%
filter(geo_value %in% geo_values),
by = "geo_value", method = "spearman")
df_cor[[i]]$signal = names[i]
}
df = do.call(rbind, df_cor)
ggplot(df, aes(value)) +
geom_density(aes(color = signal, fill = signal), alpha = 0.4) +
guides(color = guide_legend(nrow = 2)) +
labs(title = "Correlation between signals and case rates",
subtitle = sprintf("Over all counties with at least %i cumulative cases",
case_num), x = "Date", y = "Correlation") +
theme(legend.position = "bottom", legend.title = element_blank())
We can also look at choropleth maps to get a geographic sense of the correlation distribution for each signal.
# Set some fields, then plot choropleth maps using covidcast functionality
for (i in 1:length(signals)) {
df_cor[[i]]$time_value = start_day
df_cor[[i]]$issue = start_day
attributes(df_cor[[i]])$geo_type = "county"
class(df_cor[[i]]) = c("covidcast_signal", "data.frame")
print(plot(df_cor[[i]], range = c(-1, 1), choro_col = cm.colors(10),
title = sprintf("Correlations for %s", names[i])))
}
Next, we’ll look at metro areas.
We fetch various signals from our API, from April 15 through to the current day.
# Fetch the following sources and signals from the API
sources = c("doctor-visits", "fb-survey", "fb-survey", "ght",
"hospital-admissions", "indicator-combination")
signals = c("smoothed_adj_cli", "smoothed_cli", "smoothed_hh_cmnty_cli",
"smoothed_search", "smoothed_adj_covid19",
"nmf_day_doc_fbc_fbs_ght")
names = c("Doctor visits", "Facebook CLI", "Facebook CLI-in-community",
"Google trends", "Hospitalizations", "Combo indicator")
start_day = "2020-04-15"
end_day = NULL
df_signals = vector("list", length(signals))
for (i in 1:length(signals)) {
df_signals[[i]] = covidcast_signal(sources[i], signals[i], start_day, end_day,
geo_type = "msa")
}
# Fetch USAFacts confirmed case incidence proportion (smoothed with 7-day
# trailing average)
df_cases = covidcast_signal("usa-facts", "confirmed_7dav_incidence_prop",
start_day, end_day, geo_type = "msa")
Here we look at Spearman (rank) correlations between our signals and COVID-19 case incidence rates, sliced by time. That is, for each day, we compute the correlation between each signal and COVID-19 case incidence rates, over all metro areas (with at least 500 cumulative cases).
# Consider only metro areas with at least 500 cumulative cases
case_num = 500
geo_values = covidcast_signal("usa-facts", "confirmed_cumulative_num",
max(df_cases$time_value),
max(df_cases$time_value),
geo_type = "msa") %>%
filter(value >= case_num) %>% pull(geo_value)
df_cor = vector("list", length(signals))
for (i in 1:length(signals)) {
df_cor[[i]] = covidcast_cor(df_signals[[i]] %>%
filter(geo_value %in% geo_values),
df_cases %>%
filter(geo_value %in% geo_values),
by = "time_value", method = "spearman")
df_cor[[i]]$signal = names[i]
}
df = do.call(rbind, df_cor)
ggplot(df, aes(x = time_value, y = value)) +
geom_line(aes(color = signal)) +
guides(color = guide_legend(nrow = 2)) +
labs(title = "Correlation between signals and case rates",
subtitle = sprintf("Over metro areas with at least %i cumulative cases",
case_num), x = "Date", y = "Correlation") +
theme(legend.position = "bottom", legend.title = element_blank())
Now we look at Spearman (rank) correlations between our signals and COVID-19 case incidence rates, sliced by metro area That is, for each metro area (with at least 500 cumulative cases), we compute the correlation between each signal and COVID-19 case incidence rates, over all time.
df_cor = vector("list", length(signals))
for (i in 1:length(signals)) {
df_cor[[i]] = covidcast_cor(df_signals[[i]] %>%
filter(geo_value %in% geo_values),
df_cases %>%
filter(geo_value %in% geo_values),
by = "geo_value", method = "spearman")
df_cor[[i]]$signal = names[i]
}
df = do.call(rbind, df_cor)
ggplot(df, aes(value)) +
geom_density(aes(color = signal, fill = signal), alpha = 0.4) +
guides(color = guide_legend(nrow = 2)) +
labs(title = "Correlation between signals and case rates",
subtitle = sprintf("Over metro areas with at least %i cumulative cases",
case_num), x = "Date", y = "Correlation") +
theme(legend.position = "bottom", legend.title = element_blank())