Candidate Eval: Doctor Visits: Smoothed Adj CLI via COVID Hospital Admissions (HHS)
This notebook evaluates a candidate indicator against a guiding indicator (presumed ground truth) to determine whether the candidate has value for nowcasting or forecasting.
1 Data Loading
Code
# Load guiding indicator
if (!is.null(params$guiding_csv)) {
df_guiding <- read.csv(params$guiding_csv) %>%
mutate(time_value = as.Date(time_value)) %>%
filter(time_value >= as.Date(params$start_day) & time_value <= as.Date(params$end_day)) %>%
as_epi_df()
} else {
df_guiding <- pub_covidcast(
params$guiding_source, params$guiding_indicator,
params$geo_type, params$time_type,
time_values = time_range(params$start_day, params$end_day),
as_of = Sys.Date()
) %>% as_epi_df()
}
cat(sprintf(
"- **Summary**: %s rows across %s locations\n",
label_comma()(nrow(df_guiding)), label_comma()(n_distinct(df_guiding$geo_value))
))- Summary: 48,892 rows across 54 locations
Code
# Load candidate indicator
candidate_archive <- NULL
if (!is.null(params$candidate_csv)) {
df_candidate_raw <- read.csv(params$candidate_csv) %>%
mutate(time_value = as.Date(time_value)) %>%
filter(time_value >= as.Date(params$start_day) & time_value <= as.Date(params$end_day))
# CSV may include version/issue columns for revision analysis
has_versions <- any(c("version", "issue") %in% colnames(df_candidate_raw))
if (has_versions) {
df_candidate_raw <- df_candidate_raw %>%
{
if ("issue" %in% names(.)) rename(., version = issue) else .
} %>%
mutate(version = as.Date(version))
candidate_archive <- df_candidate_raw %>%
select(geo_value, time_value, version, value) %>%
as_epi_archive()
df_candidate <- epix_as_of(candidate_archive, candidate_archive$versions_end)
cat(sprintf(
"- CSV contains version history: %s versions\n",
label_comma()(n_distinct(df_candidate_raw$version))
))
} else {
df_candidate <- df_candidate_raw %>% as_epi_df()
}
} else {
df_candidate <- pub_covidcast(
params$candidate_source, params$candidate_indicator,
params$geo_type, params$time_type,
time_values = time_range(params$start_day, params$end_day),
as_of = Sys.Date()
) %>% as_epi_df()
}
cat(sprintf(
"- **Summary**: %s rows across %s locations\n",
label_comma()(nrow(df_candidate)), label_comma()(n_distinct(df_candidate$geo_value))
))- Summary: 47,668 rows across 53 locations
Code
# Fetch revision history from API if not already built from CSV
if (is.null(candidate_archive) && is.null(params$candidate_csv)) {
all_geos <- unique(df_candidate$geo_value)
archive_geos <- random_sample(all_geos, params$max_archive_locs)
# Adjust lookback window
lookback <- switch(params$time_type,
"day" = lubridate::weeks(31),
"week" = lubridate::weeks(150),
"month" = lubridate::months(36),
lubridate::weeks(31)
)
archive_start <- as.character(max(
as.Date(params$start_day),
as.Date(params$end_day) - lookback
))
archive_info <- list(
n_sampled = length(archive_geos),
n_total = length(all_geos),
time_value_start = archive_start,
time_value_end = params$end_day
)
# Request only for the sampled locations and the relevant time range
# to avoid server-side timeouts/errors on large indicators.
df_archive_raw <- pub_covidcast(
source = params$candidate_source,
signal = params$candidate_indicator,
geo_type = params$geo_type,
time_type = params$time_type,
geo_values = archive_geos,
time_values = time_range(archive_start, params$end_day),
issues = "*",
)
if (nrow(df_archive_raw) > 0 && "issue" %in% colnames(df_archive_raw)) {
archive_info$version_start <- min(df_archive_raw$issue)
archive_info$version_end <- max(df_archive_raw$issue)
candidate_archive <- df_archive_raw %>%
rename(version = issue) %>%
select(geo_value, time_value, version, value) %>%
as_epi_archive()
}
}Code
# Infer time type from guiding
env_time_type <- attributes(df_guiding)$metadata$time_type
if (is.null(env_time_type) || length(env_time_type) != 1 || is.na(env_time_type) || !is.character(env_time_type)) {
env_time_type <- params$time_type
}
# When time_type is weekly, align all dates to Sundays and aggregate
if (env_time_type == "week") {
df_guiding <- df_guiding %>%
mutate(time_value = round_to_sunday(time_value)) %>%
group_by(geo_value, time_value) %>%
summarize(value = mean(value, na.rm = TRUE), .groups = "drop") %>%
as_epi_df()
df_candidate <- df_candidate %>%
mutate(time_value = round_to_sunday(time_value)) %>%
group_by(geo_value, time_value) %>%
summarize(value = mean(value, na.rm = TRUE), .groups = "drop") %>%
as_epi_df()
cat("- Aligned both datasets to Sunday-start weeks \n")
}
# Combine into a single epi_df
grid_start <- as.Date(params$start_day)
grid_end <- as.Date(params$end_day)
if (env_time_type == "week") {
grid_start <- round_to_sunday(grid_start)
grid_end <- round_to_sunday(grid_end)
}
df <- df_guiding %>%
rename(guiding = value) %>%
full_join(
df_candidate %>% select(geo_value, time_value, value) %>%
rename(candidate = value),
by = c("geo_value", "time_value")
) %>%
group_by(geo_value) %>%
complete(
time_value = seq.Date(
from = grid_start,
to = grid_end,
by = env_time_type
)
) %>%
ungroup() %>%
as_epi_df()
n_locations <- n_distinct(df$geo_value)
cat(sprintf(
"- **Combined (full join)**: %s rows, %s locations (%s to %s) \n",
label_comma()(nrow(df)), label_comma()(n_locations),
min(df$time_value), max(df$time_value)
))- Combined (full join): 49,248 rows, 54 locations (2020-09-01 to 2023-03-01)
Code
# Intersected where both indicators must be present
df_inter <- df %>%
filter(!is.na(candidate), !is.na(guiding))
cat(sprintf(
"- **Intersection (both present)**: %s rows (%.1f%% of full grid)\n\n",
label_comma()(nrow(df_inter)), 100 * nrow(df_inter) / nrow(df)
))- Intersection (both present): 47,668 rows (96.8% of full grid)
Code
start_day <- min(df$time_value)
end_day <- max(df$time_value)
n_inter_locations <- n_distinct(df_inter$geo_value)
# Define adaptive units and lag ranges for downstream analysis
time_unit <- switch(env_time_type %||% "unknown",
week = "weeks",
month = "months",
year = "years",
"days" # default
)
lag_unit_label <- sprintf("Lag (%s)", time_unit)
# Standard discrete lags to check
lag_values <- switch(env_time_type %||% "unknown",
week = -2:2,
month = -2:2,
year = -1:1,
c(-14, -7, 0, 7, 14) # default
)
# Systematic sweep range (includes negative lags/leads)
lags_sweep <- switch(env_time_type %||% "unknown",
week = -5:5,
month = -5:5,
year = -2:2,
-21:21 # default
)2 Exploratory Data Analysis
The data is aggregated at the state level. The intersection of candidate and guiding datasets contains 53 valid locations measured at a day-level frequency.
2.1 Time Series Overview
This section provides a direct visual comparison of the guiding and candidate indicators over time for a subset of observation locations. We start by looking at the raw values for both indicators across a sample of locations. Then, a third plot overlays both indicators after normalizing them.
Code
# Pick a diverse sample of locations for visualization
all_geos <- sort(unique(df_inter$geo_value))
n_plot <- min(length(all_geos), params$max_locations_plot)
sample_geos <- random_sample(all_geos, n_plot)
df_long <- df_inter %>%
filter(geo_value %in% sample_geos) %>%
pivot_longer(
cols = c(candidate, guiding),
names_to = "indicator", values_to = "value"
) %>%
group_by(geo_value, indicator) %>%
mutate(value_norm = (value - mean(value, na.rm = TRUE)) / sd(value, na.rm = TRUE)) %>%
ungroup() %>%
mutate(indicator = recode(indicator,
candidate = params$candidate_name,
guiding = params$guiding_name
))
# Determine point size based on timeline duration for visibility
n_days_eval <- as.numeric(difftime(as.Date(params$end_day), as.Date(params$start_day), units = "days"))
eda_point_size <- if (n_days_eval > 730) 0.3 else if (n_days_eval > 365) 0.5 else 0.8
ggplot(df_long %>% filter(indicator == params$guiding_name), aes(x = time_value, y = value, color = indicator)) +
geom_line(linewidth = 0.5, alpha = 0.3) +
geom_point(size = eda_point_size, alpha = 0.5) +
facet_wrap(~geo_value, ncol = 3, scales = "free_y") +
scale_x_date(breaks = breaks_pretty(), label = label_date_short()) +
scale_color_manual(values = dis_pal_list[1]) +
labs(
title = "Guiding Indicator",
subtitle = params$guiding_name,
x = "Date", y = "Value", color = "Indicator"
) +
theme(legend.position = "none")Code
ggplot(df_long %>% filter(indicator == params$candidate_name), aes(x = time_value, y = value, color = indicator)) +
geom_line(linewidth = 0.5, alpha = 0.3) +
geom_point(size = eda_point_size, alpha = 0.5) +
facet_wrap(~geo_value, ncol = 3, scales = "free_y") +
scale_x_date(breaks = breaks_pretty(), label = label_date_short()) +
scale_color_manual(values = dis_pal_list[2]) +
labs(
title = "Candidate Indicator",
subtitle = params$candidate_name,
x = "Date", y = "Value", color = "Indicator"
) +
theme(legend.position = "none")In this normalized view, overlapping patterns, lead-lag relationships, or divergent behavior become easier to spot. For instance, if the candidate curve tends to rise before the guiding curve, we might have a useful leading relationship.
Code
ggplot(df_long, aes(x = time_value, y = value_norm, color = indicator)) +
geom_line(linewidth = 0.7, alpha = 0.3) +
geom_point(size = eda_point_size, alpha = 0.4) +
facet_wrap(~geo_value, ncol = 3) +
scale_x_date(breaks = breaks_pretty(), label = label_date_short()) +
labs(
title = "Trend Comparison",
subtitle = "Normalized indicators per location",
x = "Date", y = "StandardizedValue", color = "Indicator"
)2.2 Quantile Trends
This plot displays the distribution of both indicators over time across all included locations. The solid line represents the median, the inner ribbon shows the 25th to 75th percentile range, and the outer ribbon covers the 5th to 95th percentile range. We smooth the data with a moving average to handle day-of-week effects, and we use different scales for the top and bottom panels so each indicator’s distribution is clearly visible.
By observing the width of these ribbons, you can get a sense of geographic variability. Narrow ribbons mean the values are tightly clustered across locations, while wide ribbons show more spread. This helps us assess the overall stability and systemic trends of the indicators.
Code
cat("\n")Code
df_quantiles_wide <- df_inter %>%
pivot_longer(
cols = c(candidate, guiding),
names_to = "indicator", values_to = "value"
) %>%
group_by(time_value, indicator) %>%
summarize(vals = list(value), .groups = "drop") %>%
mutate(geo_value = indicator) %>%
as_epi_df()
# Handle time frequency differences for .window_size.
window_size <- switch(env_time_type %||% "unknown",
day = 7L, # 7-day smoothing
week = as.difftime(3, units = "weeks"), # 3-week smoothing
month = as.difftime(3, units = "months"), # 3-month smoothing
year = as.difftime(3, units = "years"), # 3-year smoothing
Inf # Default to cumulative for unknown types
# (too sparse for sliding averages)
)
df_quantiles_wide <- df_quantiles_wide %>%
group_by(geo_value) %>%
epi_slide(
~ tibble(
`0.05` = quantile(unlist(.x$vals), 0.05, na.rm = TRUE),
`0.25` = quantile(unlist(.x$vals), 0.25, na.rm = TRUE),
`0.5` = quantile(unlist(.x$vals), 0.5, na.rm = TRUE),
`0.75` = quantile(unlist(.x$vals), 0.75, na.rm = TRUE),
`0.95` = quantile(unlist(.x$vals), 0.95, na.rm = TRUE)
),
.window_size = window_size
) %>%
ungroup() %>%
mutate(
indicator = recode(geo_value,
candidate = params$candidate_name,
guiding = params$guiding_name
)
)
ggplot(df_quantiles_wide, aes(x = time_value)) +
geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = "5%-95%"), alpha = 0.2) +
geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`, fill = "25%-75%"), alpha = 0.4) +
geom_line(aes(y = `0.5`, color = "Median"), linewidth = 1) +
facet_wrap(~indicator, ncol = 1, scales = "free_y") +
scale_x_date(breaks = breaks_pretty(), label = label_date_short()) +
scale_y_continuous(labels = scales::label_comma()) +
scale_fill_manual(values = c("5%-95%" = dis_pal_list[2], "25%-75%" = dis_pal_list[2])) +
scale_color_manual(values = c("Median" = dis_pal_list[2])) +
labs(
title = "Smoothed Quantile Trends",
subtitle = sprintf("Across all %d locations, smoothed window: %s", n_inter_locations, format(window_size)),
x = "Date", y = "Value (smoothed)",
fill = "Range", color = "Trend"
) +
theme(legend.position = "bottom")2.2.1 Cross-Sectional Geographic Outliers
While the quantile plots summarize the main body of the distribution, they may hide the behavior of individual highly erratic locations. Here, we track how often individual locations jump into the extreme upper tail (above the cross-sectional 95th percentile recorded on that specific date) for each indicator.
Note: This approach compares locations against one another on each date. If the indicator relies on raw counts rather than population-standardized rates or percentages, this table will frequently just identify locations with the largest populations. In those cases, this metric reflects size rather than epidemiological aberration.
2.3 Coverage and Missingness
This section explores the availability of data across time and space. The step chart below tracks the number of geographic locations with valid data on any given date for the candidate indicator, the guiding indicator, and instances where both are available together. The dashed line indicates when both indicators are available simultaneously.
On average, both indicators are simultaneously available in 52.3 locations per day.
Code
ggplot(coverage, aes(x = time_value)) +
geom_area(
aes(
y = candidate_available, color = params$candidate_name,
fill = params$candidate_name
),
linewidth = 0.7, alpha = 0.05
) +
geom_area(
aes(
y = guiding_available, color = params$guiding_name,
fill = params$guiding_name
),
linewidth = 0.7, alpha = 0.05
) +
geom_area(
aes(y = both_available, color = "Both", fill = "Both"),
linewidth = 0.7, alpha = 0.05, linetype = "dashed"
) +
scale_x_date(breaks = breaks_pretty(), label = label_date_short()) +
scale_y_continuous(labels = label_comma(), limits = c(0, NA)) +
scale_colour_manual(
name = "Indicator",
values = dis_pal_list,
breaks = c(
params$guiding_name,
params$candidate_name, "Both"
)
) +
scale_fill_manual(
name = "Indicator",
values = dis_pal_list,
breaks = c(
params$guiding_name,
params$candidate_name, "Both"
)
) +
labs(
title = "Data coverage over time",
subtitle = sprintf("Number of locations with non-missing values per %s", env_time_type),
x = "Date", y = "Number of locations"
) +
guides(
color = guide_legend(override.aes = list(alpha = 1, fill = NA)),
fill = guide_legend(override.aes = list(alpha = 0.2))
)2.3.1 Summary Statistics by Location
This table summarizes the available data across all included locations. Locations with no candidate or guiding information are not considered.
Excluded locations (no candidate data): as (American Samoa)
Missingness Overview (All Locations):
- <5% missing: 52 locations (Candidate) vs 53 locations (Guiding)
- 5-25% missing: 0 locations (Candidate) vs 0 locations (Guiding)
- 25-50% missing: 0 locations (Candidate) vs 1 locations (Guiding)
- 50-99.9% missing: 1 locations (Candidate) vs 0 locations (Guiding)
- 100% missing: 1 locations (Candidate) vs 0 locations (Guiding)
2.4 Noise Assessment
We calculate the Coefficient of Variation (CV) for each location to compare indicator stability across geographic areas.
\[\text{CV}_i = \frac{\text{SD}(X_i)}{|\bar{X}_i|}\]
Code
# Calculate CV per location using the intersected data
noise_stats <- df_inter %>%
arrange(geo_value, time_value) %>%
group_by(geo_value) %>%
summarize(
cand_cv = sd(candidate, na.rm = TRUE) /
abs(mean(candidate, na.rm = TRUE)),
guid_cv = sd(guiding, na.rm = TRUE) /
abs(mean(guiding, na.rm = TRUE)),
.groups = "drop"
) %>%
# Remove locations where metrics couldn't be computed
filter(if_all(c(cand_cv, guid_cv), ~ is.finite(.x)))
# Clip extreme outliers for robust visualization
cv_upper <- quantile(c(noise_stats$cand_cv, noise_stats$guid_cv), 0.99, na.rm = TRUE)
noise_stats_plot <- noise_stats %>%
filter(cand_cv <= cv_upper, guid_cv <= cv_upper)
n_excluded <- nrow(noise_stats) - nrow(noise_stats_plot)
# Summary Table
noise_summary <- tibble(
Metric = "Coefficient of Variation (CV)",
`Median (candidate)` = median(noise_stats$cand_cv),
`Median (guiding)` = median(noise_stats$guid_cv),
`Median Difference` = median(noise_stats$cand_cv - noise_stats$guid_cv),
`% Smoother (candidate < guiding)` = mean(noise_stats$cand_cv < noise_stats$guid_cv) * 100
)
DT::datatable(noise_summary,
caption = sprintf(
"Comparative CV summary across locations.%s\nA negative Median Difference and high %% Smoother indicate the candidate is typically less volatile.",
if (n_excluded > 0) sprintf(" (%d extreme outlier(s) excluded from plots.)", n_excluded) else ""
),
rownames = FALSE, options = list(scrollX = TRUE, paging = FALSE, dom = "t")
) %>% DT::formatRound(columns = 2:5, digits = 3)Code
# Reshape for plotting
noise_long <- noise_stats_plot %>%
pivot_longer(-geo_value, names_to = "indicator", values_to = "value") %>%
mutate(
indicator = recode(indicator,
cand_cv = params$candidate_name,
guid_cv = params$guiding_name
)
)
# Histogram comparison
p_hist <- ggplot(noise_long, aes(x = value, fill = indicator)) +
geom_histogram(bins = 30, color = "white", alpha = 0.7, position = "identity") +
facet_wrap(~indicator, ncol = 1) +
scale_x_continuous(labels = label_percent()) +
labs(
title = "CV Distribution by Location",
subtitle = "Lower values indicate a more stable indicator",
x = "Coefficient of Variation (CV)", y = "Count", fill = "Indicator"
) +
theme(legend.position = "top")
# Paired Scatter Plot
p_scatter <- ggplot(noise_stats_plot, aes(x = guid_cv, y = cand_cv)) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "grey50") +
geom_point(aes(color = cand_cv < guid_cv), alpha = 0.7, size = 2) +
scale_x_continuous(labels = label_percent()) +
scale_y_continuous(labels = label_percent()) +
scale_color_manual(
name = "Result",
values = c("TRUE" = dis_pal_list[2], "FALSE" = dis_pal_list[4]),
labels = c("TRUE" = "Candidate More Stable", "FALSE" = "Guiding More Stable")
) +
labs(
title = "Paired CV Comparison",
subtitle = "Points below the dashed line represent locations where the candidate is more stable.",
x = sprintf("CV: %s (Guiding)", params$guiding_name),
y = sprintf("CV: %s (Candidate)", params$candidate_name)
)
p_hist / p_scatter + plot_layout(heights = c(1.2, 1))2.5 Candidate Versioning
This section examines how published values change over time as new data arrives. Each observation may be revised multiple times after its initial release, and the statistics below summarize both how often and how much those revisions occur. Relative revisions measure the magnitude of a revision relative to the signal’s own scale. A relative revision of 0.1 means the observation shifted by 10% of its own magnitude across revisions, while values near 1 indicate revisions as large as the signal itself.
The
revision_summaryis under revision and may produce misleading results in certain edge cases. Use the summary statistics below with caution while development is ongoing.
Revision analysis is performed on a sample of 53 out of 53 locations, with versions from 2022-07-31 to 2023-05-13, and time from 2022-07-27 to 2023-03-01.
- Time range: 2022-07-27 to 2023-03-01
- Min lag (days to first version): Min.: 4, 1st Qu.: 4, Median: 4, Mean: 4.3, 3rd Qu.: 4, Max.: 39
- Total number of revisions: 260
- No revisions: 0%
- Quick revisions (<= 3 days): 0%
- Few revisions (<= 3): 0%
- Minor relative revisions (< 0.1): 4%
- Major absolute revisions (> 0.470): 34%
- Days until within 20% of latest: Min.: 4, 1st Qu.: 4, Median: 8, Mean: 16.3, 3rd Qu.: 21, Max.: 73
The time series below visualize the candidate indicator (Doctor Visits: Smoothed Adj CLI) with line colors corresponding to the issue date, meaning the date the data was initially published or subsequently revised. Darker colors denote older versions, while lighter colors represent more recent updates.
If the lines perfectly overlap, it means the data for that location is not subject to revisions. Where you see a vertical spread along a single date on the x-axis, the data has been revised over time.
Code
rev_by_loc <- rev_details$revision_behavior %>%
filter(n_revisions != 0) %>%
group_by(geo_value) %>%
summarize(
n_versions = n(),
n_rev = mean(n_revisions, na.rm = TRUE),
min_lag = min(min_lag, na.rm = TRUE),
max_lag = max(max_lag, na.rm = TRUE),
spread = mean(spread, na.rm = TRUE),
rel_spread = mean(rel_spread, na.rm = TRUE),
lag_near_latest = mean(lag_near_latest, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(desc(rel_spread))
if (nrow(rev_by_loc) > 0) {
cat("The table below summarizes revision behavior per location. Column definitions:\n\n")
cat("- Min Lag. Minimum days between the event date and its first publication.\n")
cat("- Max Lag. Maximum days between the event date and its final revision.\n")
cat("- Avg. Spread. Average absolute difference between the minimum and maximum published values.\n")
cat("- Avg. Rel. Spread. Average relative difference between the minimum and maximum published values (spread divided by the maximum value).\n")
cat(sprintf("- Lag Near Latest. Average days until the value stays within %d%% of the latest published value.\n\n", rev_details$within_latest * 100))
DT::datatable(rev_by_loc,
colnames = c(
"Location", "Versions", "Avg. Revisions", "Min Lag", "Max Lag",
"Avg. Spread", "Avg. Rel. Spread", "Lag Near Latest"
),
caption = "Candidate revision behavior by location",
rownames = FALSE, options = list(scrollX = TRUE, scrollY = "400px", scrollCollapse = TRUE, paging = TRUE, dom = "tp")
) %>% DT::formatRound(columns = 2:8, digits = 2)
}The table below summarizes revision behavior per location. Column definitions:
- Min Lag. Minimum days between the event date and its first publication.
- Max Lag. Maximum days between the event date and its final revision.
- Avg. Spread. Average absolute difference between the minimum and maximum published values.
- Avg. Rel. Spread. Average relative difference between the minimum and maximum published values (spread divided by the maximum value).
- Lag Near Latest. Average days until the value stays within 20% of the latest published value.
Code
if (nrow(rev_by_loc) > 0 && n_inter_locations > 1) {
# Pick a few locations with the most revisions
rev_geos <- head(rev_by_loc$geo_value, params$max_locations_plot)
# Limit number of locations for autoplot to avoid overcrowding
# Limit to last 2 years for clarity
archive_sample <- archive %>%
filter(geo_value %in% rev_geos) %>%
filter(time_value >= max(time_value) - years(1))
autoplot(archive_sample, .versions = "month") +
scale_colour_viridis_c(
option = "plasma",
begin = 0, end = 0.8,
labels = scales::label_date(),
trans = "date"
) +
facet_wrap(~geo_value, ncol = 2, scales = "free_y") +
guides(colour = guide_colorbar(title = "Issue Date", barwidth = 25, barheight = 0.5)) +
labs(
title = "Candidate Signal Revisions",
subtitle = "Monthly version snapshots for the most recent year of data",
x = "Observation Date",
y = "Value"
) +
theme(legend.position = "bottom", strip.text = element_text(face = "bold"))
}2.6 Signal sanity checks
This section is by its nature more difficult than others to do systematically. Things to look for:
- If it’s a percentile, is it between 0 and 100?
- If it’s a rate, is it roughly proportional to population? (requires external population data — inspect manually)
- Are the actual values reasonable? E.g. if it’s a percentage, does it ever approach 100 outside of literal pandemics?
- If it’s for a seasonal disease, are rates highest in the winter and lowest in the summer?
- Are geo-aggregated coarser levels plausible aggregations? (checked below for state-level indicators vs HHS/nation)
The histogram and table below show the distribution of all observed values and the value range per location for the candidate indicator, to assess whether the range is plausible given the indicator’s expected scale.
Code
ggplot(df_inter, aes(x = candidate)) +
geom_histogram(bins = 40, alpha = 0.8, color = "white") +
scale_x_continuous(labels = label_comma()) +
labs(
title = sprintf("Value distribution: %s", params$candidate_name),
subtitle = sprintf(
"%s observations across %d locations",
label_comma()(nrow(df_inter)), n_inter_locations
),
x = "Value", y = "Count"
)The plot below shows the seasonal pattern of the indicator, with each panel representing one epidemiological year (June–May). The line traces the monthly median across all locations, the inner band covers the interquartile range, and the outer band spans the full observed range.
Code
season_month_labels <- c("Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", "Jan", "Feb", "Mar", "Apr", "May")
df_seasonal <- df_inter %>%
mutate(
yr = as.integer(format(time_value, "%Y")),
month = as.integer(format(time_value, "%m")),
season_start = if_else(month >= 6L, yr, yr - 1L),
season = paste0(season_start, "-", substr(as.character(season_start + 1L), 3L, 4L)),
season_month = if_else(month >= 6L, month - 5L, month + 7L)
) %>%
group_by(season, season_month) %>%
summarize(
median_val = median(candidate, na.rm = TRUE),
min_val = min(candidate, na.rm = TRUE),
max_val = max(candidate, na.rm = TRUE),
p25 = quantile(candidate, 0.25, na.rm = TRUE),
p75 = quantile(candidate, 0.75, na.rm = TRUE),
.groups = "drop"
) %>%
filter(season %in% (count(., season) %>% filter(n >= 3) %>% pull(season)))
ggplot(df_seasonal, aes(x = season_month, color = season, fill = season, group = season)) +
geom_ribbon(aes(ymin = min_val, ymax = max_val), alpha = 0.10, linewidth = 0.2) +
geom_ribbon(aes(ymin = p25, ymax = p75), alpha = 0.30, linewidth = 0.3) +
geom_line(aes(y = median_val), linewidth = 0.8) +
geom_point(aes(y = median_val), size = 1.5) +
facet_wrap(~season, ncol = 2, scales = "free_y") +
scale_x_continuous(breaks = 1:12, labels = season_month_labels) +
scale_y_continuous(labels = label_comma()) +
labs(
title = sprintf("Seasonal pattern: %s", params$candidate_name),
subtitle = "Median by month across all locations — inner band: IQR, outer band: full range",
x = "Month", y = "Median value"
) +
theme(legend.position = "none")No geo-aggregation violations found: all HHS/nation values are within the [min, max] of constituent state values for Doctor Visits: Smoothed Adj CLI.
The table below flags locations whose seasonal peak is at least twice the typical (median) peak across all locations in the same season.
3 Correlation Analysis
All correlation metrics are computed on the intersection of the candidate and guiding indicators (rows where both values are non-missing) using the latest available revisions unless otherwise indicated.
3.1 Correlation over Time
Here we calculate the Spearman rank correlation between the candidate and guiding indicators across all available locations for each individual date. We also overlay a LOESS smoothed trend line to help capture broader patterns over time.
Values near one tell us there is a strong positive correlation across areas on that specific day. When you see dips or sustained drops in the line, it suggests that the candidate decoupled from the guiding indicator during that period.
Two additional panels are included below the correlation plot to support interpretation. The middle panel shows smoothed quantile trends for both indicators. The bottom panel shows the number of locations where both indicators were available on each date.
Code
if (n_inter_locations > 1) {
cor_by_time <- epi_cor(df_inter, candidate, guiding,
cor_by = time_value, method = "spearman"
)
mean_rho <- mean(cor_by_time$cor, na.rm = TRUE)
p_cor <- ggplot(cor_by_time, aes(x = time_value)) +
geom_hline(yintercept = 0, linetype = "dashed", color = "grey50") +
geom_hline(
yintercept = c(0.3, 0.4, 0.6, 0.7, 0.8, 0.9),
linetype = "dotted", color = "gray85", linewidth = 0.3
) +
geom_line(aes(y = cor, color = "Correlation"), linewidth = 0.8) +
geom_smooth(aes(y = cor, color = "LOESS smoothed trend"),
method = "loess", span = 0.2, se = TRUE,
fill = dis_pal_list[2], alpha = 0.15
) +
scale_x_date(
limits = range(cor_by_time$time_value, na.rm = TRUE),
breaks = breaks_pretty(n = 6), label = label_date_short()
) +
scale_y_continuous(
name = expression(paste("Spearman ", rho)),
limits = c(floor(min(cor_by_time$cor, na.rm = TRUE) * 10) / 10, 1)
) +
scale_color_manual(
name = "Metric",
values = c(
"Correlation" = dis_pal_list[1],
"LOESS smoothed trend" = dis_pal_list[2]
)
) +
labs(
title = "Correlation over time",
subtitle = bquote(Spearman ~ rho ~ "across" ~ .(n_inter_locations) ~ "locations | Mean =" ~ .(sprintf("%.3f", mean_rho))),
x = "Date", y = expression(paste("Spearman ", rho, " across locations"))
) +
theme(
legend.position = "none",
axis.text.x = element_blank(),
axis.ticks.x = element_blank()
)
}Code
p_cor <- ggplot(cor_by_time, aes(x = time_value)) +
geom_hline(yintercept = 0, linetype = "dashed", color = "grey50") +
geom_hline(
yintercept = c(0.3, 0.4, 0.6, 0.7, 0.8, 0.9),
linetype = "dotted", color = "gray85", linewidth = 0.3
) +
geom_line(aes(y = cor, color = "Correlation"), linewidth = 0.8) +
geom_smooth(aes(y = cor, color = "LOESS smoothed trend"),
method = "loess", span = 0.2, se = TRUE,
fill = dis_pal_list[2], alpha = 0.15
) +
scale_x_date(
limits = range(cor_by_time$time_value, na.rm = TRUE),
breaks = breaks_pretty(n = 6), label = label_date_short()
) +
scale_y_continuous(
name = expression(paste("Spearman ", rho)),
limits = c(floor(min(cor_by_time$cor, na.rm = TRUE) * 10) / 10, 1),
breaks = c(0, 0.3, 0.4, 0.6, 0.7, 0.8, 0.9, 1.0)
) +
scale_color_manual(
name = "Metric",
values = c(
"Correlation" = dis_pal_list[1],
"LOESS smoothed trend" = dis_pal_list[2]
)
) +
labs(
title = "Diagnostic Comparison: Correlation, Quantiles, and Coverage",
subtitle = bquote("Correlation over time | Mean" ~ rho == .(sprintf("%.3f", mean_rho)) ~ "(latest revisions)"),
x = NULL
) +
theme(
legend.position = "none",
axis.text.x = element_blank(),
axis.ticks.x = element_blank()
)Code
p_cor / p_quant / p_cov + plot_layout(heights = c(1.4, 0.8, 0.8))3.2 Distribution of Temporal Correlations
This histogram shows the distribution of the Spearman correlation computed within each geographic location over the full time period. The dashed line marks the median.
Code
cor_by_geo <- epi_cor(df_inter, candidate, guiding,
cor_by = geo_value, method = "spearman"
)
med_rho <- median(cor_by_geo$cor, na.rm = TRUE)
if (n_inter_locations > 1) {
p_rho <- ggplot(cor_by_geo, aes(x = cor)) +
geom_histogram(bins = 20, alpha = 0.6, color = "white") +
geom_vline(
xintercept = med_rho, linetype = "dashed",
color = dis_pal_list[2], linewidth = 1
) +
scale_x_continuous(breaks = breaks_pretty()) +
labs(
subtitle = bquote(Median ~ Spearman ~ (rho) == .(sprintf("%.3f", med_rho)) ~ "across" ~ .(nrow(cor_by_geo)) ~ "locations"),
x = expression(Spearman ~ rho), y = "Count"
)
p_rho + labs(title = "Distribution of intra-location temporal correlations")
} else {
cat(sprintf("For the single location %s, the temporal Spearman correlation is %.3f\n", unique(df_inter$geo_value)[1], med_rho))
}3.3 Lagged Correlations
To explore lead and lag relationships, we shift the candidate indicator by various intervals (-14, -7, 0, 7, 14 days). Specifically, a positive lag (e.g., +7 days) compares the candidate indicator from 7 days ago against the guiding indicator today. This allows us to see if the candidate acts as a leading indicator.
3.3.1 Correlation over time, at different days
This plot expands on the basic correlation by showing multiple smoothed trend lines, each representing a different lag applied to the candidate. If the curve for a positive lag is consistently higher than the zero-lag curve, it is a strong indication that the candidate indicator provides a lead over the guiding indicator.
Code
cor_lagged_time <- map(lag_values, ~ {
epi_cor(df_inter, candidate, guiding,
cor_by = time_value, dt1 = -.x, method = "spearman"
) %>%
mutate(lag = .x)
}) %>%
list_rbind() %>%
mutate(lag = as.factor(lag))
if (n_inter_locations > 1) {
ggplot(cor_lagged_time, aes(x = time_value, y = cor, color = lag)) +
geom_line(alpha = 0.4) +
geom_smooth(method = "loess", span = 0.2, se = FALSE, linewidth = 1.2) +
geom_hline(yintercept = 0, linetype = "dashed", color = "grey50") +
scale_x_date(breaks = breaks_pretty(), label = label_date_short()) +
scale_y_continuous(limits = c(-1, 1), breaks = breaks_pretty()) +
labs(
title = sprintf("Correlation over time at different %s", time_unit),
subtitle = "Transparent line: Raw Correlation, Solid line: LOESS smoothed trend",
x = "Date", y = expression(paste("Spearman ", rho)),
color = lag_unit_label
) +
guides(color = guide_legend(nrow = 1, override.aes = list(linewidth = 1.5, alpha = 1)))
}3.3.2 Real-time lagged correlations
Same as above, but the candidate (Doctor Visits: Smoothed Adj CLI) values are drawn from epix_as_of(candidate_archive, v) at each version date v, so the correlation reflects what was observable in real time rather than using the final revised values.
Code
versions <- sort(unique(candidate_archive$DT$version))
cor_lagged_realtime <- map_dfr(versions, function(vv) {
snap <- epix_as_of(candidate_archive, vv)
if (env_time_type == "week") {
snap <- snap %>%
mutate(time_value = round_to_sunday(time_value)) %>%
group_by(geo_value, time_value) %>%
summarize(value = mean(value, na.rm = TRUE), .groups = "drop")
}
valid_tvs <- snap$time_value[snap$time_value %in% df_guiding$time_value]
if (length(valid_tvs) == 0) {
return(tibble(version = vv, lag = lag_values, cor = NA_real_))
}
latest_tv <- max(valid_tvs, na.rm = TRUE)
map_dfr(lag_values, function(lg) {
target_tv <- switch(env_time_type,
week = latest_tv - lubridate::weeks(lg),
month = latest_tv - lubridate::months(lg),
year = latest_tv - lubridate::years(lg),
latest_tv - lg
)
df_cand <- snap %>%
filter(time_value == target_tv) %>%
rename(candidate = value)
df_guid <- df_guiding %>%
filter(time_value == latest_tv) %>%
select(geo_value, guiding = value)
df_joined <- inner_join(df_cand, df_guid, by = "geo_value")
if (n_distinct(df_joined$geo_value) < 3) {
return(tibble(version = vv, lag = lg, cor = NA_real_))
}
tibble(version = vv, lag = lg, cor = cor(df_joined$candidate, df_joined$guiding,
method = "spearman", use = "complete.obs"
))
})
}) %>%
mutate(lag = as.factor(lag))
ggplot(cor_lagged_realtime, aes(x = version, y = cor, color = lag)) +
geom_line(alpha = 0.4) +
geom_smooth(method = "loess", span = 0.2, se = FALSE, linewidth = 1.2) +
geom_hline(yintercept = 0, linetype = "dashed", color = "grey50") +
scale_x_date(breaks = breaks_pretty(), label = label_date_short()) +
scale_y_continuous(limits = c(-1, 1), breaks = breaks_pretty()) +
labs(
title = sprintf("Real-time correlation over time at different %s", time_unit),
subtitle = "Candidate values as-published at each version date",
x = "Version date", y = expression(paste("Spearman ", rho)),
color = lag_unit_label
) +
guides(color = guide_legend(nrow = 1, override.aes = list(linewidth = 1.5, alpha = 1)))3.3.3 Per-location temporal correlation distributions, at different lags
Here we show density distributions of the temporal Spearman correlations for each location, calculated across several different lead and lag shifts.
The goal is to see which density curve has the highest correlation values. The lag associated with that curve is the one that best aligns the temporal dynamics of the two indicators for the majority of areas. This complements the previous plot by confirming whether the optimal lag remains consistent over time within individual locations.
Code
cor_lagged_geo <- map(lag_values, ~ {
epi_cor(df_inter, candidate, guiding,
cor_by = geo_value, dt1 = -.x, method = "spearman"
) %>%
mutate(lag = .x)
}) %>%
list_rbind() %>%
mutate(lag = as.factor(lag))
if (n_inter_locations > 1) {
ggplot(cor_lagged_geo, aes(x = cor)) +
geom_density(aes(fill = lag, color = lag), alpha = 0.15) +
xlim(-1, 1) +
labs(
title = sprintf("Temporal correlations at different %s", time_unit),
subtitle = "Positive lag = candidate leads the guiding indicator",
x = expression(paste("Spearman ", rho)), y = "Density",
fill = lag_unit_label, color = lag_unit_label
) +
guides(
fill = guide_legend(override.aes = list(alpha = 0.35))
)
}3.4 Systematic Lag Analysis
In this section, we sweep through a wider range of lags (from -21 to 21 days) to pinpoint the optimal lead time.
The plot displays summary statistics like the mean, median, and interquartile range of the temporal correlations across all locations, calculated for every possible lag in the window. A vertical line marks the optimal lag, which has the highest mean correlation.
If the optimal lag is positive, the candidate’s past values are most predictive of the guiding indicator’s current values (which makes them valuable for forecasting).
Code
lags <- lags_sweep
cor_sweep <- map(lags, ~ {
epi_cor(df_inter, candidate, guiding,
cor_by = geo_value, dt1 = -.x, method = "spearman"
) %>%
mutate(lag = .x)
}) %>% list_rbind()
lag_summary <- cor_sweep %>%
group_by(lag) %>%
summarize(
mean_cor = mean(cor, na.rm = TRUE),
median_cor = median(cor, na.rm = TRUE),
q25 = quantile(cor, 0.25, na.rm = TRUE),
q75 = quantile(cor, 0.75, na.rm = TRUE),
.groups = "drop"
)
optimal_lag <- lag_summary$lag[which.max(lag_summary$mean_cor)]
if (n_inter_locations > 1) {
ggplot(lag_summary, aes(x = lag)) +
geom_ribbon(aes(ymin = q25, ymax = q75, fill = "IQR"), alpha = 0.2) +
geom_line(aes(y = mean_cor, color = "Mean"), linewidth = 1.2) +
geom_point(aes(y = mean_cor, color = "Mean"), size = 2, fill = "transparent") +
geom_line(aes(y = median_cor, color = "Median"),
linetype = "dashed", linewidth = 0.8
) +
geom_vline(xintercept = optimal_lag, linetype = "dotted", color = "black") +
annotate("text",
x = optimal_lag, y = -1, label = sprintf("Optimal: %d", optimal_lag),
hjust = -0.1, vjust = -0.5, size = 4
) +
scale_x_continuous(breaks = breaks_pretty()) +
scale_y_continuous(limits = c(-1, 1), breaks = breaks_pretty()) +
labs(
title = sprintf("Summary metrics vs. %s", lag_unit_label),
x = lag_unit_label, y = expression(paste("Spearman ", rho)),
color = "", fill = ""
)
}3.5 Choropleth Map at Optimal Lag
This geographic map colors each state according to its temporal Spearman correlation with the guiding indicator, specifically evaluated at the optimal lag we found in the previous section.
Code
cor_optimal <- epi_cor(df_inter, candidate, guiding,
cor_by = geo_value, dt1 = -optimal_lag,
method = "spearman"
)
if (n_inter_locations > 1 && params$geo_type %in% c("state", "county")) {
# Basic check to see if we have valid-looking state/county codes before trying geographic match
looks_like_state <- params$geo_type == "state" && all(nchar(cor_optimal$geo_value) == 2)
looks_like_county <- params$geo_type == "county" && all(nchar(cor_optimal$geo_value) == 5)
if (looks_like_state || looks_like_county) {
cor_choro <- cor_optimal %>%
rename(value = cor) %>%
mutate(
time_value = as.Date(params$start_day),
issue = as.Date(params$start_day),
data_source = "evaluation",
signal = "correlation"
)
attributes(cor_choro)$metadata$geo_type <- params$geo_type
class(cor_choro) <- c("covidcast_signal", "data.frame")
# Plot object evaluating as the last statement in the conditional
print(plot(cor_choro,
range = c(-1, 1), choro_col = c(dis_pal_list[3], "white", dis_pal_list[2]),
title = sprintf("Per-location correlation at optimal lag (%d %s)", optimal_lag, time_unit)
)) # For some reason this plot is only showing up when using print()
# Summary table of correlations at optimal lag
cor_top <- cor_optimal %>%
mutate(geo_name = geo_to_name(geo_value)) %>%
arrange(desc(cor)) %>%
select(geo_value, geo_name, cor)
DT::datatable(cor_top,
colnames = c("Geo-level", "Name", "Spearman Correlation"),
caption = sprintf("Strongest local correlations at %d %s lag", optimal_lag, time_unit),
rownames = FALSE, options = list(scrollX = TRUE, scrollY = "400px", scrollCollapse = TRUE, paging = TRUE, dom = "tp")
) %>% DT::formatRound(columns = 3, digits = 3)
}
}4 Summary
Code
findings <- tibble(
Metric = c(
"Mean geographic rank correlation (ρ)",
sprintf("Optimal lead time (%s)", time_unit),
"Mean ρ at optimal lag",
"Median per-location ρ (lag 0)",
"Median per-location ρ (optimal lag)",
"Locations with |ρ| > 0.5 (optimal lag)"
),
Value = c(
sprintf("%.3f", mean_rho),
as.character(optimal_lag),
sprintf("%.3f", max(lag_summary$mean_cor)),
sprintf("%.3f", med_rho),
sprintf("%.3f", median(cor_optimal$cor, na.rm = TRUE)),
sprintf(
"%d / %d (%.0f%%)",
sum(abs(cor_optimal$cor) > 0.5, na.rm = TRUE),
nrow(cor_optimal),
100 * mean(abs(cor_optimal$cor) > 0.5, na.rm = TRUE)
)
)
)
DT::datatable(findings, caption = "Key evaluation metrics", rownames = FALSE, options = list(scrollX = TRUE, paging = FALSE, dom = "t"))