Skip to content

Commit f9f8269

Browse files
committed
wip: dashboard
1 parent 206dfa8 commit f9f8269

File tree

8 files changed

+460
-26
lines changed

8 files changed

+460
-26
lines changed

.gitignore

+3-1
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,6 @@ aux_data
1515
.netlify
1616
reports/*.html
1717
reports/report.md
18-
cache
18+
cache
19+
data
20+
.vscode

R/forecasters/formatters.R

+3-16
Original file line numberDiff line numberDiff line change
@@ -59,28 +59,15 @@ format_covidhub <- function(pred, true_forecast_date, target_end_date, quantile_
5959
format_flusight <- function(pred, disease = c("flu", "covid")) {
6060
disease <- arg_match(disease)
6161
pred %>%
62+
add_state_info(geo_value_col = "geo_value", old_geo_code = "state_id", new_geo_code = "state_code") %>%
6263
mutate(
6364
reference_date = get_forecast_reference_date(forecast_date),
6465
target = glue::glue("wk inc {disease} hosp"),
6566
horizon = as.integer(floor((target_end_date - reference_date) / 7)),
6667
output_type = "quantile",
6768
output_type_id = quantile,
68-
value = value
69+
value = value,
70+
location = state_code
6971
) %>%
70-
left_join(get_population_data() %>% select(state_id, state_code), by = c("geo_value" = "state_id")) %>%
71-
mutate(location = state_code) %>%
7272
select(reference_date, target, horizon, target_end_date, location, output_type, output_type_id, value)
7373
}
74-
75-
#' The quantile levels used by the covidhub repository
76-
#'
77-
#' @param type either standard or inc_case, with inc_case being a small subset of the standard
78-
#'
79-
#' @export
80-
covidhub_probs <- function(type = c("standard", "inc_case")) {
81-
type <- match.arg(type)
82-
switch(type,
83-
standard = c(0.01, 0.025, seq(0.05, 0.95, by = 0.05), 0.975, 0.99),
84-
inc_case = c(0.025, 0.100, 0.250, 0.500, 0.750, 0.900, 0.975)
85-
) |> round(digits = 3)
86-
}

R/utils.R

+54-9
Original file line numberDiff line numberDiff line change
@@ -212,15 +212,47 @@ exclude_geos <- function(geo_forecasters_weights) {
212212

213213
`%nin%` <- function(x, y) !(x %in% y)
214214

215-
get_population_data <- function() {
216-
readr::read_csv("https://raw.githubusercontent.com/cmu-delphi/covidcast-indicators/refs/heads/main/_delphi_utils_python/delphi_utils/data/2020/state_pop.csv", show_col_types = FALSE) %>%
217-
rename(population = pop) %>%
218-
# Add a row for the United States
219-
bind_rows(
220-
(.) %>% summarize(state_id = "us", population = sum(population), state_name = "United States", state_code = "US")
221-
) %>%
222-
# Duplicate the last row, but with state_id = "usa".
223-
bind_rows((.) %>% filter(state_id == "us") %>% mutate(state_id = "usa"))
215+
get_population_data <- function(national_code = c("both", "us", "usa")) {
216+
national_code <- rlang::arg_match(national_code)
217+
218+
state_crosswalk <- readr::read_csv("https://raw.githubusercontent.com/cmu-delphi/covidcast-indicators/refs/heads/main/_delphi_utils_python/delphi_utils/data/2020/state_pop.csv", show_col_types = FALSE) %>%
219+
rename(population = pop)
220+
us_crosswalk <- state_crosswalk %>%
221+
summarize(state_id = "us", population = sum(population), state_name = "United States", state_code = "US")
222+
usa_crosswalk <- state_crosswalk %>%
223+
filter(state_id == "us") %>%
224+
mutate(state_id = "usa")
225+
226+
if (national_code == "us") {
227+
return(bind_rows(state_crosswalk, us_crosswalk))
228+
}
229+
if (national_code == "usa") {
230+
return(bind_rows(state_crosswalk, usa_crosswalk))
231+
}
232+
if (national_code == "both") {
233+
return(bind_rows(state_crosswalk, us_crosswalk, usa_crosswalk))
234+
}
235+
}
236+
237+
#' Add a state info column to a dataframe
238+
#'
239+
#' @param df the dataframe to add the state info column to
240+
#' @param geo_value_col the name of the column in df that contains the geo_value
241+
#' @param info_col the name of the column in get_population_data() that contains the state info
242+
#' the options are: state_id, state_code, state_name, population.
243+
#'
244+
#' @examples
245+
#' # To add state_code to a dataframe with a geo_value column of state_ids (e.g. "ca")
246+
#' df %>% add_state_info(geo_value_col = "geo_value", old_geo_code = "state_id", new_geo_code = "state_code")
247+
#'
248+
#' # To add state_id to a dataframe with a geo_value column of state_codes (e.g. "01")
249+
#' df %>% add_state_info(geo_value_col = "location", old_geo_code = "state_code", new_geo_code = "state_id")
250+
add_state_info <- function(df, geo_value_col, old_geo_code, new_geo_code) {
251+
original_colnames <- colnames(df)
252+
253+
df %>%
254+
left_join(get_population_data(), by = join_by(!!geo_value_col == !!old_geo_code)) %>%
255+
select(original_colnames, !!new_geo_code)
224256
}
225257

226258
filter_forecast_geos <- function(forecasts, truth_data) {
@@ -278,6 +310,19 @@ write_submission_file <- function(pred, forecast_reference_date, submission_dire
278310
readr::write_csv(pred, file_path)
279311
}
280312

313+
#' The quantile levels used by the covidhub repository
314+
#'
315+
#' @param type either standard or inc_case, with inc_case being a small subset of the standard
316+
#'
317+
#' @export
318+
covidhub_probs <- function(type = c("standard", "inc_case")) {
319+
type <- match.arg(type)
320+
switch(type,
321+
standard = c(0.01, 0.025, seq(0.05, 0.95, by = 0.05), 0.975, 0.99),
322+
inc_case = c(0.025, 0.100, 0.250, 0.500, 0.750, 0.900, 0.975)
323+
) |> round(digits = 3)
324+
}
325+
281326
#' Utility to get the reference date for a given date. This is the last day of
282327
#' the epiweek that the date falls in.
283328
get_forecast_reference_date <- function(date) {

_targets.yaml

+4
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,7 @@ covid_hosp_prod:
1919
script: scripts/covid_hosp_prod.R
2020
store: covid_hosp_prod
2121
use_crew: no
22+
dashboard-proj:
23+
script: scripts/dashboard-proj.R
24+
store: dashboard-proj
25+
use_crew: no

dashboard-proj/.gitignore

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# CAUTION: do not edit this file by hand!
2+
# _targets/objects/ may have large data files,
3+
# and _targets/meta/process may have sensitive information.
4+
# It is good pratice to either commit nothing from _targets/,
5+
# or if your data is not too sensitive,
6+
# commit only _targets/meta/meta.
7+
*
8+
!.gitignore
9+
!meta
10+
meta/*

scripts/dashboard-proj.R

+219
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,219 @@
1+
library(tidyverse)
2+
library(httr)
3+
library(lubridate)
4+
library(progress)
5+
library(targets)
6+
source(here::here("R", "load_all.R"))
7+
8+
options(readr.show_progress = FALSE)
9+
options(readr.show_col_types = FALSE)
10+
11+
insufficient_data_geos <- c("as", "mp", "vi", "gu")
12+
13+
# Configuration
14+
config <- list(
15+
base_url = "https://raw.githubusercontent.com/cdcepi/FluSight-forecast-hub/main/model-output",
16+
forecasters = c("CMU-TimeSeries", "FluSight-baseline", "FluSight-ensemble", "FluSight-base_seasonal", "UMass-flusion"),
17+
local_storage = "data/forecasts",
18+
tracking_file = "data/download_tracking.csv"
19+
)
20+
21+
# Function to ensure directory structure exists
22+
setup_directories <- function(base_dir) {
23+
dir.create(file.path(base_dir), recursive = TRUE, showWarnings = FALSE)
24+
for (forecaster in config$forecasters) {
25+
dir.create(file.path(base_dir, forecaster), recursive = TRUE, showWarnings = FALSE)
26+
}
27+
}
28+
29+
# Function to load tracking data
30+
load_tracking_data <- function() {
31+
if (file.exists(config$tracking_file)) {
32+
read_csv(config$tracking_file)
33+
} else {
34+
tibble(
35+
forecaster = character(),
36+
filename = character(),
37+
download_date = character(),
38+
status = character()
39+
)
40+
}
41+
}
42+
43+
# Function to generate possible filenames for a date range
44+
generate_filenames <- function(start_date, end_date, forecaster) {
45+
dates <- seq(as_date(start_date), as_date(end_date), by = "week")
46+
filenames <- paste0(
47+
format(dates, "%Y-%m-%d"),
48+
"-",
49+
forecaster,
50+
".csv"
51+
)
52+
return(filenames)
53+
}
54+
55+
# Function to check if file exists on GitHub
56+
check_github_file <- function(forecaster, filename) {
57+
url <- paste0(config$base_url, "/", forecaster, "/", filename)
58+
response <- GET(url)
59+
return(status_code(response) == 200)
60+
}
61+
62+
# Function to download a single file
63+
download_forecast_file <- function(forecaster, filename) {
64+
url <- paste0(config$base_url, "/", forecaster, "/", filename)
65+
local_path <- file.path(config$local_storage, forecaster, filename)
66+
67+
tryCatch(
68+
{
69+
download.file(url, local_path, mode = "wb", quiet = TRUE)
70+
return("success")
71+
},
72+
error = function(e) {
73+
return("failed")
74+
}
75+
)
76+
}
77+
78+
# Main function to update forecast files
79+
update_forecast_files <- function(days_back = 30) {
80+
# Setup
81+
setup_directories(config$local_storage)
82+
tracking_data <- load_tracking_data()
83+
84+
# Generate date range
85+
end_date <- Sys.Date()
86+
start_date <- get_forecast_reference_date(end_date - days_back)
87+
88+
# Process each forecaster
89+
new_tracking_records <- list()
90+
91+
pb_forecasters <- progress_bar$new(
92+
format = "Downloading forecasts from :forecaster [:bar] :percent :eta",
93+
total = length(config$forecasters),
94+
clear = FALSE,
95+
width = 60
96+
)
97+
98+
for (forecaster in config$forecasters) {
99+
pb_forecasters$tick(tokens = list(forecaster = forecaster))
100+
101+
# Get potential filenames
102+
filenames <- generate_filenames(start_date, end_date, forecaster)
103+
104+
# Filter out already downloaded files
105+
existing_files <- tracking_data %>%
106+
filter(forecaster == !!forecaster, status == "success") %>%
107+
pull(filename)
108+
109+
new_files <- setdiff(filenames, existing_files)
110+
111+
if (length(new_files) > 0) {
112+
# Create nested progress bar for files
113+
pb_files <- progress_bar$new(
114+
format = " Downloading files [:bar] :current/:total :filename",
115+
total = length(new_files)
116+
)
117+
118+
for (filename in new_files) {
119+
pb_files$tick(tokens = list(filename = filename))
120+
121+
if (check_github_file(forecaster, filename)) {
122+
status <- download_forecast_file(forecaster, filename)
123+
124+
new_tracking_records[[length(new_tracking_records) + 1]] <- tibble(
125+
forecaster = forecaster,
126+
filename = filename,
127+
download_date = as.character(Sys.time()),
128+
status = status
129+
)
130+
}
131+
}
132+
}
133+
}
134+
135+
# Update tracking data
136+
if (length(new_tracking_records) > 0) {
137+
new_tracking_data <- bind_rows(new_tracking_records)
138+
tracking_data <- bind_rows(tracking_data, new_tracking_data)
139+
write_csv(tracking_data, config$tracking_file)
140+
}
141+
142+
return(tracking_data)
143+
}
144+
145+
# Function to read all forecast data
146+
read_all_forecasts <- function() {
147+
tracking_data <- read_csv(config$tracking_file)
148+
149+
successful_downloads <- tracking_data %>%
150+
filter(status == "success")
151+
152+
forecast_data <- map(1:nrow(successful_downloads), function(i) {
153+
row <- successful_downloads[i, ]
154+
path <- file.path(config$local_storage, row$forecaster, row$filename)
155+
if (file.exists(path)) {
156+
read_csv(path, col_types = list(
157+
reference_date = col_date(format = "%Y-%m-%d"),
158+
target_end_date = col_date(format = "%Y-%m-%d"),
159+
target = col_character(),
160+
location = col_character(),
161+
horizon = col_integer(),
162+
output_type = col_character(),
163+
output_type_id = col_character(),
164+
value = col_double(),
165+
forecaster = col_character(),
166+
forecast_date = col_date(format = "%Y-%m-%d")
167+
)) %>%
168+
add_state_info(geo_value_col = "location", old_geo_code = "state_code", new_geo_code = "state_id") %>%
169+
mutate(
170+
forecaster = row$forecaster,
171+
forecast_date = str_extract(row$filename, "\\d{4}-\\d{2}-\\d{2}"),
172+
geo_value = state_id
173+
)
174+
}
175+
})
176+
177+
return(bind_rows(forecast_data))
178+
}
179+
180+
get_latest_data <- function() {
181+
update_forecast_files(days_back = 120)
182+
read_all_forecasts()
183+
}
184+
185+
rlang::list2(
186+
tar_target(
187+
nhsn_latest_data,
188+
command = {
189+
if (wday(Sys.Date()) < 6 & wday(Sys.Date()) > 3) {
190+
# download from the preliminary data source from Wednesday to Friday
191+
most_recent_result <- readr::read_csv("https://data.cdc.gov/resource/mpgq-jmmr.csv?$limit=20000&$select=weekendingdate,jurisdiction,totalconfc19newadm,totalconfflunewadm")
192+
} else {
193+
most_recent_result <- readr::read_csv("https://data.cdc.gov/resource/ua7e-t2fy.csv?$limit=20000&$select=weekendingdate,jurisdiction,totalconfc19newadm,totalconfflunewadm")
194+
}
195+
most_recent_result %>%
196+
process_nhsn_data() %>%
197+
filter(disease == "nhsn_flu") %>%
198+
select(-disease) %>%
199+
filter(geo_value %nin% insufficient_data_geos) %>%
200+
mutate(
201+
source = "nhsn",
202+
geo_value = ifelse(geo_value == "usa", "us", geo_value),
203+
time_value = time_value - 3
204+
) %>%
205+
filter(version == max(version)) %>%
206+
select(-version) %>%
207+
data_substitutions(disease = "flu") %>%
208+
as_epi_df(other_keys = "source", as_of = Sys.Date())
209+
}
210+
),
211+
tar_target(
212+
name = nhsn_archive_data,
213+
command = {
214+
create_nhsn_data_archive(disease = "nhsn_flu")
215+
}
216+
),
217+
tar_target(download_forecasts, update_forecast_files(days_back = 120)),
218+
tar_target(all_forecasts, read_all_forecasts())
219+
)

0 commit comments

Comments
 (0)