wip: dashboard

dshemetov · dshemetov · commit f9f826998cfd · 2025-01-23T12:50:26.000-08:00
diff --git a/.gitignore b/.gitignore
@@ -15,4 +15,6 @@ aux_data
 .netlify
 reports/*.html
 reports/report.md
-cache
+cache
+data
+.vscode
diff --git a/R/forecasters/formatters.R b/R/forecasters/formatters.R
@@ -59,28 +59,15 @@ format_covidhub <- function(pred, true_forecast_date, target_end_date, quantile_
 format_flusight <- function(pred, disease = c("flu", "covid")) {
   disease <- arg_match(disease)
   pred %>%
+    add_state_info(geo_value_col = "geo_value", old_geo_code = "state_id", new_geo_code = "state_code") %>%
     mutate(
       reference_date = get_forecast_reference_date(forecast_date),
       target = glue::glue("wk inc {disease} hosp"),
       horizon = as.integer(floor((target_end_date - reference_date) / 7)),
       output_type = "quantile",
       output_type_id = quantile,
-      value = value
+      value = value,
+      location = state_code
     ) %>%
-    left_join(get_population_data() %>% select(state_id, state_code), by = c("geo_value" = "state_id")) %>%
-    mutate(location = state_code) %>%
     select(reference_date, target, horizon, target_end_date, location, output_type, output_type_id, value)
 }
-
-#' The quantile levels used by the covidhub repository
-#'
-#' @param type either standard or inc_case, with inc_case being a small subset of the standard
-#'
-#' @export
-covidhub_probs <- function(type = c("standard", "inc_case")) {
-  type <- match.arg(type)
-  switch(type,
-    standard = c(0.01, 0.025, seq(0.05, 0.95, by = 0.05), 0.975, 0.99),
-    inc_case = c(0.025, 0.100, 0.250, 0.500, 0.750, 0.900, 0.975)
-  ) |> round(digits = 3)
-}
diff --git a/R/utils.R b/R/utils.R
@@ -212,15 +212,47 @@ exclude_geos <- function(geo_forecasters_weights) {
 
 `%nin%` <- function(x, y) !(x %in% y)
 
-get_population_data <- function() {
-  readr::read_csv("https://raw.githubusercontent.com/cmu-delphi/covidcast-indicators/refs/heads/main/_delphi_utils_python/delphi_utils/data/2020/state_pop.csv", show_col_types = FALSE) %>%
-    rename(population = pop) %>%
-    # Add a row for the United States
-    bind_rows(
-      (.) %>% summarize(state_id = "us", population = sum(population), state_name = "United States", state_code = "US")
-    ) %>%
-    # Duplicate the last row, but with state_id = "usa".
-    bind_rows((.) %>% filter(state_id == "us") %>% mutate(state_id = "usa"))
+get_population_data <- function(national_code = c("both", "us", "usa")) {
+  national_code <- rlang::arg_match(national_code)
+
+  state_crosswalk <- readr::read_csv("https://raw.githubusercontent.com/cmu-delphi/covidcast-indicators/refs/heads/main/_delphi_utils_python/delphi_utils/data/2020/state_pop.csv", show_col_types = FALSE) %>%
+    rename(population = pop)
+  us_crosswalk <- state_crosswalk %>%
+    summarize(state_id = "us", population = sum(population), state_name = "United States", state_code = "US")
+  usa_crosswalk <- state_crosswalk %>%
+    filter(state_id == "us") %>%
+    mutate(state_id = "usa")
+
+  if (national_code == "us") {
+    return(bind_rows(state_crosswalk, us_crosswalk))
+  }
+  if (national_code == "usa") {
+    return(bind_rows(state_crosswalk, usa_crosswalk))
+  }
+  if (national_code == "both") {
+    return(bind_rows(state_crosswalk, us_crosswalk, usa_crosswalk))
+  }
+}
+
+#' Add a state info column to a dataframe
+#'
+#' @param df the dataframe to add the state info column to
+#' @param geo_value_col the name of the column in df that contains the geo_value
+#' @param info_col the name of the column in get_population_data() that contains the state info
+#'   the options are: state_id, state_code, state_name, population.
+#'
+#' @examples
+#' # To add state_code to a dataframe with a geo_value column of state_ids (e.g. "ca")
+#' df %>% add_state_info(geo_value_col = "geo_value", old_geo_code = "state_id", new_geo_code = "state_code")
+#'
+#' # To add state_id to a dataframe with a geo_value column of state_codes (e.g. "01")
+#' df %>% add_state_info(geo_value_col = "location", old_geo_code = "state_code", new_geo_code = "state_id")
+add_state_info <- function(df, geo_value_col, old_geo_code, new_geo_code) {
+  original_colnames <- colnames(df)
+
+  df %>%
+    left_join(get_population_data(), by = join_by(!!geo_value_col == !!old_geo_code)) %>%
+    select(original_colnames, !!new_geo_code)
 }
 
 filter_forecast_geos <- function(forecasts, truth_data) {
@@ -278,6 +310,19 @@ write_submission_file <- function(pred, forecast_reference_date, submission_dire
   readr::write_csv(pred, file_path)
 }
 
+#' The quantile levels used by the covidhub repository
+#'
+#' @param type either standard or inc_case, with inc_case being a small subset of the standard
+#'
+#' @export
+covidhub_probs <- function(type = c("standard", "inc_case")) {
+  type <- match.arg(type)
+  switch(type,
+    standard = c(0.01, 0.025, seq(0.05, 0.95, by = 0.05), 0.975, 0.99),
+    inc_case = c(0.025, 0.100, 0.250, 0.500, 0.750, 0.900, 0.975)
+  ) |> round(digits = 3)
+}
+
 #' Utility to get the reference date for a given date. This is the last day of
 #' the epiweek that the date falls in.
 get_forecast_reference_date <- function(date) {
diff --git a/_targets.yaml b/_targets.yaml
@@ -19,3 +19,7 @@ covid_hosp_prod:
   script: scripts/covid_hosp_prod.R
   store: covid_hosp_prod
   use_crew: no
+dashboard-proj:
+  script: scripts/dashboard-proj.R
+  store: dashboard-proj
+  use_crew: no
diff --git a/dashboard-proj/.gitignore b/dashboard-proj/.gitignore
@@ -0,0 +1,10 @@
+# CAUTION: do not edit this file by hand!
+# _targets/objects/ may have large data files,
+# and _targets/meta/process may have sensitive information.
+# It is good pratice to either commit nothing from _targets/,
+# or if your data is not too sensitive,
+# commit only _targets/meta/meta.
+*
+!.gitignore
+!meta
+meta/*
diff --git a/scripts/dashboard-proj.R b/scripts/dashboard-proj.R
@@ -0,0 +1,219 @@
+library(tidyverse)
+library(httr)
+library(lubridate)
+library(progress)
+library(targets)
+source(here::here("R", "load_all.R"))
+
+options(readr.show_progress = FALSE)
+options(readr.show_col_types = FALSE)
+
+insufficient_data_geos <- c("as", "mp", "vi", "gu")
+
+# Configuration
+config <- list(
+  base_url = "https://raw.githubusercontent.com/cdcepi/FluSight-forecast-hub/main/model-output",
+  forecasters = c("CMU-TimeSeries", "FluSight-baseline", "FluSight-ensemble", "FluSight-base_seasonal", "UMass-flusion"),
+  local_storage = "data/forecasts",
+  tracking_file = "data/download_tracking.csv"
+)
+
+# Function to ensure directory structure exists
+setup_directories <- function(base_dir) {
+  dir.create(file.path(base_dir), recursive = TRUE, showWarnings = FALSE)
+  for (forecaster in config$forecasters) {
+    dir.create(file.path(base_dir, forecaster), recursive = TRUE, showWarnings = FALSE)
+  }
+}
+
+# Function to load tracking data
+load_tracking_data <- function() {
+  if (file.exists(config$tracking_file)) {
+    read_csv(config$tracking_file)
+  } else {
+    tibble(
+      forecaster = character(),
+      filename = character(),
+      download_date = character(),
+      status = character()
+    )
+  }
+}
+
+# Function to generate possible filenames for a date range
+generate_filenames <- function(start_date, end_date, forecaster) {
+  dates <- seq(as_date(start_date), as_date(end_date), by = "week")
+  filenames <- paste0(
+    format(dates, "%Y-%m-%d"),
+    "-",
+    forecaster,
+    ".csv"
+  )
+  return(filenames)
+}
+
+# Function to check if file exists on GitHub
+check_github_file <- function(forecaster, filename) {
+  url <- paste0(config$base_url, "/", forecaster, "/", filename)
+  response <- GET(url)
+  return(status_code(response) == 200)
+}
+
+# Function to download a single file
+download_forecast_file <- function(forecaster, filename) {
+  url <- paste0(config$base_url, "/", forecaster, "/", filename)
+  local_path <- file.path(config$local_storage, forecaster, filename)
+
+  tryCatch(
+    {
+      download.file(url, local_path, mode = "wb", quiet = TRUE)
+      return("success")
+    },
+    error = function(e) {
+      return("failed")
+    }
+  )
+}
+
+# Main function to update forecast files
+update_forecast_files <- function(days_back = 30) {
+  # Setup
+  setup_directories(config$local_storage)
+  tracking_data <- load_tracking_data()
+
+  # Generate date range
+  end_date <- Sys.Date()
+  start_date <- get_forecast_reference_date(end_date - days_back)
+
+  # Process each forecaster
+  new_tracking_records <- list()
+
+  pb_forecasters <- progress_bar$new(
+    format = "Downloading forecasts from :forecaster [:bar] :percent :eta",
+    total = length(config$forecasters),
+    clear = FALSE,
+    width = 60
+  )
+
+  for (forecaster in config$forecasters) {
+    pb_forecasters$tick(tokens = list(forecaster = forecaster))
+
+    # Get potential filenames
+    filenames <- generate_filenames(start_date, end_date, forecaster)
+
+    # Filter out already downloaded files
+    existing_files <- tracking_data %>%
+      filter(forecaster == !!forecaster, status == "success") %>%
+      pull(filename)
+
+    new_files <- setdiff(filenames, existing_files)
+
+    if (length(new_files) > 0) {
+      # Create nested progress bar for files
+      pb_files <- progress_bar$new(
+        format = "  Downloading files [:bar] :current/:total :filename",
+        total = length(new_files)
+      )
+
+      for (filename in new_files) {
+        pb_files$tick(tokens = list(filename = filename))
+
+        if (check_github_file(forecaster, filename)) {
+          status <- download_forecast_file(forecaster, filename)
+
+          new_tracking_records[[length(new_tracking_records) + 1]] <- tibble(
+            forecaster = forecaster,
+            filename = filename,
+            download_date = as.character(Sys.time()),
+            status = status
+          )
+        }
+      }
+    }
+  }
+
+  # Update tracking data
+  if (length(new_tracking_records) > 0) {
+    new_tracking_data <- bind_rows(new_tracking_records)
+    tracking_data <- bind_rows(tracking_data, new_tracking_data)
+    write_csv(tracking_data, config$tracking_file)
+  }
+
+  return(tracking_data)
+}
+
+# Function to read all forecast data
+read_all_forecasts <- function() {
+  tracking_data <- read_csv(config$tracking_file)
+
+  successful_downloads <- tracking_data %>%
+    filter(status == "success")
+
+  forecast_data <- map(1:nrow(successful_downloads), function(i) {
+    row <- successful_downloads[i, ]
+    path <- file.path(config$local_storage, row$forecaster, row$filename)
+    if (file.exists(path)) {
+      read_csv(path, col_types = list(
+        reference_date = col_date(format = "%Y-%m-%d"),
+        target_end_date = col_date(format = "%Y-%m-%d"),
+        target = col_character(),
+        location = col_character(),
+        horizon = col_integer(),
+        output_type = col_character(),
+        output_type_id = col_character(),
+        value = col_double(),
+        forecaster = col_character(),
+        forecast_date = col_date(format = "%Y-%m-%d")
+      )) %>%
+        add_state_info(geo_value_col = "location", old_geo_code = "state_code", new_geo_code = "state_id") %>%
+        mutate(
+          forecaster = row$forecaster,
+          forecast_date = str_extract(row$filename, "\\d{4}-\\d{2}-\\d{2}"),
+          geo_value = state_id
+        )
+    }
+  })
+
+  return(bind_rows(forecast_data))
+}
+
+get_latest_data <- function() {
+  update_forecast_files(days_back = 120)
+  read_all_forecasts()
+}
+
+rlang::list2(
+  tar_target(
+    nhsn_latest_data,
+    command = {
+      if (wday(Sys.Date()) < 6 & wday(Sys.Date()) > 3) {
+        # download from the preliminary data source from Wednesday to Friday
+        most_recent_result <- readr::read_csv("https://data.cdc.gov/resource/mpgq-jmmr.csv?$limit=20000&$select=weekendingdate,jurisdiction,totalconfc19newadm,totalconfflunewadm")
+      } else {
+        most_recent_result <- readr::read_csv("https://data.cdc.gov/resource/ua7e-t2fy.csv?$limit=20000&$select=weekendingdate,jurisdiction,totalconfc19newadm,totalconfflunewadm")
+      }
+      most_recent_result %>%
+        process_nhsn_data() %>%
+        filter(disease == "nhsn_flu") %>%
+        select(-disease) %>%
+        filter(geo_value %nin% insufficient_data_geos) %>%
+        mutate(
+          source = "nhsn",
+          geo_value = ifelse(geo_value == "usa", "us", geo_value),
+          time_value = time_value - 3
+        ) %>%
+        filter(version == max(version)) %>%
+        select(-version) %>%
+        data_substitutions(disease = "flu") %>%
+        as_epi_df(other_keys = "source", as_of = Sys.Date())
+    }
+  ),
+  tar_target(
+    name = nhsn_archive_data,
+    command = {
+      create_nhsn_data_archive(disease = "nhsn_flu")
+    }
+  ),
+  tar_target(download_forecasts, update_forecast_files(days_back = 120)),
+  tar_target(all_forecasts, read_all_forecasts())
+)
diff --git a/scripts/reports/forecast_dashboard.Rmd b/scripts/reports/forecast_dashboard.Rmd
diff --git a/scripts/serve-dashboard.r b/scripts/serve-dashboard.r