Merge pull request #3 from FluSightNetwork/master

update my files
FluSightNetwork · Oct 27, 2017 · 568e55c · 568e55c
2 parents dc594e5 + 5a38051
commit 568e55c
Show file tree

Hide file tree

Showing 9 changed files with 3,948 additions and 3,829 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -20,9 +20,4 @@ env:
   global:
   - ENCRYPTION_LABEL: "ae5ecd417952"
 
-before_install:
-  - sudo Rscript -e "install.packages('devtools', repos='http://cran.us.r-project.org')"
-  - sudo Rscript -e "devtools::install_github('hrbrmstr/cdcfluview')"
-  - sudo Rscript -e "devtools::install_github('jarad/FluSight')"
-
 script: bash ./travis-main.sh
diff --git a/flusight-deploy/0-init-flusight.sh b/flusight-deploy/0-init-flusight.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-# Script to downlaod and setup flusight directory structure
+# Script to download and setup flusight directory structure
 set -e
 
 # Download flusight master

diff --git a/scores/scores.csv b/scores/scores.csv
diff --git a/scripts/check-ensemble-scores.R b/scripts/check-ensemble-scores.R
@@ -14,7 +14,6 @@ truth_condense <- function(year) {
     expand_truth(week_expand = 1, percent_expand = 5)
 }
 
-
 exp_truth_2010 <- truth_condense(2010)
 exp_truth_2011 <- truth_condense(2011)
 exp_truth_2012 <- truth_condense(2012)
@@ -23,6 +22,33 @@ exp_truth_2014 <- truth_condense(2014)
 exp_truth_2015 <- truth_condense(2015)
 exp_truth_2016 <- truth_condense(2016)
 
+# Pull in truth based on week 28 values in given year
+obs_truth <- read.csv("../scores/target-multivals.csv",
+                      stringsAsFactors = F)
+
+expand_old_truth <- function(season) {
+  obs_truth %>%
+    filter(Season == season & (Calendar.Week >= 43 | Calendar.Week <= 18)) %>%
+    mutate(forecast_week = ifelse(Target %in% c("Season onset", "Season peak week",
+                                                "Season peak percentage"),
+                                  NA,
+                                  Calendar.Week)) %>%
+    rename(location = Location, target = Target, bin_start_incl = Valid.Bin_start_incl) %>%
+    distinct(location, target, forecast_week, bin_start_incl) %>%
+    {if (season == "2014/2015") expand_truth(., week53 = T) else expand_truth(.) }
+
+}
+
+obs_exp_truth_2010 <- expand_old_truth("2010/2011")
+obs_exp_truth_2011 <- expand_old_truth("2011/2012")
+obs_exp_truth_2012 <- expand_old_truth("2012/2013")
+obs_exp_truth_2013 <- expand_old_truth("2013/2014")
+obs_exp_truth_2014 <- expand_old_truth("2014/2015")
+obs_exp_truth_2015 <- expand_old_truth("2015/2016")
+obs_exp_truth_2016 <- expand_old_truth("2016/2017")
+
+
+
 # Pull in csvs from ensembles
 
 read_all_entries <- function(model) {
@@ -54,20 +80,20 @@ ensemble_scores <- function(model) {
   entries <- read_all_entries(model)
 
   scores <- list()
-  scores[["2010/2011"]] <- purrr::map(entries[["2010/2011"]],
-                                      score_entry, exp_truth_2010)
-  scores[["2011/2012"]] <- purrr::map(entries[["2011/2012"]],
-                                      score_entry, exp_truth_2011)
-  scores[["2012/2013"]] <- purrr::map(entries[["2012/2013"]],
-                                      score_entry, exp_truth_2012)
-  scores[["2013/2014"]] <- purrr::map(entries[["2013/2014"]],
-                                      score_entry, exp_truth_2013)
-  scores[["2014/2015"]] <- purrr::map(entries[["2014/2015"]],
-                                      score_entry, exp_truth_2014)
-  scores[["2015/2016"]] <- purrr::map(entries[["2015/2016"]],
-                                      score_entry, exp_truth_2015)
-  scores[["2016/2017"]] <- purrr::map(entries[["2016/2017"]], 
-                                      score_entry, exp_truth_2016)
+  scores[["2010/2011"]] <- map(entries[["2010/2011"]],
+                               score_entry, obs_exp_truth_2010)
+  scores[["2011/2012"]] <- map(entries[["2011/2012"]],
+                               score_entry, obs_exp_truth_2011)
+  scores[["2012/2013"]] <- map(entries[["2012/2013"]],
+                               score_entry, obs_exp_truth_2012)
+  scores[["2013/2014"]] <- map(entries[["2013/2014"]],
+                               score_entry, obs_exp_truth_2013)
+  scores[["2014/2015"]] <- map(entries[["2014/2015"]],
+                               score_entry, obs_exp_truth_2014)
+  scores[["2015/2016"]] <- map(entries[["2015/2016"]],
+                               score_entry, obs_exp_truth_2015)
+  scores[["2016/2017"]] <- map(entries[["2016/2017"]], 
+                               score_entry, obs_exp_truth_2016)
 
   all_scores <- bind_rows(map(scores, bind_rows), .id = "season")
 
@@ -78,4 +104,48 @@ constant_weight_scores <- ensemble_scores("constant-weights")
 equal_weight_scores <- ensemble_scores("equal-weights")
 target_region_scores <- ensemble_scores("target-and-region-based-weights")
 target_scores <- ensemble_scores("target-based-weights")
-target_type_scores <- ensemble_scores("target-type-based-weights")
+target_type_scores <- ensemble_scores("target-type-based-weights")
+
+# Create boundaries for scores that we're interested in
+all_ensemble_scores <- bind_rows(
+  constant_weight_scores %>% mutate(Model = "FSNetwork-CW"),
+  equal_weight_scores %>% mutate(Model = "FSNetwork-EW"),
+  target_region_scores %>% mutate(Model = "FSNetwork-TRW"),
+  target_scores %>% mutate(Model = "FSNetwork-TW"),
+  target_type_scores %>% mutate(Model = "FSNetwork-TTW")
+) %>%
+  rename(Season = season, Location = location, Target = target,
+         FluSight_score = score, Epiweek = forecast_week) %>%
+  mutate(Model.Week = ifelse(Season == "2014/2015",
+                             ifelse(Epiweek < 40, Epiweek + 53, Epiweek),
+                             ifelse(Epiweek < 40, Epiweek + 52, Epiweek)))
+
+# Compare Travis scores to FluSight scores
+compare_scores <- all_ensemble_scores %>%
+  left_join(travis_scores, by = c("Season", "Location", "Target",
+                                  "Epiweek", "Model.Week", "Model")) %>%
+  select(Season, Location, Target, Epiweek, Model, FluSight_score, Multi.bin.score) %>%
+  mutate(diff = FluSight_score - Multi.bin.score)
+
+# Print scores that differ by more than 1e-12
+different_score <- compare_scores %>%
+  filter(diff > 1e-12)
+
+table(different_score$Model)
+table(different_score$Target)
+table(different_score$Location)
+table(different_score$Epiweek)
+table(different_score$Season)
+
+# All differences in 2014/15 peak week
+obs_exp_truth_2014 %>%
+  filter(target == "Season peak week", location == "HHS Region 3") %>%
+  distinct
+
+obs_truth %>%
+  filter(Season == "2014/2015", Target == "Season peak week") %>%
+  distinct(Season, Target, Location, Valid.Bin_start_incl)
+
+compare_scores %>%
+  filter(Season == "2014/2015", Target == "Season peak week",
+         Location == "HHS Region 3", Epiweek == 1)
diff --git a/scripts/generate-scores.js b/scripts/generate-scores.js
@@ -51,22 +51,25 @@ const getTrueData = truthFile => {
 }
 
 /**
- * Not exactly linspace
+ * Return a season string for given time data
  */
-const arange = (start, end, gap) => {
-  let out = [start]
-  while (out[out.length - 1] !== end) {
-    out.push(out[out.length - 1] + gap)
-  }
-  return out
+const getSeason = (year, epiweek) => {
+  return (epiweek < 40) ? `${year-1}/${year}` : `${year}/${year+1}`
 }
 
 /**
- * Return a neighbouring region of 1 bin around a given week
+ * Tell the last week (52/53) for given time data
  */
-const weekNeighbours = (binStart, year) => {
-  let lastWeek = (new mmwr.MMWRDate(year, 1)).nWeeks
+const getLastWeek = (year, epiweek) => {
+  let seasonFirstYear = parseInt(getSeason(year, epiweek).split('/')[0])
+  return (new mmwr.MMWRDate(seasonFirstYear)).nWeeks
+}
 
+/**
+ * Return a neighbouring region of 1 bin around a given week
+ */
+const weekNeighbours = (binStart, year, epiweek) => {
+  let lastWeek = getLastWeek(year, epiweek)
   // Handle edge cases
   if (binStart === 40) {
     // We are at the beginning of the season
@@ -76,7 +79,7 @@ const weekNeighbours = (binStart, year) => {
     // The next bin is 1
     return [binStart - 1, binStart, 1]
   } else if (binStart === 1) {
-    return [(new mmwr.MMWRDate(year - 1, 1)).nWeeks, binStart, 2]
+    return [lastWeek, binStart, 2]
   } else {
     // This is regular case
     return [binStart - 1, binStart, binStart + 1]
@@ -86,12 +89,12 @@ const weekNeighbours = (binStart, year) => {
 /**
  * Return expanded set of binStarts for given bin value and target type
  */
-const expandBinStarts = (binStarts, targetType, year) => {
+const expandBinStarts = (binStarts, targetType, year, epiweek) => {
   if (targetType.endsWith('ahead') || targetType.endsWith('percentage')) {
     // This is a percentage target
     return util.unique(binStarts.reduce((acc, binStart) => {
       return acc.concat(
-        arange(-0.5, 0.5, 0.1)
+        util.arange(-0.5, 0.5, 0.1)
           .map(diff => binStart + diff)
           .map(bs => Math.round(bs * 10) / 10) // Round to get just one place decimal
           .filter(bs => (bs >= 0.0 - Number.EPSILON) && (bs <= 13.0 + Number.EPSILON))
@@ -100,7 +103,7 @@ const expandBinStarts = (binStarts, targetType, year) => {
   } else {
     // This is a week target
     let uniqueBinStarts = util.unique(binStarts.reduce((acc, binStart) => {
-      return acc.concat(weekNeighbours(binStart, year).map(bs => Math.round(bs)))
+      return acc.concat(weekNeighbours(binStart, year, epiweek).map(bs => Math.round(bs)))
     }, []))
 
     // If every one is NaN, then just return one NaN
@@ -156,6 +159,7 @@ let outputLines = [header.join(',')]
 let errorLogLines = []
 let errorBlacklistLines = []
 let trueData = getTrueData(truthFile)
+let csvData
 
 // NOTE: For scores, we only consider these two directories
 models.getModelDirs(
@@ -170,12 +174,12 @@ models.getModelDirs(
   csvs.forEach(csvFile => {
     let {year, epiweek} = models.getCsvTime(csvFile)
     try {
-      let csvData = getCsvData(csvFile)
+      csvData = getCsvData(csvFile)
       meta.regions.forEach(region => {
         meta.targets.forEach(target => {
           let trueTargets = trueData[year][epiweek][region][target]
           let trueBinStarts = trueTargets.map(tt => parseFloat(tt[6]))
-          let expandedTrueBinStarts = expandBinStarts(trueBinStarts, target, year)
+          let expandedTrueBinStarts = expandBinStarts(trueBinStarts, target, parseInt(year), parseInt(epiweek))
           let season = trueTargets[0][2]
           let modelWeek = trueTargets[0][3]
           let modelProbabilities = csvData[region][target]

diff --git a/scripts/modules/util.js b/scripts/modules/util.js
@@ -44,7 +44,16 @@ const readYamlFile = fileName => {
   return yaml.safeLoad(fs.readFileSync(fileName, 'utf8'))
 }
 
+const arange = (start, end, gap) => {
+  let len = 1 + ((end - start) / gap)
+  return [...Array(len).keys()].map(i => start + gap * i)
+}
+
+const isClose = (a, b, tol = Number.EPSILON) => Math.abs(a - b) < tol
+
 module.exports.isSubset = isSubset
 module.exports.unique = unique
 module.exports.writeLines = writeLines
 module.exports.readYamlFile = readYamlFile
+module.exports.arange = arange
+module.exports.isClose = isClose
diff --git a/scripts/report.Rmd b/scripts/report.Rmd
@@ -0,0 +1,30 @@
+---
+title: 'Combining Mechanistic and Statistical Models to Forecat Influenza in the U.S.:
+  A Collaborative Ensemble from the FluSight Network'
+author: Nicholas G Reich, Logan Brooks, Abhinav Tushar, Teresa Yamana, Craig McGowan,
+  Evan Ray, Dave Osthus, Roni Rosenfeld
+date: "10/26/2017"
+output: pdf_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+## Overview
+In the 2016/2017 influenza season, the CDC ran the 4thth annual FluSight competition and received XX submissions from XX teams. During the season, analysts at the CDC built an ensemble model that combined all of the submitted models by taking the "average" forecast for each influenza target. This model was one of the top performing models for the entire season.
+
+In March 2017 the FluSight Network was founded to create a collaborative network of influenza forecasters. This group worked throughout 2017 to create a set of guidelines and an experimental design that would enable submission of a publicly available, multi-team, real-time submission of an ensemble model with validated and performance-based weights for each model (i.e. not a simple average of models). 
+
+This document provides an executive summary of that effort, highlighting the results and documenting the chosen model that was designated for real-time submission during the 2017/2018 U.S. influenza season.
+
+Institution | No. of models | Team leaders
+----------- | ------------- | -------------
+UMass-Amherst | 3 | Nicholas Reich, Abhinav Tushar, Evan Ray
+Carnegie Mellon | XX | Logan Brooks, Roni Rosenfeld
+Columbia University | XX | Teresa Yamana, Jeff Shaman
+Los Alamos National Laboratories | 1 | Dave Osthus
+
+## Selected Ensemble Model for Real-time Submissions
+
+The model selected for real-time submissions is the model that performed 
diff --git a/setup-R-packages.sh b/setup-R-packages.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+# Script to setup packages required for R code
+set -e
+
+sudo Rscript -e "install.packages('devtools', repos='http://cran.us.r-project.org')"
+sudo Rscript -e "devtools::install_github('hrbrmstr/cdcfluview')"
+sudo Rscript -e "devtools::install_github('jarad/FluSight')"
diff --git a/travis-main.sh b/travis-main.sh
@@ -50,6 +50,9 @@ git add ./model-forecasts/component-models/model-id-map.csv
 git diff-index --quiet HEAD || git commit -m "autogenerated csvs"
 git push $SSH_REPO HEAD:master
 
+# Setup R now
+bash ./setup-R-packages.sh
+
 echo "> Building visualizer"
 # Go back and build flusight
 git checkout gh-pages || git checkout --orphan gh-pages