Push nflfastR 2.2.0.9000

nflverse · Aug 9, 2020 · 81f1b38 · 81f1b38
1 parent 1f15706
commit 81f1b38
Show file tree

Hide file tree

Showing 13 changed files with 550 additions and 15 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -17,3 +17,7 @@
 ^man/figures/header_github\.png$
 ^man/figures/header_twitter\.png$
 ^man/figures/nflfastR_logo_fillsize\.png$
+^cran-comments\.md$
+^CRAN-RELEASE$
+^man/figures/readme-cp-model-1\.png$
+^man/figures/readme-epa-model-1\.png$
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: nflfastR
 Title: Functions to Efficiently Scrape NFL Play by Play Data
-Version: 2.1.3
+Version: 2.2.0.9000
 Authors@R: 
     c(person(given = "Sebastian",
              family = "Carl",
@@ -59,4 +59,3 @@ Suggests:
 Encoding: UTF-8
 LazyData: true
 RoxygenNote: 7.1.1
-
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
 export(add_qb_epa)
+export(add_xyac)
 export(calculate_expected_points)
 export(calculate_win_probability)
 export(clean_pbp)
@@ -55,3 +56,4 @@ importFrom(tidyr,unnest)
 importFrom(tidyr,unnest_wider)
 importFrom(tidyselect,any_of)
 importFrom(tidyselect,matches)
+importFrom(xgboost,getinfo)
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,16 @@
+# nflfastR (development version)
+
+* Fix `add_xyac()` breaking with some old packages
+* Fix `add_xyac()` calculations being wrong for some failed 4th downs
+* Updated Readme with ep and cp model plots
+* Updated `vignette("examples")` with the new `add_xyac()` function
+* Added xYAC model to `vignette("nflfastR-models")`
+
+# nflfastR 2.2.0
+
+* New function `add_xyac()` which adds the following columns associated with expected yards after
+the catch (xYAC): `xyac_epa`, `xyac_success`, `xyac_fd`, `xyac_mean_yardage`, `xyac_median_yardage`
+
 # nflfastR 2.1.3
 
 * Fixed a bug in `series_success` caused by bad `drive` information provided by NFL

diff --git a/R/helper_add_xyac.R b/R/helper_add_xyac.R
@@ -0,0 +1,217 @@
+################################################################################
+# Author: Ben Baldwin, Sebastian Carl
+# Purpose: Function to add expected yac variables.
+# Code Style Guide: styler::tidyverse_style()
+################################################################################
+#' Add expected yards after completion (xyac) variables
+#'
+#' @param pbp is a Data frame of play-by-play data scraped using \code{\link{fast_scraper}}.
+#' @details Build columns that capture what we should expect after the catch.
+#' @return The input Data Frame of the parameter 'pbp' with the following columns
+#' added:
+#' \describe{
+#' \item{xyac_epa}{Expected value of EPA gained after the catch, starting from where the catch was made. Zero yards after the catch would be listed as zero EPA.}
+#' \item{xyac_success}{Probability play earns positive EPA (relative to where play started) based on where ball was caught.}
+#' \item{xyac_fd}{Probability play earns a first down based on where the ball was caught.}
+#' \item{xyac_mean_yardage}{Average expected yards after the catch based on where the ball was caught.}
+#' \item{xyac_median_yardage}{Median expected yards after the catch based on where the ball was caught.}
+#' }
+#' @importFrom rlang .data
+#' @importFrom xgboost getinfo
+#' @export
+add_xyac <- function(pbp) {
+
+  # testing only
+  # pbp <- g
+
+  pbp <- pbp %>% dplyr::select(-tidyselect::any_of(drop.cols.xyac))
+
+  # for joining at the end
+  pbp <- pbp %>%
+    dplyr::mutate(index = 1:dplyr::n())
+
+  # prepare_xyac_data helper function shown below
+  passes <- prepare_xyac_data(pbp) %>%
+    filter(.data$valid_pass == 1, .data$distance_to_goal != 0)
+
+  if (!nrow(passes) == 0) {
+    # initialize xyac_model to avoid R CMD check note
+    xyac_model <- NULL
+    suppressWarnings(
+      # load the model from github because it is too big for the package
+      try(
+        load(url("https://github.com/guga31bb/nflfastR-data/blob/master/models/xyac_model.Rdata?raw=true")),
+        silent = TRUE
+      )
+    )
+
+    if (!is.null(xyac_model)) {
+      xyac_vars <-
+        stats::predict(
+          xyac_model,
+          as.matrix(passes %>% xyac_model_select())
+        ) %>%
+        tibble::as_tibble() %>%
+        dplyr::rename(prob = "value") %>%
+        dplyr::bind_cols(
+          purrr::map_dfr(seq_along(passes$index), function(x) {
+            tibble::tibble(
+              "yac" = -5:70,
+              "index" = passes$index[[x]],
+              "distance_to_goal" = passes$distance_to_goal[[x]],
+              "season" = passes$season[[x]],
+              "week" = passes$week[[x]],
+              "home_team" = passes$home_team[[x]],
+              "posteam" = passes$posteam[[x]],
+              "roof" = passes$roof[[x]],
+              "half_seconds_remaining" = dplyr::if_else(
+                passes$half_seconds_remaining[[x]] <= 6,
+                0,
+                passes$half_seconds_remaining[[x]] - 6
+              ),
+              "down" = as.integer(passes$down[[x]]),
+              "ydstogo" = as.integer(passes$ydstogo[[x]]),
+              "original_ydstogo" = as.integer(passes$ydstogo[[x]]),
+              "posteam_timeouts_remaining" = passes$posteam_timeouts_remaining[[x]],
+              "defteam_timeouts_remaining" = passes$defteam_timeouts_remaining[[x]],
+              "original_spot" = passes$yardline_100[[x]],
+              "original_ep" = passes$ep[[x]],
+              "air_epa" = passes$air_epa[[x]],
+              "air_yards" = passes$air_yards[[x]]
+            )
+          })
+        ) %>%
+        dplyr::group_by(.data$index) %>%
+        dplyr::mutate(
+          max_loss = dplyr::if_else(.data$distance_to_goal < 95, -5, .data$distance_to_goal - 99),
+          max_gain = dplyr::if_else(.data$distance_to_goal > 70, 70, .data$distance_to_goal),
+          cum_prob = cumsum(.data$prob),
+          prob = dplyr::case_when(
+            # truncate probs at loss greater than max loss
+            .data$yac == .data$max_loss ~ .data$cum_prob,
+            # same for gains bigger than possible
+            .data$yac == .data$max_gain ~ 1 - dplyr::lag(.data$cum_prob, 1),
+            TRUE ~ .data$prob
+          ),
+          # get end result for each possibility
+          yardline_100 = .data$distance_to_goal - .data$yac
+        ) %>%
+        dplyr::filter(.data$yac >= .data$max_loss, .data$yac <= .data$max_gain) %>%
+        dplyr::select(-.data$cum_prob) %>%
+        dplyr::mutate(
+          posteam_timeouts_pre = .data$posteam_timeouts_remaining,
+          defeam_timeouts_pre = .data$defteam_timeouts_remaining,
+          gain = .data$original_spot - .data$yardline_100,
+          turnover = dplyr::if_else(.data$down == 4 & .data$gain < .data$ydstogo, as.integer(1), as.integer(0)),
+          down = dplyr::if_else(.data$gain >= .data$ydstogo, 1, .data$down + 1),
+          ydstogo = dplyr::if_else(.data$gain >= .data$ydstogo, 10, .data$ydstogo - .data$gain),
+          # ydstogo can't be bigger than yardline
+          ydstogo = dplyr::if_else(.data$ydstogo >= .data$yardline_100, as.integer(.data$yardline_100), as.integer(.data$ydstogo)),
+          # possession change if 4th down failed
+          down = dplyr::if_else(.data$turnover == 1, as.integer(1), as.integer(.data$down)),
+          ydstogo = dplyr::if_else(.data$turnover == 1, as.integer(10), as.integer(.data$ydstogo)),
+          yardline_100 = dplyr::if_else(.data$turnover == 1, as.integer(100 - .data$yardline_100), as.integer(.data$yardline_100)),
+          posteam_timeouts_remaining = dplyr::if_else(.data$turnover == 1,
+                                                      .data$defeam_timeouts_pre,
+                                                      .data$posteam_timeouts_pre),
+          defteam_timeouts_remaining = dplyr::if_else(.data$turnover == 1,
+                                                      .data$posteam_timeouts_pre,
+                                                      .data$defeam_timeouts_pre)
+        ) %>%
+        dplyr::ungroup() %>%
+        nflfastR::calculate_expected_points() %>%
+        dplyr::group_by(.data$index) %>%
+        dplyr::mutate(
+          ep = dplyr::case_when(
+            .data$yardline_100 == 0 ~ 7,
+            .data$turnover == 1 ~ -1 * .data$ep,
+            TRUE ~ ep
+          ),
+          epa = .data$ep - .data$original_ep,
+          wt_epa = .data$epa * .data$prob,
+          wt_yardln = .data$yardline_100 * .data$prob,
+          med = dplyr::if_else(
+            cumsum(.data$prob) > .5 & dplyr::lag(cumsum(.data$prob) < .5), .data$yac, as.integer(0)
+          )
+        ) %>%
+        dplyr::summarise(
+          xyac_epa = sum(.data$wt_epa) - dplyr::first(.data$air_epa),
+          xyac_mean_yardage = (dplyr::first(.data$original_spot) - dplyr::first(.data$air_yards)) - sum(.data$wt_yardln),
+          xyac_median_yardage = max(.data$med),
+          xyac_success = sum((.data$ep > .data$original_ep) * .data$prob),
+          xyac_fd = sum((.data$gain >= .data$original_ydstogo) * .data$prob),
+          .groups = "drop_last"
+        ) %>%
+        dplyr::ungroup()
+
+      pbp <- pbp %>%
+        dplyr::left_join(xyac_vars, by = "index") %>%
+        dplyr::select(-.data$index)
+
+      message("added xyac variables")
+
+    } else {# means xyac_model isn't available
+      message("This function needs to download the model data from GitHub. Please check your Internet connection and try again!")
+      pbp <- pbp %>% dplyr::select(-.data$index)
+    }
+  } else {# means no valid pass plays in the pbp
+    pbp <- pbp %>%
+      dplyr::mutate(
+        xyac_epa = NA_real_,
+        xyac_mean_yardage = NA_real_,
+        xyac_median_yardage = NA_real_,
+        xyac_success = NA_real_,
+        xyac_fd = NA_real_
+      ) %>%
+      dplyr::select(-.data$index)
+    message("No non-NA values for xyac calculation detected. xyac variables set to NA")
+  }
+
+  # on old versions of dplyr, a .group column is created, which we don't want
+  pbp <- pbp %>% dplyr::select(-tidyselect::any_of(".group"))
+
+  return(pbp)
+}
+
+
+### helper function for getting the data ready
+prepare_xyac_data <- function(pbp) {
+
+  # valid pass play: at least -15 air yards, less than 70 air yards, has intended receiver, has pass location
+  passes <- pbp %>%
+    make_model_mutations() %>%
+    dplyr::mutate(
+      receiver_player_name =
+        stringr::str_extract(.data$desc, "(?<=((to)|(for))\\s[:digit:]{0,2}\\-{0,1})[A-Z][A-z]*\\.\\s?[A-Z][A-z]+(\\s(I{2,3})|(IV))?"),
+      pass_middle = dplyr::if_else(.data$pass_location == "middle", 1, 0),
+      air_is_zero = dplyr::if_else(.data$air_yards == 0, 1, 0),
+      distance_to_sticks = .data$air_yards - .data$ydstogo,
+      distance_to_goal = .data$yardline_100 - .data$air_yards,
+      valid_pass = dplyr::if_else(
+        (.data$complete_pass == 1 | .data$incomplete_pass == 1 | .data$interception == 1) &
+          !is.na(.data$air_yards) & .data$air_yards >= -15 & .data$air_yards < 70 &
+          !is.na(.data$receiver_player_name) & !is.na(.data$pass_location),
+        1, 0
+      )
+    )
+  return(passes)
+}
+
+### another helper function for getting the data ready
+xyac_model_select <- function(pbp) {
+  pbp %>%
+    dplyr::select(
+      "air_yards", "yardline_100", "ydstogo", "distance_to_goal",
+      "down1", "down2", "down3", "down4", "air_is_zero", "pass_middle",
+      "era2", "era3", "era4", "qb_hit", "home",
+      "outdoors", "retractable", "dome", "distance_to_sticks"
+    )
+}
+
+# These columns are being generated by add_xyac and the function tries to drop
+# them in case it is being used on a pbp dataset where the columns already exist
+drop.cols.xyac <- c(
+  "xyac_epa", "xyac_mean_yardage", "xyac_median_yardage", "xyac_success", "xyac_fd", ".groups"
+)
+
+
diff --git a/R/helper_database_functions.R b/R/helper_database_functions.R
@@ -78,7 +78,8 @@ update_db <- function(dbdir = ".",
     message(glue::glue("Starting download of {length(missing)} games ..."))
     new_pbp <- fast_scraper(missing, pp = is_installed_furrr) %>%
       clean_pbp() %>%
-      add_qb_epa()
+      add_qb_epa() %>%
+      add_xyac()
 
     message("Appending new data to database...")
     RSQLite::dbWriteTable(connection, tblname, new_pbp, append = TRUE)

diff --git a/README.Rmd b/README.Rmd
@@ -7,7 +7,8 @@ output: github_document
 ```{r, include = FALSE}
 knitr::opts_chunk$set(
   collapse = TRUE,
-  comment = "#>"
+  comment = "#>",
+  fig.path = "man/figures/readme-"
 )
 ```
 
@@ -55,7 +56,65 @@ Even though `nflfastR` is very fast, **for historical games we recommend downloa
 
 ## nflfastR models
 
-`nflfastR` uses its own models for Expected Points, Win Probability, and Completion Probability. To read about the models, please see `vignette("nflfastR-models")`. For a more detailed description of Expected Points models, we highly recommend this paper [from the nflscrapR team located here](https://arxiv.org/pdf/1802.00998.pdf). 
+`nflfastR` uses its own models for Expected Points, Win Probability, Completion Probability, and Expected Yards After the Catch. To read about the models, please see `vignette("nflfastR-models")`. For a more detailed description of Expected Points models, we highly recommend this paper [from the nflscrapR team located here](https://arxiv.org/pdf/1802.00998.pdf). 
+
+Here is a visualization of the Expected Points model by down and yardline.
+
+``` {r epa-model, warning = FALSE, message = FALSE, results = 'hide', fig.keep = 'all', dpi = 600, echo=FALSE}
+library(tidyverse)
+
+df <- map_df(2014:2019, ~{
+  readRDS(url(glue::glue('https://raw.githubusercontent.com/guga31bb/nflfastR-data/master/data/play_by_play_{.x}.rds'))) %>%
+      filter(!is.na(posteam) & !is.na(ep), !is.na(down)) %>%
+      select(ep, down, yardline_100, air_yards, pass_location, cp)
+})
+
+df %>%
+  ggplot(aes(x = yardline_100, y = ep, color = as.factor(down))) + 
+  geom_smooth(size = 2) + 
+  labs(x = "Yards from opponent's end zone",
+       y = "Expected points value",
+       color = "Down",
+       title = "Expected Points by Yardline and Down") +
+  theme_bw() + 
+  scale_y_continuous(expand=c(0,0), breaks = scales::pretty_breaks(10)) + 
+  scale_x_continuous(expand=c(0,0), breaks = seq(from = 5, to = 95, by = 10)) +
+  theme(
+    plot.title = element_text(size = 18, hjust = 0.5),
+    plot.subtitle = element_text(size = 16, hjust = 0.5),
+    axis.title = element_text(size = 18),
+    axis.text = element_text(size = 16),
+    legend.text = element_text(size = 16),
+    legend.title = element_text(size = 16),
+    legend.position = c(.90, .80)) +
+    annotate("text", x = 14, y = -2.2, size = 3, label = "2014-2019 | Model: @nflfastR")
+```
+
+Here is a visualization of the Completion Probability model by air yards and pass direction.
+
+``` {r cp-model, warning = FALSE, message = FALSE, results = 'hide', fig.keep = 'all', dpi = 600, echo=FALSE}
+df %>%
+  filter(!is.na(cp), between(air_yards, -5, 45)) %>%
+  mutate(pass_middle = if_else(pass_location == "middle", "Yes", "No")) %>%
+  ggplot(aes(x = air_yards, y = cp, color = as.factor(pass_middle))) + 
+  geom_smooth(size = 2) + 
+  labs(x = "Air yards",
+       y = "Expected completion %",
+       color = "Pass middle",
+       title = "Expected Completion % by Air Yards and Pass Direction") +
+  theme_bw() + 
+  scale_y_continuous(expand=c(0,0), breaks = scales::pretty_breaks(5)) + 
+  scale_x_continuous(expand=c(0,0)) +
+  theme(
+    plot.title = element_text(size = 18, hjust = 0.5),
+    plot.subtitle = element_text(size = 16, hjust = 0.5),
+    axis.title = element_text(size = 18),
+    axis.text = element_text(size = 16),
+    legend.text = element_text(size = 16),
+    legend.title = element_text(size = 16),
+    legend.position = c(.80, .80)) +
+    annotate("text", x = 2, y = .32, size = 3, label = "2014-2019 | Model: @nflfastR")
+```
 
 `nflfastR` includes two win probability models: one with and one without incorporating the pre-game spread.
 

diff --git a/README.md b/README.md
@@ -12,7 +12,7 @@ status](https://github.com/mrcaseb/nflfastR/workflows/R-CMD-check/badge.svg)](ht
 [![Travis build
 status](https://travis-ci.com/mrcaseb/nflfastR.svg?branch=master)](https://travis-ci.com/mrcaseb/nflfastR)
 [![Twitter
-Follow](https://img.shields.io/twitter/follow/nflfastR.svg?style=social)](https://twitter.com/nflfastR) 
+Follow](https://img.shields.io/twitter/follow/nflfastR.svg?style=social)](https://twitter.com/nflfastR)
 <!-- badges: end -->
 
 `nflfastR` is a set of functions to efficiently scrape NFL play-by-play
@@ -70,11 +70,22 @@ as .csv.gz, .parquet, or .rds.
 
 ## nflfastR models
 
-`nflfastR` uses its own models for Expected Points, Win Probability, and
-Completion Probability. To read about the models, please see
-`vignette("nflfastR-models")`. For a more detailed description of
-Expected Points models, we highly recommend this paper [from the
-nflscrapR team located here](https://arxiv.org/pdf/1802.00998.pdf).
+`nflfastR` uses its own models for Expected Points, Win Probability,
+Completion Probability, and Expected Yards After the Catch. To read
+about the models, please see `vignette("nflfastR-models")`. For a more
+detailed description of Expected Points models, we highly recommend this
+paper [from the nflscrapR team located
+here](https://arxiv.org/pdf/1802.00998.pdf).
+
+Here is a visualization of the Expected Points model by down and
+yardline.
+
+![](man/figures/readme-epa-model-1.png)<!-- -->
+
+Here is a visualization of the Completion Probability model by air yards
+and pass direction.
+
+![](man/figures/readme-cp-model-1.png)<!-- -->
 
 `nflfastR` includes two win probability models: one with and one without
 incorporating the pre-game spread.