diff --git a/.Rbuildignore b/.Rbuildignore index f1a3b1a0..02cce294 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -17,3 +17,7 @@ ^man/figures/header_github\.png$ ^man/figures/header_twitter\.png$ ^man/figures/nflfastR_logo_fillsize\.png$ +^cran-comments\.md$ +^CRAN-RELEASE$ +^man/figures/readme-cp-model-1\.png$ +^man/figures/readme-epa-model-1\.png$ diff --git a/DESCRIPTION b/DESCRIPTION index 4709e9ad..51d02e7a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: nflfastR Title: Functions to Efficiently Scrape NFL Play by Play Data -Version: 2.1.3 +Version: 2.2.0.9000 Authors@R: c(person(given = "Sebastian", family = "Carl", @@ -59,4 +59,3 @@ Suggests: Encoding: UTF-8 LazyData: true RoxygenNote: 7.1.1 - diff --git a/NAMESPACE b/NAMESPACE index 4761cbbd..3aa3a0db 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,7 @@ # Generated by roxygen2: do not edit by hand export(add_qb_epa) +export(add_xyac) export(calculate_expected_points) export(calculate_win_probability) export(clean_pbp) @@ -55,3 +56,4 @@ importFrom(tidyr,unnest) importFrom(tidyr,unnest_wider) importFrom(tidyselect,any_of) importFrom(tidyselect,matches) +importFrom(xgboost,getinfo) diff --git a/NEWS.md b/NEWS.md index ff703f1b..a7dd5ca7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,16 @@ +# nflfastR (development version) + +* Fix `add_xyac()` breaking with some old packages +* Fix `add_xyac()` calculations being wrong for some failed 4th downs +* Updated Readme with ep and cp model plots +* Updated `vignette("examples")` with the new `add_xyac()` function +* Added xYAC model to `vignette("nflfastR-models")` + +# nflfastR 2.2.0 + +* New function `add_xyac()` which adds the following columns associated with expected yards after +the catch (xYAC): `xyac_epa`, `xyac_success`, `xyac_fd`, `xyac_mean_yardage`, `xyac_median_yardage` + # nflfastR 2.1.3 * Fixed a bug in `series_success` caused by bad `drive` information provided by NFL diff --git a/R/helper_add_xyac.R b/R/helper_add_xyac.R new file mode 100644 index 00000000..cbdbda54 --- /dev/null +++ b/R/helper_add_xyac.R @@ -0,0 +1,217 @@ +################################################################################ +# Author: Ben Baldwin, Sebastian Carl +# Purpose: Function to add expected yac variables. +# Code Style Guide: styler::tidyverse_style() +################################################################################ +#' Add expected yards after completion (xyac) variables +#' +#' @param pbp is a Data frame of play-by-play data scraped using \code{\link{fast_scraper}}. +#' @details Build columns that capture what we should expect after the catch. +#' @return The input Data Frame of the parameter 'pbp' with the following columns +#' added: +#' \describe{ +#' \item{xyac_epa}{Expected value of EPA gained after the catch, starting from where the catch was made. Zero yards after the catch would be listed as zero EPA.} +#' \item{xyac_success}{Probability play earns positive EPA (relative to where play started) based on where ball was caught.} +#' \item{xyac_fd}{Probability play earns a first down based on where the ball was caught.} +#' \item{xyac_mean_yardage}{Average expected yards after the catch based on where the ball was caught.} +#' \item{xyac_median_yardage}{Median expected yards after the catch based on where the ball was caught.} +#' } +#' @importFrom rlang .data +#' @importFrom xgboost getinfo +#' @export +add_xyac <- function(pbp) { + + # testing only + # pbp <- g + + pbp <- pbp %>% dplyr::select(-tidyselect::any_of(drop.cols.xyac)) + + # for joining at the end + pbp <- pbp %>% + dplyr::mutate(index = 1:dplyr::n()) + + # prepare_xyac_data helper function shown below + passes <- prepare_xyac_data(pbp) %>% + filter(.data$valid_pass == 1, .data$distance_to_goal != 0) + + if (!nrow(passes) == 0) { + # initialize xyac_model to avoid R CMD check note + xyac_model <- NULL + suppressWarnings( + # load the model from github because it is too big for the package + try( + load(url("https://github.com/guga31bb/nflfastR-data/blob/master/models/xyac_model.Rdata?raw=true")), + silent = TRUE + ) + ) + + if (!is.null(xyac_model)) { + xyac_vars <- + stats::predict( + xyac_model, + as.matrix(passes %>% xyac_model_select()) + ) %>% + tibble::as_tibble() %>% + dplyr::rename(prob = "value") %>% + dplyr::bind_cols( + purrr::map_dfr(seq_along(passes$index), function(x) { + tibble::tibble( + "yac" = -5:70, + "index" = passes$index[[x]], + "distance_to_goal" = passes$distance_to_goal[[x]], + "season" = passes$season[[x]], + "week" = passes$week[[x]], + "home_team" = passes$home_team[[x]], + "posteam" = passes$posteam[[x]], + "roof" = passes$roof[[x]], + "half_seconds_remaining" = dplyr::if_else( + passes$half_seconds_remaining[[x]] <= 6, + 0, + passes$half_seconds_remaining[[x]] - 6 + ), + "down" = as.integer(passes$down[[x]]), + "ydstogo" = as.integer(passes$ydstogo[[x]]), + "original_ydstogo" = as.integer(passes$ydstogo[[x]]), + "posteam_timeouts_remaining" = passes$posteam_timeouts_remaining[[x]], + "defteam_timeouts_remaining" = passes$defteam_timeouts_remaining[[x]], + "original_spot" = passes$yardline_100[[x]], + "original_ep" = passes$ep[[x]], + "air_epa" = passes$air_epa[[x]], + "air_yards" = passes$air_yards[[x]] + ) + }) + ) %>% + dplyr::group_by(.data$index) %>% + dplyr::mutate( + max_loss = dplyr::if_else(.data$distance_to_goal < 95, -5, .data$distance_to_goal - 99), + max_gain = dplyr::if_else(.data$distance_to_goal > 70, 70, .data$distance_to_goal), + cum_prob = cumsum(.data$prob), + prob = dplyr::case_when( + # truncate probs at loss greater than max loss + .data$yac == .data$max_loss ~ .data$cum_prob, + # same for gains bigger than possible + .data$yac == .data$max_gain ~ 1 - dplyr::lag(.data$cum_prob, 1), + TRUE ~ .data$prob + ), + # get end result for each possibility + yardline_100 = .data$distance_to_goal - .data$yac + ) %>% + dplyr::filter(.data$yac >= .data$max_loss, .data$yac <= .data$max_gain) %>% + dplyr::select(-.data$cum_prob) %>% + dplyr::mutate( + posteam_timeouts_pre = .data$posteam_timeouts_remaining, + defeam_timeouts_pre = .data$defteam_timeouts_remaining, + gain = .data$original_spot - .data$yardline_100, + turnover = dplyr::if_else(.data$down == 4 & .data$gain < .data$ydstogo, as.integer(1), as.integer(0)), + down = dplyr::if_else(.data$gain >= .data$ydstogo, 1, .data$down + 1), + ydstogo = dplyr::if_else(.data$gain >= .data$ydstogo, 10, .data$ydstogo - .data$gain), + # ydstogo can't be bigger than yardline + ydstogo = dplyr::if_else(.data$ydstogo >= .data$yardline_100, as.integer(.data$yardline_100), as.integer(.data$ydstogo)), + # possession change if 4th down failed + down = dplyr::if_else(.data$turnover == 1, as.integer(1), as.integer(.data$down)), + ydstogo = dplyr::if_else(.data$turnover == 1, as.integer(10), as.integer(.data$ydstogo)), + yardline_100 = dplyr::if_else(.data$turnover == 1, as.integer(100 - .data$yardline_100), as.integer(.data$yardline_100)), + posteam_timeouts_remaining = dplyr::if_else(.data$turnover == 1, + .data$defeam_timeouts_pre, + .data$posteam_timeouts_pre), + defteam_timeouts_remaining = dplyr::if_else(.data$turnover == 1, + .data$posteam_timeouts_pre, + .data$defeam_timeouts_pre) + ) %>% + dplyr::ungroup() %>% + nflfastR::calculate_expected_points() %>% + dplyr::group_by(.data$index) %>% + dplyr::mutate( + ep = dplyr::case_when( + .data$yardline_100 == 0 ~ 7, + .data$turnover == 1 ~ -1 * .data$ep, + TRUE ~ ep + ), + epa = .data$ep - .data$original_ep, + wt_epa = .data$epa * .data$prob, + wt_yardln = .data$yardline_100 * .data$prob, + med = dplyr::if_else( + cumsum(.data$prob) > .5 & dplyr::lag(cumsum(.data$prob) < .5), .data$yac, as.integer(0) + ) + ) %>% + dplyr::summarise( + xyac_epa = sum(.data$wt_epa) - dplyr::first(.data$air_epa), + xyac_mean_yardage = (dplyr::first(.data$original_spot) - dplyr::first(.data$air_yards)) - sum(.data$wt_yardln), + xyac_median_yardage = max(.data$med), + xyac_success = sum((.data$ep > .data$original_ep) * .data$prob), + xyac_fd = sum((.data$gain >= .data$original_ydstogo) * .data$prob), + .groups = "drop_last" + ) %>% + dplyr::ungroup() + + pbp <- pbp %>% + dplyr::left_join(xyac_vars, by = "index") %>% + dplyr::select(-.data$index) + + message("added xyac variables") + + } else {# means xyac_model isn't available + message("This function needs to download the model data from GitHub. Please check your Internet connection and try again!") + pbp <- pbp %>% dplyr::select(-.data$index) + } + } else {# means no valid pass plays in the pbp + pbp <- pbp %>% + dplyr::mutate( + xyac_epa = NA_real_, + xyac_mean_yardage = NA_real_, + xyac_median_yardage = NA_real_, + xyac_success = NA_real_, + xyac_fd = NA_real_ + ) %>% + dplyr::select(-.data$index) + message("No non-NA values for xyac calculation detected. xyac variables set to NA") + } + + # on old versions of dplyr, a .group column is created, which we don't want + pbp <- pbp %>% dplyr::select(-tidyselect::any_of(".group")) + + return(pbp) +} + + +### helper function for getting the data ready +prepare_xyac_data <- function(pbp) { + + # valid pass play: at least -15 air yards, less than 70 air yards, has intended receiver, has pass location + passes <- pbp %>% + make_model_mutations() %>% + dplyr::mutate( + receiver_player_name = + stringr::str_extract(.data$desc, "(?<=((to)|(for))\\s[:digit:]{0,2}\\-{0,1})[A-Z][A-z]*\\.\\s?[A-Z][A-z]+(\\s(I{2,3})|(IV))?"), + pass_middle = dplyr::if_else(.data$pass_location == "middle", 1, 0), + air_is_zero = dplyr::if_else(.data$air_yards == 0, 1, 0), + distance_to_sticks = .data$air_yards - .data$ydstogo, + distance_to_goal = .data$yardline_100 - .data$air_yards, + valid_pass = dplyr::if_else( + (.data$complete_pass == 1 | .data$incomplete_pass == 1 | .data$interception == 1) & + !is.na(.data$air_yards) & .data$air_yards >= -15 & .data$air_yards < 70 & + !is.na(.data$receiver_player_name) & !is.na(.data$pass_location), + 1, 0 + ) + ) + return(passes) +} + +### another helper function for getting the data ready +xyac_model_select <- function(pbp) { + pbp %>% + dplyr::select( + "air_yards", "yardline_100", "ydstogo", "distance_to_goal", + "down1", "down2", "down3", "down4", "air_is_zero", "pass_middle", + "era2", "era3", "era4", "qb_hit", "home", + "outdoors", "retractable", "dome", "distance_to_sticks" + ) +} + +# These columns are being generated by add_xyac and the function tries to drop +# them in case it is being used on a pbp dataset where the columns already exist +drop.cols.xyac <- c( + "xyac_epa", "xyac_mean_yardage", "xyac_median_yardage", "xyac_success", "xyac_fd", ".groups" +) + + diff --git a/R/helper_database_functions.R b/R/helper_database_functions.R index dcde9570..13b4b611 100644 --- a/R/helper_database_functions.R +++ b/R/helper_database_functions.R @@ -78,7 +78,8 @@ update_db <- function(dbdir = ".", message(glue::glue("Starting download of {length(missing)} games ...")) new_pbp <- fast_scraper(missing, pp = is_installed_furrr) %>% clean_pbp() %>% - add_qb_epa() + add_qb_epa() %>% + add_xyac() message("Appending new data to database...") RSQLite::dbWriteTable(connection, tblname, new_pbp, append = TRUE) diff --git a/README.Rmd b/README.Rmd index 0c4de9fb..689dc6eb 100644 --- a/README.Rmd +++ b/README.Rmd @@ -7,7 +7,8 @@ output: github_document ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, - comment = "#>" + comment = "#>", + fig.path = "man/figures/readme-" ) ``` @@ -55,7 +56,65 @@ Even though `nflfastR` is very fast, **for historical games we recommend downloa ## nflfastR models -`nflfastR` uses its own models for Expected Points, Win Probability, and Completion Probability. To read about the models, please see `vignette("nflfastR-models")`. For a more detailed description of Expected Points models, we highly recommend this paper [from the nflscrapR team located here](https://arxiv.org/pdf/1802.00998.pdf). +`nflfastR` uses its own models for Expected Points, Win Probability, Completion Probability, and Expected Yards After the Catch. To read about the models, please see `vignette("nflfastR-models")`. For a more detailed description of Expected Points models, we highly recommend this paper [from the nflscrapR team located here](https://arxiv.org/pdf/1802.00998.pdf). + +Here is a visualization of the Expected Points model by down and yardline. + +``` {r epa-model, warning = FALSE, message = FALSE, results = 'hide', fig.keep = 'all', dpi = 600, echo=FALSE} +library(tidyverse) + +df <- map_df(2014:2019, ~{ + readRDS(url(glue::glue('https://raw.githubusercontent.com/guga31bb/nflfastR-data/master/data/play_by_play_{.x}.rds'))) %>% + filter(!is.na(posteam) & !is.na(ep), !is.na(down)) %>% + select(ep, down, yardline_100, air_yards, pass_location, cp) +}) + +df %>% + ggplot(aes(x = yardline_100, y = ep, color = as.factor(down))) + + geom_smooth(size = 2) + + labs(x = "Yards from opponent's end zone", + y = "Expected points value", + color = "Down", + title = "Expected Points by Yardline and Down") + + theme_bw() + + scale_y_continuous(expand=c(0,0), breaks = scales::pretty_breaks(10)) + + scale_x_continuous(expand=c(0,0), breaks = seq(from = 5, to = 95, by = 10)) + + theme( + plot.title = element_text(size = 18, hjust = 0.5), + plot.subtitle = element_text(size = 16, hjust = 0.5), + axis.title = element_text(size = 18), + axis.text = element_text(size = 16), + legend.text = element_text(size = 16), + legend.title = element_text(size = 16), + legend.position = c(.90, .80)) + + annotate("text", x = 14, y = -2.2, size = 3, label = "2014-2019 | Model: @nflfastR") +``` + +Here is a visualization of the Completion Probability model by air yards and pass direction. + +``` {r cp-model, warning = FALSE, message = FALSE, results = 'hide', fig.keep = 'all', dpi = 600, echo=FALSE} +df %>% + filter(!is.na(cp), between(air_yards, -5, 45)) %>% + mutate(pass_middle = if_else(pass_location == "middle", "Yes", "No")) %>% + ggplot(aes(x = air_yards, y = cp, color = as.factor(pass_middle))) + + geom_smooth(size = 2) + + labs(x = "Air yards", + y = "Expected completion %", + color = "Pass middle", + title = "Expected Completion % by Air Yards and Pass Direction") + + theme_bw() + + scale_y_continuous(expand=c(0,0), breaks = scales::pretty_breaks(5)) + + scale_x_continuous(expand=c(0,0)) + + theme( + plot.title = element_text(size = 18, hjust = 0.5), + plot.subtitle = element_text(size = 16, hjust = 0.5), + axis.title = element_text(size = 18), + axis.text = element_text(size = 16), + legend.text = element_text(size = 16), + legend.title = element_text(size = 16), + legend.position = c(.80, .80)) + + annotate("text", x = 2, y = .32, size = 3, label = "2014-2019 | Model: @nflfastR") +``` `nflfastR` includes two win probability models: one with and one without incorporating the pre-game spread. diff --git a/README.md b/README.md index a4a324bf..e07e7f22 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ status](https://github.com/mrcaseb/nflfastR/workflows/R-CMD-check/badge.svg)](ht [![Travis build status](https://travis-ci.com/mrcaseb/nflfastR.svg?branch=master)](https://travis-ci.com/mrcaseb/nflfastR) [![Twitter -Follow](https://img.shields.io/twitter/follow/nflfastR.svg?style=social)](https://twitter.com/nflfastR) +Follow](https://img.shields.io/twitter/follow/nflfastR.svg?style=social)](https://twitter.com/nflfastR) `nflfastR` is a set of functions to efficiently scrape NFL play-by-play @@ -70,11 +70,22 @@ as .csv.gz, .parquet, or .rds. ## nflfastR models -`nflfastR` uses its own models for Expected Points, Win Probability, and -Completion Probability. To read about the models, please see -`vignette("nflfastR-models")`. For a more detailed description of -Expected Points models, we highly recommend this paper [from the -nflscrapR team located here](https://arxiv.org/pdf/1802.00998.pdf). +`nflfastR` uses its own models for Expected Points, Win Probability, +Completion Probability, and Expected Yards After the Catch. To read +about the models, please see `vignette("nflfastR-models")`. For a more +detailed description of Expected Points models, we highly recommend this +paper [from the nflscrapR team located +here](https://arxiv.org/pdf/1802.00998.pdf). + +Here is a visualization of the Expected Points model by down and +yardline. + +![](man/figures/readme-epa-model-1.png) + +Here is a visualization of the Completion Probability model by air yards +and pass direction. + +![](man/figures/readme-cp-model-1.png) `nflfastR` includes two win probability models: one with and one without incorporating the pre-game spread. diff --git a/man/add_xyac.Rd b/man/add_xyac.Rd new file mode 100644 index 00000000..055fee12 --- /dev/null +++ b/man/add_xyac.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/helper_add_xyac.R +\name{add_xyac} +\alias{add_xyac} +\title{Add expected yards after completion (xyac) variables} +\usage{ +add_xyac(pbp) +} +\arguments{ +\item{pbp}{is a Data frame of play-by-play data scraped using \code{\link{fast_scraper}}.} +} +\value{ +The input Data Frame of the parameter 'pbp' with the following columns +added: +\describe{ +\item{xyac_epa}{Expected value of EPA gained after the catch, starting from where the catch was made. Zero yards after the catch would be listed as zero EPA.} +\item{xyac_success}{Probability play earns positive EPA (relative to where play started) based on where ball was caught.} +\item{xyac_fd}{Probability play earns a first down based on where the ball was caught.} +\item{xyac_mean_yardage}{Average expected yards after the catch based on where the ball was caught.} +\item{xyac_median_yardage}{Median expected yards after the catch based on where the ball was caught.} +} +} +\description{ +Add expected yards after completion (xyac) variables +} +\details{ +Build columns that capture what we should expect after the catch. +} diff --git a/man/figures/readme-cp-model-1.png b/man/figures/readme-cp-model-1.png new file mode 100644 index 00000000..bdc94a2b Binary files /dev/null and b/man/figures/readme-cp-model-1.png differ diff --git a/man/figures/readme-epa-model-1.png b/man/figures/readme-epa-model-1.png new file mode 100644 index 00000000..aa1ddf9b Binary files /dev/null and b/man/figures/readme-epa-model-1.png differ diff --git a/vignettes/examples.Rmd b/vignettes/examples.Rmd index 1ccc793b..9ca64056 100644 --- a/vignettes/examples.Rmd +++ b/vignettes/examples.Rmd @@ -236,7 +236,7 @@ There's exactly one function in `nflfastR` that works with databases: `update_db Let's say I just want to dump a database into the current working directory. Here we go! -``` {r} +``` {r create-db} update_db() ``` @@ -244,7 +244,7 @@ This created a database in the current directory called `pbp_db`. Wait, that's it? That's it! What if it's partway through the season and you want to make sure all the new games are added to the database? What do you run? `update_db()`! (just make sure you're in the directory the database is saved in or you supply the right file path) -``` {r} +``` {r update-db} update_db() ``` @@ -310,7 +310,50 @@ dbDisconnect(connection) For more details on using a database with `nflfastR`, see [Thomas Mock's life-changing post here](https://themockup.blog/posts/2019-04-28-nflfastr-dbplyr-rsqlite/). -# Example 9: Working with roster and position data +# Example 9: working with the expected yards after catch model + +The variables in `xyac` are as follows: + +* `xyac_epa`: The expected value of EPA gained after the catch, **starting from where the catch was made**. +* `xyac_success`: The probability the play earns positive EPA (relative to where play started) based on where ball was caught. +* `xyac_fd`: Probability play earns a first down based on where the ball was caught. +* `xyac_mean_yardage` and `xyac_median_yardage`: Average and median expected yards after the catch based on where the ball was caught. + +Some other notes: + +* `epa` = `air_epa` + `yac_epa`, where `air_epa` is the EPA associated with a catch at the target location. If a receiver loses a fumble, it is removed from his `yac_epa` +* Expected value of EPA at catch point = `air_epa` + `xyac_epa` +* So if we want to get YAC EPA over expected, we need to compare `yac_epa` to `xyac_epa`, as in the example below +* To get first downs over expected, we could compare `first_down` to `xyac_fd` +* These fields are populated for all pass attempts, whether caught or not, but restrict to completed passes when measuring, for example, YAC EPA over expected +* The expected YAC EPA model doesn't take receiver fumbles into account, so actual minus expected YAC is slightly negative due to fumbles happening + +Let's create measures for EPA and first downs over expected in 2015: + +``` {r ex9-xyac, warning = FALSE, message = FALSE} +games_2015 %>% + group_by(receiver, receiver_id, posteam) %>% + mutate(tgt = sum(complete_pass + incomplete_pass)) %>% + filter(tgt >= 50) %>% + filter(complete_pass == 1, air_yards < yardline_100, !is.na(xyac_epa)) %>% + summarize( + epa_oe = mean(yac_epa - xyac_epa), + actual_fd = mean(first_down), + expected_fd = mean(xyac_fd), + fd_oe = mean(first_down - xyac_fd), + rec = n() + ) %>% + ungroup() %>% + select(receiver, posteam, actual_fd, expected_fd, fd_oe, epa_oe, rec) %>% + arrange(-epa_oe) %>% + head(10) %>% + knitr::kable(digits = 3) + +``` + +The presence of so many running backs on this list suggests that even though it takes into account target depth and pass direction, the model doesn't do a great job capturing space. Alternatively, running backs might be better at generating yards after the catch since running with the football is their primary role. + +# Example 10: Working with roster and position data This section used to contain an example of working with roster data. Unfortunately, we have not found a way to obtain roster data that can be joined to the new play by play, so for now, it is empty. We would like to be able to get position data but haven't yet. diff --git a/vignettes/nflfastR-models.Rmd b/vignettes/nflfastR-models.Rmd index 51713e01..cd0ca6f3 100644 --- a/vignettes/nflfastR-models.Rmd +++ b/vignettes/nflfastR-models.Rmd @@ -46,7 +46,7 @@ The EP, WP, and CP models are trained using xgboost, which uses training data to * Whether possession team is at home * [Model with Vegas line only: point spread * log(3600 / (50 + (seconds elapsed in game))] -**CP model features** +**CP and expected yards after the catch model features** * Yard line * Whether possession team is at home @@ -59,6 +59,7 @@ The EP, WP, and CP models are trained using xgboost, which uses training data to * Whether air yards is 0 (probably unnecessary with tree-based method and a relic from earlier models where it was included because completion percentage is much lower for 0 air yard passes) * Pass location (binary: middle or not middle) * Whether quarterback was hit on the play +* For xyac model only: how far away the goal line is when the ball is caught ## EP Model Calibration Results @@ -705,3 +706,160 @@ cp_cv_cal_error <- cp_cv_loso_calibration_results %>% round(with(cp_cv_cal_error, weighted.mean(weight_cal_error, n_complete)), 4) ``` + + + +## xYAC Model Calibration Results + +By now, the process should be familiar. +``` {r xyac-setup, results = 'hide'} +# pbp <- readRDS(url('https://github.com/guga31bb/nflfastR-data/blob/master/models/cal_data.rds?raw=true')) +pbp_data <- readRDS('../../nflfastR-data/models/cal_data.rds') + +model_data <- pbp_data %>% + make_model_mutations() %>% + filter(season >= 2006, complete_pass == 1, !is.na(yards_after_catch), + yards_after_catch >= -20, air_yards < yardline_100) %>% + dplyr::mutate( + distance_to_goal = yardline_100 - air_yards, + pass_middle = dplyr::if_else(pass_location == 'middle', 1, 0), + air_is_zero= dplyr::if_else(air_yards == 0,1,0), + distance_to_sticks = air_yards - ydstogo, + yards_after_catch = dplyr::case_when( + yards_after_catch < -5 ~ -5, + yards_after_catch > 70 ~ 70, + TRUE ~ yards_after_catch + ), + label = yards_after_catch + 5 + ) %>% + dplyr::filter(!is.na(air_yards) & air_yards >= -15 & air_yards <70 & !is.na(pass_location)) %>% + dplyr::select( + season, label, air_yards, yardline_100, ydstogo, distance_to_goal, + down1, down2, down3, down4, air_is_zero, pass_middle, + era2, era3, era4, qb_hit, home, + outdoors, retractable, dome, distance_to_sticks + ) + + +# nrounds = 500 +nrounds = 500 +params <- + list( + booster = "gbtree", + objective = "multi:softprob", + eval_metric = c("mlogloss"), + num_class = 76, + eta = .025, + gamma = 2, + subsample=0.8, + colsample_bytree=0.8, + max_depth = 4, + min_child_weight = 1 + ) + + +cv_results <- map_dfr(2006:2019, function(x) { + + test_data <- model_data %>% + filter(season == x) %>% + select(-season) + train_data <- model_data %>% + filter(season != x) %>% + select(-season) + + full_train = xgboost::xgb.DMatrix(model.matrix(~.+0, data = train_data %>% select(-label)), label = train_data$label) + xyac_model <- xgboost::xgboost(params = params, data = full_train, nrounds = nrounds, verbose = 2) + + preds <- as.data.frame( + matrix(predict(xyac_model, as.matrix(test_data %>% select(-label))), ncol=76, byrow=TRUE) + ) + + cv_data <- bind_cols(test_data, preds) %>% + mutate(season = x) + return(cv_data) + +}) +``` + +``` {r xyac-bins} +plot <- cv_results %>% + select(label, air_yards, starts_with("V")) %>% + mutate( + loss = V1 + V2 + V3 + V4 + V5 + V6, + short_gain = V7 + V8 + V9 + V10 + V11, + med_gain = V12 + V13 + V14 + V15 + V16, + long_gain = select(., V17:V76) %>% rowSums(), + outcome = case_when( + label <= 5 ~ "loss", + between(label, 6, 10) ~ "short_gain", + between(label, 11, 15) ~ "med_gain", + label > 15 ~ "long_gain" + ), + distance = case_when( + air_yards < 5 ~ "1: Short", + air_yards >= 5 ~ "2: Long" + )) %>% + select(outcome, distance, loss, short_gain, med_gain, long_gain) %>% + pivot_longer(-c(outcome, distance), names_to = 'type', values_to = 'pred_prob') %>% + mutate(bin_pred_prob = round(pred_prob / 0.05) * .05) %>% + group_by(type, distance, bin_pred_prob) %>% + mutate(correct = if_else(outcome == type, 1, 0)) %>% + summarize(n_plays = n(), + n_outcome = sum(correct), + bin_actual_prob = n_outcome / n_plays) + +ann_text <- data.frame(x = c(.25, 0.75), y = c(0.75, 0.25), + lab = c("More times\nthan expected", "Fewer times\nthan expected") + ) +``` + +```{r plot-xyac, warning = FALSE, message = FALSE, results = 'hide', fig.keep = 'all', dpi = 700} + +plot %>% + ungroup() %>% + mutate(type = fct_relevel(type, + "loss", "short_gain", + "med_gain", "long_gain" + ), + type = fct_recode(type, + "Loss/ no gain" = "loss", + "1-5 yards" = "short_gain", + "6-10 yards" = "med_gain", + "11+ yards" = "long_gain")) %>% + filter(n_plays > 15) %>% + ggplot() + + geom_point(aes(x = bin_pred_prob, y = bin_actual_prob, size = n_plays)) + + geom_smooth(aes(x = bin_pred_prob, y = bin_actual_prob), method = "loess") + + geom_abline(slope = 1, intercept = 0, color = "black", lty = 2) + + coord_equal() + + scale_x_continuous(limits = c(0,1)) + + scale_y_continuous(limits = c(0,1)) + + labs(size = "Number of plays", + x = "Estimated yards after catch", + y = "Observed yards after catch") + + geom_text(data = ann_text, aes(x = x, y = y, label = lab), size = 2) + + theme_bw() + + theme(plot.title = element_text(hjust = 0.5), + strip.background = element_blank(), + strip.text = element_text(size = 12), + axis.title = element_text(size = 12), + axis.text.y = element_text(size = 12), + axis.text.x = element_text(size = 10, angle = 90), + legend.title = element_text(size = 12), + legend.text = element_text(size = 12), + legend.position = c(1.5, .05), legend.justification = c(1, 0)) + + facet_wrap(~ distance + type, ncol = 4) +``` + +Calibration error: +```{r xyac-error} +xyac_cv_cal_error <- plot %>% + ungroup() %>% + mutate(cal_diff = abs(bin_pred_prob - bin_actual_prob)) %>% + group_by(distance) %>% + summarize(weight_cal_error = weighted.mean(cal_diff, n_plays, na.rm = TRUE), + n_outcome = sum(n_outcome, na.rm = TRUE)) + +round(with(xyac_cv_cal_error, weighted.mean(weight_cal_error, n_outcome)), 4) +``` +