Skip to content

Commit

Permalink
Push nflfastR 2.0
Browse files Browse the repository at this point in the history
  • Loading branch information
mrcaseb committed Jun 13, 2020
1 parent df6c67d commit 9f70755
Show file tree
Hide file tree
Showing 43 changed files with 4,652 additions and 885 deletions.
14 changes: 6 additions & 8 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
Type: Package
Package: nflfastR
Title: Functions to Efficiently Scrape NFL Play by Play and
Roster Data
Version: 1.1.10
Title: Functions to Efficiently Scrape NFL Play by Play Data
Version: 2.0.0
Authors@R:
c(person(given = "Sebastian",
family = "Carl",
Expand All @@ -12,11 +11,10 @@ Authors@R:
role = c("cre", "aut"),
email = "[email protected]"))
Description: nflfastR is a set of functions to efficiently
scrape NFL play-by-play and roster data.
scrape NFL play-by-play data.
License: MIT + file LICENSE
Depends:
R (>= 3.5.0),
nflscrapR
R (>= 3.5.0)
Imports:
dplyr,
glue,
Expand All @@ -26,12 +24,12 @@ Imports:
lubridate,
magrittr,
mgcv,
progressr,
purrr,
stringr,
tibble,
tidyr,
tidyselect,
xml2
tidyselect
Suggests:
furrr,
future
Expand Down
6 changes: 2 additions & 4 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
# Generated by roxygen2: do not edit by hand

export("%>%")
export(add_qb_epa)
export(calculate_expected_points)
export(clean_pbp)
export(fast_scraper)
export(fast_scraper_clips)
export(fast_scraper_roster)
export(fast_scraper_schedules)
export(fix_fumbles)
importFrom(magrittr,"%>%")
121 changes: 121 additions & 0 deletions NEWS.md

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions R/data_documentation.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,8 @@
#' \item{team_logo_wikipedia}{Url to Team logo on wikipedia}
#' \item{team_logo_espn}{Url to higher quality logo on espn}
#' }
#' The colors are taken from Lee Sharpe's teamcolors.csv who has taken them from the
#' `teamcolors` package created by Ben Baumer and Gregory Matthews.
#' The Wikipeadia logo urls are taken from Lee Sharpe's logos.csv
#'
"teams_colors_logos"
68 changes: 68 additions & 0 deletions R/ep_calculator.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#' Compute expected points
#'
#' for provided plays. Returns the data with
#' probabilities of each scoring event and EP added. The following columns
#' must be present: season, week, home_team, posteam, roof (coded as 'open',
#' 'closed', or 'retractable'), half_seconds_remaining, yardline_100,
#' ydstogo, posteam_timeouts_remaining, defteam_timeouts_remaining
#'
#' @param pbp_data Play-by-play dataset to estimate expected points for.
#' @details Computes expected points for provided plays. Returns the data with
#' probabilities of each scoring event and EP added. The following columns
#' must be present: season, week, home_team, posteam, roof (coded as 'open',
#' 'closed', or 'retractable'), half_seconds_remaining, yardline_100,
#' ydstogo, posteam_timeouts_remaining, defteam_timeouts_remaining
#' @return The original pbp_data with the following columns appended to it:
#' \itemize{
#' \item{ep} - expected points.
#' \item{no_score_prob} - probability of no more scoring this half.
#' \item{opp_fg_prob} - probability next score opponent field goal this half.
#' \item{opp_safety_prob} - probability next score opponent safety this half.
#' \item{opp_td_prob} - probability of next score opponent touchdown this half.
#' \item{fg_prob} - probability next score field goal this half.
#' \item{safety_prob} - probability next score safety this half.
#' \item{td_prob} - probability text score touchdown this half.
#' }
#' @export

calculate_expected_points <- function(pbp_data) {

suppressWarnings(
model_data <- pbp_data %>%
#drop existing values of ep and the probs before making new ones
dplyr::select(-one_of(drop.cols)) %>%
make_model_mutations() %>%
ep_model_select()
)


preds <- as.data.frame(
matrix(stats::predict(ep_model, as.matrix(model_data)), ncol=7, byrow=TRUE)
)

colnames(preds) <- c("td_prob","opp_td_prob","fg_prob","opp_fg_prob",
"safety_prob","opp_safety_prob","no_score_prob")

preds <- preds %>%
dplyr::mutate(
ep =
(-3 * opp_fg_prob) +
(-2 * opp_safety_prob) +
(-7 * opp_td_prob) +
(3 * fg_prob) +
(2 * safety_prob) +
(7 * td_prob)
) %>%
dplyr::bind_cols(pbp_data)

return(preds)

}


drop.cols <- c(
'ep', 'td_prob', 'opp_td_prob', 'fg_prob', 'opp_fg_prob',
'safety_prob', 'opp_safety_prob', 'no_score_prob'
)


80 changes: 51 additions & 29 deletions R/helper_add_cp_cpoe.R
Original file line number Diff line number Diff line change
@@ -1,42 +1,35 @@
################################################################################
# Author: Ben Baldwin, Sebastian Carl
# Purpose: Function to add cp and cpoe variables
# Purpose: Function to add cp and cpoe variables.
# CP model created by Zach Feldman: https://github.com/z-feldman/Expected_Completion_NFL
# Code Style Guide: styler::tidyverse_style()
################################################################################

add_cp <- function(pbp) {
passes <- pbp %>%
dplyr::filter(complete_pass == 1 | incomplete_pass == 1 | interception == 1) %>%
#this step is necessary because NFL doesn't record receiver on incomplete passes prior to 2009
dplyr::mutate(receiver_player_name =
stringr::str_extract(desc, "(?<=((to)|(for))\\s[:digit:]{0,2}\\-{0,1})[A-Z][A-z]*\\.\\s?[A-Z][A-z]+(\\s(I{2,3})|(IV))?")) %>%
dplyr::filter(
!is.na(air_yards) & air_yards >= -15 & air_yards < 70 & !is.na(receiver_player_name) &
!is.na(pass_location) & season >= 2006
) %>%
dplyr::mutate(air_is_zero = dplyr::if_else(air_yards == 0, 1, 0)) %>%
dplyr::select(
game_id, play_id, complete_pass, air_yards, pass_location,
air_is_zero, yardline_100, ydstogo, down, season
) %>%
dplyr::mutate(
#since cpoe model uses season as a factor and there's no 2020 data to estimate the model
#pretend that seasons > 2019 are 2019
#if anyone has better ideas here, please get in touch!
season = dplyr::if_else(season > 2019, 2019, season)
)

if (!nrow(passes) == 0) {
passes$cp <- mgcv::predict.gam(cp_models, passes, type = "response")
passes_with_cp <- passes %>%
dplyr::select(game_id, play_id, cp)

#testing only
#pbp <- g

passes <- prepare_cp_data(pbp)

if (!nrow(passes %>% dplyr::filter(valid_pass == 1)) == 0) {

pred <- stats::predict(cp_model, as.matrix(passes %>% dplyr::select(-complete_pass, -valid_pass))) %>%
tibble::as_tibble() %>%
dplyr::rename(cp = value) %>%
dplyr::bind_cols(passes) %>%
dplyr::select(cp, valid_pass)

pbp <- pbp %>%
dplyr::left_join(passes_with_cp, by = c("game_id", "play_id")) %>%
dplyr::bind_cols(pred) %>%
dplyr::mutate(
cp = dplyr::if_else(!is.na(cp), cp, NA_real_),
cp = dplyr::if_else(
valid_pass == 1, cp, NA_real_
),
cpoe = dplyr::if_else(!is.na(cp), 100 * (complete_pass - cp), NA_real_)
)
) %>%
dplyr::select(-valid_pass)

message("added cp and cpoe")
} else {
pbp <- pbp %>%
Expand All @@ -51,3 +44,32 @@ add_cp <- function(pbp) {
}


### helper function for getting the data ready
prepare_cp_data <- function(pbp) {

# valid pass play: at least -15 air yards, less than 70 air yards, has intended receiver, has pass location
passes <- pbp %>%
dplyr::mutate(receiver_player_name =
stringr::str_extract(desc, "(?<=((to)|(for))\\s[:digit:]{0,2}\\-{0,1})[A-Z][A-z]*\\.\\s?[A-Z][A-z]+(\\s(I{2,3})|(IV))?"),
pass_middle = dplyr::if_else(pass_location == 'middle', 1, 0),
air_is_zero= dplyr::if_else(air_yards == 0,1,0),
distance_to_sticks = air_yards - ydstogo,
valid_pass = dplyr::if_else(
(complete_pass == 1 | incomplete_pass == 1 | interception == 1) &
!is.na(air_yards) & air_yards >= -15 & air_yards <70 & !is.na(receiver_player_name) & !is.na(pass_location),
1, 0
)
) %>%
dplyr::select(
complete_pass, air_yards, yardline_100, ydstogo,
down1, down2, down3, down4, air_is_zero, pass_middle,
era2, era3, era4, qb_hit, home, model_week,
outdoors, retractable, dome, distance_to_sticks, valid_pass
)

}





Loading

0 comments on commit 9f70755

Please sign in to comment.