diff --git a/DESCRIPTION b/DESCRIPTION index da3dbaa1..8fb1187e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: nflfastR Title: Functions to Efficiently Scrape NFL Play by Play Data -Version: 2.0.2 +Version: 2.0.3 Authors@R: c(person(given = "Sebastian", family = "Carl", diff --git a/NEWS.md b/NEWS.md index af36e36a..4cae0b6f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,8 @@ +# nflfastR 2.0.3 + +* Fix for NFL providing plays out of order +* Fix for series not incrementing following defensive TD + # nflfastR 2.0.2 * Fixed a bug in the series and series success calculations caused by timeouts diff --git a/R/helper_add_nflscrapr_mutations.R b/R/helper_add_nflscrapr_mutations.R index d517b47d..ec460230 100644 --- a/R/helper_add_nflscrapr_mutations.R +++ b/R/helper_add_nflscrapr_mutations.R @@ -11,13 +11,23 @@ add_nflscrapr_mutations <- function(pbp) { out <- pbp %>% - dplyr::mutate(index = 1 : dplyr::n()) %>% # to re-sort after removing duplicates + dplyr::mutate(index = 1 : dplyr::n()) %>% # remove duplicate plays. can't do this with play_id because duplicate plays # sometimes have different play_ids dplyr::group_by(game_id, quarter, time, play_description) %>% dplyr::slice(1) %>% dplyr::ungroup() %>% - dplyr::arrange(index) %>% + dplyr::mutate( + # Modify the time column for the quarter end: + time = dplyr::if_else(quarter_end == 1, "00:00", time), + time = dplyr::if_else(play_description == 'GAME', "15:00", time), + # Create a column with the time in seconds remaining for the quarter: + quarter_seconds_remaining = lubridate::period_to_seconds(lubridate::ms(time)) + ) %>% + #put plays in the right order + dplyr::group_by(game_id) %>% + dplyr::arrange(quarter, -quarter_seconds_remaining, index) %>% + dplyr::ungroup() %>% dplyr::mutate( # Fill in the rows with missing posteam with the lag: posteam = dplyr::if_else( @@ -74,10 +84,6 @@ add_nflscrapr_mutations <- function(pbp) { yardline_side == posteam | yardline == "MID 50", 100 - yardline_number, yardline_number ), - # Modify the time column for the quarter end: - time = dplyr::if_else(quarter_end == 1, "00:00", time), - # Create a column with the time in seconds remaining for the quarter: - quarter_seconds_remaining = lubridate::period_to_seconds(lubridate::ms(time)), # Create a column with the time in seconds remaining for each half: half_seconds_remaining = dplyr::if_else( quarter %in% c(1, 3), diff --git a/R/helper_add_series_data.R b/R/helper_add_series_data.R index 720f5b4e..984fcbf9 100644 --- a/R/helper_add_series_data.R +++ b/R/helper_add_series_data.R @@ -23,8 +23,11 @@ add_series_data <- function(pbp) { # AND first down after change of possesion (-> drivenumber increases) # we don't want a first down being indicated for XP, 2P, KO first_down = dplyr::if_else( - (first_down_rush == 1 | first_down_pass == 1 | - first_down_penalty == 1 | + #earn first down + (first_down_rush == 1 | first_down_pass == 1 | first_down_penalty == 1 | + #defensive TD + (touchdown == 1 & td_team != posteam) | + #drive changes (drive < dplyr::lead(drive) | (drive < dplyr::lead(drive, 2) & is.na(dplyr::lead(drive)))) ) & (extra_point_attempt == 0 & two_point_attempt == 0 & kickoff_attempt == 0), @@ -49,7 +52,7 @@ add_series_data <- function(pbp) { ), series_success = dplyr::case_when( is.na(series) | qb_kneel == 1 | qb_spike == 1 ~ NA_real_, - touchdown == 1 | first_down_rush == 1 | first_down_pass == 1 | + (touchdown == 1 & td_team == posteam) | first_down_rush == 1 | first_down_pass == 1 | first_down_penalty == 1 ~ 1, punt_attempt == 1 | interception == 1 | fumble_lost == 1 | fourth_down_failed == 1 | field_goal_attempt == 1 ~ 0, diff --git a/README.Rmd b/README.Rmd index c6dfefc6..0b4ae41a 100644 --- a/README.Rmd +++ b/README.Rmd @@ -17,9 +17,9 @@ knitr::opts_chunk$set( ) ``` - - +![GitHub release (latest by date)](https://img.shields.io/github/v/release/mrcaseb/nflfastR?label=latest%20release) +[![Twitter Follow](https://img.shields.io/twitter/follow/nflfastR.svg?style=social)](https://twitter.com/nflfastR) `nflfastR` is a set of functions to efficiently scrape NFL play-by-play data. `nflfastR` expands upon the features of nflscrapR: @@ -56,7 +56,7 @@ library(tidyverse) The functionality of `nflscrapR` can be duplicated by using `fast_scraper` This obtains the same information contained in `nflscrapR` (plus some extra) but much more quickly. To compare to `nflscrapR`, we use their data repository as the program no longer functions now that the NFL has taken down the old Gamecenter feed. Note that EP differs from nflscrapR as we use a newer era-adjusted model (more on this below). -This example also uses the built-in function `clean_pbp` to create a "name' column for the primary player involved (the QB on pass play or ball-carrier on run play). +This example also uses the built-in function `clean_pbp` to create a 'name' column for the primary player involved (the QB on pass play or ball-carrier on run play). ``` {r ex1-nflscrapR, warning = FALSE, message = FALSE} read_csv(url('https://github.com/ryurko/nflscrapR-data/blob/master/play_by_play_data/regular_season/reg_pbp_2019.csv?raw=true')) %>% @@ -104,7 +104,7 @@ games_2009 %>% filter(!is.na(cpoe)) %>% group_by(passer_player_name) %>% When scraping from the default RS feed, drive results are automatically included. Let's look at how much more likely teams were to score starting from 1st & 10 at their own 20 yard line in 2015 (the last year before touchbacks on kickoffs changed to the 25) than in 2000. ``` {r ex4, warning = FALSE, message = FALSE} games_2000 <- readRDS(url('https://raw.githubusercontent.com/guga31bb/nflfastR-data/master/data/play_by_play_2000.rds')) -games_2015 <-readRDS(url('https://raw.githubusercontent.com/guga31bb/nflfastR-data/master/data/play_by_play_2015.rds')) +games_2015 <- readRDS(url('https://raw.githubusercontent.com/guga31bb/nflfastR-data/master/data/play_by_play_2015.rds')) pbp <- bind_rows(games_2000, games_2015) @@ -159,7 +159,7 @@ The `clean_pbp()` function does a lot of work cleaning up player names and IDs f ## `nflfastR` models -`nflfastR` uses its own models for Expected Points, Win Probability, and Completion Percentage. To read about the models, [please see here](https://github.com/mrcaseb/nflfastR/blob/master/data-raw/MODEL-README.md). For a more detailed description of Expected Points models, we highly recommend this paper [from the nflscrapR team located here](https://arxiv.org/pdf/1802.00998.pdf). +`nflfastR` uses its own models for Expected Points, Win Probability, and Completion Probability. To read about the models, [please see here](https://github.com/mrcaseb/nflfastR/blob/master/data-raw/MODEL-README.md). For a more detailed description of Expected Points models, we highly recommend this paper [from the nflscrapR team located here](https://arxiv.org/pdf/1802.00998.pdf). `nflfastR` includes two win probability models: one with and one without incorporating the pre-game spread. @@ -184,10 +184,10 @@ Even though `nflfastR` is very fast, **for historical games we recommend downloa ## Special thanks * To [Nick Shoemaker](https://twitter.com/WeightRoomShoe) for [finding and making available JSON-formatted NFL play-by-play back to 1999](https://github.com/CroppedClamp/nfl_pbps) (`nflfastR` uses this source for 1999-2010) +* To [Lau Sze Yui](https://twitter.com/903124S) for developing a scraping function to access JSON-formatted NFL play-by-play beginning in 2011. * To [Lee Sharpe](https://twitter.com/LeeSharpeNFL) for curating a resource for game information * To [Timo Riske](https://twitter.com/PFF_Moo), [Lau Sze Yui](https://twitter.com/903124S), [Sean Clement](https://twitter.com/SeanfromSeabeck), and [Daniel Houston](https://twitter.com/CowboysStats) for many helpful discussions regarding the development of the new `nflfastR` models -* To [Zach Feldman](https://twitter.com/ZachFeldman3) and [Josh Hermsmeyer](https://twitter.com/friscojosh) for many helpful discussions about CPOE models +* To [Zach Feldman](https://twitter.com/ZachFeldman3) and [Josh Hermsmeyer](https://twitter.com/friscojosh) for many helpful discussions about CPOE models as well as [Peter Owen](https://twitter.com/JSmoovesBrekkie) for [many helpful suggestions for the CP model](https://twitter.com/JSmoovesBrekkie/status/1268885950626623490) * To [Florian Schmitt](https://twitter.com/Flosch1006) for the logo design -* To [Peter Owen](https://twitter.com/JSmoovesBrekkie) for [many helpful suggestions for the CP model](https://twitter.com/JSmoovesBrekkie/status/1268885950626623490) * The many users who found and reported bugs in `nflfastR` 1.0 * And of course, the original [`nflscrapR`](https://github.com/maksimhorowitz/nflscrapR) team, Maksim Horowitz, Ronald Yurko, and Samuel Ventura, whose work represented a dramatic step forward for the state of public NFL research diff --git a/README.md b/README.md index bb937d19..833e8c2e 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,13 @@ nflfastR ================ + + +![GitHub release (latest by +date)](https://img.shields.io/github/v/release/mrcaseb/nflfastR?label=latest%20release) +[![Twitter Follow](https://img.shields.io/twitter/follow/nflfastR.svg?style=social)](https://twitter.com/nflfastR) + + - [Installation](#installation) - [Usage](#usage) - [Example 1: replicate `nflscrapR` with @@ -24,9 +31,6 @@ nflfastR - - - `nflfastR` is a set of functions to efficiently scrape NFL play-by-play data. `nflfastR` expands upon the features of nflscrapR: @@ -78,7 +82,7 @@ that EP differs from nflscrapR as we use a newer era-adjusted model (more on this below). This example also uses the built-in function `clean_pbp` to create a -"name’ column for the primary player involved (the QB on pass play or +‘name’ column for the primary player involved (the QB on pass play or ball-carrier on run play). ``` r @@ -171,7 +175,7 @@ before touchbacks on kickoffs changed to the 25) than in 2000. ``` r games_2000 <- readRDS(url('https://raw.githubusercontent.com/guga31bb/nflfastR-data/master/data/play_by_play_2000.rds')) -games_2015 <-readRDS(url('https://raw.githubusercontent.com/guga31bb/nflfastR-data/master/data/play_by_play_2015.rds')) +games_2015 <- readRDS(url('https://raw.githubusercontent.com/guga31bb/nflfastR-data/master/data/play_by_play_2015.rds')) pbp <- bind_rows(games_2000, games_2015) @@ -252,7 +256,7 @@ as the NFL changed their system for IDs in the underlying data. ## `nflfastR` models `nflfastR` uses its own models for Expected Points, Win Probability, and -Completion Percentage. To read about the models, [please see +Completion Probability. To read about the models, [please see here](https://github.com/mrcaseb/nflfastR/blob/master/data-raw/MODEL-README.md). For a more detailed description of Expected Points models, we highly recommend this paper [from the nflscrapR team located @@ -306,6 +310,9 @@ Baldwin](https://twitter.com/benbbaldwin). and making available JSON-formatted NFL play-by-play back to 1999](https://github.com/CroppedClamp/nfl_pbps) (`nflfastR` uses this source for 1999-2010) + - To [Lau Sze Yui](https://twitter.com/903124S) for developing a + scraping function to access JSON-formatted NFL play-by-play + beginning in 2011. - To [Lee Sharpe](https://twitter.com/LeeSharpeNFL) for curating a resource for game information - To [Timo Riske](https://twitter.com/PFF_Moo), [Lau Sze @@ -315,12 +322,12 @@ Baldwin](https://twitter.com/benbbaldwin). discussions regarding the development of the new `nflfastR` models - To [Zach Feldman](https://twitter.com/ZachFeldman3) and [Josh Hermsmeyer](https://twitter.com/friscojosh) for many helpful - discussions about CPOE models + discussions about CPOE models as well as [Peter + Owen](https://twitter.com/JSmoovesBrekkie) for [many helpful + suggestions for the CP + model](https://twitter.com/JSmoovesBrekkie/status/1268885950626623490) - To [Florian Schmitt](https://twitter.com/Flosch1006) for the logo design - - To [Peter Owen](https://twitter.com/JSmoovesBrekkie) for [many - helpful suggestions for the CP - model](https://twitter.com/JSmoovesBrekkie/status/1268885950626623490) - The many users who found and reported bugs in `nflfastR` 1.0 - And of course, the original [`nflscrapR`](https://github.com/maksimhorowitz/nflscrapR) team, diff --git a/man/figures/README-ex5-1.png b/man/figures/README-ex5-1.png index 7fff88d9..248ff17b 100644 Binary files a/man/figures/README-ex5-1.png and b/man/figures/README-ex5-1.png differ