Skip to content

Commit

Permalink
Merge pull request #420 from nflverse/fix-driveSequenceNumber
Browse files Browse the repository at this point in the history
Fix drive sequence number
  • Loading branch information
mrcaseb authored Aug 30, 2023
2 parents 723aa58 + f397cb6 commit 597c668
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 52 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: nflfastR
Title: Functions to Efficiently Access NFL Play by Play Data
Version: 4.5.1.9005
Version: 4.5.1.9006
Authors@R:
c(person(given = "Sebastian",
family = "Carl",
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
- The function `calculate_player_stats_def()` no longer errors when small subsets of pbp data are missing stats. (#415)
- The function `calculate_series_conversion_rates()` no longer returns `NA` values if a small subset of pbp data is missing series on offense or defense. (#417)
- `fixed_drive` now correctly increments on plays where posteam lost a fumble but remains posteam because defteam also lost a fumble during the same play. (#419)
- nflfastR now fixes missing drive number counts in raw pbp data in order to provide accurate drive information. (#420)


# nflfastR 4.5.1

Expand Down
51 changes: 0 additions & 51 deletions R/helper_additional_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -55,57 +55,6 @@ clean_pbp <- function(pbp, ...) {
} else {
user_message("Cleaning up play-by-play...", "todo")

if(any(pbp$season >= 2022)){

# user_message("Loading pbp player ID patch files", "info")

patch_seasons <- unique(pbp$season[pbp$season >= 2022])

patch_ids <- nflreadr::load_from_url(
glue::glue("https://github.com/nflverse/nflverse-data/releases/download/misc/pbp_patch_ids_{patch_seasons}.rds")
) %>% suppressMessages()

patchable_ids <- pbp %>%
dplyr::select(
dplyr::any_of(c(
"game_id", "play_id",
"passer_id", "passer_name" = "passer",
"receiver_id", "receiver_name" = "receiver",
"rusher_id", "rusher_name" = "rusher",
"fantasy_id", "fantasy_name" = "fantasy",
"fantasy_player_name"
)),
dplyr::matches("player_id|player_name")
) %>%
tidyr::pivot_longer(
cols = -c("game_id","play_id"),
names_to = c("stat",".value"),
names_pattern = c("(.+)_(id|name)"),
values_drop_na = TRUE
) %>%
dplyr::filter(is.na(.data$id)) %>%
dplyr::left_join(patch_ids, by = c("game_id","play_id","name")) %>%
dplyr::mutate(
id = dplyr::coalesce(.data$id,.data$gsis_id),
gsis_id = NULL,
club_code = NULL,
name = NULL
) %>%
tidyr::pivot_wider(
names_from = "stat",
values_from = "id",
names_glue = "{stat}_id"
)

if(nrow(patchable_ids) > 0){
pbp <- tibble::tibble(pbp) %>%
dplyr::rows_patch(patchable_ids, by = c("game_id","play_id"))
}

# cli::cli_alert_success("{my_time()} | Patched {nrow(patchable_ids)} missing gsis_id field{?s}")

}

# drop existing values of clean_pbp
pbp <- pbp %>% dplyr::select(-tidyselect::any_of(drop.cols))

Expand Down
26 changes: 26 additions & 0 deletions R/helper_scrape_nfl.R
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,32 @@ get_pbp_nfl <- function(id, dir = NULL, qs = FALSE, ...) {

plays <- raw_data$data$viewer$gameDetail$plays %>% dplyr::mutate(game_id = as.character(game_id))

# We have this issue https://github.com/nflverse/nflfastR/issues/309 with 2013 postseason games
# where the driveSequenceNumber in the plays df is NA for all plays. That prevents drive information
# from being joined.
# In this case, we compute our own driveSequenceNumber by incrementing a counter depending on the
# value of driveTimeOfPossession.
# driveTimeOfPossession will be a constant value during a drive so this should actually be accurate
if (all(is.na(plays$driveSequenceNumber))){
plays <- plays %>%
dplyr::mutate(
# First, create a trigger for cumsum
drive_trigger = dplyr::case_when(
# this is the first play of the first drive
is.na(dplyr::lag(driveTimeOfPossession)) & !is.na(driveTimeOfPossession) ~ 1,
# if driveTimeOfPossession changes, there is a new drive
dplyr::lag(driveTimeOfPossession) != driveTimeOfPossession ~ 1,
TRUE ~ 0
),
# Now create the drive number by accumulationg triggers
driveSequenceNumber = cumsum(drive_trigger),
# driveSequenceNumber should be NA on plays where driveTimeOfPossession is NA
driveSequenceNumber = ifelse(is.na(driveTimeOfPossession), NA_real_, driveSequenceNumber),
# drop the helper
drive_trigger = NULL
)
}

#fill missing posteam info for this
if (
((home_team %in% c("JAC", "JAX") | away_team %in% c("JAC", "JAX")) & season <= 2015) |
Expand Down

0 comments on commit 597c668

Please sign in to comment.