From 6f4e6bcac0852193e1ea0d4348a7ca68fe1d4d95 Mon Sep 17 00:00:00 2001 From: Sebastian Carl Date: Fri, 1 Sep 2023 10:03:35 +0200 Subject: [PATCH 01/13] put our version of str_split_i in utils to avoid stringr 1.5 dep --- R/utils.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/R/utils.R b/R/utils.R index 8e00f259..2710bbe7 100644 --- a/R/utils.R +++ b/R/utils.R @@ -187,3 +187,7 @@ make_nflverse_data <- function(data, type = c("play by play")){ class(data) <- c("nflverse_data", "tbl_df", "tbl", "data.table", "data.frame") data } + +str_split_and_extract <- function(string, pattern, i){ + split_list <- stringr::str_split(string, pattern, simplify = TRUE, n = i + 1) + split_list[, i] From 2af8faf0067c57209f2f1927e7350cc64965c691 Mon Sep 17 00:00:00 2001 From: Sebastian Carl Date: Fri, 1 Sep 2023 10:04:28 +0200 Subject: [PATCH 02/13] no more mapping -> significantly faster --- R/helper_scrape_nfl.R | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/R/helper_scrape_nfl.R b/R/helper_scrape_nfl.R index 67b434a9..6d2c6d6a 100644 --- a/R/helper_scrape_nfl.R +++ b/R/helper_scrape_nfl.R @@ -223,14 +223,8 @@ get_pbp_nfl <- function(id, dir = NULL, qs = FALSE, ...) { dplyr::mutate( posteam_id = .data$posteam, # have to do all this nonsense to make goal_to_go and yardline_side for compatibility with later functions - yardline_side = furrr::future_map_chr( - stringr::str_split(.data$yardline, " "), - function(x) x[1] - ), - yardline_number = as.numeric(furrr::future_map_chr( - stringr::str_split(.data$yardline, " "), - function(x) x[2] - )), + yardline_side = str_split_and_extract(.data$yardline, " ", 1), + yardline_number = as.numeric(str_split_and_extract(.data$yardline, " ", 2)), quarter_end = dplyr::if_else(stringr::str_detect(.data$play_description, "END QUARTER"), 1, 0), game_year = as.integer(season), season = as.integer(season), From 23883db943b74a323375b146f2c2b9626b53c46e Mon Sep 17 00:00:00 2001 From: Sebastian Carl Date: Fri, 1 Sep 2023 10:05:01 +0200 Subject: [PATCH 03/13] update unexported helpers for debugging --- R/utils.R | 80 +++++++++++++++++++++++++------------------------------ 1 file changed, 36 insertions(+), 44 deletions(-) diff --git a/R/utils.R b/R/utils.R index 2710bbe7..9eb5524c 100644 --- a/R/utils.R +++ b/R/utils.R @@ -76,46 +76,50 @@ maybe_valid <- function(id) { is_installed <- function(pkg) requireNamespace(pkg, quietly = TRUE) # load raw game files esp. for debugging -load_raw_game <- function(game_id, qs = FALSE){ +load_raw_game <- function(game_id, + dir = getOption("nflfastR.raw_directory", default = NULL), + skip_local = FALSE){ - if (isTRUE(qs) && !is_installed("qs")) { - cli::cli_abort("Package {.val qs} required for argument {.val qs = TRUE}. Please install it.") - } + # game_id <- "2022_19_LAC_JAX" season <- substr(game_id, 1, 4) - path <- "https://raw.githubusercontent.com/guga31bb/nflfastR-raw/master/raw" - - if(isFALSE(qs)) fetched <- curl::curl_fetch_memory(glue::glue("{path}/{season}/{game_id}.rds")) - if(isTRUE(qs)) fetched <- curl::curl_fetch_memory(glue::glue("{path}/{season}/{game_id}.qs")) + local_file <- file.path( + dir, + season, + paste0(game_id, ".rds") + ) - if (fetched$status_code == 404 & maybe_valid(game_id)) { - cli::cli_abort("The requested GameID {game_id} is not loaded yet, please try again later!") - } else if (fetched$status_code == 500) { - cli::cli_abort("The data hosting servers are down, please try again later!") - } else if (fetched$status_code == 404) { - cli::cli_abort("The requested GameID {game_id} is invalid!") + if (length(local_file) == 1 && file.exists(local_file) && isFALSE(skip_local)) { + # cli::cli_progress_step("Load locally from {.path {local_file}}") + raw <- readRDS(local_file) + } else { + to_load <- file.path( + "https://raw.githubusercontent.com/nflverse/nflfastR-raw/master/raw", + season, + paste0(game_id, ".rds"), + fsep = "/" + ) + raw <- nflreadr::rds_from_url(to_load) } - if(isFALSE(qs)) raw_data <- read_raw_rds(fetched$content) - - if(isTRUE(qs)) raw_data <- qs::qdeserialize(fetched$content) - - return(raw_data) + raw } # Identify sessions with sequential future resolving is_sequential <- function() inherits(future::plan(), "sequential") -check_stat_ids <- function(seasons, stat_ids){ +check_stat_ids <- function(seasons, + stat_ids = 1:500, + dir = getOption("nflfastR.raw_directory", default = NULL), + skip_local = FALSE){ if (is_sequential()) { - cli::cli_alert_info(c( - "It is recommended to use parallel processing when using this function.", - "Please consider running {.code future::plan(\"multisession\")}!", - "Will go on sequentially..." - )) + cli::cli_alert_info( + "It is recommended to use parallel processing when using this function. \\ + Please consider running {.code future::plan(\"multisession\")}! \\ + Will go on sequentially...", wrap = TRUE) } games <- nflreadr::load_schedules() %>% @@ -124,10 +128,10 @@ check_stat_ids <- function(seasons, stat_ids){ p <- progressr::progressor(along = games) - furrr::future_map_dfr(games, function(id, stats, p){ - raw_data <- load_raw_game(id) + furrr::future_map_dfr(games, function(id, stats, p, dir, skip_local){ + raw_data <- load_raw_game(id, dir = dir, skip_local = skip_local) plays <- janitor::clean_names(raw_data$data$viewer$gameDetail$plays) %>% - dplyr::select(.data$play_id, .data$play_stats) + dplyr::select("play_id", "play_stats", "desc" = play_description_with_jersey_numbers) p(sprintf("ID=%s", as.character(id))) @@ -142,23 +146,10 @@ check_stat_ids <- function(seasons, stat_ids){ "yards", "team_abbr" = "team_abbreviation", "player_name", - "gsis_player_id" + "gsis_player_id", + "desc" ) - }, stat_ids, p) -} - -# compute most recent season -most_recent_season <- function(roster = FALSE) { - today <- Sys.Date() - current_year <- as.integer(format(today, format = "%Y")) - current_month <- as.integer(format(today, format = "%m")) - - if ((isFALSE(roster) && current_month >= 9) || - (isTRUE(roster) && current_month >= 3)) { - return(current_year) - } - - return(current_year - 1) + }, stats = stat_ids, p = p, dir = dir, skip_local = skip_local) } # take a time string of the format "MM:SS" and convert it to seconds @@ -191,3 +182,4 @@ make_nflverse_data <- function(data, type = c("play by play")){ str_split_and_extract <- function(string, pattern, i){ split_list <- stringr::str_split(string, pattern, simplify = TRUE, n = i + 1) split_list[, i] +} From 820d24247758aa4e89d0c7cd4096d2f6d8639785 Mon Sep 17 00:00:00 2001 From: Sebastian Carl Date: Fri, 1 Sep 2023 10:59:56 +0200 Subject: [PATCH 04/13] better user messaging function for future usage --- R/utils.R | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/R/utils.R b/R/utils.R index 9eb5524c..2ee2b9b4 100644 --- a/R/utils.R +++ b/R/utils.R @@ -21,6 +21,13 @@ user_message <- function(x, type) { } } +cli_message <- function(msg, + ..., + .cli_fct = cli::cli_alert_info, + .envir = parent.frame()) { + .cli_fct(c(my_time(), " | ", msg), ..., .envir = .envir) +} + my_time <- function() strftime(Sys.time(), format = "%H:%M:%S") # custom mode function from https://stackoverflow.com/questions/2547402/is-there-a-built-in-function-for-finding-the-mode/8189441 From 240202167eba75c9bb458d2a8b08d7bd8873bdaf Mon Sep 17 00:00:00 2001 From: Sebastian Carl Date: Fri, 1 Sep 2023 11:14:10 +0200 Subject: [PATCH 05/13] adjust some messages --- R/build_nflfastR_pbp.R | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/R/build_nflfastR_pbp.R b/R/build_nflfastR_pbp.R index 0a67836c..b401e835 100644 --- a/R/build_nflfastR_pbp.R +++ b/R/build_nflfastR_pbp.R @@ -61,10 +61,10 @@ build_nflfastR_pbp <- function(game_ids, if (!is.vector(game_ids) && is.data.frame(game_ids)) game_ids <- game_ids$game_id - if (!is.vector(game_ids)) cli::cli_abort("Param {.code game_ids} is not a valid vector!") + if (!is.vector(game_ids)) cli::cli_abort("Param {.arg game_ids} is not a valid vector!") if (isTRUE(decode) && !is_installed("gsisdecoder")) { - cli::cli_abort("Package {.val gsisdecoder} required for decoding. Please install it with {.code install.packages(\"gsisdecoder\")}.") + cli::cli_abort("Package {.pkg gsisdecoder} required for decoding. Please install it with {.code install.packages(\"gsisdecoder\")}.") } if (isTRUE(rules)) rule_header("Build nflfastR Play-by-Play Data") @@ -72,11 +72,7 @@ build_nflfastR_pbp <- function(game_ids, game_count <- ifelse(is.vector(game_ids), length(game_ids), nrow(game_ids)) builder <- TRUE - if (game_count > 1) { - cli::cli_ul("{my_time()} | Start download of {game_count} games...") - } else { - cli::cli_ul("{my_time()} | Start download of {game_count} game...") - } + cli::cli_ul("{my_time()} | Start download of {game_count} game{?s}...") ret <- fast_scraper(game_ids = game_ids, ..., in_builder = builder) %>% clean_pbp(in_builder = builder) %>% From 8b4c7f82c04595f618f91c0d59b2794b58e939fa Mon Sep 17 00:00:00 2001 From: Sebastian Carl Date: Fri, 1 Sep 2023 14:11:53 +0200 Subject: [PATCH 06/13] centralize raw game download and better implement local files --- R/utils.R | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 2 deletions(-) diff --git a/R/utils.R b/R/utils.R index 2ee2b9b4..4bd4bd87 100644 --- a/R/utils.R +++ b/R/utils.R @@ -63,8 +63,8 @@ qs_from_url <- function(url) qs::qdeserialize(curl::curl_fetch_memory(url)$conte read_raw_rds <- function(raw) { con <- gzcon(rawConnection(raw)) ret <- readRDS(con) - close(con) - return(ret) + on.exit(close(con)) + ret } # helper to make sure the output of the @@ -190,3 +190,61 @@ str_split_and_extract <- function(string, pattern, i){ split_list <- stringr::str_split(string, pattern, simplify = TRUE, n = i + 1) split_list[, i] } + +# slightly modified version of purrr::possibly +please_work <- function(.f, otherwise = data.frame(), quiet = FALSE){ + function(...){ + tryCatch( + expr = .f(...), + error = function(e){ + if(isFALSE(quiet)) cli::cli_alert_warning(conditionMessage(e)) + otherwise + } + ) + } +} + + +fetch_raw <- function(game_id, + dir = getOption("nflfastR.raw_directory", default = NULL)){ + + season <- substr(id, 1, 4) + + if (is.null(dir)) { + + to_load <- file.path( + "https://raw.githubusercontent.com/nflverse/nflfastR-raw/master/raw", + season, + paste0(id, ".rds"), + fsep = "/" + ) + + fetched <- curl::curl_fetch_memory(to_load) + + if (fetched$status_code == 404 & maybe_valid(id)) { + cli::cli_abort("The requested GameID {.val {id}} is not loaded yet, please try again later!") + } else if (fetched$status_code == 500) { + cli::cli_abort("The data hosting servers are down, please try again later!") + } else if (fetched$status_code == 404) { + cli::cli_abort("The requested GameID {.val {id}} is invalid!") + } + + out <- read_raw_rds(fetched$content) + + } else { + # build path to locally stored game files + local_file <- file.path( + dir, + season, + paste0(id, ".rds") + ) + + if (!file.exists(local_file)) { + cli::cli_abort("File {.path {local_file}} doesn't exist!") + } + + out <- readRDS(local_file) + } + + out +} From 2a9f23228f30d973f8543b93aa8f53496ada9a41 Mon Sep 17 00:00:00 2001 From: Sebastian Carl Date: Fri, 1 Sep 2023 14:12:43 +0200 Subject: [PATCH 07/13] remove annoying tryCatch stuff and use our custom implementation of purrr::possibly --- R/helper_scrape_gc.R | 508 +++++++++++++++------------------- R/helper_scrape_nfl.R | 630 +++++++++++++++++++----------------------- R/top-level_scraper.R | 4 +- 3 files changed, 508 insertions(+), 634 deletions(-) diff --git a/R/helper_scrape_gc.R b/R/helper_scrape_gc.R index 1cfbd9d5..9eb14736 100644 --- a/R/helper_scrape_gc.R +++ b/R/helper_scrape_gc.R @@ -8,302 +8,240 @@ # # @param gameId Specifies the game -get_pbp_gc <- function(gameId, dir = NULL, qs = FALSE, ...) { - - warn <- 0 - - if (isTRUE(qs) && !is_installed("qs")) { - cli::cli_abort("Package {.val qs} required for argument {.val qs = TRUE}. Please install it.") +get_pbp_gc <- function(gameId, + dir = getOption("nflfastR.raw_directory", default = NULL), + ...) { + + # testing only + # gameId = '2013120812' + # gameId = '2019_01_GB_CHI' + # gameId = '2009_18_NYJ_CIN' + # gameId = '2007_01_ARI_SF' + # gameId = '1999_01_BAL_STL' + + if (gameId %in% c("2000_03_SD_KC", "2000_06_BUF_MIA", "1999_01_BAL_STL")) { + cli::cli_abort("You asked for GameID {.val {gameId}} is broken. Skipping.") } - combined <- data.frame() - tryCatch( - expr = { - - # testing only - # gameId = '2013120812' - # gameId = '2019_01_GB_CHI' - # gameId = '2009_18_NYJ_CIN' - # gameId = '2007_01_ARI_SF' - # gameId = '1999_01_BAL_STL' - - if (gameId %in% c("2000_03_SD_KC", "2000_06_BUF_MIA", "1999_01_BAL_STL")) { - warning(warn <- 1) - } - - season <- as.integer(substr(gameId, 1, 4)) - - if (is.null(dir)) { - path <- "https://raw.githubusercontent.com/nflverse/nflfastR-raw/master/raw" - - if(isFALSE(qs)) fetched <- curl::curl_fetch_memory(glue::glue("{path}/{season}/{gameId}.rds")) - - if(isTRUE(qs)) fetched <- curl::curl_fetch_memory(glue::glue("{path}/{season}/{gameId}.qs")) + season <- as.integer(substr(gameId, 1, 4)) - if (fetched$status_code == 404) { - warning(warn <- 3) - } else if (fetched$status_code == 500) { - warning(warn <- 4) - } + raw <- fetch_raw(game_id = gameId, dir = dir) - if(isFALSE(qs)) raw <- read_raw_rds(fetched$content) + game_json <- raw[[1]] - if(isTRUE(qs)) raw <- qs::qdeserialize(fetched$content) - - } else { - # build path to locally stored game files. This functionality is primarily - # for the data repo maintainer - if(isFALSE(qs)) p <- glue::glue("{dir}/{season}/{gameId}.rds") - if(isTRUE(qs)) p <- glue::glue("{dir}/{season}/{gameId}.qs") - - if (file.exists(p) == FALSE) { - warning(warn <- 5) - } - - if(isFALSE(qs)) raw <- readRDS(p) - if(isTRUE(qs)) raw <- qs::qread(p) - } - - game_json <- raw[[1]] - - date_parse <- names(raw)[1] %>% stringr::str_extract(pattern = "[0-9]{8}") - date_year <- stringr::str_sub(date_parse, 1, 4) - date_month <- stringr::str_sub(date_parse, 5, 6) - date_day <- stringr::str_sub( - date_parse, nchar(date_parse) - 1, - nchar(date_parse) - ) + date_parse <- names(raw)[1] %>% stringr::str_extract(pattern = "[0-9]{8}") + date_year <- stringr::str_sub(date_parse, 1, 4) + date_month <- stringr::str_sub(date_parse, 5, 6) + date_day <- stringr::str_sub( + date_parse, nchar(date_parse) - 1, + nchar(date_parse) + ) - week <- as.integer(substr(gameId, 6, 7)) - if (week <= 17) { - season_type <- "REG" - } else { - season_type <- "POST" - } + week <- as.integer(substr(gameId, 6, 7)) + if (week <= 17) { + season_type <- "REG" + } else { + season_type <- "POST" + } - if (date_year < 1999) { - warning(warn <- 2) - } + if (date_year < 1999) { + cli::cli_abort("You asked a game from {date_year}, but data only goes back to 1999.") + } - # excluding last element since it's "crntdrv" and not an actual - drives <- game_json$drives[-length(game_json$drives)] + # excluding last element since it's "crntdrv" and not an actual + drives <- game_json$drives[-length(game_json$drives)] + + # list of plays + # each play has "players" column which is a list of player stats from the play + plays <- suppressWarnings(furrr::future_map_dfr(seq_along(drives), function(x) { + cbind( + "drive" = x, + data.frame(do.call( + rbind, + drives[[x]]$plays + ))[, c(1:11)] + ) %>% dplyr::mutate(play_id = names(drives[[x]]$plays), play_id = as.integer(.data$play_id)) + } + )) - # list of plays - # each play has "players" column which is a list of player stats from the play - plays <- suppressWarnings(furrr::future_map_dfr(seq_along(drives), function(x) { - cbind( - "drive" = x, - data.frame(do.call( - rbind, - drives[[x]]$plays - ))[, c(1:11)] - ) %>% dplyr::mutate(play_id = names(drives[[x]]$plays), play_id = as.integer(.data$play_id)) - } - )) + plays$quarter_end <- dplyr::if_else( + stringr::str_detect(plays$desc, "(END QUARTER)|(END GAME)|(End of quarter)"), 1, 0 + ) + plays$home_team <- game_json$home$abbr + plays$away_team <- game_json$away$abbr - plays$quarter_end <- dplyr::if_else( - stringr::str_detect(plays$desc, "(END QUARTER)|(END GAME)|(End of quarter)"), 1, 0 + # get df with 1 line per statId + stats <- furrr::future_map_dfr(seq_along(plays$play_id), function(x) { + dplyr::bind_rows(plays[x, ]$players[[1]], .id = "player_id") %>% + dplyr::mutate(play_id = plays[x, ]$play_id) + } + ) %>% + dplyr::mutate( + sequence = as.numeric(.data$sequence), + statId = as.numeric(.data$statId), + play_id = as.character(.data$play_id), + yards = as.integer(.data$yards) + ) %>% + dplyr::arrange(.data$play_id, .data$sequence) %>% + dplyr::rename( + playId = "play_id", + teamAbbr = "clubcode", + player.esbId = "player_id", + player.displayName = "playerName", + playStatSeq = "sequence" + ) + + pbp_stats <- lapply(unique(stats$playId), sum_play_stats, stats) + pbp_stats <- data.table::rbindlist(pbp_stats) %>% tibble::as_tibble() + + # drive info + d <- tibble::tibble(drives) %>% + tidyr::unnest_wider(drives) %>% + # dplyr::select(-plays) %>% + tidyr::unnest_wider("start", names_sep = "_") %>% + tidyr::unnest_wider("end", names_sep = "_") %>% + dplyr::mutate(drive = 1:dplyr::n()) %>% + dplyr::rename( + drive_play_count = "numplays", + drive_time_of_possession = "postime", + drive_first_downs = "fds", + drive_inside20 = "redzone", + drive_quarter_start = "start_qtr", + drive_quarter_end = "end_qtr", + drive_end_transition = "result", + drive_game_clock_start = "start_time", + drive_game_clock_end = "end_time", + drive_start_yard_line = "start_yrdln", + drive_end_yard_line = "end_yrdln" + ) %>% + dplyr::mutate( + drive_inside20 = dplyr::if_else(.data$drive_inside20, 1, 0), + drive_how_ended_description = .data$drive_end_transition, + drive_ended_with_score = dplyr::if_else(.data$drive_how_ended_description == "Touchdown" | .data$drive_how_ended_description == "Field Goal", 1, 0), + drive_start_transition = dplyr::lag(.data$drive_how_ended_description, 1), + drive_how_started_description = .data$drive_start_transition + ) %>% + dplyr::select( + "drive", "drive_play_count", "drive_time_of_possession", + "drive_first_downs", "drive_inside20", "drive_ended_with_score", + "drive_quarter_start", "drive_quarter_end", + "drive_end_transition", "drive_how_ended_description", + "drive_game_clock_start", "drive_game_clock_end", + "drive_start_yard_line", "drive_end_yard_line", + "drive_start_transition", "drive_how_started_description" + ) + + combined <- plays %>% + dplyr::left_join(pbp_stats, by = "play_id") %>% + dplyr::mutate_if(is.logical, as.numeric) %>% + dplyr::mutate_if(is.integer, as.numeric) %>% + dplyr::select(-"players", -"note") %>% + #Weirdly formatted and missing anyway + dplyr::mutate(note = NA_character_) %>% + dplyr::rename(yardline = "yrdln", quarter = "qtr", play_description = "desc", yards_to_go = "ydstogo") %>% + tidyr::unnest(cols = c("sp", "quarter", "down", "time", "yardline", "yards_to_go", "ydsnet", "posteam", "play_description", "note")) %>% + dplyr::left_join(d, by = "drive") %>% + dplyr::mutate( + posteam_id = .data$posteam, + game_id = gameId, + game_year = as.integer(date_year), + game_month = as.integer(date_month), + game_date = as.Date(paste(date_month, + date_day, + date_year, + sep = "/" + ), + format = "%m/%d/%Y" + ), + season = season, + + # fix up yardline before doing stuff. from nflscrapr + yardline = dplyr::if_else(.data$yardline == "50", "MID 50", .data$yardline), + yardline = dplyr::if_else( + nchar(.data$yardline) == 0 | is.null(.data$yardline) | + .data$yardline == "NULL", + dplyr::lag(.data$yardline), .data$yardline + ), + + # have to do all this nonsense to make goal_to_go and yardline_side for compatibility with later functions + yardline_side = furrr::future_map_chr( + stringr::str_split(.data$yardline, " "), + function(x) x[1] + ), + yardline_number = as.numeric(furrr::future_map_chr( + stringr::str_split(.data$yardline, " "), + function(x) x[2] + )), + goal_to_go = dplyr::if_else( + .data$yardline_side != .data$posteam & + ((.data$yards_to_go == .data$yardline_number) | + (.data$yards_to_go <= 1 & .data$yardline_number == 1)), + 1, 0 + ), + down = as.double(.data$down), + quarter = as.double(.data$quarter), + week = week, + season_type = season_type, + # missing from older gc data + drive_real_start_time = NA_character_, + start_time = NA_character_, + stadium = NA_character_, + weather = NA_character_, + nfl_api_id = NA_character_, + play_clock = NA_character_, + play_deleted = NA_real_, + play_type_nfl = NA_character_, + drive_yards_penalized = NA_real_, + end_clock_time = NA_character_, + end_yard_line = NA_character_, + order_sequence = NA_real_, + time_of_day = NA_character_, + special_teams_play = NA_real_, + st_play_type = NA_character_, + # there seems to be no easy way to find the safety scoring team. Will hard code the plays + # as there are only 6 of them in the game center data + safety_team = dplyr::case_when( + .data$safety == 1 & .data$game_id == "1999_04_PHI_NYG" & .data$play_id == 827 ~ .data$posteam, + .data$safety == 1 & .data$game_id == "2000_03_ATL_CAR" & .data$play_id == 3423 ~ .data$posteam, + .data$safety == 1 & .data$game_id == "2000_16_OAK_SEA" & .data$play_id == 3590 ~ .data$posteam, + .data$safety == 1 & .data$game_id == "2001_14_DAL_SEA" & .data$play_id == 2552 ~ .data$posteam, + .data$safety == 1 & .data$game_id == "2003_03_NO_TEN" & .data$play_id == 416 ~ .data$posteam, + .data$safety == 1 & .data$game_id == "2009_08_STL_DET" & .data$play_id == 987 ~ .data$posteam, + .data$safety == 1 & .data$posteam == .data$home_team ~ .data$away_team, + .data$safety == 1 & .data$posteam == .data$away_team ~ .data$home_team, + TRUE ~ NA_character_ ) - plays$home_team <- game_json$home$abbr - plays$away_team <- game_json$away$abbr - - # get df with 1 line per statId - stats <- furrr::future_map_dfr(seq_along(plays$play_id), function(x) { - dplyr::bind_rows(plays[x, ]$players[[1]], .id = "player_id") %>% - dplyr::mutate(play_id = plays[x, ]$play_id) - } - ) %>% - dplyr::mutate( - sequence = as.numeric(.data$sequence), - statId = as.numeric(.data$statId), - play_id = as.character(.data$play_id), - yards = as.integer(.data$yards) - ) %>% - dplyr::arrange(.data$play_id, .data$sequence) %>% - dplyr::rename( - playId = "play_id", - teamAbbr = "clubcode", - player.esbId = "player_id", - player.displayName = "playerName", - playStatSeq = "sequence" - ) - - pbp_stats <- lapply(unique(stats$playId), sum_play_stats, stats) - pbp_stats <- data.table::rbindlist(pbp_stats) %>% tibble::as_tibble() - - # drive info - d <- tibble::tibble(drives) %>% - tidyr::unnest_wider(drives) %>% - # dplyr::select(-plays) %>% - tidyr::unnest_wider("start", names_sep = "_") %>% - tidyr::unnest_wider("end", names_sep = "_") %>% - dplyr::mutate(drive = 1:dplyr::n()) %>% - dplyr::rename( - drive_play_count = "numplays", - drive_time_of_possession = "postime", - drive_first_downs = "fds", - drive_inside20 = "redzone", - drive_quarter_start = "start_qtr", - drive_quarter_end = "end_qtr", - drive_end_transition = "result", - drive_game_clock_start = "start_time", - drive_game_clock_end = "end_time", - drive_start_yard_line = "start_yrdln", - drive_end_yard_line = "end_yrdln" - ) %>% - dplyr::mutate( - drive_inside20 = dplyr::if_else(.data$drive_inside20, 1, 0), - drive_how_ended_description = .data$drive_end_transition, - drive_ended_with_score = dplyr::if_else(.data$drive_how_ended_description == "Touchdown" | .data$drive_how_ended_description == "Field Goal", 1, 0), - drive_start_transition = dplyr::lag(.data$drive_how_ended_description, 1), - drive_how_started_description = .data$drive_start_transition - ) %>% - dplyr::select( - "drive", "drive_play_count", "drive_time_of_possession", - "drive_first_downs", "drive_inside20", "drive_ended_with_score", - "drive_quarter_start", "drive_quarter_end", - "drive_end_transition", "drive_how_ended_description", - "drive_game_clock_start", "drive_game_clock_end", - "drive_start_yard_line", "drive_end_yard_line", - "drive_start_transition", "drive_how_started_description" + ) %>% + dplyr::group_by(.data$drive) %>% + dplyr::mutate( + drive_play_id_started = min(.data$play_id, na.rm = TRUE), + drive_play_seq_started = min(.data$play_id, na.rm = TRUE), + drive_play_id_ended = max(.data$play_id, na.rm = TRUE), + drive_play_seq_ended = max(.data$play_id, na.rm = TRUE) + ) %>% + dplyr::ungroup() + + # missing space in side of field breaks parser + if (gameId %in% c('2000_01_CAR_WAS', '2000_02_NE_NYJ', '2000_03_ATL_CAR')) { + combined <- combined %>% + dplyr::mutate( + yardline_number = case_when( + .data$yardline %in% c("WAS20", "NYJ20", "ATL20") ~ 20, + TRUE ~ .data$yardline_number + ), + yardline = case_when( + .data$yardline == "WAS20" ~ "WAS 20", + .data$yardline == "NYJ20" ~ "NYJ 20", + .data$yardline == "ATL20" ~ "ATL 20", + TRUE ~ .data$yardline + ), + yardline_side = case_when( + .data$yardline_side == "WAS20" ~ "WAS", + .data$yardline_side == "NYJ20" ~ "NYJ", + .data$yardline_side == "ATL20" ~ "ATL", + TRUE ~ .data$yardline_side ) - - combined <- plays %>% - dplyr::left_join(pbp_stats, by = "play_id") %>% - dplyr::mutate_if(is.logical, as.numeric) %>% - dplyr::mutate_if(is.integer, as.numeric) %>% - dplyr::select(-"players", -"note") %>% - #Weirdly formatted and missing anyway - dplyr::mutate(note = NA_character_) %>% - dplyr::rename(yardline = "yrdln", quarter = "qtr", play_description = "desc", yards_to_go = "ydstogo") %>% - tidyr::unnest(cols = c("sp", "quarter", "down", "time", "yardline", "yards_to_go", "ydsnet", "posteam", "play_description", "note")) %>% - dplyr::left_join(d, by = "drive") %>% - dplyr::mutate( - posteam_id = .data$posteam, - game_id = gameId, - game_year = as.integer(date_year), - game_month = as.integer(date_month), - game_date = as.Date(paste(date_month, - date_day, - date_year, - sep = "/" - ), - format = "%m/%d/%Y" - ), - season = season, - - # fix up yardline before doing stuff. from nflscrapr - yardline = dplyr::if_else(.data$yardline == "50", "MID 50", .data$yardline), - yardline = dplyr::if_else( - nchar(.data$yardline) == 0 | is.null(.data$yardline) | - .data$yardline == "NULL", - dplyr::lag(.data$yardline), .data$yardline - ), - - # have to do all this nonsense to make goal_to_go and yardline_side for compatibility with later functions - yardline_side = furrr::future_map_chr( - stringr::str_split(.data$yardline, " "), - function(x) x[1] - ), - yardline_number = as.numeric(furrr::future_map_chr( - stringr::str_split(.data$yardline, " "), - function(x) x[2] - )), - goal_to_go = dplyr::if_else( - .data$yardline_side != .data$posteam & - ((.data$yards_to_go == .data$yardline_number) | - (.data$yards_to_go <= 1 & .data$yardline_number == 1)), - 1, 0 - ), - down = as.double(.data$down), - quarter = as.double(.data$quarter), - week = week, - season_type = season_type, - # missing from older gc data - drive_real_start_time = NA_character_, - start_time = NA_character_, - stadium = NA_character_, - weather = NA_character_, - nfl_api_id = NA_character_, - play_clock = NA_character_, - play_deleted = NA_real_, - play_type_nfl = NA_character_, - drive_yards_penalized = NA_real_, - end_clock_time = NA_character_, - end_yard_line = NA_character_, - order_sequence = NA_real_, - time_of_day = NA_character_, - special_teams_play = NA_real_, - st_play_type = NA_character_, - # there seems to be no easy way to find the safety scoring team. Will hard code the plays - # as there are only 6 of them in the game center data - safety_team = dplyr::case_when( - .data$safety == 1 & .data$game_id == "1999_04_PHI_NYG" & .data$play_id == 827 ~ .data$posteam, - .data$safety == 1 & .data$game_id == "2000_03_ATL_CAR" & .data$play_id == 3423 ~ .data$posteam, - .data$safety == 1 & .data$game_id == "2000_16_OAK_SEA" & .data$play_id == 3590 ~ .data$posteam, - .data$safety == 1 & .data$game_id == "2001_14_DAL_SEA" & .data$play_id == 2552 ~ .data$posteam, - .data$safety == 1 & .data$game_id == "2003_03_NO_TEN" & .data$play_id == 416 ~ .data$posteam, - .data$safety == 1 & .data$game_id == "2009_08_STL_DET" & .data$play_id == 987 ~ .data$posteam, - .data$safety == 1 & .data$posteam == .data$home_team ~ .data$away_team, - .data$safety == 1 & .data$posteam == .data$away_team ~ .data$home_team, - TRUE ~ NA_character_ - ) - ) %>% - dplyr::group_by(.data$drive) %>% - dplyr::mutate( - drive_play_id_started = min(.data$play_id, na.rm = TRUE), - drive_play_seq_started = min(.data$play_id, na.rm = TRUE), - drive_play_id_ended = max(.data$play_id, na.rm = TRUE), - drive_play_seq_ended = max(.data$play_id, na.rm = TRUE) - ) %>% - dplyr::ungroup() - - # missing space in side of field breaks parser - if (gameId %in% c('2000_01_CAR_WAS', '2000_02_NE_NYJ', '2000_03_ATL_CAR')) { - combined <- combined %>% - dplyr::mutate( - yardline_number = case_when( - .data$yardline %in% c("WAS20", "NYJ20", "ATL20") ~ 20, - TRUE ~ .data$yardline_number - ), - yardline = case_when( - .data$yardline == "WAS20" ~ "WAS 20", - .data$yardline == "NYJ20" ~ "NYJ 20", - .data$yardline == "ATL20" ~ "ATL 20", - TRUE ~ .data$yardline - ), - yardline_side = case_when( - .data$yardline_side == "WAS20" ~ "WAS", - .data$yardline_side == "NYJ20" ~ "NYJ", - .data$yardline_side == "ATL20" ~ "ATL", - TRUE ~ .data$yardline_side - ) - ) - } - }, - error = function(e) { - message("The following error has occured:") - message(e) - }, - warning = function(w) { - if (warn == 1) { - message(glue::glue("You asked for {gameId}, which is broken. Skipping.")) - } else if (warn == 2) { - message(glue::glue("You asked a game from {date_year}, but data only goes back to 1999.")) - } else if (warn == 3) { - message(glue::glue("Warning: The requested GameID {gameId} is invalid!")) - } else if (warn == 4) { - message(glue::glue("Warning: The data hosting servers are down, please try again later!")) - } else if (warn == 5) { - message(glue::glue("Warning: Either the requested GameID {gameId} is missing or you've passed an invalid path!")) - } else { - message("The following warning has occured:") - message(w) - } - }, - finally = { - } - ) - + ) + } return(combined) } diff --git a/R/helper_scrape_nfl.R b/R/helper_scrape_nfl.R index 6d2c6d6a..320a0c7d 100644 --- a/R/helper_scrape_nfl.R +++ b/R/helper_scrape_nfl.R @@ -7,375 +7,311 @@ # Build a tidy version of scraped NFL data # # @param id Specifies the game -get_pbp_nfl <- function(id, dir = NULL, qs = FALSE, ...) { +get_pbp_nfl <- function(id, + dir = getOption("nflfastR.raw_directory", default = NULL), + ...) { - warn <- 0 + #testing + #id = '2019_01_GB_CHI' + # id = '2015_01_CAR_JAX' + #id = '2011_01_NO_GB' - if (isTRUE(qs) && !is_installed("qs")) { - cli::cli_abort("Package {.val qs} required for argument {.val qs = TRUE}. Please install it.") - } - - combined <- data.frame() - tryCatch( - expr = { - - #testing - #id = '2019_01_GB_CHI' - # id = '2015_01_CAR_JAX' - #id = '2011_01_NO_GB' + season <- substr(id, 1, 4) + week <- as.integer(substr(id, 6, 7)) - season <- substr(id, 1, 4) - week <- as.integer(substr(id, 6, 7)) + raw_data <- fetch_raw(game_id = id, dir = dir) - if (is.null(dir)) { - path <- "https://raw.githubusercontent.com/guga31bb/nflfastR-raw/master/raw" - - if(isFALSE(qs)) fetched <- curl::curl_fetch_memory(glue::glue("{path}/{season}/{id}.rds")) + season_type <- dplyr::case_when( + season <= 2020 & week <= 17 ~ "REG", + season >= 2021 & week <= 18 ~ "REG", + TRUE ~ "POST" + ) - if(isTRUE(qs)) fetched <- curl::curl_fetch_memory(glue::glue("{path}/{season}/{id}.qs")) + # game_info <- raw_data$data$viewer$gameDetail - if (fetched$status_code == 404 & maybe_valid(id)) { - warning(warn <- 3) - } else if (fetched$status_code == 500) { - warning(warn <- 2) - } else if (fetched$status_code == 404) { - warning(warn <- 1) - } + game_id <- raw_data$data$viewer$gameDetail$id + home_team <- raw_data$data$viewer$gameDetail$homeTeam$abbreviation + away_team <- raw_data$data$viewer$gameDetail$visitorTeam$abbreviation - if(isFALSE(qs)) raw_data <- read_raw_rds(fetched$content) + # if home team and away team are the same, the game is messed up and needs fixing + if (home_team == away_team) { - if(isTRUE(qs)) raw_data <- qs::qdeserialize(fetched$content) + # get correct home and away from the game ID + id_parts <- stringr::str_split(id, "_") + away_team <- id_parts[[1]][3] + home_team <- id_parts[[1]][4] + bad_game <- 1 - } else { - # build path to locally stored game files. This functionality is primarily - # for the data repo maintainer - if(isFALSE(qs)) p <- glue::glue("{dir}/{season}/{id}.rds") - if(isTRUE(qs)) p <- glue::glue("{dir}/{season}/{id}.qs") + } else { + bad_game <- 0 + } - if (!file.exists(p)) { - warning(warn <- 4) - } + weather <- ifelse( + is.null(raw_data$data$viewer$gameDetail$weather$shortDescription), + NA_character_, + raw_data$data$viewer$gameDetail$weather$shortDescription + ) + stadium <- ifelse( + is.null(raw_data$data$viewer$gameDetail$stadium), + NA_character_, + raw_data$data$viewer$gameDetail$stadium + ) + start_time <- raw_data$data$viewer$gameDetail$startTime + + game_info <- tibble::tibble( + game_id = as.character(game_id), + home_team, + away_team, + weather, + stadium, + start_time + ) - if(isFALSE(qs)) raw_data <- readRDS(p) - if(isTRUE(qs)) raw_data <- qs::qread(p) - } + plays <- raw_data$data$viewer$gameDetail$plays %>% dplyr::mutate(game_id = as.character(game_id)) + + # We have this issue https://github.com/nflverse/nflfastR/issues/309 with 2013 postseason games + # where the driveSequenceNumber in the plays df is NA for all plays. That prevents drive information + # from being joined. + # In this case, we compute our own driveSequenceNumber by incrementing a counter depending on the + # value of driveTimeOfPossession. + # driveTimeOfPossession will be a constant value during a drive so this should actually be accurate + if (all(is.na(plays$driveSequenceNumber))){ + plays <- plays %>% + dplyr::mutate( + # First, create a trigger for cumsum + drive_trigger = dplyr::case_when( + # this is the first play of the first drive + is.na(dplyr::lag(driveTimeOfPossession)) & !is.na(driveTimeOfPossession) ~ 1, + # if driveTimeOfPossession changes, there is a new drive + dplyr::lag(driveTimeOfPossession) != driveTimeOfPossession ~ 1, + TRUE ~ 0 + ), + # Now create the drive number by accumulationg triggers + driveSequenceNumber = cumsum(drive_trigger), + # driveSequenceNumber should be NA on plays where driveTimeOfPossession is NA + driveSequenceNumber = ifelse(is.na(driveTimeOfPossession), NA_real_, driveSequenceNumber), + # drop the helper + drive_trigger = NULL + ) + } - season_type <- dplyr::case_when( - season <= 2020 & week <= 17 ~ "REG", - season >= 2021 & week <= 18 ~ "REG", - TRUE ~ "POST" + #fill missing posteam info for this + if ( + ((home_team %in% c("JAC", "JAX") | away_team %in% c("JAC", "JAX")) & season <= 2015) | + bad_game == 1 + ) { + plays <- plays %>% + dplyr::mutate( + possessionTeam.abbreviation = stringr::str_extract(plays$prePlayByPlay, '[A-Z]{2,3}(?=\\s)'), + possessionTeam.abbreviation = dplyr::if_else( + .data$possessionTeam.abbreviation %in% c('OUT', 'END', 'NA'), + NA_character_, .data$possessionTeam.abbreviation + ), + possessionTeam.abbreviation = dplyr::if_else( + .data$possessionTeam.abbreviation == 'JAX', 'JAC', .data$possessionTeam.abbreviation + ) ) - # game_info <- raw_data$data$viewer$gameDetail + # for these old games, we're making everything JAC instead of JAX + home_team <- dplyr::if_else(home_team == "JAX", "JAC", home_team) + away_team <- dplyr::if_else(away_team == "JAX", "JAC", away_team) + } - game_id <- raw_data$data$viewer$gameDetail$id - home_team <- raw_data$data$viewer$gameDetail$homeTeam$abbreviation - away_team <- raw_data$data$viewer$gameDetail$visitorTeam$abbreviation + drives <- raw_data$data$viewer$gameDetail$drives %>% + dplyr::mutate(ydsnet = .data$yards + .data$yardsPenalized) %>% + # these are already in plays + dplyr::select( + -"possessionTeam.abbreviation", + -"possessionTeam.nickName", + -"possessionTeam.franchise.currentLogo.url" + ) %>% + janitor::clean_names() + colnames(drives) <- paste0("drive_", colnames(drives)) + + stats <- tidyr::unnest(plays %>% dplyr::select(-"yards"), cols = c("playStats")) %>% + dplyr::mutate( + yards = as.integer(.data$yards), + statId = as.numeric(.data$statId), + team.abbreviation = as.character(.data$team.abbreviation) + ) %>% + dplyr::rename( + player.esbId = "gsisPlayer.id", + player.displayName = "playerName", + teamAbbr = "team.abbreviation" + ) %>% + dplyr::select( + "playId", + "statId", + "yards", + "teamAbbr", + "player.displayName", + "player.esbId" + ) - # if home team and away team are the same, the game is messed up and needs fixing - if (home_team == away_team) { + # there was a penalty on this play so these stat IDs shouldn't exist + if (id == "2020_10_DEN_LV") { + stats <- stats %>% + dplyr::filter(!(.data$playId == 979 & .data$statId %in% c(8, 10, 79))) + } - # get correct home and away from the game ID - id_parts <- stringr::str_split(id, "_") - away_team <- id_parts[[1]][3] - home_team <- id_parts[[1]][4] - bad_game <- 1 + pbp_stats <- lapply(unique(stats$playId), sum_play_stats, stats) + pbp_stats <- data.table::rbindlist(pbp_stats) %>% tibble::as_tibble() + + combined <- game_info %>% + dplyr::bind_cols(plays %>% dplyr::select(-"playStats", -"game_id")) %>% + dplyr::left_join(drives, by = c("driveSequenceNumber" = "drive_order_sequence")) %>% + dplyr::left_join(pbp_stats, by = c("playId" = "play_id")) %>% + dplyr::mutate_if(is.logical, as.numeric) %>% + dplyr::mutate_if(is.integer, as.numeric) %>% + dplyr::mutate_if(is.factor, as.character) %>% + janitor::clean_names() %>% + dplyr::select(-"drive_play_count", -"drive_time_of_possession", -"next_play_type") %>% + dplyr::rename( + time = "clock_time", + play_type_nfl = "play_type", + posteam = "possession_team_abbreviation", + yardline = "yard_line", + sp = "scoring_play", + drive = "drive_sequence_number", + nfl_api_id = "game_id", + drive_play_count = "drive_play_count_2", + drive_time_of_possession = "drive_time_of_possession_2", + ydsnet = "drive_ydsnet" + ) %>% + dplyr::mutate( + posteam_id = .data$posteam, + # have to do all this nonsense to make goal_to_go and yardline_side for compatibility with later functions + yardline_side = str_split_and_extract(.data$yardline, " ", 1), + yardline_number = as.numeric(str_split_and_extract(.data$yardline, " ", 2)), + quarter_end = dplyr::if_else(stringr::str_detect(.data$play_description, "END QUARTER"), 1, 0), + game_year = as.integer(season), + season = as.integer(season), + # this is only needed for epa and dropped later + game_month = as.integer(11), + game_id = id, + play_description = .data$play_description_with_jersey_numbers, + week = week, + season_type = season_type, + play_clock = as.character(.data$play_clock), + st_play_type = as.character(.data$st_play_type), + #if JAC has the ball and scored, make them the scoring team + td_team = dplyr::if_else( + .data$season <= 2015 & .data$posteam %in% c("JAC", "JAX") & + .data$drive_how_ended_description == 'Touchdown' & !is.na(.data$td_team), + 'JAC', .data$td_team + ), + #if JAC involved in a game and defensive team score, fill in the right team + td_team = dplyr::if_else( + #game involving the jags + .data$season <= 2015 & (.data$home_team %in% c("JAC", "JAX") | .data$away_team %in% c("JAC", "JAX")) & + #defensive TD + .data$drive_how_ended_description != 'Touchdown' & !is.na(.data$td_team), + #if home team has ball, then away team scored, otherwise home team scored + dplyr::if_else(.data$posteam == .data$home_team, .data$away_team, .data$home_team), + .data$td_team + ), + # fix muffed punt td in JAC game + td_team = dplyr::if_else(id == "2011_14_TB_JAX" & .data$play_id == 1343, 'JAC', .data$td_team), - } else { - bad_game <- 0 - } + # kickoff return TDs in old JAC games + td_team = dplyr::if_else(id == "2006_14_IND_JAX" & .data$play_id == 2078, 'JAC', .data$td_team), + td_team = dplyr::if_else(id == "2007_17_JAX_HOU" & .data$play_id %in% c(1907, 2042), 'HOU', .data$td_team), + td_team = dplyr::if_else(id == "2008_09_JAX_CIN" & .data$play_id == 3145, 'JAC', .data$td_team), + td_team = dplyr::if_else(id == "2009_15_IND_JAX" & .data$play_id == 1088, 'IND', .data$td_team), + td_team = dplyr::if_else(id == "2010_15_JAX_IND" & .data$play_id == 3848, 'IND', .data$td_team), - weather <- ifelse( - is.null(raw_data$data$viewer$gameDetail$weather$shortDescription), - NA_character_, - raw_data$data$viewer$gameDetail$weather$shortDescription - ) - stadium <- ifelse( - is.null(raw_data$data$viewer$gameDetail$stadium), - NA_character_, - raw_data$data$viewer$gameDetail$stadium - ) - start_time <- raw_data$data$viewer$gameDetail$startTime - - game_info <- tibble::tibble( - game_id = as.character(game_id), - home_team, - away_team, - weather, - stadium, - start_time + # fill in return team for the JAX games + return_team = dplyr::if_else( + !is.na(.data$return_team) & .data$season <= 2015 & (.data$home_team %in% c("JAC", "JAX") | .data$away_team %in% c("JAC", "JAX")), + dplyr::if_else( + # if the home team has the ball, return team is away team (this is before we flip posteam for kickoffs) + .data$posteam == .data$home_team, .data$away_team, .data$home_team + ), + .data$return_team + ), + fumble_recovery_1_team = dplyr::if_else( + !is.na(.data$fumble_recovery_1_team) & .data$season <= 2015 & (.data$home_team %in% c("JAC", "JAX") | .data$away_team %in% c("JAC", "JAX")), + # assign possession based on fumble_lost + dplyr::case_when( + .data$fumble_lost == 1 & .data$posteam == .data$home_team ~ .data$away_team, + .data$fumble_lost == 1 & .data$posteam == .data$away_team ~ .data$home_team, + .data$fumble_lost == 0 & .data$posteam == .data$home_team ~ .data$home_team, + .data$fumble_lost == 0 & .data$posteam == .data$away_team ~ .data$away_team + ), + .data$fumble_recovery_1_team + ), + timeout_team = dplyr::if_else( + # if there's a timeout in the affected seasons + !is.na(.data$timeout_team) & .data$season <= 2015 & (.data$home_team %in% c("JAC", "JAX") | .data$away_team %in% c("JAC", "JAX")), + # extract from play description + # make it JAC instead of JAX to be consistent with everything else + dplyr::if_else( + stringr::str_extract(.data$play_description, "(?<=Timeout #[1-3] by )[:upper:]+") == "JAX", "JAC", stringr::str_extract(.data$play_description, "(?<=Timeout #[1-3] by )[:upper:]+") + ), + .data$timeout_team + ), + # Also fix penalty team for JAC games + penalty_team = dplyr::if_else( + # if there's a penalty_team in the affected seasons + !is.na(.data$penalty_team) & .data$season <= 2015 & (.data$home_team %in% c("JAC", "JAX") | .data$away_team %in% c("JAC", "JAX")), + # extract from play description + # make it JAC instead of JAX to be consistent with everything else + dplyr::if_else( + stringr::str_extract(.data$play_description, "(?<=PENALTY on )[:upper:]{2,3}") == "JAX", + "JAC", + stringr::str_extract(.data$play_description, "(?<=PENALTY on )[:upper:]{2,3}") + ), + .data$penalty_team + ), + yardline_side = dplyr::if_else( + .data$season <= 2015 & .data$yardline_side == 'JAX', + 'JAC', .data$yardline_side + ), + time = dplyr::case_when( + id == '2012_04_NO_GB' & .data$play_id == 1085 ~ '3:34', + id == '2012_16_BUF_MIA' & .data$play_id == 2571 ~ '8:31', + TRUE ~ .data$time + ), + drive_real_start_time = as.character(.data$drive_real_start_time), + # get the safety team to ensure the correct team gets the points + # usage of base ifelse is important here for non-scoring games (i.e. early live games) + safety_team = ifelse(.data$safety == 1, .data$scoring_team_abbreviation, NA_character_), + + # scoring_team_abbreviation messed up on old Jags games so just assume it's defense team + safety_team = ifelse( + .data$safety == 1 & .data$season <= 2015 & (.data$home_team %in% c("JAC", "JAX") | .data$away_team %in% c("JAC", "JAX")), + ifelse(.data$posteam == .data$home_team, .data$away_team, .data$home_team), .data$safety_team ) - plays <- raw_data$data$viewer$gameDetail$plays %>% dplyr::mutate(game_id = as.character(game_id)) - - # We have this issue https://github.com/nflverse/nflfastR/issues/309 with 2013 postseason games - # where the driveSequenceNumber in the plays df is NA for all plays. That prevents drive information - # from being joined. - # In this case, we compute our own driveSequenceNumber by incrementing a counter depending on the - # value of driveTimeOfPossession. - # driveTimeOfPossession will be a constant value during a drive so this should actually be accurate - if (all(is.na(plays$driveSequenceNumber))){ - plays <- plays %>% - dplyr::mutate( - # First, create a trigger for cumsum - drive_trigger = dplyr::case_when( - # this is the first play of the first drive - is.na(dplyr::lag(driveTimeOfPossession)) & !is.na(driveTimeOfPossession) ~ 1, - # if driveTimeOfPossession changes, there is a new drive - dplyr::lag(driveTimeOfPossession) != driveTimeOfPossession ~ 1, - TRUE ~ 0 - ), - # Now create the drive number by accumulationg triggers - driveSequenceNumber = cumsum(drive_trigger), - # driveSequenceNumber should be NA on plays where driveTimeOfPossession is NA - driveSequenceNumber = ifelse(is.na(driveTimeOfPossession), NA_real_, driveSequenceNumber), - # drop the helper - drive_trigger = NULL - ) - } - - #fill missing posteam info for this - if ( - ((home_team %in% c("JAC", "JAX") | away_team %in% c("JAC", "JAX")) & season <= 2015) | - bad_game == 1 - ) { - plays <- plays %>% - dplyr::mutate( - possessionTeam.abbreviation = stringr::str_extract(plays$prePlayByPlay, '[A-Z]{2,3}(?=\\s)'), - possessionTeam.abbreviation = dplyr::if_else( - .data$possessionTeam.abbreviation %in% c('OUT', 'END', 'NA'), - NA_character_, .data$possessionTeam.abbreviation - ), - possessionTeam.abbreviation = dplyr::if_else( - .data$possessionTeam.abbreviation == 'JAX', 'JAC', .data$possessionTeam.abbreviation - ) - ) - - # for these old games, we're making everything JAC instead of JAX - home_team <- dplyr::if_else(home_team == "JAX", "JAC", home_team) - away_team <- dplyr::if_else(away_team == "JAX", "JAC", away_team) - } - - drives <- raw_data$data$viewer$gameDetail$drives %>% - dplyr::mutate(ydsnet = .data$yards + .data$yardsPenalized) %>% - # these are already in plays - dplyr::select( - -"possessionTeam.abbreviation", - -"possessionTeam.nickName", - -"possessionTeam.franchise.currentLogo.url" - ) %>% - janitor::clean_names() - colnames(drives) <- paste0("drive_", colnames(drives)) - - stats <- tidyr::unnest(plays %>% dplyr::select(-"yards"), cols = c("playStats")) %>% - dplyr::mutate( - yards = as.integer(.data$yards), - statId = as.numeric(.data$statId), - team.abbreviation = as.character(.data$team.abbreviation) - ) %>% - dplyr::rename( - player.esbId = "gsisPlayer.id", - player.displayName = "playerName", - teamAbbr = "team.abbreviation" - ) %>% - dplyr::select( - "playId", - "statId", - "yards", - "teamAbbr", - "player.displayName", - "player.esbId" - ) + ) %>% + dplyr::mutate_if( + .predicate = is.character, + .funs = ~dplyr::na_if(.x, "") + ) - # there was a penalty on this play so these stat IDs shouldn't exist - if (id == "2020_10_DEN_LV") { - stats <- stats %>% - dplyr::filter(!(.data$playId == 979 & .data$statId %in% c(8, 10, 79))) - } + # fix for games where home_team == away_team and fields are messed up + if (bad_game == 1) { + combined <- combined %>% + fix_bad_games() + } + + # nfl didn't fill in first downs on this game + if (id == '2018_01_ATL_PHI') { + combined <- combined %>% + dplyr::mutate( + first_down_pass = dplyr::if_else(.data$pass_attempt == 1 & .data$first_down == 1, 1, .data$first_down_pass), + first_down_rush = dplyr::if_else(.data$rush_attempt == 1 & .data$first_down == 1, 1, .data$first_down_rush), + + third_down_converted = dplyr::if_else(.data$first_down == 1 & .data$down == 3, 1, .data$third_down_converted), + fourth_down_converted = dplyr::if_else(.data$first_down == 1 & .data$down == 4, 1, .data$fourth_down_converted), - # if I don't put this here it breaks - suppressWarnings( - pbp_stats <- lapply(unique(stats$playId), sum_play_stats, stats) + third_down_failed = dplyr::if_else(.data$first_down == 0 & .data$down == 3, 1, .data$third_down_failed), + fourth_down_failed = dplyr::if_else(.data$first_down == 0 & .data$down == 4 & + .data$play_type_nfl != "FIELD_GOAL" & .data$play_type_nfl != "PUNT" & .data$play_type_nfl != "PENALTY", + 1, .data$fourth_down_failed) ) - pbp_stats <- data.table::rbindlist(pbp_stats) %>% tibble::as_tibble() - - combined <- game_info %>% - dplyr::bind_cols(plays %>% dplyr::select(-"playStats", -"game_id")) %>% - dplyr::left_join(drives, by = c("driveSequenceNumber" = "drive_order_sequence")) %>% - dplyr::left_join(pbp_stats, by = c("playId" = "play_id")) %>% - dplyr::mutate_if(is.logical, as.numeric) %>% - dplyr::mutate_if(is.integer, as.numeric) %>% - dplyr::mutate_if(is.factor, as.character) %>% - janitor::clean_names() %>% - dplyr::select(-"drive_play_count", -"drive_time_of_possession", -"next_play_type") %>% - dplyr::rename( - time = "clock_time", - play_type_nfl = "play_type", - posteam = "possession_team_abbreviation", - yardline = "yard_line", - sp = "scoring_play", - drive = "drive_sequence_number", - nfl_api_id = "game_id", - drive_play_count = "drive_play_count_2", - drive_time_of_possession = "drive_time_of_possession_2", - ydsnet = "drive_ydsnet" - ) %>% - dplyr::mutate( - posteam_id = .data$posteam, - # have to do all this nonsense to make goal_to_go and yardline_side for compatibility with later functions - yardline_side = str_split_and_extract(.data$yardline, " ", 1), - yardline_number = as.numeric(str_split_and_extract(.data$yardline, " ", 2)), - quarter_end = dplyr::if_else(stringr::str_detect(.data$play_description, "END QUARTER"), 1, 0), - game_year = as.integer(season), - season = as.integer(season), - # this is only needed for epa and dropped later - game_month = as.integer(11), - game_id = id, - play_description = .data$play_description_with_jersey_numbers, - week = week, - season_type = season_type, - play_clock = as.character(.data$play_clock), - st_play_type = as.character(.data$st_play_type), - #if JAC has the ball and scored, make them the scoring team - td_team = dplyr::if_else( - .data$season <= 2015 & .data$posteam %in% c("JAC", "JAX") & - .data$drive_how_ended_description == 'Touchdown' & !is.na(.data$td_team), - 'JAC', .data$td_team - ), - #if JAC involved in a game and defensive team score, fill in the right team - td_team = dplyr::if_else( - #game involving the jags - .data$season <= 2015 & (.data$home_team %in% c("JAC", "JAX") | .data$away_team %in% c("JAC", "JAX")) & - #defensive TD - .data$drive_how_ended_description != 'Touchdown' & !is.na(.data$td_team), - #if home team has ball, then away team scored, otherwise home team scored - dplyr::if_else(.data$posteam == .data$home_team, .data$away_team, .data$home_team), - .data$td_team - ), - # fix muffed punt td in JAC game - td_team = dplyr::if_else(id == "2011_14_TB_JAX" & .data$play_id == 1343, 'JAC', .data$td_team), - - # kickoff return TDs in old JAC games - td_team = dplyr::if_else(id == "2006_14_IND_JAX" & .data$play_id == 2078, 'JAC', .data$td_team), - td_team = dplyr::if_else(id == "2007_17_JAX_HOU" & .data$play_id %in% c(1907, 2042), 'HOU', .data$td_team), - td_team = dplyr::if_else(id == "2008_09_JAX_CIN" & .data$play_id == 3145, 'JAC', .data$td_team), - td_team = dplyr::if_else(id == "2009_15_IND_JAX" & .data$play_id == 1088, 'IND', .data$td_team), - td_team = dplyr::if_else(id == "2010_15_JAX_IND" & .data$play_id == 3848, 'IND', .data$td_team), - - # fill in return team for the JAX games - return_team = dplyr::if_else( - !is.na(.data$return_team) & .data$season <= 2015 & (.data$home_team %in% c("JAC", "JAX") | .data$away_team %in% c("JAC", "JAX")), - dplyr::if_else( - # if the home team has the ball, return team is away team (this is before we flip posteam for kickoffs) - .data$posteam == .data$home_team, .data$away_team, .data$home_team - ), - .data$return_team - ), - fumble_recovery_1_team = dplyr::if_else( - !is.na(.data$fumble_recovery_1_team) & .data$season <= 2015 & (.data$home_team %in% c("JAC", "JAX") | .data$away_team %in% c("JAC", "JAX")), - # assign possession based on fumble_lost - dplyr::case_when( - .data$fumble_lost == 1 & .data$posteam == .data$home_team ~ .data$away_team, - .data$fumble_lost == 1 & .data$posteam == .data$away_team ~ .data$home_team, - .data$fumble_lost == 0 & .data$posteam == .data$home_team ~ .data$home_team, - .data$fumble_lost == 0 & .data$posteam == .data$away_team ~ .data$away_team - ), - .data$fumble_recovery_1_team - ), - timeout_team = dplyr::if_else( - # if there's a timeout in the affected seasons - !is.na(.data$timeout_team) & .data$season <= 2015 & (.data$home_team %in% c("JAC", "JAX") | .data$away_team %in% c("JAC", "JAX")), - # extract from play description - # make it JAC instead of JAX to be consistent with everything else - dplyr::if_else( - stringr::str_extract(.data$play_description, "(?<=Timeout #[1-3] by )[:upper:]+") == "JAX", "JAC", stringr::str_extract(.data$play_description, "(?<=Timeout #[1-3] by )[:upper:]+") - ), - .data$timeout_team - ), - # Also fix penalty team for JAC games - penalty_team = dplyr::if_else( - # if there's a penalty_team in the affected seasons - !is.na(.data$penalty_team) & .data$season <= 2015 & (.data$home_team %in% c("JAC", "JAX") | .data$away_team %in% c("JAC", "JAX")), - # extract from play description - # make it JAC instead of JAX to be consistent with everything else - dplyr::if_else( - stringr::str_extract(.data$play_description, "(?<=PENALTY on )[:upper:]{2,3}") == "JAX", - "JAC", - stringr::str_extract(.data$play_description, "(?<=PENALTY on )[:upper:]{2,3}") - ), - .data$penalty_team - ), - yardline_side = dplyr::if_else( - .data$season <= 2015 & .data$yardline_side == 'JAX', - 'JAC', .data$yardline_side - ), - time = dplyr::case_when( - id == '2012_04_NO_GB' & .data$play_id == 1085 ~ '3:34', - id == '2012_16_BUF_MIA' & .data$play_id == 2571 ~ '8:31', - TRUE ~ .data$time - ), - drive_real_start_time = as.character(.data$drive_real_start_time), - # get the safety team to ensure the correct team gets the points - # usage of base ifelse is important here for non-scoring games (i.e. early live games) - safety_team = ifelse(.data$safety == 1, .data$scoring_team_abbreviation, NA_character_), - - # scoring_team_abbreviation messed up on old Jags games so just assume it's defense team - safety_team = ifelse( - .data$safety == 1 & .data$season <= 2015 & (.data$home_team %in% c("JAC", "JAX") | .data$away_team %in% c("JAC", "JAX")), - ifelse(.data$posteam == .data$home_team, .data$away_team, .data$home_team), .data$safety_team - ) - - ) %>% - dplyr::mutate_if( - .predicate = is.character, - .funs = ~dplyr::na_if(.x, "") - ) + } - # fix for games where home_team == away_team and fields are messed up - if (bad_game == 1) { - combined <- combined %>% - fix_bad_games() - } - - # nfl didn't fill in first downs on this game - if (id == '2018_01_ATL_PHI') { - combined <- combined %>% - dplyr::mutate( - first_down_pass = dplyr::if_else(.data$pass_attempt == 1 & .data$first_down == 1, 1, .data$first_down_pass), - first_down_rush = dplyr::if_else(.data$rush_attempt == 1 & .data$first_down == 1, 1, .data$first_down_rush), - - third_down_converted = dplyr::if_else(.data$first_down == 1 & .data$down == 3, 1, .data$third_down_converted), - fourth_down_converted = dplyr::if_else(.data$first_down == 1 & .data$down == 4, 1, .data$fourth_down_converted), - - third_down_failed = dplyr::if_else(.data$first_down == 0 & .data$down == 3, 1, .data$third_down_failed), - fourth_down_failed = dplyr::if_else(.data$first_down == 0 & .data$down == 4 & - .data$play_type_nfl != "FIELD_GOAL" & .data$play_type_nfl != "PUNT" & .data$play_type_nfl != "PENALTY", - 1, .data$fourth_down_failed) - ) - } - - }, - error = function(e) { - message("The following error has occured:") - message(e) - }, - warning = function(w) { - if (warn == 1) { - message(glue::glue("Warning: The requested GameID {id} is invalid!")) - } else if (warn == 2) { - message(glue::glue("Warning: The data hosting servers are down, please try again later!")) - } else if (warn == 3) { - message(glue::glue("Warning: The requested GameID {id} is not loaded yet, please try again later!")) - } else if (warn == 4) { - message(glue::glue("Warning: Either the requested GameID {id} is missing or you've passed an invalid path!")) - } else { - message("The following warning has occured:") - message(w) - } - }, - finally = { - } - ) return(combined) } @@ -386,14 +322,14 @@ fix_bad_games <- function(pbp) { dplyr::mutate( #if team has the ball and scored, make them the scoring team td_team = dplyr::if_else( - .data$drive_how_ended_description == 'Touchdown' & !is.na(.data$td_team), + .data$drive_how_ended_description == 'Touchdown' & !is.na(.data$td_team), .data$posteam, .data$td_team ), #if team defensive team score, fill in the right team td_team = dplyr::if_else( #game involving the jags - #defensive TD - .data$drive_how_ended_description != 'Touchdown' & !is.na(.data$td_team), + #defensive TD + .data$drive_how_ended_description != 'Touchdown' & !is.na(.data$td_team), #if home team has ball, then away team scored, otherwise home team scored dplyr::if_else(.data$posteam == .data$home_team, .data$away_team, .data$home_team), .data$td_team diff --git a/R/top-level_scraper.R b/R/top-level_scraper.R index 8e0b0954..1cf01e19 100644 --- a/R/top-level_scraper.R +++ b/R/top-level_scraper.R @@ -425,9 +425,9 @@ fast_scraper <- function(game_ids, p <- progressr::progressor(along = game_ids) pbp <- furrr::future_map_dfr(game_ids, function(x, p, ...) { if (substr(x, 1, 4) < 2001) { - plays <- get_pbp_gc(x, ...) + plays <- please_work(get_pbp_gc)(x, ...) } else { - plays <- get_pbp_nfl(x, ...) + plays <- please_work(get_pbp_nfl)(x, ...) } p(sprintf("ID=%s", as.character(x))) return(plays) From e08d4c80fa6d6c5a45c3413ebe932c99164dd52a Mon Sep 17 00:00:00 2001 From: Sebastian Carl Date: Fri, 1 Sep 2023 14:28:58 +0200 Subject: [PATCH 08/13] global var check note --- R/helper_scrape_nfl.R | 8 ++++---- R/utils.R | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/R/helper_scrape_nfl.R b/R/helper_scrape_nfl.R index 320a0c7d..493db8ea 100644 --- a/R/helper_scrape_nfl.R +++ b/R/helper_scrape_nfl.R @@ -81,15 +81,15 @@ get_pbp_nfl <- function(id, # First, create a trigger for cumsum drive_trigger = dplyr::case_when( # this is the first play of the first drive - is.na(dplyr::lag(driveTimeOfPossession)) & !is.na(driveTimeOfPossession) ~ 1, + is.na(dplyr::lag(.data$driveTimeOfPossession)) & !is.na(.data$driveTimeOfPossession) ~ 1, # if driveTimeOfPossession changes, there is a new drive - dplyr::lag(driveTimeOfPossession) != driveTimeOfPossession ~ 1, + dplyr::lag(.data$driveTimeOfPossession) != .data$driveTimeOfPossession ~ 1, TRUE ~ 0 ), # Now create the drive number by accumulationg triggers - driveSequenceNumber = cumsum(drive_trigger), + driveSequenceNumber = cumsum(.data$drive_trigger), # driveSequenceNumber should be NA on plays where driveTimeOfPossession is NA - driveSequenceNumber = ifelse(is.na(driveTimeOfPossession), NA_real_, driveSequenceNumber), + driveSequenceNumber = ifelse(is.na(.data$driveTimeOfPossession), NA_real_, .data$driveSequenceNumber), # drop the helper drive_trigger = NULL ) diff --git a/R/utils.R b/R/utils.R index 4bd4bd87..7223efd0 100644 --- a/R/utils.R +++ b/R/utils.R @@ -138,7 +138,7 @@ check_stat_ids <- function(seasons, furrr::future_map_dfr(games, function(id, stats, p, dir, skip_local){ raw_data <- load_raw_game(id, dir = dir, skip_local = skip_local) plays <- janitor::clean_names(raw_data$data$viewer$gameDetail$plays) %>% - dplyr::select("play_id", "play_stats", "desc" = play_description_with_jersey_numbers) + dplyr::select("play_id", "play_stats", "desc" = .data$play_description_with_jersey_numbers) p(sprintf("ID=%s", as.character(id))) From 435b580bd04f1fc578ab504721f52ed73b98dad1 Mon Sep 17 00:00:00 2001 From: Sebastian Carl Date: Fri, 1 Sep 2023 14:29:13 +0200 Subject: [PATCH 09/13] forgot to change id naming --- R/utils.R | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/R/utils.R b/R/utils.R index 7223efd0..6b5b112e 100644 --- a/R/utils.R +++ b/R/utils.R @@ -204,29 +204,30 @@ please_work <- function(.f, otherwise = data.frame(), quiet = FALSE){ } } - +# THIS IS CALLED FROM INSIDE get_pbp_gc AND get_pbp_nfl +# MODIFY WITH CAUTION fetch_raw <- function(game_id, dir = getOption("nflfastR.raw_directory", default = NULL)){ - season <- substr(id, 1, 4) + season <- substr(game_id, 1, 4) if (is.null(dir)) { to_load <- file.path( "https://raw.githubusercontent.com/nflverse/nflfastR-raw/master/raw", season, - paste0(id, ".rds"), + paste0(game_id, ".rds"), fsep = "/" ) fetched <- curl::curl_fetch_memory(to_load) - if (fetched$status_code == 404 & maybe_valid(id)) { - cli::cli_abort("The requested GameID {.val {id}} is not loaded yet, please try again later!") + if (fetched$status_code == 404 & maybe_valid(game_id)) { + cli::cli_abort("The requested GameID {.val {game_id}} is not loaded yet, please try again later!") } else if (fetched$status_code == 500) { cli::cli_abort("The data hosting servers are down, please try again later!") } else if (fetched$status_code == 404) { - cli::cli_abort("The requested GameID {.val {id}} is invalid!") + cli::cli_abort("The requested GameID {.val {game_id}} is invalid!") } out <- read_raw_rds(fetched$content) @@ -236,7 +237,7 @@ fetch_raw <- function(game_id, local_file <- file.path( dir, season, - paste0(id, ".rds") + paste0(game_id, ".rds") ) if (!file.exists(local_file)) { From 93f06e19e3727c6dd55d83d49b9d38e0f7a9698b Mon Sep 17 00:00:00 2001 From: mrcaseb Date: Sun, 3 Sep 2023 13:20:00 +0200 Subject: [PATCH 10/13] add raw pbp functions --- NAMESPACE | 2 + R/save_raw_pbp.R | 154 +++++++++++++++++++++++++++++++++++++++++ man/missing_raw_pbp.Rd | 38 ++++++++++ man/save_raw_pbp.Rd | 61 ++++++++++++++++ 4 files changed, 255 insertions(+) create mode 100644 R/save_raw_pbp.R create mode 100644 man/missing_raw_pbp.Rd create mode 100644 man/save_raw_pbp.Rd diff --git a/NAMESPACE b/NAMESPACE index c64c9002..3247233d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -18,8 +18,10 @@ export(fast_scraper_roster) export(fast_scraper_schedules) export(load_pbp) export(load_player_stats) +export(missing_raw_pbp) export(nflverse_sitrep) export(report) +export(save_raw_pbp) export(update_db) import(dplyr) import(fastrmodels) diff --git a/R/save_raw_pbp.R b/R/save_raw_pbp.R new file mode 100644 index 00000000..c5ae2b4a --- /dev/null +++ b/R/save_raw_pbp.R @@ -0,0 +1,154 @@ +#' Download Raw PBP Data to Local Filesystem +#' +#' The functions [build_nflfastR_pbp()] and [fast_scraper()] support loading +#' raw pbp data from local file systems instead of Github servers. +#' This function is intended to help setting this up. It loads raw pbp data +#' and saves it in the given directory split by season in subdirectories. +#' +#' @param game_ids A vector of nflverse game IDs. +#' @param dir Path to local directory (defaults to option "nflfastR.raw_directory"). +#' nflfastR will download the raw game files split by season into one sub +#' directory per season. +#' +#' @returns The function returns a data frame with one row for each downloaded file and +#' the following columns: +#' - `success` if the HTTP request was successfully performed, regardless of the +#' response status code. This is `FALSE` in case of a network error, or in case +#' you tried to resume from a server that did not support this. A value of `NA` +#' means the download was interrupted while in progress. +#' - `status_code` the HTTP status code from the request. A successful download is +#' usually `200` for full requests or `206` for resumed requests. Anything else +#' could indicate that the downloaded file contains an error page instead of the +#' requested content. +#' - `resumefrom` the file size before the request, in case a download was resumed. +#' - `url` final url (after redirects) of the request. +#' - `destfile` downloaded file on disk. +#' - `error` if `success == FALSE` this column contains an error message. +#' - `type` the `Content-Type` response header value. +#' - `modified` the `Last-Modified` response header value. +#' - `time` total elapsed download time for this file in seconds. +#' - `headers` vector with http response headers for the request. +#' @export +#' +#' @seealso [build_nflfastR_pbp()], [missing_raw_pbp()] +#' +#' @examples +#' \donttest{ +#' # CREATE LOCAL TEMP DIRECTORY +#' local_dir <- tempdir() +#' +#' # LOAD AND SAVE A GAME TO TEMP DIRECTORY +#' save_raw_pbp("2021_20_BUF_KC", dir = local_dir) +#' +#' # REMOVE THE DIRECTORY +#' unlink(file.path(local_dir, 2021)) +#' } +save_raw_pbp <- function(game_ids, + dir = getOption("nflfastR.raw_directory", default = NULL)){ + verify_game_ids(game_ids = game_ids) + if(is.null(dir)){ + cli::cli_abort("Invalid argument {.arg dir}. Do you need to set \\ + {.code options(nflfastR.raw_directory)}?") + } else if (!dir.exists(dir)){ + cli::cli_abort("You've asked to save raw pbp to {.path {dir}} which \\ + doesn't exist. Please create it.") + } + seasons <- substr(game_ids, 1, 4) + season_folders <- file.path(dir, unique(seasons)) %>% sort() + missing_season_folders <- season_folders[!dir.exists(season_folders)] + created_folders <- vapply(missing_season_folders, dir.create, FUN.VALUE = logical(1L)) + to_load <- file.path( + "https://raw.githubusercontent.com/nflverse/nflfastR-raw/master/raw", + seasons, + paste0(game_ids, ".rds"), + fsep = "/" + ) + save_to <- file.path( + dir, seasons, paste0(game_ids, ".rds") + ) + curl::multi_download(to_load, save_to) +} + +#' Compute Missing Raw PBP Data on Local Filesystem +#' +#' Uses [nflreadr::load_schedules()] to load game IDs of finished games and +#' compares these IDs to all files saved under `dir`. +#' This function is intended to serve as input for [save_raw_pbp()]. +#' +#' @inheritParams save_raw_pbp +#' @param verbose If `TRUE`, will print number of missing game files as well as +#' oldest and most recent missing ID to console. +#' +#' @return A character vector of missing game IDs. If no files are missing, +#' returns `NULL` invisibly. +#' @export +#' +#' @seealso [save_raw_pbp()] +#' +#' @examples +#' \donttest{ +#' try( +#' missing <- missing_raw_pbp(tempdir()) +#' ) +#' } +missing_raw_pbp <- function(dir = getOption("nflfastR.raw_directory", default = NULL), + verbose = TRUE){ + if(is.null(dir)){ + cli::cli_abort("Invalid argument {.arg dir}. Do you need to set \\ + {.code options(nflfastR.raw_directory)}?") + } else if (!dir.exists(dir)){ + cli::cli_abort("You've asked to check raw pbp in {.path {dir}} which \\ + doesn't exist. Please create it.") + } + local_games <- sapply(list.files(dir, full.names = TRUE), list.files) %>% + unlist(use.names = FALSE) %>% + tools::file_path_sans_ext() + + finished_games <- nflreadr::load_schedules() %>% + dplyr::filter(!is.na(result)) %>% + dplyr::pull(game_id) + + local_missing_games <- finished_games[!finished_games %in% local_games] + + if (length(local_missing_games) == 0){ + cli::cli_alert_success("No missing games!") + return(invisible(NULL)) + } + + if (isTRUE(verbose)){ + cli::cli_alert_info( + "You are missing {length(local_missing_games)} game file{?s}. \\ + The oldest missing game is {.val {local_missing_games[[1]]}}. \\ + The most recent missing game is \\ + {.val {local_missing_games[length(local_missing_games)]}}." + ) + } + + local_missing_games +} + + +verify_game_ids <- function(game_ids){ + # game_ids <- c( + # "2021_02_LAC_KC", + # "Hello World", + # "2028_01_LAC_JAX", + # "2022_27_LAC_BUF", + # "2021_02_LAC_KAC" + # ) + season_check <- substr(game_ids, 1, 4) %in% seq.int(1999, as.integer(format(Sys.Date(), "%Y")) + 1, 1) + week_check <- as.integer(substr(game_ids, 6, 7)) %in% seq_len(22) + team_name_check <- + vapply( + str_extract_all(game_ids, "(?<=_)[:upper:]{2,3}"), + function(t) all(t %in% nflfastR::teams_colors_logos$team_abbr), + FUN.VALUE = logical(1L) + ) + combined_check <- season_check & week_check & team_name_check + + if (any(combined_check == FALSE)){ + cli::cli_abort("The game IDs {.val {game_ids[!combined_check]}} seem to be invalid!") + } + + invisible(NULL) +} diff --git a/man/missing_raw_pbp.Rd b/man/missing_raw_pbp.Rd new file mode 100644 index 00000000..af940616 --- /dev/null +++ b/man/missing_raw_pbp.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/save_raw_pbp.R +\name{missing_raw_pbp} +\alias{missing_raw_pbp} +\title{Compute Missing Raw PBP Data on Local Filesystem} +\usage{ +missing_raw_pbp( + dir = getOption("nflfastR.raw_directory", default = NULL), + verbose = TRUE +) +} +\arguments{ +\item{dir}{Path to local directory (defaults to option "nflfastR.raw_directory"). +nflfastR will download the raw game files split by season into one sub +directory per season.} + +\item{verbose}{If \code{TRUE}, will print number of missing game files as well as +oldest and most recent missing ID to console.} +} +\value{ +A character vector of missing game IDs. If no files are missing, +returns \code{NULL} invisibly. +} +\description{ +Uses \code{\link[nflreadr:load_schedules]{nflreadr::load_schedules()}} to load game IDs of finished games and +compares these IDs to all files saved under \code{dir}. +This function is intended to serve as input for \code{\link[=save_raw_pbp]{save_raw_pbp()}}. +} +\examples{ +\donttest{ +try( +missing <- missing_raw_pbp(tempdir()) +) +} +} +\seealso{ +\code{\link[=save_raw_pbp]{save_raw_pbp()}} +} diff --git a/man/save_raw_pbp.Rd b/man/save_raw_pbp.Rd new file mode 100644 index 00000000..794c9e79 --- /dev/null +++ b/man/save_raw_pbp.Rd @@ -0,0 +1,61 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/save_raw_pbp.R +\name{save_raw_pbp} +\alias{save_raw_pbp} +\title{Download Raw PBP Data to Local Filesystem} +\usage{ +save_raw_pbp( + game_ids, + dir = getOption("nflfastR.raw_directory", default = NULL) +) +} +\arguments{ +\item{game_ids}{A vector of nflverse game IDs.} + +\item{dir}{Path to local directory (defaults to option "nflfastR.raw_directory"). +nflfastR will download the raw game files split by season into one sub +directory per season.} +} +\value{ +The function returns a data frame with one row for each downloaded file and +the following columns: +\itemize{ +\item \code{success} if the HTTP request was successfully performed, regardless of the +response status code. This is \code{FALSE} in case of a network error, or in case +you tried to resume from a server that did not support this. A value of \code{NA} +means the download was interrupted while in progress. +\item \code{status_code} the HTTP status code from the request. A successful download is +usually \code{200} for full requests or \code{206} for resumed requests. Anything else +could indicate that the downloaded file contains an error page instead of the +requested content. +\item \code{resumefrom} the file size before the request, in case a download was resumed. +\item \code{url} final url (after redirects) of the request. +\item \code{destfile} downloaded file on disk. +\item \code{error} if \code{success == FALSE} this column contains an error message. +\item \code{type} the \code{Content-Type} response header value. +\item \code{modified} the \code{Last-Modified} response header value. +\item \code{time} total elapsed download time for this file in seconds. +\item \code{headers} vector with http response headers for the request. +} +} +\description{ +The functions \code{\link[=build_nflfastR_pbp]{build_nflfastR_pbp()}} and \code{\link[=fast_scraper]{fast_scraper()}} support loading +raw pbp data from local file systems instead of Github servers. +This function is intended to help setting this up. It loads raw pbp data +and saves it in the given directory split by season in subdirectories. +} +\examples{ +\donttest{ +# CREATE LOCAL TEMP DIRECTORY +local_dir <- tempdir() + +# LOAD AND SAVE A GAME TO TEMP DIRECTORY +save_raw_pbp("2021_20_BUF_KC", dir = local_dir) + +# REMOVE THE DIRECTORY +unlink(file.path(local_dir, 2021)) +} +} +\seealso{ +\code{\link[=build_nflfastR_pbp]{build_nflfastR_pbp()}}, \code{\link[=missing_raw_pbp]{missing_raw_pbp()}} +} From a18cf6f2d43f1a085579a1e5aee75e437939f3ae Mon Sep 17 00:00:00 2001 From: mrcaseb Date: Sun, 3 Sep 2023 13:22:43 +0200 Subject: [PATCH 11/13] update pkgdown index --- pkgdown/_pkgdown.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkgdown/_pkgdown.yml b/pkgdown/_pkgdown.yml index dbc69f29..eea40969 100644 --- a/pkgdown/_pkgdown.yml +++ b/pkgdown/_pkgdown.yml @@ -128,6 +128,8 @@ reference: - load_player_stats - title: Utility Functions contents: + - save_raw_pbp + - missing_raw_pbp - calculate_expected_points - calculate_win_probability - calculate_player_stats From f6fb1366ab35d54c8d7cffaf9f7bcf48172479ea Mon Sep 17 00:00:00 2001 From: mrcaseb Date: Sun, 3 Sep 2023 19:35:43 +0200 Subject: [PATCH 12/13] document dir argument across functions --- R/build_nflfastR_pbp.R | 3 ++- R/top-level_scraper.R | 13 +++++++++---- man/build_nflfastR_pbp.Rd | 12 +++++++++++- man/fast_scraper.Rd | 13 ++++++++++++- man/load_player_stats.Rd | 2 +- 5 files changed, 35 insertions(+), 8 deletions(-) diff --git a/R/build_nflfastR_pbp.R b/R/build_nflfastR_pbp.R index b401e835..ab4f451c 100644 --- a/R/build_nflfastR_pbp.R +++ b/R/build_nflfastR_pbp.R @@ -55,6 +55,7 @@ #' } #' } build_nflfastR_pbp <- function(game_ids, + dir = getOption("nflfastR.raw_directory", default = NULL), ..., decode = TRUE, rules = TRUE) { @@ -74,7 +75,7 @@ build_nflfastR_pbp <- function(game_ids, cli::cli_ul("{my_time()} | Start download of {game_count} game{?s}...") - ret <- fast_scraper(game_ids = game_ids, ..., in_builder = builder) %>% + ret <- fast_scraper(game_ids = game_ids, dir = dir, ..., in_builder = builder) %>% clean_pbp(in_builder = builder) %>% add_qb_epa(in_builder = builder) %>% add_xyac(in_builder = builder) %>% diff --git a/R/top-level_scraper.R b/R/top-level_scraper.R index 1cf01e19..c191445a 100644 --- a/R/top-level_scraper.R +++ b/R/top-level_scraper.R @@ -16,6 +16,9 @@ #' #' @param game_ids Vector of character ids or a data frame including the variable #' `game_id` (see details for further information). +#' @param dir Path to local directory (defaults to option "nflfastR.raw_directory") +#' where nflfastR searches for raw game play-by-play data. +#' See [save_raw_pbp()] for additional information. #' @param ... Additional arguments passed to the scraping functions (for internal use) #' @param in_builder If \code{TRUE}, the final message will be suppressed (for usage inside of \code{\link{build_nflfastR_pbp}}). #' @details To load valid game_ids please use the package function @@ -23,6 +26,7 @@ #' output of that function) #' @seealso For information on parallel processing and progress updates please #' see [nflfastR]. +#' @seealso [build_nflfastR_pbp()], [save_raw_pbp()] #' @return Data frame where each individual row represents a single play for #' all passed game_ids containing the following #' detailed information (description partly extracted from nflscrapR): @@ -404,6 +408,7 @@ #' } #' } fast_scraper <- function(game_ids, + dir = getOption("nflfastR.raw_directory", default = NULL), ..., in_builder = FALSE) { @@ -423,15 +428,15 @@ fast_scraper <- function(game_ids, suppressWarnings({ p <- progressr::progressor(along = game_ids) - pbp <- furrr::future_map_dfr(game_ids, function(x, p, ...) { + pbp <- furrr::future_map_dfr(game_ids, function(x, p, dir, ...) { if (substr(x, 1, 4) < 2001) { - plays <- please_work(get_pbp_gc)(x, ...) + plays <- please_work(get_pbp_gc)(x, dir = dir, ...) } else { - plays <- please_work(get_pbp_nfl)(x, ...) + plays <- please_work(get_pbp_nfl)(x, dir = dir, ...) } p(sprintf("ID=%s", as.character(x))) return(plays) - }, p, ...) + }, p, dir = dir, ...) if (length(pbp) != 0) { user_message("Download finished. Adding variables...", "done") diff --git a/man/build_nflfastR_pbp.Rd b/man/build_nflfastR_pbp.Rd index 83f993b2..fe4c7a35 100644 --- a/man/build_nflfastR_pbp.Rd +++ b/man/build_nflfastR_pbp.Rd @@ -4,12 +4,22 @@ \alias{build_nflfastR_pbp} \title{Build a Complete nflfastR Data Set} \usage{ -build_nflfastR_pbp(game_ids, ..., decode = TRUE, rules = TRUE) +build_nflfastR_pbp( + game_ids, + dir = getOption("nflfastR.raw_directory", default = NULL), + ..., + decode = TRUE, + rules = TRUE +) } \arguments{ \item{game_ids}{Vector of character ids or a data frame including the variable \code{game_id} (see details for further information).} +\item{dir}{Path to local directory (defaults to option "nflfastR.raw_directory") +where nflfastR searches for raw game play-by-play data. +See \code{\link[=save_raw_pbp]{save_raw_pbp()}} for additional information.} + \item{...}{Additional arguments passed to the scraping functions (for internal use)} \item{decode}{If \code{TRUE}, the function \code{\link[=decode_player_ids]{decode_player_ids()}} will be executed.} diff --git a/man/fast_scraper.Rd b/man/fast_scraper.Rd index ec07bb64..473cdf4d 100644 --- a/man/fast_scraper.Rd +++ b/man/fast_scraper.Rd @@ -4,12 +4,21 @@ \alias{fast_scraper} \title{Get NFL Play by Play Data} \usage{ -fast_scraper(game_ids, ..., in_builder = FALSE) +fast_scraper( + game_ids, + dir = getOption("nflfastR.raw_directory", default = NULL), + ..., + in_builder = FALSE +) } \arguments{ \item{game_ids}{Vector of character ids or a data frame including the variable \code{game_id} (see details for further information).} +\item{dir}{Path to local directory (defaults to option "nflfastR.raw_directory") +where nflfastR searches for raw game play-by-play data. +See \code{\link[=save_raw_pbp]{save_raw_pbp()}} for additional information.} + \item{...}{Additional arguments passed to the scraping functions (for internal use)} \item{in_builder}{If \code{TRUE}, the final message will be suppressed (for usage inside of \code{\link{build_nflfastR_pbp}}).} @@ -410,4 +419,6 @@ future::plan("sequential") \seealso{ For information on parallel processing and progress updates please see \link{nflfastR}. + +\code{\link[=build_nflfastR_pbp]{build_nflfastR_pbp()}}, \code{\link[=save_raw_pbp]{save_raw_pbp()}} } diff --git a/man/load_player_stats.Rd b/man/load_player_stats.Rd index 9ac25071..42cc6479 100644 --- a/man/load_player_stats.Rd +++ b/man/load_player_stats.Rd @@ -11,7 +11,7 @@ load_player_stats(...) Arguments passed on to \code{\link[nflreadr:load_player_stats]{nflreadr::load_player_stats}} \describe{ \item{\code{seasons}}{a numeric vector of seasons to return, defaults to most recent season. If set to \code{TRUE}, returns all available data.} - \item{\code{stat_type}}{one of \code{offense} or \code{kicking}} + \item{\code{stat_type}}{one of \code{"offense"}, \code{"defense"}, or \code{"kicking"}} \item{\code{file_type}}{One of \code{c("rds", "qs", "csv", "parquet")}. Can also be set globally with \code{options(nflreadr.prefer)}} }} From ab7ab10eef20699d2d1c1b7b95a6db64193f6428 Mon Sep 17 00:00:00 2001 From: mrcaseb Date: Sun, 3 Sep 2023 19:48:35 +0200 Subject: [PATCH 13/13] version and news bullet --- DESCRIPTION | 2 +- NEWS.md | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3a91583e..fa870c65 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: nflfastR Title: Functions to Efficiently Access NFL Play by Play Data -Version: 4.5.1.9008 +Version: 4.5.1.9009 Authors@R: c(person(given = "Sebastian", family = "Carl", diff --git a/NEWS.md b/NEWS.md index 495dd703..10b96fd4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,6 +8,9 @@ - `fixed_drive` now correctly increments on plays where posteam lost a fumble but remains posteam because defteam also lost a fumble during the same play. (#419) - nflfastR now fixes missing drive number counts in raw pbp data in order to provide accurate drive information. (#420) - nflfastR now returns correct `kick_distance` on all punts and kickoffs. (#422) +- nflfastR now fully supports loading raw pbp data from local file system. The best way to use this feature is to set `options("nflfastR.raw_directory" = {"your/local/directory"})`. Alternatively, both `build_nflfastR_pbp()` and `fast_scraper()` support the argument `dir` which defaults to the above option. (#423) +- Added the new function `save_raw_pbp()` which efficiently downloads raw play-by-play data and saves it to the local file system. This serves as a helper to setup the system for faster play-by-play parsing via the above functionality. (#423) +- Added the new function `missing_raw_pbp()` that computes a vector of game IDs missing in the local raw play-by-play directory. (#423) # nflfastR 4.5.1