diff --git a/.github/workflows/update_advanced_stats.yaml b/.github/workflows/update_advanced_stats.yaml index b720459e..c680cd40 100644 --- a/.github/workflows/update_advanced_stats.yaml +++ b/.github/workflows/update_advanced_stats.yaml @@ -24,7 +24,7 @@ jobs: R_KEEP_PKG_SOURCE: yes steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: r-lib/actions/setup-r@v2 with: @@ -45,12 +45,3 @@ jobs: - name: Run update script run: Rscript -e 'source("auto/update_adv_season_stats.R")' - - # - name: Commit updated data - # run: | - # git config --local user.email "actions@github.com" - # git config --local user.name "GitHub Actions" - # git pull - # git add data/adv_stats - # git commit -m "Automated advanced stats scrape `date`" || echo "No changes to commit" - # git push || echo "No changes to commit" diff --git a/DESCRIPTION b/DESCRIPTION index c8364cd7..de6eab3b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -37,4 +37,4 @@ Remotes: Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) -RoxygenNote: 7.1.1 +RoxygenNote: 7.2.3 diff --git a/NAMESPACE b/NAMESPACE index dc8b7525..6ed508bb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,9 @@ # Generated by roxygen2: do not edit by hand export("%>%") +export(pfr_advanced_defense_season) +export(pfr_advanced_receiving_season) +export(pfr_advanced_rushing_season) +export(pfr_advanced_stat_season) export(pfr_game_adv_stats) importFrom(magrittr,"%>%") diff --git a/R/pfr_advanced_stats.R b/R/pfr_advanced_stats.R new file mode 100644 index 00000000..6694f862 --- /dev/null +++ b/R/pfr_advanced_stats.R @@ -0,0 +1,90 @@ +#' Scrape PFR Advanced Stats on Season Level +#' +#' @param s Season to scrape +#' @param type Stat type +#' +#' @return A tibble +#' @export +pfr_advanced_stat_season <- function(s, type = c("receiving", "rushing", "defense")) { + + type <- rlang::arg_match(type) + + cli::cli_progress_step("Load advanced {.val {type}} {.val {s}}") + + # Load html and extract relevant part from comments ----------------------- + + raw_url <- glue::glue("https://www.pro-football-reference.com/years/{s}/{type}_advanced.htm") + raw_html <- rvest::read_html(raw_url) + tbl_html <- xml2::xml_find_all(raw_html, xpath = glue::glue("//div[@id='all_advanced_{type}']/comment()")) |> + rvest::html_text() |> + xml2::read_html() + + + # Extract Player Information ---------------------------------------------- + + player_elements <- tbl_html |> + xml2::xml_find_all("//td[@data-append-csv]") + + players <- tibble::tibble( + pfr_id = xml2::xml_attr(player_elements, "data-append-csv"), + player = xml2::xml_text(player_elements) + ) + + + # Extract actual Data ----------------------------------------------------- + + out <- rvest::html_table(tbl_html) |> + purrr::pluck(1) |> + .check_pfr_stats_names() |> + janitor::clean_names() |> + dplyr::filter(rk != "Rk") |> + dplyr::left_join(players, by = "player") |> + dplyr::mutate( + tm = nflreadr::clean_team_abbrs(tm), + season = s, + loaded = lubridate::today() + ) |> + dplyr::select(season, player, pfr_id, dplyr::everything(), -rk) |> + dplyr::mutate( + dplyr::across( + .cols = tidyselect::where(is.character), + .fns = ~ dplyr::na_if(., "") + ), + dplyr::across( + .cols = tidyselect::contains("percent"), + .fns = function(x) as.numeric(sub("%","",x)) / 100 + ), + dplyr::across( + .cols = !tidyselect::any_of(c("player", "pfr_id", "tm", "pos", "loaded")), + .fns = as.numeric + ), + player = stringr::str_remove_all(player, "\\+|\\*"), + player = nflreadr::clean_player_names(player), + pos = toupper(pos) + ) + + cli::cli_progress_done() + + out +} + +.check_pfr_stats_names <- function(df){ + if ("Rk" %in% names(df)){ + janitor::clean_names(df) + } else { + janitor::row_to_names(df, 1) |> + janitor::clean_names() + } +} + +#' @export +#' @rdname pfr_advanced_stat_season +pfr_advanced_receiving_season <- function(s) pfr_advanced_stat_season(s = s, type = "receiving") + +#' @export +#' @rdname pfr_advanced_stat_season +pfr_advanced_rushing_season <- function(s) pfr_advanced_stat_season(s = s, type = "rushing") + +#' @export +#' @rdname pfr_advanced_stat_season +pfr_advanced_defense_season <- function(s) pfr_advanced_stat_season(s = s, type = "defense") diff --git a/auto/defense.R b/auto/defense.R index c52269a3..edc05ad1 100644 --- a/auto/defense.R +++ b/auto/defense.R @@ -1,59 +1,37 @@ -library(rvest) - -get_def_season <- function(s) { - - cli::cli_process_start("Load DEF {.val {s}}") - - raw_url <- glue::glue("https://widgets.sports-reference.com/wg.fcgi?css=1&site", - "=pfr&url=%2Fyears%2F{s}%2Fdefense_advanced.htm&div=div_advanced_defense") - - raw_html <- read_html(raw_url) - tbl_html <- html_element(raw_html, xpath = '//*[@id="advanced_defense"]') - - # The "data-append-csv" attribut of the dt tags inherits the pfr player ids - ids <- tbl_html |> - html_elements("td") |> - html_attr("data-append-csv") |> - na.omit() - - df <- html_table(tbl_html) - names(df) <- as.character(df[1, ]) - - suppressWarnings({ - out <- df |> - janitor::clean_names() |> - dplyr::filter(rk != "Rk") |> - dplyr::mutate( - pfr_id = ids, - tm = nflreadr::clean_team_abbrs(tm), - season = s, - loaded = lubridate::today() - ) |> - dplyr::na_if("") |> - dplyr::select(season, player, pfr_id, dplyr::everything(), -rk) |> - dplyr::mutate( - dplyr::across( - .cols = tidyselect::contains("percent"), - .fns = function(x) as.numeric(sub("%","",x)) / 100 - ), - dplyr::across( - .cols = !tidyselect::any_of(c("player", "pfr_id", "tm", "pos", "loaded")), - .fns = as.numeric - ), - player = stringr::str_remove_all(player, "\\+|\\*"), - pos = toupper(pos) - ) - }) +if(Sys.getenv("NFLVERSE_REBUILD", "false") == "true"){ + seasons_to_update <- 2018:nflreadr::most_recent_season() +} else { + seasons_to_update <- nflreadr::most_recent_season() +} - cli::cli_process_done() +purrr::walk( + seasons_to_update, + purrr::possibly(function(season){ + nflversedata::nflverse_save( + data_frame = pfr_advanced_defense_season(season), + file_name = glue::glue("advstats_season_def_{season}"), + nflverse_type = "advanced defense season stats via PFR", + release_tag = "pfr_advstats", + file_types = "rds" + ) + }, quiet = FALSE + ) +) - out -} +## NOW COMBINE ALL SEASONS FOR THE FILE nflreadr IS LOADING -df_advstats <- purrr::map_df(2018:nflreadr:::most_recent_season(), get_def_season) +combined_advstats <- purrr::map( + 2018:nflreadr::most_recent_season(), + purrr::possibly(function(season){ + load_from <- glue::glue("https://github.com/nflverse/nflverse-data/releases/download/pfr_advstats/advstats_season_def_{season}.rds") + nflreadr::rds_from_url(load_from) + }, tibble::tibble(), quiet = FALSE), + .progress = TRUE +) |> + purrr::list_rbind() nflversedata::nflverse_save( - data_frame = df_advstats, + data_frame = combined_advstats, file_name = "advstats_season_def", nflverse_type = "advanced defense season stats via PFR", release_tag = "pfr_advstats" diff --git a/auto/passing.R b/auto/passing.R index 623ff198..21920a8f 100644 --- a/auto/passing.R +++ b/auto/passing.R @@ -134,11 +134,40 @@ get_passing <- function(s) { out } -# data seem spotty before 2019 -df_advstats <- purrr::map_df(2018:nflreadr:::most_recent_season(), purrr::possibly(get_passing,tibble::tibble())) +if(Sys.getenv("NFLVERSE_REBUILD", "false") == "true"){ + seasons_to_update <- 2018:nflreadr::most_recent_season() +} else { + seasons_to_update <- nflreadr::most_recent_season() +} + +purrr::walk( + seasons_to_update, + purrr::possibly(function(season){ + nflversedata::nflverse_save( + data_frame = get_passing(season), + file_name = glue::glue("advstats_season_pass_{season}"), + nflverse_type = "advanced passing season stats via PFR", + release_tag = "pfr_advstats", + file_types = "rds" + ) + }, quiet = FALSE + ) +) + +## NOW COMBINE ALL SEASONS FOR THE FILE nflreadr IS LOADING + +combined_advstats <- purrr::map( + 2018:nflreadr::most_recent_season(), + purrr::possibly(function(season){ + load_from <- glue::glue("https://github.com/nflverse/nflverse-data/releases/download/pfr_advstats/advstats_season_pass_{season}.rds") + nflreadr::rds_from_url(load_from) + }, tibble::tibble(), quiet = FALSE), + .progress = TRUE +) |> + purrr::list_rbind() nflversedata::nflverse_save( - data_frame = df_advstats, + data_frame = combined_advstats, file_name = "advstats_season_pass", nflverse_type = "advanced passing season stats via PFR", release_tag = "pfr_advstats" diff --git a/auto/receiving.R b/auto/receiving.R index f320ba04..e9a85350 100644 --- a/auto/receiving.R +++ b/auto/receiving.R @@ -1,61 +1,38 @@ -library(rvest) - -get_rec_season <- function(s) { - - cli::cli_process_start("Load REC {.val {s}}") - - raw_url <- glue::glue("https://widgets.sports-reference.com/wg.fcgi?css=1&site", - "=pfr&url=%2Fyears%2F{s}%2Freceiving_advanced.htm&div=div_advanced_receiving") - - raw_html <- read_html(raw_url) - tbl_html <- html_element(raw_html, xpath = '//*[@id="advanced_receiving"]') - - # The "data-append-csv" attribut of the dt tags inherits the pfr player ids - ids <- tbl_html |> - html_elements("td") |> - html_attr("data-append-csv") |> - na.omit() - - df <- html_table(tbl_html) +if(Sys.getenv("NFLVERSE_REBUILD", "false") == "true"){ + seasons_to_update <- 2018:nflreadr::most_recent_season() +} else { + seasons_to_update <- nflreadr::most_recent_season() +} - suppressWarnings({ - out <- df |> - janitor::clean_names() |> - dplyr::filter(rk != "Rk") |> - dplyr::mutate( - pfr_id = ids, - tm = nflreadr::clean_team_abbrs(tm), - season = s, - loaded = lubridate::today() - ) |> - dplyr::na_if("") |> - dplyr::select(season, player, pfr_id, dplyr::everything(), -rk) |> - dplyr::mutate( - dplyr::across( - .cols = tidyselect::contains("percent"), - .fns = function(x) as.numeric(sub("%","",x)) / 100 - ), - dplyr::across( - .cols = !tidyselect::any_of(c("player", "pfr_id", "tm", "pos", "loaded")), - .fns = as.numeric - ), - player = stringr::str_remove_all(player, "\\+|\\*"), - pos = toupper(pos) - ) - }) +purrr::walk( + seasons_to_update, + purrr::possibly(function(season){ + nflversedata::nflverse_save( + data_frame = pfr_advanced_receiving_season(season), + file_name = glue::glue("advstats_season_rec_{season}"), + nflverse_type = "advanced receiving season stats via PFR", + release_tag = "pfr_advstats", + file_types = "rds" + ) + }, quiet = FALSE + ) +) - cli::cli_process_done() +## NOW COMBINE ALL SEASONS FOR THE FILE nflreadr IS LOADING - out -} - -# data seem spotty before 2019 -df_advstats <- purrr::map_df(2018:nflreadr:::most_recent_season(), get_rec_season) +combined_advstats <- purrr::map( + 2018:nflreadr::most_recent_season(), + purrr::possibly(function(season){ + load_from <- glue::glue("https://github.com/nflverse/nflverse-data/releases/download/pfr_advstats/advstats_season_rec_{season}.rds") + nflreadr::rds_from_url(load_from) + }, tibble::tibble(), quiet = FALSE), + .progress = TRUE +) |> + purrr::list_rbind() nflversedata::nflverse_save( - data_frame = df_advstats, + data_frame = combined_advstats, file_name = "advstats_season_rec", nflverse_type = "advanced receiving season stats via PFR", release_tag = "pfr_advstats" ) - diff --git a/auto/rushing.R b/auto/rushing.R index 219ea142..34c18ab2 100644 --- a/auto/rushing.R +++ b/auto/rushing.R @@ -1,60 +1,37 @@ -library(rvest) - -get_rush_season <- function(s) { - - cli::cli_process_start("Load RUSH {.val {s}}") - - raw_url <- glue::glue("https://widgets.sports-reference.com/wg.fcgi?css=1&site", - "=pfr&url=%2Fyears%2F{s}%2Frushing_advanced.htm&div=div_advanced_rushing") - - raw_html <- read_html(raw_url) - tbl_html <- html_element(raw_html, xpath = '//*[@id="advanced_rushing"]') - - # The "data-append-csv" attribut of the dt tags inherits the pfr player ids - ids <- tbl_html |> - html_elements("td") |> - html_attr("data-append-csv") |> - na.omit() - - df <- html_table(tbl_html) - names(df) <- as.character(df[1, ]) - - suppressWarnings({ - out <- df |> - janitor::clean_names() |> - dplyr::filter(rk != "Rk") |> - dplyr::mutate( - pfr_id = ids, - tm = nflreadr::clean_team_abbrs(tm), - season = s, - loaded = lubridate::today() - ) |> - dplyr::na_if("") |> - dplyr::select(season, player, pfr_id, dplyr::everything(), -rk) |> - dplyr::mutate( - dplyr::across( - .cols = tidyselect::contains("percent"), - .fns = function(x) as.numeric(sub("%","",x)) / 100 - ), - dplyr::across( - .cols = !tidyselect::any_of(c("player", "pfr_id", "tm", "pos", "loaded")), - .fns = as.numeric - ), - player = stringr::str_remove_all(player, "\\+|\\*"), - pos = toupper(pos) - ) - }) +if(Sys.getenv("NFLVERSE_REBUILD", "false") == "true"){ + seasons_to_update <- 2018:nflreadr::most_recent_season() +} else { + seasons_to_update <- nflreadr::most_recent_season() +} - cli::cli_process_done() +purrr::walk( + seasons_to_update, + purrr::possibly(function(season){ + nflversedata::nflverse_save( + data_frame = pfr_advanced_rushing_season(season), + file_name = glue::glue("advstats_season_rush_{season}"), + nflverse_type = "advanced rushing season stats via PFR", + release_tag = "pfr_advstats", + file_types = "rds" + ) + }, quiet = FALSE + ) +) - out -} +## NOW COMBINE ALL SEASONS FOR THE FILE nflreadr IS LOADING -# data seem spotty before 2019 -df_advstats <- purrr::map_df(2018:nflreadr:::most_recent_season(), get_rush_season) +combined_advstats <- purrr::map( + 2018:nflreadr::most_recent_season(), + purrr::possibly(function(season){ + load_from <- glue::glue("https://github.com/nflverse/nflverse-data/releases/download/pfr_advstats/advstats_season_rush_{season}.rds") + nflreadr::rds_from_url(load_from) + }, tibble::tibble(), quiet = FALSE), + .progress = TRUE +) |> + purrr::list_rbind() nflversedata::nflverse_save( - data_frame = df_advstats, + data_frame = combined_advstats, file_name = "advstats_season_rush", nflverse_type = "advanced rushing season stats via PFR", release_tag = "pfr_advstats" diff --git a/auto/update_adv_season_stats.R b/auto/update_adv_season_stats.R index ca5ff0d9..e4700254 100644 --- a/auto/update_adv_season_stats.R +++ b/auto/update_adv_season_stats.R @@ -1,6 +1,5 @@ +pkgload::load_all() source("auto/rushing.R") source("auto/receiving.R") source("auto/defense.R") source("auto/passing.R") - -# list.files("build", full.names = TRUE) |> nflversedata::nflverse_upload("pfr_advstats") diff --git a/man/pfr_advanced_stat_season.Rd b/man/pfr_advanced_stat_season.Rd new file mode 100644 index 00000000..af07862d --- /dev/null +++ b/man/pfr_advanced_stat_season.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pfr_advanced_stats.R +\name{pfr_advanced_stat_season} +\alias{pfr_advanced_stat_season} +\alias{pfr_advanced_receiving_season} +\alias{pfr_advanced_rushing_season} +\alias{pfr_advanced_defense_season} +\title{Scrape PFR Advanced Stats on Season Level} +\usage{ +pfr_advanced_stat_season(s, type = c("receiving", "rushing", "defense")) + +pfr_advanced_receiving_season(s) + +pfr_advanced_rushing_season(s) + +pfr_advanced_defense_season(s) +} +\arguments{ +\item{s}{Season to scrape} + +\item{type}{Stat type} +} +\value{ +A tibble +} +\description{ +Scrape PFR Advanced Stats on Season Level +} diff --git a/man/pfr_game_urls.Rd b/man/pfr_game_urls.Rd new file mode 100644 index 00000000..8bf0f5ef --- /dev/null +++ b/man/pfr_game_urls.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/game_urls.R +\name{pfr_game_urls} +\alias{pfr_game_urls} +\title{Game IDs and URLs} +\usage{ +pfr_game_urls(season = 2021) +} +\description{ +Game IDs and URLs +}