-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #34 from nflverse/advanced-season-stats
Rewrite advanced season stats scraper
- Loading branch information
Showing
11 changed files
with
255 additions
and
171 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,7 +24,7 @@ jobs: | |
R_KEEP_PKG_SOURCE: yes | ||
|
||
steps: | ||
- uses: actions/checkout@v2 | ||
- uses: actions/checkout@v3 | ||
|
||
- uses: r-lib/actions/setup-r@v2 | ||
with: | ||
|
@@ -45,12 +45,3 @@ jobs: | |
|
||
- name: Run update script | ||
run: Rscript -e 'source("auto/update_adv_season_stats.R")' | ||
|
||
# - name: Commit updated data | ||
# run: | | ||
# git config --local user.email "[email protected]" | ||
# git config --local user.name "GitHub Actions" | ||
# git pull | ||
# git add data/adv_stats | ||
# git commit -m "Automated advanced stats scrape `date`" || echo "No changes to commit" | ||
# git push || echo "No changes to commit" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,4 +37,4 @@ Remotes: | |
Encoding: UTF-8 | ||
LazyData: true | ||
Roxygen: list(markdown = TRUE) | ||
RoxygenNote: 7.1.1 | ||
RoxygenNote: 7.2.3 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,9 @@ | ||
# Generated by roxygen2: do not edit by hand | ||
|
||
export("%>%") | ||
export(pfr_advanced_defense_season) | ||
export(pfr_advanced_receiving_season) | ||
export(pfr_advanced_rushing_season) | ||
export(pfr_advanced_stat_season) | ||
export(pfr_game_adv_stats) | ||
importFrom(magrittr,"%>%") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
#' Scrape PFR Advanced Stats on Season Level | ||
#' | ||
#' @param s Season to scrape | ||
#' @param type Stat type | ||
#' | ||
#' @return A tibble | ||
#' @export | ||
pfr_advanced_stat_season <- function(s, type = c("receiving", "rushing", "defense")) { | ||
|
||
type <- rlang::arg_match(type) | ||
|
||
cli::cli_progress_step("Load advanced {.val {type}} {.val {s}}") | ||
|
||
# Load html and extract relevant part from comments ----------------------- | ||
|
||
raw_url <- glue::glue("https://www.pro-football-reference.com/years/{s}/{type}_advanced.htm") | ||
raw_html <- rvest::read_html(raw_url) | ||
tbl_html <- xml2::xml_find_all(raw_html, xpath = glue::glue("//div[@id='all_advanced_{type}']/comment()")) |> | ||
rvest::html_text() |> | ||
xml2::read_html() | ||
|
||
|
||
# Extract Player Information ---------------------------------------------- | ||
|
||
player_elements <- tbl_html |> | ||
xml2::xml_find_all("//td[@data-append-csv]") | ||
|
||
players <- tibble::tibble( | ||
pfr_id = xml2::xml_attr(player_elements, "data-append-csv"), | ||
player = xml2::xml_text(player_elements) | ||
) | ||
|
||
|
||
# Extract actual Data ----------------------------------------------------- | ||
|
||
out <- rvest::html_table(tbl_html) |> | ||
purrr::pluck(1) |> | ||
.check_pfr_stats_names() |> | ||
janitor::clean_names() |> | ||
dplyr::filter(rk != "Rk") |> | ||
dplyr::left_join(players, by = "player") |> | ||
dplyr::mutate( | ||
tm = nflreadr::clean_team_abbrs(tm), | ||
season = s, | ||
loaded = lubridate::today() | ||
) |> | ||
dplyr::select(season, player, pfr_id, dplyr::everything(), -rk) |> | ||
dplyr::mutate( | ||
dplyr::across( | ||
.cols = tidyselect::where(is.character), | ||
.fns = ~ dplyr::na_if(., "") | ||
), | ||
dplyr::across( | ||
.cols = tidyselect::contains("percent"), | ||
.fns = function(x) as.numeric(sub("%","",x)) / 100 | ||
), | ||
dplyr::across( | ||
.cols = !tidyselect::any_of(c("player", "pfr_id", "tm", "pos", "loaded")), | ||
.fns = as.numeric | ||
), | ||
player = stringr::str_remove_all(player, "\\+|\\*"), | ||
player = nflreadr::clean_player_names(player), | ||
pos = toupper(pos) | ||
) | ||
|
||
cli::cli_progress_done() | ||
|
||
out | ||
} | ||
|
||
.check_pfr_stats_names <- function(df){ | ||
if ("Rk" %in% names(df)){ | ||
janitor::clean_names(df) | ||
} else { | ||
janitor::row_to_names(df, 1) |> | ||
janitor::clean_names() | ||
} | ||
} | ||
|
||
#' @export | ||
#' @rdname pfr_advanced_stat_season | ||
pfr_advanced_receiving_season <- function(s) pfr_advanced_stat_season(s = s, type = "receiving") | ||
|
||
#' @export | ||
#' @rdname pfr_advanced_stat_season | ||
pfr_advanced_rushing_season <- function(s) pfr_advanced_stat_season(s = s, type = "rushing") | ||
|
||
#' @export | ||
#' @rdname pfr_advanced_stat_season | ||
pfr_advanced_defense_season <- function(s) pfr_advanced_stat_season(s = s, type = "defense") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,61 +1,38 @@ | ||
library(rvest) | ||
|
||
get_rec_season <- function(s) { | ||
|
||
cli::cli_process_start("Load REC {.val {s}}") | ||
|
||
raw_url <- glue::glue("https://widgets.sports-reference.com/wg.fcgi?css=1&site", | ||
"=pfr&url=%2Fyears%2F{s}%2Freceiving_advanced.htm&div=div_advanced_receiving") | ||
|
||
raw_html <- read_html(raw_url) | ||
tbl_html <- html_element(raw_html, xpath = '//*[@id="advanced_receiving"]') | ||
|
||
# The "data-append-csv" attribut of the dt tags inherits the pfr player ids | ||
ids <- tbl_html |> | ||
html_elements("td") |> | ||
html_attr("data-append-csv") |> | ||
na.omit() | ||
|
||
df <- html_table(tbl_html) | ||
if(Sys.getenv("NFLVERSE_REBUILD", "false") == "true"){ | ||
seasons_to_update <- 2018:nflreadr::most_recent_season() | ||
} else { | ||
seasons_to_update <- nflreadr::most_recent_season() | ||
} | ||
|
||
suppressWarnings({ | ||
out <- df |> | ||
janitor::clean_names() |> | ||
dplyr::filter(rk != "Rk") |> | ||
dplyr::mutate( | ||
pfr_id = ids, | ||
tm = nflreadr::clean_team_abbrs(tm), | ||
season = s, | ||
loaded = lubridate::today() | ||
) |> | ||
dplyr::na_if("") |> | ||
dplyr::select(season, player, pfr_id, dplyr::everything(), -rk) |> | ||
dplyr::mutate( | ||
dplyr::across( | ||
.cols = tidyselect::contains("percent"), | ||
.fns = function(x) as.numeric(sub("%","",x)) / 100 | ||
), | ||
dplyr::across( | ||
.cols = !tidyselect::any_of(c("player", "pfr_id", "tm", "pos", "loaded")), | ||
.fns = as.numeric | ||
), | ||
player = stringr::str_remove_all(player, "\\+|\\*"), | ||
pos = toupper(pos) | ||
) | ||
}) | ||
purrr::walk( | ||
seasons_to_update, | ||
purrr::possibly(function(season){ | ||
nflversedata::nflverse_save( | ||
data_frame = pfr_advanced_receiving_season(season), | ||
file_name = glue::glue("advstats_season_rec_{season}"), | ||
nflverse_type = "advanced receiving season stats via PFR", | ||
release_tag = "pfr_advstats", | ||
file_types = "rds" | ||
) | ||
}, quiet = FALSE | ||
) | ||
) | ||
|
||
cli::cli_process_done() | ||
## NOW COMBINE ALL SEASONS FOR THE FILE nflreadr IS LOADING | ||
|
||
out | ||
} | ||
|
||
# data seem spotty before 2019 | ||
df_advstats <- purrr::map_df(2018:nflreadr:::most_recent_season(), get_rec_season) | ||
combined_advstats <- purrr::map( | ||
2018:nflreadr::most_recent_season(), | ||
purrr::possibly(function(season){ | ||
load_from <- glue::glue("https://github.com/nflverse/nflverse-data/releases/download/pfr_advstats/advstats_season_rec_{season}.rds") | ||
nflreadr::rds_from_url(load_from) | ||
}, tibble::tibble(), quiet = FALSE), | ||
.progress = TRUE | ||
) |> | ||
purrr::list_rbind() | ||
|
||
nflversedata::nflverse_save( | ||
data_frame = df_advstats, | ||
data_frame = combined_advstats, | ||
file_name = "advstats_season_rec", | ||
nflverse_type = "advanced receiving season stats via PFR", | ||
release_tag = "pfr_advstats" | ||
) | ||
|
Oops, something went wrong.