Skip to content

Commit

Permalink
Merge pull request #34 from nflverse/advanced-season-stats
Browse files Browse the repository at this point in the history
Rewrite advanced season stats scraper
  • Loading branch information
mrcaseb authored Aug 22, 2023
2 parents eaf39bb + d54bcae commit b567329
Show file tree
Hide file tree
Showing 11 changed files with 255 additions and 171 deletions.
11 changes: 1 addition & 10 deletions .github/workflows/update_advanced_stats.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ jobs:
R_KEEP_PKG_SOURCE: yes

steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3

- uses: r-lib/actions/setup-r@v2
with:
Expand All @@ -45,12 +45,3 @@ jobs:

- name: Run update script
run: Rscript -e 'source("auto/update_adv_season_stats.R")'

# - name: Commit updated data
# run: |
# git config --local user.email "[email protected]"
# git config --local user.name "GitHub Actions"
# git pull
# git add data/adv_stats
# git commit -m "Automated advanced stats scrape `date`" || echo "No changes to commit"
# git push || echo "No changes to commit"
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,4 @@ Remotes:
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.1.1
RoxygenNote: 7.2.3
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Generated by roxygen2: do not edit by hand

export("%>%")
export(pfr_advanced_defense_season)
export(pfr_advanced_receiving_season)
export(pfr_advanced_rushing_season)
export(pfr_advanced_stat_season)
export(pfr_game_adv_stats)
importFrom(magrittr,"%>%")
90 changes: 90 additions & 0 deletions R/pfr_advanced_stats.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#' Scrape PFR Advanced Stats on Season Level
#'
#' @param s Season to scrape
#' @param type Stat type
#'
#' @return A tibble
#' @export
pfr_advanced_stat_season <- function(s, type = c("receiving", "rushing", "defense")) {

type <- rlang::arg_match(type)

cli::cli_progress_step("Load advanced {.val {type}} {.val {s}}")

# Load html and extract relevant part from comments -----------------------

raw_url <- glue::glue("https://www.pro-football-reference.com/years/{s}/{type}_advanced.htm")
raw_html <- rvest::read_html(raw_url)
tbl_html <- xml2::xml_find_all(raw_html, xpath = glue::glue("//div[@id='all_advanced_{type}']/comment()")) |>
rvest::html_text() |>
xml2::read_html()


# Extract Player Information ----------------------------------------------

player_elements <- tbl_html |>
xml2::xml_find_all("//td[@data-append-csv]")

players <- tibble::tibble(
pfr_id = xml2::xml_attr(player_elements, "data-append-csv"),
player = xml2::xml_text(player_elements)
)


# Extract actual Data -----------------------------------------------------

out <- rvest::html_table(tbl_html) |>
purrr::pluck(1) |>
.check_pfr_stats_names() |>
janitor::clean_names() |>
dplyr::filter(rk != "Rk") |>
dplyr::left_join(players, by = "player") |>
dplyr::mutate(
tm = nflreadr::clean_team_abbrs(tm),
season = s,
loaded = lubridate::today()
) |>
dplyr::select(season, player, pfr_id, dplyr::everything(), -rk) |>
dplyr::mutate(
dplyr::across(
.cols = tidyselect::where(is.character),
.fns = ~ dplyr::na_if(., "")
),
dplyr::across(
.cols = tidyselect::contains("percent"),
.fns = function(x) as.numeric(sub("%","",x)) / 100
),
dplyr::across(
.cols = !tidyselect::any_of(c("player", "pfr_id", "tm", "pos", "loaded")),
.fns = as.numeric
),
player = stringr::str_remove_all(player, "\\+|\\*"),
player = nflreadr::clean_player_names(player),
pos = toupper(pos)
)

cli::cli_progress_done()

out
}

.check_pfr_stats_names <- function(df){
if ("Rk" %in% names(df)){
janitor::clean_names(df)
} else {
janitor::row_to_names(df, 1) |>
janitor::clean_names()
}
}

#' @export
#' @rdname pfr_advanced_stat_season
pfr_advanced_receiving_season <- function(s) pfr_advanced_stat_season(s = s, type = "receiving")

#' @export
#' @rdname pfr_advanced_stat_season
pfr_advanced_rushing_season <- function(s) pfr_advanced_stat_season(s = s, type = "rushing")

#' @export
#' @rdname pfr_advanced_stat_season
pfr_advanced_defense_season <- function(s) pfr_advanced_stat_season(s = s, type = "defense")
80 changes: 29 additions & 51 deletions auto/defense.R
Original file line number Diff line number Diff line change
@@ -1,59 +1,37 @@
library(rvest)

get_def_season <- function(s) {

cli::cli_process_start("Load DEF {.val {s}}")

raw_url <- glue::glue("https://widgets.sports-reference.com/wg.fcgi?css=1&site",
"=pfr&url=%2Fyears%2F{s}%2Fdefense_advanced.htm&div=div_advanced_defense")

raw_html <- read_html(raw_url)
tbl_html <- html_element(raw_html, xpath = '//*[@id="advanced_defense"]')

# The "data-append-csv" attribut of the dt tags inherits the pfr player ids
ids <- tbl_html |>
html_elements("td") |>
html_attr("data-append-csv") |>
na.omit()

df <- html_table(tbl_html)
names(df) <- as.character(df[1, ])

suppressWarnings({
out <- df |>
janitor::clean_names() |>
dplyr::filter(rk != "Rk") |>
dplyr::mutate(
pfr_id = ids,
tm = nflreadr::clean_team_abbrs(tm),
season = s,
loaded = lubridate::today()
) |>
dplyr::na_if("") |>
dplyr::select(season, player, pfr_id, dplyr::everything(), -rk) |>
dplyr::mutate(
dplyr::across(
.cols = tidyselect::contains("percent"),
.fns = function(x) as.numeric(sub("%","",x)) / 100
),
dplyr::across(
.cols = !tidyselect::any_of(c("player", "pfr_id", "tm", "pos", "loaded")),
.fns = as.numeric
),
player = stringr::str_remove_all(player, "\\+|\\*"),
pos = toupper(pos)
)
})
if(Sys.getenv("NFLVERSE_REBUILD", "false") == "true"){
seasons_to_update <- 2018:nflreadr::most_recent_season()
} else {
seasons_to_update <- nflreadr::most_recent_season()
}

cli::cli_process_done()
purrr::walk(
seasons_to_update,
purrr::possibly(function(season){
nflversedata::nflverse_save(
data_frame = pfr_advanced_defense_season(season),
file_name = glue::glue("advstats_season_def_{season}"),
nflverse_type = "advanced defense season stats via PFR",
release_tag = "pfr_advstats",
file_types = "rds"
)
}, quiet = FALSE
)
)

out
}
## NOW COMBINE ALL SEASONS FOR THE FILE nflreadr IS LOADING

df_advstats <- purrr::map_df(2018:nflreadr:::most_recent_season(), get_def_season)
combined_advstats <- purrr::map(
2018:nflreadr::most_recent_season(),
purrr::possibly(function(season){
load_from <- glue::glue("https://github.com/nflverse/nflverse-data/releases/download/pfr_advstats/advstats_season_def_{season}.rds")
nflreadr::rds_from_url(load_from)
}, tibble::tibble(), quiet = FALSE),
.progress = TRUE
) |>
purrr::list_rbind()

nflversedata::nflverse_save(
data_frame = df_advstats,
data_frame = combined_advstats,
file_name = "advstats_season_def",
nflverse_type = "advanced defense season stats via PFR",
release_tag = "pfr_advstats"
Expand Down
35 changes: 32 additions & 3 deletions auto/passing.R
Original file line number Diff line number Diff line change
Expand Up @@ -134,11 +134,40 @@ get_passing <- function(s) {
out
}

# data seem spotty before 2019
df_advstats <- purrr::map_df(2018:nflreadr:::most_recent_season(), purrr::possibly(get_passing,tibble::tibble()))
if(Sys.getenv("NFLVERSE_REBUILD", "false") == "true"){
seasons_to_update <- 2018:nflreadr::most_recent_season()
} else {
seasons_to_update <- nflreadr::most_recent_season()
}

purrr::walk(
seasons_to_update,
purrr::possibly(function(season){
nflversedata::nflverse_save(
data_frame = get_passing(season),
file_name = glue::glue("advstats_season_pass_{season}"),
nflverse_type = "advanced passing season stats via PFR",
release_tag = "pfr_advstats",
file_types = "rds"
)
}, quiet = FALSE
)
)

## NOW COMBINE ALL SEASONS FOR THE FILE nflreadr IS LOADING

combined_advstats <- purrr::map(
2018:nflreadr::most_recent_season(),
purrr::possibly(function(season){
load_from <- glue::glue("https://github.com/nflverse/nflverse-data/releases/download/pfr_advstats/advstats_season_pass_{season}.rds")
nflreadr::rds_from_url(load_from)
}, tibble::tibble(), quiet = FALSE),
.progress = TRUE
) |>
purrr::list_rbind()

nflversedata::nflverse_save(
data_frame = df_advstats,
data_frame = combined_advstats,
file_name = "advstats_season_pass",
nflverse_type = "advanced passing season stats via PFR",
release_tag = "pfr_advstats"
Expand Down
81 changes: 29 additions & 52 deletions auto/receiving.R
Original file line number Diff line number Diff line change
@@ -1,61 +1,38 @@
library(rvest)

get_rec_season <- function(s) {

cli::cli_process_start("Load REC {.val {s}}")

raw_url <- glue::glue("https://widgets.sports-reference.com/wg.fcgi?css=1&site",
"=pfr&url=%2Fyears%2F{s}%2Freceiving_advanced.htm&div=div_advanced_receiving")

raw_html <- read_html(raw_url)
tbl_html <- html_element(raw_html, xpath = '//*[@id="advanced_receiving"]')

# The "data-append-csv" attribut of the dt tags inherits the pfr player ids
ids <- tbl_html |>
html_elements("td") |>
html_attr("data-append-csv") |>
na.omit()

df <- html_table(tbl_html)
if(Sys.getenv("NFLVERSE_REBUILD", "false") == "true"){
seasons_to_update <- 2018:nflreadr::most_recent_season()
} else {
seasons_to_update <- nflreadr::most_recent_season()
}

suppressWarnings({
out <- df |>
janitor::clean_names() |>
dplyr::filter(rk != "Rk") |>
dplyr::mutate(
pfr_id = ids,
tm = nflreadr::clean_team_abbrs(tm),
season = s,
loaded = lubridate::today()
) |>
dplyr::na_if("") |>
dplyr::select(season, player, pfr_id, dplyr::everything(), -rk) |>
dplyr::mutate(
dplyr::across(
.cols = tidyselect::contains("percent"),
.fns = function(x) as.numeric(sub("%","",x)) / 100
),
dplyr::across(
.cols = !tidyselect::any_of(c("player", "pfr_id", "tm", "pos", "loaded")),
.fns = as.numeric
),
player = stringr::str_remove_all(player, "\\+|\\*"),
pos = toupper(pos)
)
})
purrr::walk(
seasons_to_update,
purrr::possibly(function(season){
nflversedata::nflverse_save(
data_frame = pfr_advanced_receiving_season(season),
file_name = glue::glue("advstats_season_rec_{season}"),
nflverse_type = "advanced receiving season stats via PFR",
release_tag = "pfr_advstats",
file_types = "rds"
)
}, quiet = FALSE
)
)

cli::cli_process_done()
## NOW COMBINE ALL SEASONS FOR THE FILE nflreadr IS LOADING

out
}

# data seem spotty before 2019
df_advstats <- purrr::map_df(2018:nflreadr:::most_recent_season(), get_rec_season)
combined_advstats <- purrr::map(
2018:nflreadr::most_recent_season(),
purrr::possibly(function(season){
load_from <- glue::glue("https://github.com/nflverse/nflverse-data/releases/download/pfr_advstats/advstats_season_rec_{season}.rds")
nflreadr::rds_from_url(load_from)
}, tibble::tibble(), quiet = FALSE),
.progress = TRUE
) |>
purrr::list_rbind()

nflversedata::nflverse_save(
data_frame = df_advstats,
data_frame = combined_advstats,
file_name = "advstats_season_rec",
nflverse_type = "advanced receiving season stats via PFR",
release_tag = "pfr_advstats"
)

Loading

0 comments on commit b567329

Please sign in to comment.