Skip to content

Commit

Permalink
Exclude pbp participation for any years where the data file isn't ava…
Browse files Browse the repository at this point in the history
…ilable (#107)

* Exclude pbp participation data if the file isn't available

* Fixed incorrect error import
  • Loading branch information
alecglen authored Sep 17, 2024
1 parent 4541ab6 commit 478ec27
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 12 deletions.
26 changes: 18 additions & 8 deletions nfl_data_py/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import numpy
import pandas
import appdirs
from urllib.error import HTTPError

# module level doc string
__doc__ = """
Expand Down Expand Up @@ -143,13 +144,20 @@ def import_pbp_data(
raw = pandas.DataFrame(data)
raw['season'] = year

if all([include_participation, year >= 2016, not cache]):

if include_participation and not cache:
path = r'https://github.com/nflverse/nflverse-data/releases/download/pbp_participation/pbp_participation_{}.parquet'.format(year)
partic = pandas.read_parquet(path)
raw = raw.merge(partic,
how='left',
left_on=['play_id','game_id'],
right_on=['play_id','nflverse_game_id'])

try:
partic = pandas.read_parquet(path)
raw = raw.merge(
partic,
how='left',
left_on=['play_id','game_id'],
right_on=['play_id','nflverse_game_id']
)
except HTTPError:
pass

pbp_data.append(raw)
print(str(year) + ' done.')
Expand All @@ -158,8 +166,10 @@ def import_pbp_data(
print(e)
print('Data not available for ' + str(year))

if pbp_data:
plays = pandas.concat(pbp_data).reset_index(drop=True)
if not pbp_data:
return pandas.DataFrame()

plays = pandas.concat(pbp_data, ignore_index=True)

# converts float64 to float32, saves ~30% memory
if downcast:
Expand Down
23 changes: 19 additions & 4 deletions nfl_data_py/tests/nfl_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,17 @@


class test_pbp(TestCase):
pbp = nfl.import_pbp_data([2020])

def test_is_df_with_data(self):
s = nfl.import_pbp_data([2020])
self.assertIsInstance(s, pd.DataFrame)
self.assertTrue(len(s) > 0)
self.assertIsInstance(self.pbp, pd.DataFrame)
self.assertTrue(len(self.pbp) > 0)

def test_is_df_with_data_thread_requests(self):
s = nfl.import_pbp_data([2020, 2021], thread_requests=True)
self.assertIsInstance(s, pd.DataFrame)
self.assertTrue(len(s) > 0)


def test_uses_cache_when_cache_is_true(self):
cache = Path(__file__).parent/f"tmpcache-{random.randint(0, 10000)}"
self.assertRaises(
Expand All @@ -33,6 +33,21 @@ def test_uses_cache_when_cache_is_true(self):
self.assertIsInstance(data, pd.DataFrame)

shutil.rmtree(cache)

def test_includes_participation_by_default(self):
self.assertIn("offense_players", self.pbp.columns)

def test_excludes_participation_when_requested(self):
data = nfl.import_pbp_data([2020], include_participation=False)
self.assertIsInstance(self.pbp, pd.DataFrame)
self.assertTrue(len(self.pbp) > 0)
self.assertNotIn("offense_players", data.columns)

def test_excludes_participation_if_not_available(self):
data = nfl.import_pbp_data([2024])
self.assertIsInstance(self.pbp, pd.DataFrame)
self.assertTrue(len(self.pbp) > 0)
self.assertNotIn("offense_players", data.columns)


class test_weekly(TestCase):
Expand Down

0 comments on commit 478ec27

Please sign in to comment.