diff --git a/nfl_data_py/__init__.py b/nfl_data_py/__init__.py index 3c874e1..e66d810 100644 --- a/nfl_data_py/__init__.py +++ b/nfl_data_py/__init__.py @@ -10,6 +10,7 @@ import numpy import pandas import appdirs +from urllib.error import HTTPError # module level doc string __doc__ = """ @@ -143,13 +144,20 @@ def import_pbp_data( raw = pandas.DataFrame(data) raw['season'] = year - if all([include_participation, year >= 2016, not cache]): + + if include_participation and not cache: path = r'https://github.com/nflverse/nflverse-data/releases/download/pbp_participation/pbp_participation_{}.parquet'.format(year) - partic = pandas.read_parquet(path) - raw = raw.merge(partic, - how='left', - left_on=['play_id','game_id'], - right_on=['play_id','nflverse_game_id']) + + try: + partic = pandas.read_parquet(path) + raw = raw.merge( + partic, + how='left', + left_on=['play_id','game_id'], + right_on=['play_id','nflverse_game_id'] + ) + except HTTPError: + pass pbp_data.append(raw) print(str(year) + ' done.') @@ -158,8 +166,10 @@ def import_pbp_data( print(e) print('Data not available for ' + str(year)) - if pbp_data: - plays = pandas.concat(pbp_data).reset_index(drop=True) + if not pbp_data: + return pandas.DataFrame() + + plays = pandas.concat(pbp_data, ignore_index=True) # converts float64 to float32, saves ~30% memory if downcast: diff --git a/nfl_data_py/tests/nfl_test.py b/nfl_data_py/tests/nfl_test.py index b4a0983..779a4c2 100644 --- a/nfl_data_py/tests/nfl_test.py +++ b/nfl_data_py/tests/nfl_test.py @@ -9,17 +9,17 @@ class test_pbp(TestCase): + pbp = nfl.import_pbp_data([2020]) + def test_is_df_with_data(self): - s = nfl.import_pbp_data([2020]) - self.assertIsInstance(s, pd.DataFrame) - self.assertTrue(len(s) > 0) + self.assertIsInstance(self.pbp, pd.DataFrame) + self.assertTrue(len(self.pbp) > 0) def test_is_df_with_data_thread_requests(self): s = nfl.import_pbp_data([2020, 2021], thread_requests=True) self.assertIsInstance(s, pd.DataFrame) self.assertTrue(len(s) > 0) - def test_uses_cache_when_cache_is_true(self): cache = Path(__file__).parent/f"tmpcache-{random.randint(0, 10000)}" self.assertRaises( @@ -33,6 +33,21 @@ def test_uses_cache_when_cache_is_true(self): self.assertIsInstance(data, pd.DataFrame) shutil.rmtree(cache) + + def test_includes_participation_by_default(self): + self.assertIn("offense_players", self.pbp.columns) + + def test_excludes_participation_when_requested(self): + data = nfl.import_pbp_data([2020], include_participation=False) + self.assertIsInstance(self.pbp, pd.DataFrame) + self.assertTrue(len(self.pbp) > 0) + self.assertNotIn("offense_players", data.columns) + + def test_excludes_participation_if_not_available(self): + data = nfl.import_pbp_data([2024]) + self.assertIsInstance(self.pbp, pd.DataFrame) + self.assertTrue(len(self.pbp) > 0) + self.assertNotIn("offense_players", data.columns) class test_weekly(TestCase):