From b0a6a4c13bc3a0ce98465292d54f1ca33058199d Mon Sep 17 00:00:00 2001 From: bcapuano Date: Tue, 21 Nov 2023 21:33:47 -0500 Subject: [PATCH 1/9] Fixed RecursionError discovered by OSSFuzz by converting _split_by_known_words into an iterative function --- dateparser/languages/dictionary.py | 46 ++++++++++++++++++------------ 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/dateparser/languages/dictionary.py b/dateparser/languages/dictionary.py index 4a4c1d427..eed074ce8 100644 --- a/dateparser/languages/dictionary.py +++ b/dateparser/languages/dictionary.py @@ -181,26 +181,34 @@ def _add_to_cache(self, value, cache): ): cache.pop(list(cache.keys())[0]) - def _split_by_known_words(self, string, keep_formatting): - if not string: - return string - + def _split_by_known_words(self, string: str, keep_formatting: bool): regex = self._get_split_regex_cache() - match = regex.match(string) - if not match: - return ( - self._split_by_numerals(string, keep_formatting) - if self._should_capture(string, keep_formatting) - else [] - ) - - unparsed, known, unknown = match.groups() - splitted = [known] if self._should_capture(known, keep_formatting) else [] - if unparsed and self._should_capture(unparsed, keep_formatting): - splitted = self._split_by_numerals(unparsed, keep_formatting) + splitted - if unknown: - splitted.extend(self._split_by_known_words(unknown, keep_formatting)) - + splitted = [] + unknown = string + + while unknown: + match = regex.match(string) + + if not match: + curr_split = ( + self._split_by_numerals(string, keep_formatting) + if self._should_capture(string, keep_formatting) + else [] + ) + unknown = "" + else: + unparsed, known, unknown = match.groups() + curr_split = ( + [known] if self._should_capture(known, keep_formatting) else [] + ) + if unparsed and self._should_capture(unparsed, keep_formatting): + curr_split = ( + self._split_by_numerals(unparsed, keep_formatting) + curr_split + ) + if unknown: + string = unknown if string != unknown else "" + + splitted.extend(curr_split) return splitted def _split_by_numerals(self, string, keep_formatting): From 9f6ac5b2cbf5a13031e1d37d5896062b7580dd2c Mon Sep 17 00:00:00 2001 From: bcapuano Date: Sat, 16 Dec 2023 16:34:17 -0500 Subject: [PATCH 2/9] Caught exception potentially raised by pytz utcoffset in _correct_for_time_frame --- dateparser/parser.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/dateparser/parser.py b/dateparser/parser.py index 297e73f3b..d62712c45 100644 --- a/dateparser/parser.py +++ b/dateparser/parser.py @@ -144,10 +144,10 @@ def __init__(self, *args, **kwargs): "%m%d%y": ( self._preferred_formats + sorted( - self._all, - key=lambda x: x.lower().startswith("%m%d%y"), - reverse=True, - ) + self._all, + key=lambda x: x.lower().startswith("%m%d%y"), + reverse=True, + ) ), "%m%y%d": sorted( self._all, key=lambda x: x.lower().startswith("%m%y%d"), reverse=True @@ -566,10 +566,16 @@ def _correct_for_time_frame(self, dateobj, tz): except pytz.UnknownTimeZoneError: tz = None + dateobj_time = None if tz: - dateobj_time = (dateobj - tz.utcoffset(dateobj)).time() - else: + try: + dateobj_time = (dateobj - tz.utcoffset(dateobj)).time() + except pytz.InvalidTimeError: + pass + + if not dateobj_time: dateobj_time = dateobj.time() + if "past" in self.settings.PREFER_DATES_FROM: if self.now.time() < dateobj_time: dateobj = dateobj + timedelta(days=-1) From 016ed043b9a19d2baee3c3059485b6808d08ed33 Mon Sep 17 00:00:00 2001 From: bcapuano Date: Sat, 16 Dec 2023 16:44:44 -0500 Subject: [PATCH 3/9] Added CIFuzz --- .github/workflows/cifuzz.yml | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 .github/workflows/cifuzz.yml diff --git a/.github/workflows/cifuzz.yml b/.github/workflows/cifuzz.yml new file mode 100644 index 000000000..979b81116 --- /dev/null +++ b/.github/workflows/cifuzz.yml @@ -0,0 +1,35 @@ +name: CIFuzz +on: [pull_request] +permissions: {} +jobs: + Fuzzing: + runs-on: ubuntu-latest + permissions: + security-events: write + steps: + - name: Build Fuzzers + id: build + uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master + with: + oss-fuzz-project-name: 'dateparser' + language: python + - name: Run Fuzzers + uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master + with: + oss-fuzz-project-name: 'dateparser' + language: python + fuzz-seconds: 600 + output-sarif: true + - name: Upload Crash + uses: actions/upload-artifact@v3 + if: failure() && steps.build.outcome == 'success' + with: + name: artifacts + path: ./out/artifacts + - name: Upload Sarif + if: always() && steps.build.outcome == 'success' + uses: github/codeql-action/upload-sarif@v2 + with: + # Path to SARIF file relative to the root of the repository + sarif_file: cifuzz-sarif/results.sarif + checkout_path: cifuzz-sarif From c4f66378848846864c61c2e6317385ab9e5b7d5a Mon Sep 17 00:00:00 2001 From: bcapuano Date: Sun, 17 Dec 2023 14:22:02 -0500 Subject: [PATCH 4/9] Ran pre-commit --- dateparser/parser.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dateparser/parser.py b/dateparser/parser.py index d62712c45..af48dcf53 100644 --- a/dateparser/parser.py +++ b/dateparser/parser.py @@ -144,10 +144,10 @@ def __init__(self, *args, **kwargs): "%m%d%y": ( self._preferred_formats + sorted( - self._all, - key=lambda x: x.lower().startswith("%m%d%y"), - reverse=True, - ) + self._all, + key=lambda x: x.lower().startswith("%m%d%y"), + reverse=True, + ) ), "%m%y%d": sorted( self._all, key=lambda x: x.lower().startswith("%m%y%d"), reverse=True From 1cf5e35a6aceb301ca87b7c800598182d0826405 Mon Sep 17 00:00:00 2001 From: bcapuano Date: Tue, 19 Dec 2023 21:34:12 -0500 Subject: [PATCH 5/9] Added a regression test for OSSFuzz bug --- tests/test_clean_api.py | 58 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/tests/test_clean_api.py b/tests/test_clean_api.py index fc0f4f15c..425386b48 100644 --- a/tests/test_clean_api.py +++ b/tests/test_clean_api.py @@ -111,6 +111,55 @@ def test_dates_which_match_locales_are_parsed( self.when_date_is_parsed(date_string, locales=locales) self.then_parsed_date_is(expected_date) + @parameterized.expand( + [ + param( + date_string='0:4', + locales=["fr-PF"], + languages=["en"], + region='', + date_formats=['%a', '%a', '%a', '%a'], + expected_date=datetime(1969, 12, 31, 14, 4) + ) + ] + ) + def test_dates_parse_utc_offset_does_not_throw( + self, date_string, locales, languages, region, date_formats, expected_date + ): + """ + Bug discovered by OSSFuzz that caused an exception in pytz to halt parsing + Regression test to ensure that this is not reintroduced + """ + self.when_date_is_parsed_with_args_and_settings( + date_string, + languages=languages, + locales=locales, + region=region, + date_formats=date_formats, + settings={ + 'CACHE_SIZE_LIMIT': 1000, + 'DATE_ORDER': 'YDM', + 'DEFAULT_LANGUAGES': ['mzn', 'as', 'af', 'fur', 'sr-Cyrl', 'kw', 'ne', 'en', 'vi', 'teo', 'sr', 'cgg'], + 'LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD': 0.18823535008398845, + 'NORMALIZE': True, + 'PARSERS': ['custom-formats', 'absolute-time'], + 'PREFER_DATES_FROM': 'past', + 'PREFER_DAY_OF_MONTH': 'first', + 'PREFER_LOCALE_DATE_ORDER': True, + 'PREFER_MONTH_OF_YEAR': 'current', + 'RELATIVE_BASE': datetime(year=1970, month=1, day=1, hour=0, minute=0, second=0), + 'REQUIRE_PARTS': [], + 'RETURN_AS_TIMEZONE_AWARE': False, + 'RETURN_TIME_AS_PERIOD': False, + 'SKIP_TOKENS': [], + 'STRICT_PARSING': False, + 'TIMEZONE': 'America/Hermosillo', + 'TO_TIMEZONE': 'Asia/Almaty' + } + ) + self.then_parsed_date_and_time_is(expected_date) + print(self.result) + @parameterized.expand( [ param(date_string="January 24, 2014", locales=["pt-AO"]), @@ -133,6 +182,15 @@ def when_date_is_parsed(self, date_string, languages=None, locales=None): def when_date_is_parsed_with_settings(self, date_string, settings=None): self.result = dateparser.parse(date_string, settings=settings) + def when_date_is_parsed_with_args_and_settings(self, date_string, languages=None, locales=None, region=None, date_formats=None, settings=None): + self.result = dateparser.parse( + date_string, + languages=languages, + locales=locales, + region=region, + date_formats=date_formats, + settings=settings) + def then_parsed_date_is(self, expected_date): self.assertEqual( self.result, datetime.combine(expected_date, datetime.min.time()) From f54dc88d1c467d3b4545c4b7cf7425e9dee1ec4d Mon Sep 17 00:00:00 2001 From: bcapuano Date: Tue, 19 Dec 2023 21:35:13 -0500 Subject: [PATCH 6/9] Removed print --- tests/test_clean_api.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_clean_api.py b/tests/test_clean_api.py index 425386b48..4e779ab13 100644 --- a/tests/test_clean_api.py +++ b/tests/test_clean_api.py @@ -158,7 +158,6 @@ def test_dates_parse_utc_offset_does_not_throw( } ) self.then_parsed_date_and_time_is(expected_date) - print(self.result) @parameterized.expand( [ From ab129af6063cb57879629c4f803d8e1097e8227b Mon Sep 17 00:00:00 2001 From: bcapuano Date: Tue, 19 Dec 2023 21:37:11 -0500 Subject: [PATCH 7/9] Ran precommit --- tests/test_clean_api.py | 74 +++++++++++++++++++++++++++-------------- 1 file changed, 49 insertions(+), 25 deletions(-) diff --git a/tests/test_clean_api.py b/tests/test_clean_api.py index 4e779ab13..c487ec07a 100644 --- a/tests/test_clean_api.py +++ b/tests/test_clean_api.py @@ -114,12 +114,12 @@ def test_dates_which_match_locales_are_parsed( @parameterized.expand( [ param( - date_string='0:4', + date_string="0:4", locales=["fr-PF"], languages=["en"], - region='', - date_formats=['%a', '%a', '%a', '%a'], - expected_date=datetime(1969, 12, 31, 14, 4) + region="", + date_formats=["%a", "%a", "%a", "%a"], + expected_date=datetime(1969, 12, 31, 14, 4), ) ] ) @@ -137,25 +137,40 @@ def test_dates_parse_utc_offset_does_not_throw( region=region, date_formats=date_formats, settings={ - 'CACHE_SIZE_LIMIT': 1000, - 'DATE_ORDER': 'YDM', - 'DEFAULT_LANGUAGES': ['mzn', 'as', 'af', 'fur', 'sr-Cyrl', 'kw', 'ne', 'en', 'vi', 'teo', 'sr', 'cgg'], - 'LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD': 0.18823535008398845, - 'NORMALIZE': True, - 'PARSERS': ['custom-formats', 'absolute-time'], - 'PREFER_DATES_FROM': 'past', - 'PREFER_DAY_OF_MONTH': 'first', - 'PREFER_LOCALE_DATE_ORDER': True, - 'PREFER_MONTH_OF_YEAR': 'current', - 'RELATIVE_BASE': datetime(year=1970, month=1, day=1, hour=0, minute=0, second=0), - 'REQUIRE_PARTS': [], - 'RETURN_AS_TIMEZONE_AWARE': False, - 'RETURN_TIME_AS_PERIOD': False, - 'SKIP_TOKENS': [], - 'STRICT_PARSING': False, - 'TIMEZONE': 'America/Hermosillo', - 'TO_TIMEZONE': 'Asia/Almaty' - } + "CACHE_SIZE_LIMIT": 1000, + "DATE_ORDER": "YDM", + "DEFAULT_LANGUAGES": [ + "mzn", + "as", + "af", + "fur", + "sr-Cyrl", + "kw", + "ne", + "en", + "vi", + "teo", + "sr", + "cgg", + ], + "LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD": 0.18823535008398845, + "NORMALIZE": True, + "PARSERS": ["custom-formats", "absolute-time"], + "PREFER_DATES_FROM": "past", + "PREFER_DAY_OF_MONTH": "first", + "PREFER_LOCALE_DATE_ORDER": True, + "PREFER_MONTH_OF_YEAR": "current", + "RELATIVE_BASE": datetime( + year=1970, month=1, day=1, hour=0, minute=0, second=0 + ), + "REQUIRE_PARTS": [], + "RETURN_AS_TIMEZONE_AWARE": False, + "RETURN_TIME_AS_PERIOD": False, + "SKIP_TOKENS": [], + "STRICT_PARSING": False, + "TIMEZONE": "America/Hermosillo", + "TO_TIMEZONE": "Asia/Almaty", + }, ) self.then_parsed_date_and_time_is(expected_date) @@ -181,14 +196,23 @@ def when_date_is_parsed(self, date_string, languages=None, locales=None): def when_date_is_parsed_with_settings(self, date_string, settings=None): self.result = dateparser.parse(date_string, settings=settings) - def when_date_is_parsed_with_args_and_settings(self, date_string, languages=None, locales=None, region=None, date_formats=None, settings=None): + def when_date_is_parsed_with_args_and_settings( + self, + date_string, + languages=None, + locales=None, + region=None, + date_formats=None, + settings=None, + ): self.result = dateparser.parse( date_string, languages=languages, locales=locales, region=region, date_formats=date_formats, - settings=settings) + settings=settings, + ) def then_parsed_date_is(self, expected_date): self.assertEqual( From 325ae9d7ebb79e65354d760308bfec4485c285fb Mon Sep 17 00:00:00 2001 From: bcapuano Date: Tue, 19 Dec 2023 21:45:56 -0500 Subject: [PATCH 8/9] Caught NonExistentTimeError --- dateparser/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dateparser/parser.py b/dateparser/parser.py index 665175f7a..89c30b838 100644 --- a/dateparser/parser.py +++ b/dateparser/parser.py @@ -564,7 +564,7 @@ def _correct_for_time_frame(self, dateobj, tz): try: tz = tz or get_timezone_from_tz_string(self.settings.TIMEZONE) tz_offset = tz.utcoffset(dateobj) - except pytz.UnknownTimeZoneError: + except (pytz.UnknownTimeZoneError, pytz.NonExistentTimeError): tz_offset = timedelta(hours=0) dateobj_time = None From 6223f83359cbca8f0022f1f4b576d92117d556b8 Mon Sep 17 00:00:00 2001 From: McKenna Date: Wed, 20 Dec 2023 20:29:34 -0500 Subject: [PATCH 9/9] Fixed bug with current state of master --- dateparser/parser.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/dateparser/parser.py b/dateparser/parser.py index 89c30b838..16751188b 100644 --- a/dateparser/parser.py +++ b/dateparser/parser.py @@ -567,16 +567,6 @@ def _correct_for_time_frame(self, dateobj, tz): except (pytz.UnknownTimeZoneError, pytz.NonExistentTimeError): tz_offset = timedelta(hours=0) - dateobj_time = None - if tz: - try: - dateobj_time = (dateobj - tz.utcoffset(dateobj)).time() - except pytz.InvalidTimeError: - pass - - if not dateobj_time: - dateobj_time = dateobj.time() - if "past" in self.settings.PREFER_DATES_FROM: if self.now < dateobj - tz_offset: dateobj = dateobj + timedelta(days=-1)