From 748e48aec2aa87a2ad95fe4e0ec3216fcfaae1ab Mon Sep 17 00:00:00 2001 From: Benjamin Buzbee Date: Mon, 8 Apr 2024 11:27:15 -0400 Subject: [PATCH] Fix date_parser with prefer_month_of_year wrong results (#1224) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix date_parser with prefer_month_of_year wrong results Fix two problems 1. Parser would use current month even if prefer_month_of_year was not current when relative_base was not none 2. Parser would use current month to derive 'what is the last day of this month' - for example, with prefer_month=last and prefer_day=past, but current_month=april, it would return december 30th, because it would use april to find that the last day was the 30th, when it should use the month. Additionally, add a test to test_date_parser that uses prefer_month * Run pre-commit * Update test_dates_parse_utc_offset_does_not_throw to expect January It is parsing "0:4", french, with settings ``` "PREFER_DATES_FROM": "past", "PREFER_DAY_OF_MONTH": "first", "PREFER_LOCALE_DATE_ORDER": True, "PREFER_MONTH_OF_YEAR": "current", "RELATIVE_BASE": datetime( year=1970, month=1, day=1, hour=0, minute=0, second=0 ), ``` It used to expect to get `expected_date=datetime(1969, 12, 31, 14, 4)` but after my change it gets `datetime(1969, 1, 31, 14, 4)` I would argue that with PREFER_MONTH_OF_YEAR set to "Current", and "Current" being January 1st 1970, that `datetime(1969, 1, 31, 14, 4)` is a better result However with this particular set of configuration, I am not exactly 100% sure what to expect. These settings were generated by a fuzzer so perhaps they don't really make a ton of sense together anyway; rather than change the settings (and thus deviate from what the parser caught) I have opted to update the test expectation to accept January. * Update German test_search_and_parse to accept January for parsing of 'Die' It is searching a German string for dates and asserting that when it finds the word "Die" in the string, it should be parsed as `datetime.datetime(1999, 12, 28, 0, 0)` Similarly, my change makes this `datetime.datetime(1999, 1, 28, 0, 0)` instead. I don't speak German, but as far as I can tell "Die" just means "The" so I have no idea why it is even matching it. In my opinion, this could be a bug with the search identifying a non-date word, and so I can't really guess as to what a sensible result would be. For the sake of simplicity, I also just updated this test to accept January, --------- Co-authored-by: Adrián Chaves --- dateparser/parser.py | 13 ++++++--- tests/test_clean_api.py | 2 +- tests/test_date_parser.py | 60 +++++++++++++++++++++++++++++++++++++++ tests/test_search.py | 12 ++++++-- 4 files changed, 79 insertions(+), 8 deletions(-) diff --git a/dateparser/parser.py b/dateparser/parser.py index 16751188b..40aa8ff7b 100644 --- a/dateparser/parser.py +++ b/dateparser/parser.py @@ -598,10 +598,13 @@ def _correct_for_month(self, dateobj): relative_base_month = ( relative_base.month if hasattr(relative_base, "month") else relative_base ) - if getattr(self, "_token_month", None) or relative_base_month: + + if getattr(self, "_token_month", None): return dateobj - dateobj = set_correct_month_from_settings(dateobj, self.settings) + dateobj = set_correct_month_from_settings( + dateobj, self.settings, relative_base_month + ) return dateobj @classmethod @@ -613,11 +616,13 @@ def parse(cls, datestring, settings, tz=None): # correction for past, future if applicable dateobj = po._correct_for_time_frame(dateobj, tz) + # correction for preference of month: beginning, current, end + # must happen before day so that day is derived from the correct month + dateobj = po._correct_for_month(dateobj) + # correction for preference of day: beginning, current, end dateobj = po._correct_for_day(dateobj) - # correction for preference of month: beginning, current, end - dateobj = po._correct_for_month(dateobj) period = po._get_period() return dateobj, period diff --git a/tests/test_clean_api.py b/tests/test_clean_api.py index c487ec07a..62e3b218a 100644 --- a/tests/test_clean_api.py +++ b/tests/test_clean_api.py @@ -119,7 +119,7 @@ def test_dates_which_match_locales_are_parsed( languages=["en"], region="", date_formats=["%a", "%a", "%a", "%a"], - expected_date=datetime(1969, 12, 31, 14, 4), + expected_date=datetime(1969, 1, 31, 14, 4), ) ] ) diff --git a/tests/test_date_parser.py b/tests/test_date_parser.py index 97ee361cd..2d2795045 100644 --- a/tests/test_date_parser.py +++ b/tests/test_date_parser.py @@ -1265,6 +1265,66 @@ def test_prefer_dates_from_with_timezone( self.then_date_was_parsed_by_date_parser() self.then_date_obj_exactly_is(expected) + @parameterized.expand( + [ + param( + "2015", + prefer_day="current", + prefer_month="current", + today=datetime(2010, 2, 10), + expected=datetime(2015, 2, 10), + ), + param( + "2015", + prefer_day="last", + prefer_month="current", + today=datetime(2010, 2, 10), + expected=datetime(2015, 2, 28), + ), + param( + "2015", + prefer_day="first", + prefer_month="current", + today=datetime(2010, 2, 10), + expected=datetime(2015, 2, 1), + ), + param( + "2015", + prefer_day="current", + prefer_month="last", + today=datetime(2010, 2, 10), + expected=datetime(2015, 12, 10), + ), + param( + "2015", + prefer_day="last", + prefer_month="last", + today=datetime(2010, 2, 10), + expected=datetime(2015, 12, 31), + ), + param( + "2020", # Leap year last day test + prefer_day="last", + prefer_month="current", + today=datetime(2010, 2, 10), + expected=datetime(2020, 2, 29), + ), + ] + ) + def test_dates_with_no_day_or_month( + self, date_string, prefer_day, prefer_month, today=None, expected=None + ): + self.given_parser( + settings={ + "PREFER_DAY_OF_MONTH": prefer_day, + "PREFER_MONTH_OF_YEAR": prefer_month, + "RELATIVE_BASE": today, + } + ) + self.when_date_is_parsed(date_string) + self.then_date_was_parsed_by_date_parser() + self.then_date_obj_exactly_is(expected) + def given_local_tz_offset(self, offset): self.add_patch( patch.object( diff --git a/tests/test_search.py b/tests/test_search.py index 9d0b72e63..ba5a1ea3c 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -410,7 +410,7 @@ def test_search_date_string(self, shortname, datetime_string): "Die UdSSR blieb gemäß dem Neutralitätspakt " "vom 13. April 1941 gegenüber Japan vorerst neutral.", [ - ("Die", datetime.datetime(1999, 12, 28, 0, 0)), + ("Die", datetime.datetime(1999, 1, 28, 0, 0)), ("13. April 1941", datetime.datetime(1941, 4, 13, 0, 0)), ], settings={"RELATIVE_BASE": datetime.datetime(2000, 1, 1)}, @@ -825,7 +825,10 @@ def test_splitting_of_not_parsed(self, shortname, string, expected, settings=Non "бомбардировки срещу Япония, използувайки новозавладените острови като бази.", ), # Chinese - param("zh", "不過大多數人仍多把第二次世界大戰的爆發定為1939年9月1日德國入侵波蘭開始,2015年04月08日10点05。"), + param( + "zh", + "不過大多數人仍多把第二次世界大戰的爆發定為1939年9月1日德國入侵波蘭開始,2015年04月08日10点05。", + ), # Czech param( "cs", @@ -897,7 +900,10 @@ def test_splitting_of_not_parsed(self, shortname, string, expected, settings=Non "d'Etiopia. Il 9 maggio 1936 venne proclamato l'Impero. ", ), # Japanese - param("ja", "1933年(昭和8年)12月23日午前6時39分、宮城(現:皇居)内の産殿にて誕生。"), + param( + "ja", + "1933年(昭和8年)12月23日午前6時39分、宮城(現:皇居)内の産殿にて誕生。", + ), # Persian param("fa", "نگ جهانی دوم جنگ جدی بین سپتامبر 1939 و 2 سپتامبر 1945 بود."), # Polish