Skip to content

Commit

Permalink
Fix date_parser with prefer_month_of_year wrong results (#1224)
Browse files Browse the repository at this point in the history
* Fix date_parser with prefer_month_of_year wrong results

Fix two problems
1. Parser would use current month even if prefer_month_of_year was not
   current when relative_base was not none

2. Parser would use current month to derive 'what is the last day of
   this month' - for example, with prefer_month=last and
   prefer_day=past, but current_month=april, it would return december
   30th, because it would use april to find that the last day was the
   30th, when it should use the month.

Additionally, add a test to test_date_parser that uses prefer_month

* Run pre-commit

* Update test_dates_parse_utc_offset_does_not_throw to expect January

It is parsing "0:4", french, with settings
```
                "PREFER_DATES_FROM": "past",
                "PREFER_DAY_OF_MONTH": "first",
                "PREFER_LOCALE_DATE_ORDER": True,
                "PREFER_MONTH_OF_YEAR": "current",
                "RELATIVE_BASE": datetime(
                    year=1970, month=1, day=1, hour=0, minute=0, second=0
                ),
```
It used to expect to get `expected_date=datetime(1969, 12, 31, 14, 4)` but after my change it gets `datetime(1969, 1, 31, 14, 4)`
I would argue that with PREFER_MONTH_OF_YEAR set to "Current", and "Current" being January 1st 1970, that `datetime(1969, 1, 31, 14, 4)` is a better result
However with this particular set of configuration, I am not exactly 100% sure what to expect. These settings were generated by a fuzzer so perhaps they don't really make a ton of sense together anyway; rather than change the settings (and thus deviate from what the parser caught) I have opted to update the test expectation to accept January.

* Update German test_search_and_parse to accept January for parsing of
'Die'

It is searching a German string for dates and asserting that when it finds the word "Die" in the string, it should be parsed as `datetime.datetime(1999, 12, 28, 0, 0)`
Similarly, my change makes this `datetime.datetime(1999, 1, 28, 0, 0)` instead. I don't speak German, but as far as I can tell "Die" just means "The" so I have no idea why it is even matching it. In my opinion, this could be a bug with the search identifying a non-date word, and so I can't really guess as to what a sensible result would be. For the sake of simplicity, I also just updated this test to accept January,

---------

Co-authored-by: Adrián Chaves <[email protected]>
  • Loading branch information
benbuzbee and Gallaecio authored Apr 8, 2024
1 parent f659364 commit 748e48a
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 8 deletions.
13 changes: 9 additions & 4 deletions dateparser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -598,10 +598,13 @@ def _correct_for_month(self, dateobj):
relative_base_month = (
relative_base.month if hasattr(relative_base, "month") else relative_base
)
if getattr(self, "_token_month", None) or relative_base_month:

if getattr(self, "_token_month", None):
return dateobj

dateobj = set_correct_month_from_settings(dateobj, self.settings)
dateobj = set_correct_month_from_settings(
dateobj, self.settings, relative_base_month
)
return dateobj

@classmethod
Expand All @@ -613,11 +616,13 @@ def parse(cls, datestring, settings, tz=None):
# correction for past, future if applicable
dateobj = po._correct_for_time_frame(dateobj, tz)

# correction for preference of month: beginning, current, end
# must happen before day so that day is derived from the correct month
dateobj = po._correct_for_month(dateobj)

# correction for preference of day: beginning, current, end
dateobj = po._correct_for_day(dateobj)

# correction for preference of month: beginning, current, end
dateobj = po._correct_for_month(dateobj)
period = po._get_period()

return dateobj, period
Expand Down
2 changes: 1 addition & 1 deletion tests/test_clean_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def test_dates_which_match_locales_are_parsed(
languages=["en"],
region="",
date_formats=["%a", "%a", "%a", "%a"],
expected_date=datetime(1969, 12, 31, 14, 4),
expected_date=datetime(1969, 1, 31, 14, 4),
)
]
)
Expand Down
60 changes: 60 additions & 0 deletions tests/test_date_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1265,6 +1265,66 @@ def test_prefer_dates_from_with_timezone(
self.then_date_was_parsed_by_date_parser()
self.then_date_obj_exactly_is(expected)

@parameterized.expand(
[
param(
"2015",
prefer_day="current",
prefer_month="current",
today=datetime(2010, 2, 10),
expected=datetime(2015, 2, 10),
),
param(
"2015",
prefer_day="last",
prefer_month="current",
today=datetime(2010, 2, 10),
expected=datetime(2015, 2, 28),
),
param(
"2015",
prefer_day="first",
prefer_month="current",
today=datetime(2010, 2, 10),
expected=datetime(2015, 2, 1),
),
param(
"2015",
prefer_day="current",
prefer_month="last",
today=datetime(2010, 2, 10),
expected=datetime(2015, 12, 10),
),
param(
"2015",
prefer_day="last",
prefer_month="last",
today=datetime(2010, 2, 10),
expected=datetime(2015, 12, 31),
),
param(
"2020", # Leap year last day test
prefer_day="last",
prefer_month="current",
today=datetime(2010, 2, 10),
expected=datetime(2020, 2, 29),
),
]
)
def test_dates_with_no_day_or_month(
self, date_string, prefer_day, prefer_month, today=None, expected=None
):
self.given_parser(
settings={
"PREFER_DAY_OF_MONTH": prefer_day,
"PREFER_MONTH_OF_YEAR": prefer_month,
"RELATIVE_BASE": today,
}
)
self.when_date_is_parsed(date_string)
self.then_date_was_parsed_by_date_parser()
self.then_date_obj_exactly_is(expected)

def given_local_tz_offset(self, offset):
self.add_patch(
patch.object(
Expand Down
12 changes: 9 additions & 3 deletions tests/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,7 @@ def test_search_date_string(self, shortname, datetime_string):
"Die UdSSR blieb gemäß dem Neutralitätspakt "
"vom 13. April 1941 gegenüber Japan vorerst neutral.",
[
("Die", datetime.datetime(1999, 12, 28, 0, 0)),
("Die", datetime.datetime(1999, 1, 28, 0, 0)),
("13. April 1941", datetime.datetime(1941, 4, 13, 0, 0)),
],
settings={"RELATIVE_BASE": datetime.datetime(2000, 1, 1)},
Expand Down Expand Up @@ -825,7 +825,10 @@ def test_splitting_of_not_parsed(self, shortname, string, expected, settings=Non
"бомбардировки срещу Япония, използувайки новозавладените острови като бази.",
),
# Chinese
param("zh", "不過大多數人仍多把第二次世界大戰的爆發定為1939年9月1日德國入侵波蘭開始,2015年04月08日10点05。"),
param(
"zh",
"不過大多數人仍多把第二次世界大戰的爆發定為1939年9月1日德國入侵波蘭開始,2015年04月08日10点05。",
),
# Czech
param(
"cs",
Expand Down Expand Up @@ -897,7 +900,10 @@ def test_splitting_of_not_parsed(self, shortname, string, expected, settings=Non
"d'Etiopia. Il 9 maggio 1936 venne proclamato l'Impero. ",
),
# Japanese
param("ja", "1933年(昭和8年)12月23日午前6時39分、宮城(現:皇居)内の産殿にて誕生。"),
param(
"ja",
"1933年(昭和8年)12月23日午前6時39分、宮城(現:皇居)内の産殿にて誕生。",
),
# Persian
param("fa", "نگ جهانی دوم جنگ جدی بین سپتامبر 1939 و 2 سپتامبر 1945 بود."),
# Polish
Expand Down

0 comments on commit 748e48a

Please sign in to comment.