Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixing #894 and correcting tests #901

Closed
wants to merge 28 commits into from
Closed
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
5583f3a
Fixing Issue #894
gavishpoddar Mar 30, 2021
fff5112
Tests are corrected for recognising am
gavishpoddar Mar 30, 2021
a77cb4c
Fixing the issue #894
gavishpoddar Mar 30, 2021
3a93e8a
Removing unnecessary comments
gavishpoddar Mar 31, 2021
0ea06bc
Improving test_search.py
gavishpoddar Mar 31, 2021
4e729e8
Correcting 'am' for German Language
gavishpoddar Mar 31, 2021
5773f76
Update test_search.py
gavishpoddar Mar 31, 2021
b169301
Update search.py
gavishpoddar Mar 31, 2021
fba58dd
Fixing the static date to datetime.datetime.utcnow().day
gavishpoddar Apr 5, 2021
2aff295
Update test_search.py
gavishpoddar Apr 5, 2021
5d8e3d5
Update test_search.py
gavishpoddar Apr 5, 2021
c21e428
Update test_search.py
gavishpoddar Apr 5, 2021
34f49b9
Update dateparser/search/search.py
gavishpoddar Apr 12, 2021
50c1d30
Update search.py
gavishpoddar Apr 12, 2021
0563be4
newlines removed
gavishpoddar Apr 12, 2021
c089907
Update test_search.py
gavishpoddar Apr 12, 2021
dbd12ec
Adding comment
gavishpoddar Apr 12, 2021
4343cd4
Fixing test from line break to comma
gavishpoddar Apr 16, 2021
2ff140e
Fixing tests
gavishpoddar Apr 16, 2021
14bb443
Update test_search.py
gavishpoddar Apr 16, 2021
840e2a3
Update test_search.py
gavishpoddar Apr 16, 2021
2f61776
Test's fixed
gavishpoddar Apr 16, 2021
4810bc6
Using re.sub to replace
gavishpoddar Apr 16, 2021
95fbe1d
Fixing tests
gavishpoddar Apr 16, 2021
61bfe13
Update test_search.py
gavishpoddar Apr 16, 2021
98b6ba6
Update test_search.py
gavishpoddar Apr 16, 2021
da1c1c3
Merge branch 'scrapinghub:master' into master
gavishpoddar May 19, 2021
ff1e856
Merge branch 'scrapinghub:master' into master
gavishpoddar Jun 1, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 12 additions & 7 deletions dateparser/search/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,12 @@ def split_if_not_parsed(self, item, original):
possible_splits.extend(self.split_by(item, original, splitter))
return possible_splits

def parse_item(self, parser, item, translated_item, parsed, need_relative_base):
def parse_item(self, parser, item, translated_item, parsed, need_relative_base, language):
gavishpoddar marked this conversation as resolved.
Show resolved Hide resolved
relative_base = None
item = item.replace('ngày', '')
item = item.replace('am', '')

if language == "de":
gavishpoddar marked this conversation as resolved.
Show resolved Hide resolved
item = item.replace('am', '')
gavishpoddar marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Collaborator

@noviluni noviluni Apr 13, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if language == "de":
item = item.replace('am', '')
if language == "de":
# Replace "am" because "am 8" means "on the 8th" but "8 am" actually
# means "8 am"
item = item.replace('am', '')

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmmmm... Maybe we can use regex (re.sub) to remove it only when preceding a number? 🤔

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @noviluni, I have made some can you please check and recommend


parsed_item = parser.get_date_data(item)
is_relative = date_is_relative(translated_item)

Expand All @@ -101,7 +103,7 @@ def parse_item(self, parser, item, translated_item, parsed, need_relative_base):
parsed_item = parser.get_date_data(item)
return parsed_item, is_relative

def parse_found_objects(self, parser, to_parse, original, translated, settings):
def parse_found_objects(self, parser, to_parse, original, translated, settings, language):
gavishpoddar marked this conversation as resolved.
Show resolved Hide resolved
parsed = []
substrings = []
need_relative_base = True
Expand All @@ -111,7 +113,7 @@ def parse_found_objects(self, parser, to_parse, original, translated, settings):
if len(item) <= 2:
continue

parsed_item, is_relative = self.parse_item(parser, item, translated[i], parsed, need_relative_base)
parsed_item, is_relative = self.parse_item(parser, item, translated[i], parsed, need_relative_base, language)
if parsed_item['date_obj']:
parsed.append((parsed_item, is_relative))
substrings.append(original[i].strip(" .,:()[]-'"))
Expand All @@ -131,7 +133,7 @@ def parse_found_objects(self, parser, to_parse, original, translated, settings):
if len(jtem) <= 2:
continue
parsed_jtem, is_relative_jtem = self.parse_item(
parser, jtem, split_translated[j], current_parsed, need_relative_base)
parser, jtem, split_translated[j], current_parsed, need_relative_base, language)
current_parsed.append((parsed_jtem, is_relative_jtem))
current_substrings.append(split_original[j].strip(' .,:()[]-'))
possible_parsed.append(current_parsed)
Expand All @@ -153,9 +155,12 @@ def search_parse(self, shortname, text, settings):
languages = [shortname]
to_parse = original




gavishpoddar marked this conversation as resolved.
Show resolved Hide resolved
gavishpoddar marked this conversation as resolved.
Show resolved Hide resolved
parser = DateDataParser(languages=languages, settings=settings)
parsed, substrings = self.parse_found_objects(parser=parser, to_parse=to_parse,
original=original, translated=translated, settings=settings)
original=original, translated=translated, settings=settings, language=shortname)
parser._settings = Settings()
return list(zip(substrings, [i[0]['date_obj'] for i in parsed]))

Expand Down
13 changes: 7 additions & 6 deletions tests/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,15 +461,16 @@ def test_search_and_parse(self, shortname, string, expected, settings=None):
January UTC
June 5 am utc
June 23th 5 pm EST
May 31, 8am UTC""",
May 31
8am UTC""",
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved
[('May 2020', datetime.datetime(2020, 5, datetime.datetime.utcnow().day, 0, 0)),
('June 2020', datetime.datetime(2020, 6, datetime.datetime.utcnow().day, 0, 0)),
('2023', datetime.datetime(2023, 6, datetime.datetime.utcnow().day, 0, 0)),
('January UTC', datetime.datetime(2023, 1, datetime.datetime.utcnow().day, 0, 0, tzinfo=pytz.utc)),
('June 5 am utc', datetime.datetime(2023, 6, 5, 0, 0, tzinfo=pytz.utc)),
('June 2020', datetime.datetime(2020, 6, 30, 0, 0)),
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved
('2023', datetime.datetime(2023, 6, 30, 0, 0)),
('January UTC', datetime.datetime(2023, 1, 30, 0, 0, tzinfo=pytz.utc)),
('June 5 am utc', datetime.datetime(2023, 6, 30, 5, 0, tzinfo=pytz.utc)),
('June 23th 5 pm EST', datetime.datetime(2023, 6, 23, 17, 0, tzinfo=pytz.timezone("EST"))),
('May 31', datetime.datetime(2023, 5, 31, 0, 0)),
('8am UTC', datetime.datetime(2023, 8, 31, 0, 0, tzinfo=pytz.utc))]),
('8am UTC', datetime.datetime(2023, 5, 31, 8, 0, tzinfo=pytz.utc))]),

# Russian
param('ru', '19 марта 2001 был хороший день. 20 марта тоже был хороший день. 21 марта был отличный день.',
Expand Down