From 769e4c027fdf96e188f3a16985f6d7befca92fc7 Mon Sep 17 00:00:00 2001 From: Sjoerd Langkemper Date: Wed, 11 Jan 2023 20:31:48 +0100 Subject: [PATCH] Prevent ReDoS in Spanish sentence splitting regex (#1084) --- dateparser/languages/locale.py | 6 ++---- tests/test_search.py | 5 +++++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/dateparser/languages/locale.py b/dateparser/languages/locale.py index a64c7e79b..e3b43b6be 100644 --- a/dateparser/languages/locale.py +++ b/dateparser/languages/locale.py @@ -263,7 +263,7 @@ def _sentence_split(self, string, settings): splitters_dict = {1: r'[\.!?;…\r\n]+(?:\s|$)*', # most European, Tagalog, Hebrew, Georgian, # Indonesian, Vietnamese - 2: r'(?:[¡¿]+|[\.!?;…\r\n]+(?:\s|$))+', # Spanish + 2: r'[\.!?;…\r\n]+(\s*[¡¿]*|$)|[¡¿]+', # Spanish 3: r'[|!?;\r\n]+(?:\s|$)+', # Hindi and Bangla 4: r'[。…‥\.!??!;\r\n]+(?:\s|$)+', # Japanese and Chinese 5: r'[\r\n]+', # Thai @@ -275,9 +275,7 @@ def _sentence_split(self, string, settings): split_reg = abbreviation_string + splitters_dict[self.info['sentence_splitter_group']] sentences = re.split(split_reg, string) - for i in sentences: - if not i: - sentences.remove(i) + sentences = filter(None, sentences) return sentences def _simplify_split_align(self, original, settings): diff --git a/tests/test_search.py b/tests/test_search.py index 05ab054b5..b292fab78 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -416,6 +416,10 @@ def test_search_date_string(self, shortname, datetime_string): ('de 1941', datetime.datetime(1941, 1, 1, 0, 0))], settings={'RELATIVE_BASE': datetime.datetime(2000, 1, 1)}), + param('es', '¡¡Ay!! En Madrid, a 17 de marzo de 1615. ¿Vos bueno?', + [('a 17 de marzo de 1615', datetime.datetime(1615, 3, 17, 0, 0))], + settings={'RELATIVE_BASE': datetime.datetime(2000, 1, 1)}), + # Swedish param('sv', 'Efter kommunisternas seger 1922 drog de allierade och Japan bort sina trupper.', [('1922', datetime.datetime(1922, 1, 1, 0, 0))], @@ -657,6 +661,7 @@ def test_splitting_of_not_parsed(self, shortname, string, expected, settings=Non # Spanish param('es', '11 junio 2010'), + param('es', '¡¡Ay!! En Madrid, a 17 de marzo de 1615. ¿Vos bueno?'), # Swedish param('sv', ' den 15 augusti 1945 då Kejsardömet'),