From b2ac745b63ba90180abb143cac705cc73c52ae7e Mon Sep 17 00:00:00 2001 From: atrifonov Date: Thu, 12 Nov 2020 12:26:42 +0200 Subject: [PATCH 1/7] make `now` attribute of `FreshnessDateDataParser` a local variable --- dateparser/freshness_date_parser.py | 37 +++++++++++++---------------- tests/test_freshness_date_parser.py | 1 - 2 files changed, 17 insertions(+), 21 deletions(-) diff --git a/dateparser/freshness_date_parser.py b/dateparser/freshness_date_parser.py index 969eb8a59..cd8609c99 100644 --- a/dateparser/freshness_date_parser.py +++ b/dateparser/freshness_date_parser.py @@ -16,8 +16,6 @@ class FreshnessDateDataParser: """ Parses date string like "1 year, 2 months ago" and "3 hours, 50 minutes ago" """ - def __init__(self): - self.now = None def _are_all_words_units(self, date_string): skip = [_UNITS, @@ -59,42 +57,42 @@ def apply_time(dateobj, timeobj): ) if settings.RELATIVE_BASE: - self.now = settings.RELATIVE_BASE + now = settings.RELATIVE_BASE if 'local' not in _settings_tz: - self.now = localize_timezone(self.now, settings.TIMEZONE) + now = localize_timezone(now, settings.TIMEZONE) if ptz: - if self.now.tzinfo: - self.now = self.now.astimezone(ptz) + if now.tzinfo: + now = now.astimezone(ptz) else: if hasattr(ptz, 'localize'): - self.now = ptz.localize(self.now) + now = ptz.localize(now) else: - self.now = self.now.replace(tzinfo=ptz) + now = now.replace(tzinfo=ptz) - if not self.now.tzinfo: + if not now.tzinfo: if hasattr(self.get_local_tz(), 'localize'): - self.now = self.get_local_tz().localize(self.now) + now = self.get_local_tz().localize(now) else: - self.now = self.now.replace(tzinfo=self.get_local_tz()) + now = now.replace(tzinfo=self.get_local_tz()) elif ptz: _now = datetime.now(ptz) if 'local' in _settings_tz: - self.now = _now + now = _now else: - self.now = apply_timezone(_now, settings.TIMEZONE) + now = apply_timezone(_now, settings.TIMEZONE) else: if 'local' not in _settings_tz: utc_dt = datetime.utcnow() - self.now = apply_timezone(utc_dt, settings.TIMEZONE) + now = apply_timezone(utc_dt, settings.TIMEZONE) else: - self.now = datetime.now(self.get_local_tz()) + now = datetime.now(self.get_local_tz()) - date, period = self._parse_date(date_string, settings.PREFER_DATES_FROM) + date, period = self._parse_date(date_string, now, settings.PREFER_DATES_FROM) if date: old_date = date @@ -112,10 +110,9 @@ def apply_time(dateobj, timeobj): ): date = date.replace(tzinfo=None) - self.now = None return date, period - def _parse_date(self, date_string, prefer_dates_from): + def _parse_date(self, date_string, now, prefer_dates_from): if not self._are_all_words_units(date_string): return None, None @@ -135,9 +132,9 @@ def _parse_date(self, date_string, prefer_dates_from): or re.search(r'\bfuture\b', prefer_dates_from) and not re.search(r'\bago\b', date_string) ): - date = self.now + td + date = now + td else: - date = self.now - td + date = now - td return date, period def get_kwargs(self, date_string): diff --git a/tests/test_freshness_date_parser.py b/tests/test_freshness_date_parser.py index d3d0877c5..6260c8320 100644 --- a/tests/test_freshness_date_parser.py +++ b/tests/test_freshness_date_parser.py @@ -1666,7 +1666,6 @@ def wrapped(*args, **kwargs): collecting_get_date_data(freshness_date_parser.get_date_data))) self.freshness_parser = Mock(wraps=freshness_date_parser) - self.add_patch(patch.object(self.freshness_parser, 'now', self.now)) dt_mock = Mock(wraps=dateparser.freshness_date_parser.datetime) dt_mock.utcnow = Mock(return_value=self.now) From 6b6dd2df4f948fdb082c903ab87c7afaed3ccd36 Mon Sep 17 00:00:00 2001 From: atrifonov Date: Thu, 12 Nov 2020 12:28:31 +0200 Subject: [PATCH 2/7] use other kwargs when constructing settings to avoid multiple locales/languages concurrently change `settings.DATE_ORDER` --- dateparser/conf.py | 47 +++++++++++++++++------------ dateparser/search/text_detection.py | 2 +- tests/test_freshness_date_parser.py | 2 +- tests/test_settings.py | 2 +- tests/test_utils.py | 6 ++-- 5 files changed, 35 insertions(+), 24 deletions(-) diff --git a/dateparser/conf.py b/dateparser/conf.py index c14374a3f..80cdc00c5 100644 --- a/dateparser/conf.py +++ b/dateparser/conf.py @@ -31,19 +31,21 @@ class Settings: _pyfile_data = None _mod_settings = dict() - def __init__(self, settings=None): - if settings: - self._updateall(settings.items()) - else: + def __init__(self, **kwargs): + if not kwargs.get('settings'): self._updateall(self._get_settings_from_pyfile().items()) + elif len(self.__dict__) == 1: + self._updateall(kwargs['settings'].items()) + @classmethod - def get_key(cls, settings=None): - if not settings: - return 'default' + def get_key(cls, **kwargs): + if kwargs: + keys = [f'{key}-{val}' for key, val in kwargs.pop('settings').items()] + keys.extend([f'{key}-{val}' for key, val in kwargs.items() if val]) + return hashlib.md5(''.join(sorted(keys)).encode('utf-8')).hexdigest() - keys = sorted(['%s-%s' % (key, str(settings[key])) for key in settings]) - return hashlib.md5(''.join(keys).encode('utf-8')).hexdigest() + return 'default' @classmethod def _get_settings_from_pyfile(cls): @@ -57,18 +59,21 @@ def _updateall(self, iterable): setattr(self, key, value) def replace(self, mod_settings=None, **kwds): - for k, v in kwds.items(): + _settings = kwds.get('settings', {}).copy() + for k, v in _settings.items(): if v is None: raise TypeError('Invalid {{"{}": {}}}'.format(k, v)) - for x in self._get_settings_from_pyfile().keys(): - kwds.setdefault(x, getattr(self, x)) + z = self._get_settings_from_pyfile().keys() + for x in z: + _settings.setdefault(x, getattr(self, x)) - kwds['_default'] = False + _settings['_default'] = False if mod_settings: - kwds['_mod_settings'] = mod_settings + _settings['_mod_settings'] = mod_settings - return self.__class__(settings=kwds) + kwds['settings'] = _settings + return self.__class__(**kwds) settings = Settings() @@ -77,11 +82,15 @@ def replace(self, mod_settings=None, **kwds): def apply_settings(f): @wraps(f) def wrapper(*args, **kwargs): - mod_settings = kwargs.get('settings') - kwargs['settings'] = mod_settings or settings + mod_settings = kwargs.get('settings', {}) + if mod_settings is None: + kwargs['settings'] = mod_settings = {} - if isinstance(kwargs['settings'], dict): - kwargs['settings'] = settings.replace(mod_settings=mod_settings, **kwargs['settings']) + if kwargs: + if isinstance(mod_settings, dict): + kwargs['settings'] = settings.replace(mod_settings=mod_settings.copy(), **kwargs) + else: + kwargs['settings'] = settings if not isinstance(kwargs['settings'], Settings): raise TypeError("settings can only be either dict or instance of Settings class") diff --git a/dateparser/search/text_detection.py b/dateparser/search/text_detection.py index c9b45aa2a..4317cf5e3 100644 --- a/dateparser/search/text_detection.py +++ b/dateparser/search/text_detection.py @@ -11,7 +11,7 @@ def __init__(self, languages): self.language_chars = [] def get_unique_characters(self, settings): - settings = settings.replace(NORMALIZE=False) + settings = settings.replace(settings={'NORMALIZE': False}) for language in self.languages: chars = language.get_wordchars_for_detection(settings=settings) diff --git a/tests/test_freshness_date_parser.py b/tests/test_freshness_date_parser.py index 6260c8320..81337018b 100644 --- a/tests/test_freshness_date_parser.py +++ b/tests/test_freshness_date_parser.py @@ -1546,7 +1546,7 @@ def test_freshness_date_with_timezone_conversion(self, date_string, timezone, to self.then_time_is(time) def test_freshness_date_with_to_timezone_setting(self): - _settings = settings.replace(**{ + _settings = settings.replace(settings={ 'TIMEZONE': 'local', 'TO_TIMEZONE': 'UTC', 'RELATIVE_BASE': datetime(2014, 9, 1, 10, 30) diff --git a/tests/test_settings.py b/tests/test_settings.py index 44c355c9c..b08b23583 100644 --- a/tests/test_settings.py +++ b/tests/test_settings.py @@ -76,7 +76,7 @@ def given_configurations(self, confs): if 'TIMEZONE' not in confs: confs.update({'TIMEZONE': 'local'}) - self.confs = settings.replace(**confs) + self.confs = settings.replace(settings=confs) def when_date_is_parsed(self): self.result = parse(self.given_ds, settings=(self.confs or {})) diff --git a/tests/test_utils.py b/tests/test_utils.py index a344606df..697290edb 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -84,7 +84,9 @@ def test_apply_timezone_function(self, date, timezone, expected): param(datetime(2015, 12, 12, 10, 12), timezone='-0500', expected=datetime(2015, 12, 12, 5, 12)), ]) def test_apply_timezone_from_settings_function(self, date, timezone, expected): - result = apply_timezone_from_settings(date, settings.replace(**{'TO_TIMEZONE': timezone, 'TIMEZONE': 'UTC'})) + result = apply_timezone_from_settings(date, + settings.replace(settings={'TO_TIMEZONE': timezone, 'TIMEZONE': 'UTC'}) + ) self.assertEqual(expected, result) @parameterized.expand([ @@ -101,7 +103,7 @@ def test_apply_timezone_from_settings_function_none_settings(self, date, expecte param(datetime(2015, 12, 12, 10, 12),), ]) def test_apply_timezone_from_settings_function_should_return_tz(self, date): - result = apply_timezone_from_settings(date, settings.replace(**{'RETURN_AS_TIMEZONE_AWARE': True})) + result = apply_timezone_from_settings(date, settings.replace(settings={'RETURN_AS_TIMEZONE_AWARE': True})) self.assertTrue(bool(result.tzinfo)) def test_registry_when_get_keys_not_implemented(self): From 66f6b7daf379a35589205ffeb0fab26a4651489f Mon Sep 17 00:00:00 2001 From: atrifonov Date: Thu, 12 Nov 2020 12:28:56 +0200 Subject: [PATCH 3/7] add concurrency test --- tests/test_concurrency.py | 57 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 tests/test_concurrency.py diff --git a/tests/test_concurrency.py b/tests/test_concurrency.py new file mode 100644 index 000000000..d6aa15fd5 --- /dev/null +++ b/tests/test_concurrency.py @@ -0,0 +1,57 @@ +import concurrent.futures +import random +from datetime import datetime + +import dateparser +from tests import BaseTestCase + +RELATIVE = {'RELATIVE_BASE': datetime(2014, 9, 15, 10, 30)} + +TEST_DATA = [ + {'ds': 'Tue May 07, 2018 10:55 PM', 'expected': datetime(2018, 5, 7, 22, 55), 'loc': 'en'}, + {'ds': '2018-10-07T22:55:01', 'expected': datetime(2018, 10, 7, 22, 55, 1), 'loc': 'en'}, + {'ds': '2018-Oct-11', 'expected': datetime(2018, 10, 11, 0, 0), 'loc': 'en'}, + {'ds': '12.04.2018', 'expected': datetime(2018, 12, 4, 0, 0), 'loc': 'en'}, + {'ds': '12-10-2018 20:13', 'expected': datetime(2018, 12, 10, 20, 13), 'loc': 'en'}, + {'ds': '03.04.2019', 'expected': datetime(2019, 4, 3, 0, 0), 'loc': 'en-150'}, + {'ds': 'on Tue October 7, 2019 04:55 PM', 'expected': datetime(2019, 10, 7, 16, 55), 'loc': 'en-150'}, + {'ds': '2019Oct8', 'expected': datetime(2019, 10, 8, 0, 0), 'loc': 'en-150'}, + {'ds': '07.03.2020 - 11:13', 'expected': datetime(2020, 3, 7, 11, 13), 'loc': 'ru'}, + {'ds': '9 Авг. 2020 17:11:01', 'expected': datetime(2020, 8, 9, 17, 11, 1), 'loc': 'ru'}, + {'ds': '07.01.2020', 'expected': datetime(2020, 1, 7, 0, 0), 'loc': 'ru'}, + {'ds': 'yesterday 11:00', 'expected': datetime(2014, 9, 14, 11), 'loc': 'en', 'extra': RELATIVE}, + {'ds': '13 days ago', 'expected': datetime(2014, 9, 2, 10, 30), 'loc': 'en', 'extra': RELATIVE}, + ] * 180 + +random.shuffle(TEST_DATA) + + +class TestConcurrency(BaseTestCase): + + def test_concurrency(self): + with concurrent.futures.ThreadPoolExecutor() as executor: + + results = list(executor.map(self.concurrency_test, TEST_DATA)) + results_with_error = [(r['ds'], r['error']) for r in results if r['error']] + self.assertEqual([], results_with_error, + f'{len(results_with_error)} Threads failed with errors:\n{set(results_with_error)}') + + wrong_results = [str(r) for r in results if (r['expected'] != r['date'])] + w_r_output = '\n'.join(wrong_results) + self.assertEqual([], wrong_results, + f'{len(wrong_results)} Threads returned wrong date time:\n{w_r_output}') + + @staticmethod + def concurrency_test(data_for_test): + try: + date_string = data_for_test['ds'] + date = dateparser.parse(date_string, locales=[data_for_test['loc']], + settings=data_for_test.get('extra')) + if date: + data_for_test['date'] = date + data_for_test['error'] = None + except Exception as error: + data_for_test['error'] = str(error) + data_for_test['date'] = None + finally: + return data_for_test From 5296d46990299679d92667ca628f4c4304897730 Mon Sep 17 00:00:00 2001 From: atrifonov Date: Thu, 12 Nov 2020 12:41:09 +0200 Subject: [PATCH 4/7] fix tox E126 --- tests/test_concurrency.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_concurrency.py b/tests/test_concurrency.py index d6aa15fd5..b88b6e2db 100644 --- a/tests/test_concurrency.py +++ b/tests/test_concurrency.py @@ -21,7 +21,7 @@ {'ds': '07.01.2020', 'expected': datetime(2020, 1, 7, 0, 0), 'loc': 'ru'}, {'ds': 'yesterday 11:00', 'expected': datetime(2014, 9, 14, 11), 'loc': 'en', 'extra': RELATIVE}, {'ds': '13 days ago', 'expected': datetime(2014, 9, 2, 10, 30), 'loc': 'en', 'extra': RELATIVE}, - ] * 180 +] * 180 random.shuffle(TEST_DATA) From 5294b234f7dea6b1d5228cec162714b02dbdde7e Mon Sep 17 00:00:00 2001 From: atrifonov Date: Thu, 12 Nov 2020 14:26:57 +0200 Subject: [PATCH 5/7] convert f-strings to format() --- dateparser/conf.py | 4 ++-- tests/test_concurrency.py | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/dateparser/conf.py b/dateparser/conf.py index 80cdc00c5..b72aff114 100644 --- a/dateparser/conf.py +++ b/dateparser/conf.py @@ -41,8 +41,8 @@ def __init__(self, **kwargs): @classmethod def get_key(cls, **kwargs): if kwargs: - keys = [f'{key}-{val}' for key, val in kwargs.pop('settings').items()] - keys.extend([f'{key}-{val}' for key, val in kwargs.items() if val]) + keys = sorted('{}-{}'.format(key, val) for key, val in kwargs.pop('settings').items()) + keys.extend(sorted('{}-{}'.format(key, val) for key, val in kwargs.items() if val)) return hashlib.md5(''.join(sorted(keys)).encode('utf-8')).hexdigest() return 'default' diff --git a/tests/test_concurrency.py b/tests/test_concurrency.py index b88b6e2db..2cdb29b5c 100644 --- a/tests/test_concurrency.py +++ b/tests/test_concurrency.py @@ -33,13 +33,14 @@ def test_concurrency(self): results = list(executor.map(self.concurrency_test, TEST_DATA)) results_with_error = [(r['ds'], r['error']) for r in results if r['error']] + msg = '{}Threads failed with errors:\n{}' self.assertEqual([], results_with_error, - f'{len(results_with_error)} Threads failed with errors:\n{set(results_with_error)}') + msg.format(len(results_with_error), set(results_with_error))) wrong_results = [str(r) for r in results if (r['expected'] != r['date'])] - w_r_output = '\n'.join(wrong_results) + msg = '{} Threads returned wrong date time:\n{}' self.assertEqual([], wrong_results, - f'{len(wrong_results)} Threads returned wrong date time:\n{w_r_output}') + msg.format(len(wrong_results), '\n'.join(wrong_results))) @staticmethod def concurrency_test(data_for_test): From 1ba7a9ca4b91447980730b8e1557b59f11c585a7 Mon Sep 17 00:00:00 2001 From: atrifonov Date: Fri, 20 Nov 2020 18:06:21 +0200 Subject: [PATCH 6/7] rm excess variable from replace --- dateparser/conf.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/dateparser/conf.py b/dateparser/conf.py index b72aff114..c692a6375 100644 --- a/dateparser/conf.py +++ b/dateparser/conf.py @@ -64,9 +64,8 @@ def replace(self, mod_settings=None, **kwds): if v is None: raise TypeError('Invalid {{"{}": {}}}'.format(k, v)) - z = self._get_settings_from_pyfile().keys() - for x in z: - _settings.setdefault(x, getattr(self, x)) + for key in self._get_settings_from_pyfile().keys(): + _settings.setdefault(key, getattr(self, key)) _settings['_default'] = False if mod_settings: From 6ea11c2c1b799a66a20701623c45d39a7067419c Mon Sep 17 00:00:00 2001 From: atrifonov Date: Wed, 25 Nov 2020 09:05:13 +0200 Subject: [PATCH 7/7] avoid double assigment --- dateparser/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dateparser/conf.py b/dateparser/conf.py index c692a6375..c11860f03 100644 --- a/dateparser/conf.py +++ b/dateparser/conf.py @@ -83,7 +83,7 @@ def apply_settings(f): def wrapper(*args, **kwargs): mod_settings = kwargs.get('settings', {}) if mod_settings is None: - kwargs['settings'] = mod_settings = {} + kwargs['settings'], mod_settings = {}, {} if kwargs: if isinstance(mod_settings, dict):