diff --git a/dateparser/languages/locale.py b/dateparser/languages/locale.py index 0cb578552..6c37531e8 100644 --- a/dateparser/languages/locale.py +++ b/dateparser/languages/locale.py @@ -17,15 +17,12 @@ class Locale: """ Class that deals with applicability and translation from a locale. - :param shortname: A locale code, e.g. 'fr-PF', 'qu-EC', 'af-NA'. :type shortname: str - :param language_info: Language info (translation data) of the language the locale belongs to. :type language_info: dict - :return: A Locale instance """ @@ -50,15 +47,12 @@ def __init__(self, shortname, language_info): def is_applicable(self, date_string, strip_timezone=False, settings=None): """ Check if the locale is applicable to translate date string. - :param date_string: A string representing date and/or time in a recognizably valid format. :type date_string: str - :param strip_timezone: If True, timezone is stripped from date string. :type strip_timezone: bool - :return: boolean value representing if the locale is applicable for the date string or not. """ if strip_timezone: @@ -110,15 +104,12 @@ def clean_dictionary(dictionary, threshold=2): def translate(self, date_string, keep_formatting=False, settings=None): """ Translate the date string to its English equivalent. - :param date_string: A string representing date and/or time in a recognizably valid format. :type date_string: str - :param keep_formatting: If True, retain formatting of the date string after translation. :type keep_formatting: bool - :return: translated date string. """ date_string = self._translate_numerals(date_string) @@ -268,12 +259,20 @@ def _sentence_split(self, string, settings): 4: r'[。…‥\.!??!;\r\n]+(?:\s|$)+', # Japanese and Chinese 5: r'[\r\n]+', # Thai 6: r'[\r\n؟!\.…]+(?:\s|$)+'} # Arabic and Farsi + + sentences = [] + re_dot_date = r'(\d+\.\d+\.\d+)' + for dot_date_object in reversed(list(re.finditer(re_dot_date, string))): + start_index, end_index = dot_date_object.span() + string = string[:start_index] + string[end_index:] + sentences.append(dot_date_object.group()) + if 'sentence_splitter_group' not in self.info: split_reg = abbreviation_string + splitters_dict[1] - sentences = re.split(split_reg, string) + sentences.extend(re.split(split_reg, string)) else: split_reg = abbreviation_string + splitters_dict[self.info['sentence_splitter_group']] - sentences = re.split(split_reg, string) + sentences.extend(re.split(split_reg, string)) for i in sentences: if not i: diff --git a/dateparser/search/__init__.py b/dateparser/search/__init__.py index 67205d4b0..bdb62eeab 100644 --- a/dateparser/search/__init__.py +++ b/dateparser/search/__init__.py @@ -1,63 +1,119 @@ from dateparser.search.search import DateSearchWithDetection +from dateparser.conf import apply_settings -_search_with_detection = DateSearchWithDetection() +_search_dates = DateSearchWithDetection() +@apply_settings def search_dates(text, languages=None, settings=None, add_detected_language=False, detect_languages_function=None): """Find all substrings of the given string which represent date and/or time and parse them. - :param text: - A string in a natural language which may contain date and/or time expressions. - :type text: str - - :param languages: - A list of two letters language codes.e.g. ['en', 'es']. If languages are given, it will - not attempt to detect the language. - :type languages: list - - :param settings: - Configure customized behavior using settings defined in :mod:`dateparser.conf.Settings`. - :type settings: dict - - :param add_detected_language: - Indicates if we want the detected language returned in the tuple. - :type add_detected_language: bool - - :param detect_languages_function: - A function for language detection that takes as input a `text` and a `confidence_threshold`, - and returns a list of detected language codes. - Note: detect_languages_function is only uses if `languages` are not provided. - :type detect_languages_function: function - - :return: Returns list of tuples containing: - substrings representing date and/or time, corresponding :mod:`datetime.datetime` - object and detected language if *add_detected_language* is True. - Returns None if no dates that can be parsed are found. - :rtype: list - :raises: ValueError - Unknown Language - - >>> from dateparser.search import search_dates - >>> search_dates('The first artificial Earth satellite was launched on 4 October 1957.') - [('on 4 October 1957', datetime.datetime(1957, 10, 4, 0, 0))] - - >>> search_dates('The first artificial Earth satellite was launched on 4 October 1957.', - >>> add_detected_language=True) - [('on 4 October 1957', datetime.datetime(1957, 10, 4, 0, 0), 'en')] - - >>> search_dates("The client arrived to the office for the first time in March 3rd, 2004 " - >>> "and got serviced, after a couple of months, on May 6th 2004, the customer " - >>> "returned indicating a defect on the part") - [('in March 3rd, 2004 and', datetime.datetime(2004, 3, 3, 0, 0)), - ('on May 6th 2004', datetime.datetime(2004, 5, 6, 0, 0))] - - """ - result = _search_with_detection.search_dates( + :param text: + A string in a natural language which may contain the date and/or time expressions. + :type text: str + + :param languages: + A list of two letters language codes.e.g. ['en', 'es']. If languages are given, it will + not attempt to detect the language. + :type languages: list + + :param settings: + Configure customized behavior using settings defined in :mod:`dateparser.conf.Settings`. + :type settings: dict + + :param add_detected_language: + Indicates if we want the detected language returned in the tuple. + :type add_detected_language: bool + + :return: Returns list of tuples containing: + substrings representing date and/or time, corresponding :mod:`datetime.datetime` + object and detected language if *add_detected_language* is True. + Returns None if no dates that can be parsed are found. + :rtype: list + :raises: ValueError - Unknown Language + + >>> from dateparser.search import search_dates + >>> search_dates('The first artificial Earth satellite was launched on 4 October 1957.') + [('on 4 October 1957', datetime.datetime(1957, 10, 4, 0, 0))] + + >>> search_dates('The first artificial Earth satellite was launched on 4 October 1957.', + >>> add_detected_language=True) + [('on 4 October 1957', datetime.datetime(1957, 10, 4, 0, 0), 'en')] + + >>> search_dates("The client arrived to the office for the first time in March 3rd, 2004 " + >>> "and got serviced, after a couple of months, on May 6th 2004, the customer " + >>> "returned indicating a defect on the part") + [('in March 3rd, 2004 and', datetime.datetime(2004, 3, 3, 0, 0)), + ('on May 6th 2004', datetime.datetime(2004, 5, 6, 0, 0))] + + """ + + result = _search_dates.search_dates( text=text, languages=languages, settings=settings, detect_languages_function=detect_languages_function ) - dates = result.get('Dates') + + dates = result.get("Dates") if dates: if add_detected_language: - language = result.get('Language') - dates = [date + (language, ) for date in dates] + language = result.get("Language") + dates = [date + (language,) for date in dates] return dates + + +@apply_settings +def search_first_date(text, languages=None, settings=None, add_detected_language=False, detect_languages_function=None): + """Find first substring of the given string which represent date and/or time and parse it. + + :param text: + A string in a natural language which may contain the date and/or time expression. + :type text: str + + :param languages: + A list of two letters language codes.e.g. ['en', 'es']. If languages are given, it will + not attempt to detect the language. + :type languages: list + + :param settings: + Configure customized behavior using settings defined in :mod:`dateparser.conf.Settings`. + :type settings: dict + + :param add_detected_language: + Indicates if we want the detected language returned in the tuple. + :type add_detected_language: bool + + :return: Returns a tuple containing: + substring representing date and/or time, corresponding :mod:`datetime.datetime` + object and detected language if *add_detected_language* is True. + Returns None if no dates that can be parsed are found. + :rtype: tuple + :raises: ValueError - Unknown Language + + >>> from dateparser.search import search_first_date + >>> search_first_date('The first artificial Earth satellite was launched on 4 October 1957.') + ('on 4 October 1957', datetime.datetime(1957, 10, 4, 0, 0)) + + >>> from dateparser.search import search_first_date + >>> search_first_date('Caesar Augustus, also known as Octavian') + None + + >>> search_first_date('The first artificial Earth satellite was launched on 4 October 1957.', + >>> add_detected_language=True) + ('on 4 October 1957', datetime.datetime(1957, 10, 4, 0, 0), 'en') + + >>> search_first_date("The client arrived to the office for the first time in March 3rd, 2004 " + >>> "and got serviced, after a couple of months, on May 6th 2004, the customer " + >>> "returned indicating a defect on the part") + ('in March 3rd, 2004 and', datetime.datetime(2004, 3, 3, 0, 0)) + + """ + + result = _search_dates.search_dates( + text=text, languages=languages, limit_date_search_results=1, settings=settings, detect_languages_function=detect_languages_function + ) + dates = result.get("Dates") + if dates: + if add_detected_language: + language = result.get("Language") + dates = [date + (language,) for date in dates] + return dates[0] diff --git a/dateparser/search/languages.py b/dateparser/search/languages.py new file mode 100644 index 000000000..2d5a42335 --- /dev/null +++ b/dateparser/search/languages.py @@ -0,0 +1,48 @@ +from collections.abc import Set + +from dateparser.search.text_detection import FullTextLanguageDetector +from dateparser.languages.loader import LocaleDataLoader +from dateparser.custom_language_detection.language_mapping import map_languages + + +class SearchLanguages: + def __init__(self): + self.loader = LocaleDataLoader() + self.available_language_map = self.loader.get_locale_map() + self.language = None + + def get_current_language(self, language_shortname): + if self.language is None or self.language.shortname != language_shortname: + self.language = self.loader.get_locale(language_shortname) + + def translate_objects(self, language_shortname, text, settings): + self.get_current_language(language_shortname) + result = self.language.translate_search(text, settings=settings) + return result + + def detect_language(self, text, languages, settings=None, detect_languages_function=None): + if detect_languages_function and not languages: + detected_languages = detect_languages_function( + text, confidence_threshold=settings.LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD + ) + detected_languages = map_languages(detected_languages) or settings.DEFAULT_LANGUAGES + return detected_languages[0] if detected_languages else None + + if isinstance(languages, (list, tuple, Set)): + if all([language in self.available_language_map for language in languages]): + languages = [self.available_language_map[language] for language in languages] + else: + unsupported_languages = set(languages) - set(self.available_language_map.keys()) + raise ValueError("Unknown language(s): %s" % ', '.join(map(repr, unsupported_languages))) + elif languages is not None: + raise TypeError("languages argument must be a list (%r given)" % type(languages)) + + if languages: + self.language_detector = FullTextLanguageDetector(languages=languages) + else: + self.language_detector = FullTextLanguageDetector(list(self.available_language_map.values())) + + detected_language = self.language_detector._best_language(text) or ( + settings.DEFAULT_LANGUAGES[0] if settings.DEFAULT_LANGUAGES else None + ) + return detected_language diff --git a/dateparser/search/search.py b/dateparser/search/search.py index 3fc657810..cff23ff16 100644 --- a/dateparser/search/search.py +++ b/dateparser/search/search.py @@ -1,212 +1,185 @@ -from collections.abc import Set +import re +from string import punctuation -from dateparser.languages.loader import LocaleDataLoader from dateparser.conf import apply_settings, check_settings, Settings from dateparser.date import DateDataParser -from dateparser.search.text_detection import FullTextLanguageDetector -from dateparser.custom_language_detection.language_mapping import map_languages -import regex as re - - -RELATIVE_REG = re.compile("(ago|in|from now|tomorrow|today|yesterday)") - - -def date_is_relative(translation): - return re.search(RELATIVE_REG, translation) is not None - - -class _ExactLanguageSearch: - def __init__(self, loader): - self.loader = loader - self.language = None - - def get_current_language(self, shortname): - if self.language is None or self.language.shortname != shortname: - self.language = self.loader.get_locale(shortname) - - def search(self, shortname, text, settings): - self.get_current_language(shortname) - result = self.language.translate_search(text, settings=settings) - return result - - @staticmethod - def set_relative_base(substring, already_parsed): - if len(already_parsed) == 0: - return substring, None - - i = len(already_parsed) - 1 - while already_parsed[i][1]: - i -= 1 - if i == -1: - return substring, None - relative_base = already_parsed[i][0]['date_obj'] - return substring, relative_base - - def choose_best_split(self, possible_parsed_splits, possible_substrings_splits): - rating = [] - for i in range(len(possible_parsed_splits)): - num_substrings = len(possible_substrings_splits[i]) - num_substrings_without_digits = 0 - not_parsed = 0 - for j, item in enumerate(possible_parsed_splits[i]): - if item[0]['date_obj'] is None: - not_parsed += 1 - if not any(char.isdigit() for char in possible_substrings_splits[i][j]): - num_substrings_without_digits += 1 - rating.append([ - num_substrings, - 0 if not_parsed == 0 else (float(not_parsed) / float(num_substrings)), - 0 if num_substrings_without_digits == 0 else ( - float(num_substrings_without_digits) / float(num_substrings))]) - best_index, best_rating = min(enumerate(rating), key=lambda p: (p[1][1], p[1][0], p[1][2])) - return possible_parsed_splits[best_index], possible_substrings_splits[best_index] - - def split_by(self, item, original, splitter): - if item.count(splitter) <= 2: - return [[item.split(splitter), original.split(splitter)]] - - item_all_split = item.split(splitter) - original_all_split = original.split(splitter) - all_possible_splits = [[item_all_split, original_all_split]] - for i in range(2, 4): - item_partially_split = [] - original_partially_split = [] - for j in range(0, len(item_all_split), i): - item_join = splitter.join(item_all_split[j:j + i]) - original_join = splitter.join(original_all_split[j:j + i]) - item_partially_split.append(item_join) - original_partially_split.append(original_join) - all_possible_splits.append([item_partially_split, original_partially_split]) - return all_possible_splits - - def split_if_not_parsed(self, item, original): - splitters = [',', '،', '——', '—', '–', '.', ' '] - possible_splits = [] - for splitter in splitters: - if splitter in item and item.count(splitter) == original.count(splitter): - possible_splits.extend(self.split_by(item, original, splitter)) - return possible_splits - - def parse_item(self, parser, item, translated_item, parsed, need_relative_base): - relative_base = None - item = item.replace('ngày', '') - item = item.replace('am', '') - parsed_item = parser.get_date_data(item) - is_relative = date_is_relative(translated_item) - - if need_relative_base: - item, relative_base = self.set_relative_base(item, parsed) - - if relative_base: - parser._settings.RELATIVE_BASE = relative_base - parsed_item = parser.get_date_data(item) - return parsed_item, is_relative - - def parse_found_objects(self, parser, to_parse, original, translated, settings): - parsed = [] - substrings = [] - need_relative_base = True - if settings.RELATIVE_BASE: - need_relative_base = False - for i, item in enumerate(to_parse): - if len(item) <= 2: +from dateparser.search.languages import SearchLanguages + +_drop_words = {"on", "of", "the"} # cause annoying false positives +_bad_date_re = re.compile( + # whole dates we black-list (can still be parts of valid dates) + "^(" + + "|".join( + [ + r"\d{1,3}", # less than 4 digits + r"#\d+", # this is a sequence number + # some common false positives below + r"[-/.]+", # bare separators parsed as current date + r"\w\.?", # one letter (with optional dot) + "an", + ] + ) + + ")$" +) + +_secondary_splitters = [ + ",", + "،", + "——", + "—", + "–", + ".", +] # are used if no date object is found +_punctuations = list(set(punctuation)) + + +def _get_relative_base(already_parsed): + if already_parsed: + return already_parsed[-1][1] + return None + + +def _create_splits(text): + splited_objects = text.split() + return splited_objects + + +def _create_joined_parse(text, max_join=7, sort_ascending=False): + split_objects = _create_splits(text=text) + joint_objects = [] + for i in range(len(split_objects)): + for j in reversed(range(min(max_join, len(split_objects) - i))): + x = " ".join(split_objects[i:i + j + 1]) + if _bad_date_re.match(x): continue - - parsed_item, is_relative = self.parse_item(parser, item, translated[i], parsed, need_relative_base) - if parsed_item['date_obj']: - parsed.append((parsed_item, is_relative)) - substrings.append(original[i].strip(" .,:()[]-'")) + if not len(x) > 2: continue - possible_splits = self.split_if_not_parsed(item, original[i]) - if not possible_splits: - continue + joint_objects.append(x) + + if sort_ascending: + joint_objects = sorted(joint_objects, key=len) + + return joint_objects + + +def _get_accurate_return_text(text, parser, datetime_object): + text_candidates = _create_joined_parse(text=text, sort_ascending=True) + for text_candidate in text_candidates: + if parser.get_date_data(text_candidate).date_obj == datetime_object: + return text_candidate + + +def _joint_parse( + text, + parser, + translated=None, + deep_search=True, + accurate_return_text=False, + data_carry=None, + is_recursion_call=False, +): + + if translated and len(translated) <= 2: + return data_carry + + text = text.strip(" .,:()[]-'") + + reduced_text_candidate = None + secondary_split_made = False + returnable_objects = data_carry or [] + joint_based_search_dates = _create_joined_parse(text=text) + for date_object_candidate in joint_based_search_dates: + parsed_date_object = parser.get_date_data(date_object_candidate) + if parsed_date_object.date_obj: + if accurate_return_text: + date_object_candidate = _get_accurate_return_text( + text=date_object_candidate, + parser=parser, + datetime_object=parsed_date_object.date_obj, + ) - possible_parsed = [] - possible_substrings = [] - for split_translated, split_original in possible_splits: - current_parsed = [] - current_substrings = [] - if split_translated: - for j, jtem in enumerate(split_translated): - if len(jtem) <= 2: - continue - parsed_jtem, is_relative_jtem = self.parse_item( - parser, jtem, split_translated[j], current_parsed, need_relative_base) - current_parsed.append((parsed_jtem, is_relative_jtem)) - current_substrings.append(split_original[j].strip(' .,:()[]-')) - possible_parsed.append(current_parsed) - possible_substrings.append(current_substrings) - parsed_best, substrings_best = self.choose_best_split(possible_parsed, possible_substrings) - for k in range(len(parsed_best)): - if parsed_best[k][0]['date_obj']: - parsed.append(parsed_best[k]) - substrings.append(substrings_best[k]) - return parsed, substrings - - def search_parse(self, shortname, text, settings): - translated, original = self.search(shortname, text, settings) - bad_translate_with_search = ['vi', 'hu'] # splitting done by spaces and some dictionary items contain spaces - if shortname not in bad_translate_with_search: - languages = ['en'] - to_parse = translated + returnable_objects.append( + (date_object_candidate.strip(" .,:()[]-'"), parsed_date_object.date_obj) + ) + + if deep_search: + start_index = text.find(date_object_candidate) + end_index = start_index + len(date_object_candidate) + reduced_text_candidate = None + if start_index >= 0: + reduced_text_candidate = text[:start_index] + text[end_index:] + break else: - languages = [shortname] - to_parse = original + for splitter in _secondary_splitters: + secondary_split = re.split( + "(? 1: + reduced_text_candidate = " ".join(secondary_split) + secondary_split_made = True + + if not reduced_text_candidate: + is_previous_punctuation = False + for index, char in enumerate(date_object_candidate): + if char in _punctuations: + if is_previous_punctuation: + double_punctuation_split = [ + text[: index - 1], + text[index - 1:], + ] + reduced_text_candidate = " ".join(double_punctuation_split) + break + is_previous_punctuation = True + else: + is_previous_punctuation = False + + if reduced_text_candidate: + reduced_text_candidate = reduced_text_candidate.strip(" .,:()[]-'") + + if (deep_search or secondary_split_made) and not ( + text == reduced_text_candidate and is_recursion_call + ): + if reduced_text_candidate and len(reduced_text_candidate) > 2: + returnable_objects = _joint_parse( + text=reduced_text_candidate, + parser=parser, + data_carry=returnable_objects, + is_recursion_call=True, + ) - parser = DateDataParser(languages=languages, settings=settings) - parsed, substrings = self.parse_found_objects(parser=parser, to_parse=to_parse, - original=original, translated=translated, settings=settings) - parser._settings = Settings() - return list(zip(substrings, [i[0]['date_obj'] for i in parsed])) + return returnable_objects class DateSearchWithDetection: """ - Class which executes language detection of string in a natural language, translation of a given string, - search of substrings which represent date and/or time and parsing of these substrings. + Class which handles language detection, translation and subsequent generic parsing of + string representing date and/or time. + :return: A date search instance """ + def __init__(self): - self.loader = LocaleDataLoader() - self.available_language_map = self.loader.get_locale_map() - self.search = _ExactLanguageSearch(self.loader) + self.search_languages = SearchLanguages() @apply_settings - def detect_language(self, text, languages, settings=None, detect_languages_function=None): - if detect_languages_function and not languages: - detected_languages = detect_languages_function( - text, confidence_threshold=settings.LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD - ) - detected_languages = map_languages(detected_languages) or settings.DEFAULT_LANGUAGES - return detected_languages[0] if detected_languages else None - - if isinstance(languages, (list, tuple, Set)): - if all([language in self.available_language_map for language in languages]): - languages = [self.available_language_map[language] for language in languages] - else: - unsupported_languages = set(languages) - set(self.available_language_map.keys()) - raise ValueError("Unknown language(s): %s" % ', '.join(map(repr, unsupported_languages))) - elif languages is not None: - raise TypeError("languages argument must be a list (%r given)" % type(languages)) - - if languages: - self.language_detector = FullTextLanguageDetector(languages=languages) - else: - self.language_detector = FullTextLanguageDetector(list(self.available_language_map.values())) + def search_parse( + self, + text, + languages, + settings, + limit_date_search_results=None, + make_joints_parse=True, + deep_search=True, + accurate_return_text=False, + ): - detected_language = self.language_detector._best_language(text) or ( - settings.DEFAULT_LANGUAGES[0] if settings.DEFAULT_LANGUAGES else None - ) - return detected_language - - @apply_settings - def search_dates(self, text, languages=None, settings=None, detect_languages_function=None): """ - Find all substrings of the given string which represent date and/or time and parse them. + Search parse string representing date and/or time in recognizable text. + Supports parsing multiple languages and timezones. :param text: - A string in a natural language which may contain date and/or time expressions. + A string containing dates. :type text: str :param languages: @@ -215,9 +188,25 @@ def search_dates(self, text, languages=None, settings=None, detect_languages_fun :type languages: list :param settings: - Configure customized behavior using settings defined in :mod:`dateparser.conf.Settings`. + Configure customized behavior using settings defined in :mod:`dateparser.conf.Settings`. :type settings: dict + :param limit_date_search_results: + A int which sets maximum results to be returned. + :type limit_date_search_results: int + + :param make_joints_parse: + If True, make_joints_parse method is used. Deafult: True + :type locales: bool + + :param deep_search: + Indicates if we want deep search the text for date and/or time. Deafult: True + :type deep_search: bool + + :param accurate_return_text: + Indicates if we want accurate text contining the date and/or time. Deafult: True + :type accurate_return_text: bool + :param detect_languages_function: A function for language detection that takes as input a `text` and a `confidence_threshold`, returns a list of detected language codes. @@ -234,10 +223,69 @@ def search_dates(self, text, languages=None, settings=None, detect_languages_fun check_settings(settings) - language_shortname = self.detect_language( + returnable_objects = [] + parser = DateDataParser(languages=[languages], settings=settings) + translated, original = self.search_languages.translate_objects( + languages, text, settings + ) + + for index, original_object in enumerate(original): + if limit_date_search_results and returnable_objects: + if len(returnable_objects) == limit_date_search_results: + break + + if not len(original_object) > 2: + continue + + lowered_word_list = original_object.lower().split() + if any(drop_word in lowered_word_list for drop_word in _drop_words): + continue + + if not settings.RELATIVE_BASE: + relative_base = _get_relative_base(already_parsed=returnable_objects) + if relative_base: + parser._settings.RELATIVE_BASE = relative_base + + if make_joints_parse: + joint_based_search_dates = _joint_parse( + text=original_object, + parser=parser, + translated=translated[index], + deep_search=deep_search, + accurate_return_text=accurate_return_text, + ) + if joint_based_search_dates: + returnable_objects.extend(joint_based_search_dates) + else: + parsed_date_object = parser.get_date_data(original_object) + if parsed_date_object.date_obj: + returnable_objects.append( + ( + original_object.strip(" .,:()[]-'"), + parsed_date_object.date_obj, + ) + ) + + parser._settings = Settings() + return returnable_objects + + @apply_settings + def search_dates( + self, text, languages=None, limit_date_search_results=None, settings=None, detect_languages_function=None + ): + + languages = self.search_languages.detect_language( text=text, languages=languages, settings=settings, detect_languages_function=detect_languages_function ) - if not language_shortname: - return {'Language': None, 'Dates': None} - return {'Language': language_shortname, 'Dates': self.search.search_parse(language_shortname, text, - settings=settings)} + + if not languages: + return {"Language": None, "Dates": None} + return { + "Language": languages, + "Dates": self.search_parse( + text=text, + languages=languages, + settings=settings, + limit_date_search_results=limit_date_search_results, + ), + } diff --git a/dateparser_scripts/update_supported_languages_and_locales.py b/dateparser_scripts/update_supported_languages_and_locales.py old mode 100755 new mode 100644 diff --git a/docs/conf.py b/docs/conf.py old mode 100755 new mode 100644 diff --git a/tests/test_search.py b/tests/test_search.py index 1ea7b7bff..a2b08fdeb 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -1,19 +1,20 @@ from parameterized import parameterized, param +import pytest +import pytz from tests import BaseTestCase from dateparser.timezone_parser import StaticTzInfo from dateparser.search.search import DateSearchWithDetection -from dateparser.search import search_dates +from dateparser.search import search_dates, search_first_date from dateparser.conf import Settings, apply_settings from dateparser_data.settings import default_parsers import datetime -import pytz class TestTranslateSearch(BaseTestCase): def setUp(self): super().setUp() - self.search_with_detection = DateSearchWithDetection() - self.exact_language_search = self.search_with_detection.search + self.search_dates = DateSearchWithDetection() + self.exact_language_search = self.search_dates.search_languages def run_search_dates_function_invalid_languages(self, text, languages, error_type): try: @@ -219,7 +220,7 @@ def check_error_message(self, message): param('sv', "fredag, 03 september 2014"), ]) def test_search_date_string(self, shortname, datetime_string): - result = self.exact_language_search.search(shortname, datetime_string, settings=Settings())[1][0] + result = self.exact_language_search.translate_objects(shortname, datetime_string, settings=Settings())[1][0] self.assertEqual(result, datetime_string) @parameterized.expand([ @@ -444,8 +445,8 @@ def test_search_date_string(self, shortname, datetime_string): settings={'RELATIVE_BASE': datetime.datetime(2000, 1, 1)}), ]) @apply_settings - def test_search_and_parse(self, shortname, string, expected, settings=None): - result = self.exact_language_search.search_parse(shortname, string, settings=settings) + def test_relative_base_setting(self, shortname, string, expected, settings=None): + result = self.search_dates.search_parse(string, shortname, settings=settings) self.assertEqual(result, expected) @parameterized.expand([ @@ -459,7 +460,8 @@ def test_search_and_parse(self, shortname, string, expected, settings=None): 2014, datetime.datetime.utcnow().month, datetime.datetime.utcnow().day, 0, 0) ), ('October', datetime.datetime(2014, 10, datetime.datetime.utcnow().day, 0, 0)), - ('Friday, 21', datetime.datetime(2014, 10, 21, 0, 0))]), + ('Friday, 21', datetime.datetime(2014, datetime.datetime.utcnow().month, 21, 0, 0))]), + param('en', """May 2020 June 2020 2023 @@ -474,7 +476,7 @@ def test_search_and_parse(self, shortname, string, expected, settings=None): ('June 5 am utc', datetime.datetime(2023, 6, 5, 0, 0, tzinfo=pytz.utc)), ('June 23th 5 pm EST', datetime.datetime(2023, 6, 23, 17, 0, tzinfo=pytz.timezone("EST"))), ('May 31', datetime.datetime(2023, 5, 31, 0, 0)), - ('8am UTC', datetime.datetime(2023, 8, 31, 0, 0, tzinfo=pytz.utc))]), + ('8am UTC', datetime.datetime(2023, 8, 31, 0, 0, tzinfo=pytz.utc))], xfail=True), # Russian param('ru', '19 марта 2001 был хороший день. 20 марта тоже был хороший день. 21 марта был отличный день.', @@ -511,8 +513,10 @@ def test_search_and_parse(self, shortname, string, expected, settings=None): ('tháng 9', datetime.datetime(1940, 9, 1, 0, 0))]) ]) @apply_settings - def test_relative_base_setting(self, shortname, string, expected, settings=None): - result = self.exact_language_search.search_parse(shortname, string, settings=settings) + def test_relative_base(self, shortname, string, expected, settings=None, xfail=False): + if xfail: + pytest.xfail() + result = self.search_dates.search_parse(string, shortname, settings=settings) self.assertEqual(result, expected) @parameterized.expand([ @@ -531,6 +535,9 @@ def test_relative_base_setting(self, shortname, string, expected, settings=None) [('July 13th 2014', datetime.datetime(2014, 7, 13, 0, 0)), ('July 14th 2014', datetime.datetime(2014, 7, 14, 0, 0))]), param('en', 'July 13th 2014 July 14th', + [('July 13th 2014', datetime.datetime(2014, 7, 13, 0, 0)), + ('July 14th', datetime.datetime(2014, 7, 14, 0, 0))], xfail=True), + param('en', 'July 13th 2014. July 14th', [('July 13th 2014', datetime.datetime(2014, 7, 13, 0, 0)), ('July 14th', datetime.datetime(2014, 7, 14, 0, 0))]), param('en', 'July 13th, 2014 July 14th, 2014', @@ -555,15 +562,17 @@ def test_relative_base_setting(self, shortname, string, expected, settings=None) )]), # German - param('de', 'Verteidiger der Stadt kapitulierten am 2. Mai 1945. Am 8. Mai 1945 (VE-Day) trat ' + param('de', 'Verteidiger der Stadt kapitulierten am 2 Mai 1945. Am 8 Mai 1945 (VE-Day) trat ' 'bedingungslose Kapitulation der Wehrmacht in Kraft', - [('am 2. Mai 1945', datetime.datetime(1945, 5, 2, 0, 0)), - ('Am 8. Mai 1945', datetime.datetime(1945, 5, 8, 0, 0))]), + [('2 Mai 1945', datetime.datetime(1945, 5, 2, 0, 0)), + ('8 Mai 1945', datetime.datetime(1945, 5, 8, 0, 0))]), ]) @apply_settings - def test_splitting_of_not_parsed(self, shortname, string, expected, settings=None): - result = self.exact_language_search.search_parse(shortname, string, settings=settings) + def test_splitting_of_not_parsed(self, shortname, string, expected, settings=None, xfail=False): + if xfail: + pytest.xfail() + result = search_dates(string, [shortname], settings=settings) self.assertEqual(result, expected) @parameterized.expand([ @@ -685,7 +694,7 @@ def test_splitting_of_not_parsed(self, shortname, string, expected, settings=Non param('en', '2007'), ]) def test_detection(self, shortname, text): - result = self.search_with_detection.detect_language(text, languages=None) + result = self.exact_language_search.detect_language(text, languages=None) self.assertEqual(result, shortname) @parameterized.expand([ @@ -701,12 +710,14 @@ def test_detection(self, shortname, text): settings={'RELATIVE_BASE': datetime.datetime(2000, 1, 1)}, expected=[('Em outubro de 1936', datetime.datetime(1936, 10, 1, 0, 0))]), + # xfail - "20 марта, 21" and "марта" is parsed instead of "20 марта" and "21 марта" param(text='19 марта 2001, 20 марта, 21 марта был отличный день.', languages=['en', 'ru'], settings=None, expected=[('19 марта 2001', datetime.datetime(2001, 3, 19, 0, 0)), ('20 марта', datetime.datetime(2001, 3, 20, 0, 0)), - ('21 марта', datetime.datetime(2001, 3, 21, 0, 0))]), + ('21 марта', datetime.datetime(2001, 3, 21, 0, 0))], + xfail=True), # Dates not found param(text='', @@ -726,10 +737,14 @@ def test_detection(self, shortname, text): settings=None, expected=[('DECEMBER 21 19', datetime.datetime(2019, 12, 21, 0, 0))] ), + + # xfail - "08 11 58" in parsed as datetime object by dateparser.parse param(text='bonjour, pouvez vous me joindre svp par telephone 08 11 58 54 41', languages=None, settings={'STRICT_PARSING': True}, - expected=None), + expected=None, + xfail=True), + param(text="a Americ", languages=None, settings=None, @@ -744,8 +759,44 @@ def test_detection(self, shortname, text): languages=['en'], settings=None, expected=[('9/3/2017', datetime.datetime(2017, 9, 3, 0, 0))]), + + # Test dates with period. i.e "." + param(text="12.12.2000", + languages=None, + settings=None, + expected=[('12.12.2000', datetime.datetime(2000, 12, 12, 0, 0))]), + param(text="1973.02.16", + languages=None, + settings=None, + expected=[('1973.02.16', datetime.datetime(1973, 2, 16, 0, 0))]), + param(text="26.09.2019", + languages=None, + settings=None, + expected=[('26.09.2019', datetime.datetime(2019, 9, 26, 0, 0))]), + param(text="test 13.07.2016 test", + languages=None, + settings=None, + expected=[('13.07.2016', datetime.datetime(2016, 7, 13, 0, 0))]), + param(text="Date:22.06.2020", + languages=["de"], + settings={'DATE_ORDER': 'DMY'}, + expected=[('22.06.2020', datetime.datetime(2020, 6, 22, 0, 0))]), + param(text="Date :22.06.2020", + languages=["de"], + settings={'DATE_ORDER': 'DMY'}, + expected=[('22.06.2020', datetime.datetime(2020, 6, 22, 0, 0))]), + param(text="Hello-Date 26.09.2019", + languages=["de", "fr"], + settings={'DATE_ORDER': 'DMY'}, + expected=[('26.09.2019', datetime.datetime(2019, 9, 26, 0, 0))]), + param(text="Year of the Four Emperors", + languages=['en'], + settings=None, + expected=None), ]) - def test_date_search_function(self, text, languages, settings, expected): + def test_date_search_function(self, text, languages, settings, expected, xfail=False): + if xfail: + pytest.xfail() result = search_dates(text, languages=languages, settings=settings) self.assertEqual(result, expected) @@ -782,3 +833,58 @@ def test_date_search_function_invalid_languages_type(self, text, languages): def test_date_search_function_invalid_language_code(self, text, languages): self.run_search_dates_function_invalid_languages(text=text, languages=languages, error_type=ValueError) self.check_error_message("Unknown language(s): 'unknown language code'") + + @parameterized.expand([ + param(text="15 de outubro de 1936", + shortname='pt', + expected=[ + ("15 de outubro de 1936", datetime.datetime(1936, 10, 15, 0, 0)) + ]), + ]) + def test_search_date_without_make_joints_parse( + self, text, shortname, expected, settings=None + ): + result = self.search_dates.search_parse(text, shortname, settings=settings, make_joints_parse=False) + self.assertEqual(result, expected) + + @parameterized.expand([ + param(text="January 3, 2017 - February 1st", + expected=('January 3, 2017', datetime.datetime(2017, 1, 3, 0, 0))), + ]) + def test_search_first_date( + self, text, expected + ): + result = search_first_date(text) + self.assertEqual(result, expected) + + @parameterized.expand([ + param(text="15 de outubro de 1936", + add_detected_language=True, + expected=("15 de outubro de 1936", datetime.datetime(1936, 10, 15, 0, 0), "pt")), + ]) + def test_search_first_date_returning_detected_languages_if_requested( + self, text, add_detected_language, expected + ): + result = search_first_date(text, add_detected_language=add_detected_language) + self.assertEqual(result, expected) + + @parameterized.expand([ + param('pt', 'Em outubro de 1936, Alemanha e Itália formaram o Eixo Roma-Berlim.', + [('Em outubro de 1936', datetime.datetime(1936, 10, datetime.datetime.utcnow().day, 0, 0))], xfail=True), + ]) + @apply_settings + def test_search_date_accurate_return_text(self, shortname, string, expected, settings=None, xfail=False): + if xfail: + pytest.xfail() + result = self.search_dates.search_parse(string, shortname, settings=settings, accurate_return_text=True) + self.assertEqual(result, expected) + + @parameterized.expand([ + param('2021-08-04T14:21:37+05:30', + [('2021-08-04T14:21:37', datetime.datetime(2021, 8, 4, 14, 21, 37)), + ('05:30', datetime.datetime(2021, 8, 4, 5, 30))]), + ]) + @apply_settings + def test_search_date_is_previous_punctuation(self, string, expected, settings=None): + result = search_dates(string) + self.assertEqual(result, expected)