diff --git a/dateparser/__init__.py b/dateparser/__init__.py index 05cb9cc02..fe259902d 100644 --- a/dateparser/__init__.py +++ b/dateparser/__init__.py @@ -7,7 +7,8 @@ @apply_settings -def parse(date_string, date_formats=None, languages=None, locales=None, region=None, settings=None): +def parse(date_string, date_formats=None, languages=None, locales=None, + region=None, settings=None, detect_languages_function=None): """Parse date and time from given date string. :param date_string: @@ -39,6 +40,12 @@ def parse(date_string, date_formats=None, languages=None, locales=None, region=N Configure customized behavior using settings defined in :mod:`dateparser.conf.Settings`. :type settings: dict + :param detect_languages_function: + A function for language detection that takes as input a string (the `date_string`) and + a `confidence_threshold`, and returns a list of detected language codes. + Note: this function is only used if ``languages`` and ``locales`` are not provided. + :type detect_languages_function: function + :return: Returns :class:`datetime ` representing parsed date if successful, else returns None :rtype: :class:`datetime `. :raises: @@ -47,9 +54,9 @@ def parse(date_string, date_formats=None, languages=None, locales=None, region=N """ parser = _default_parser - if languages or locales or region or not settings._default: + if languages or locales or region or detect_languages_function or not settings._default: parser = DateDataParser(languages=languages, locales=locales, - region=region, settings=settings) + region=region, settings=settings, detect_languages_function=detect_languages_function) data = parser.get_date_data(date_string, date_formats) diff --git a/dateparser/conf.py b/dateparser/conf.py index c14374a3f..4bc6b281a 100644 --- a/dateparser/conf.py +++ b/dateparser/conf.py @@ -2,6 +2,7 @@ from datetime import datetime from functools import wraps +from dateparser.data.languages_info import language_order from .parser import date_order_chart from .utils import registry @@ -25,6 +26,8 @@ class Settings: * `NORMALIZE` * `RETURN_TIME_AS_PERIOD` * `PARSERS` + * `DEFAULT_LANGUAGES` + * `LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD` """ _default = True @@ -129,6 +132,28 @@ def _check_parsers(setting_name, setting_value): _check_repeated_values(setting_name, setting_value) +def _check_default_languages(setting_name, setting_value): + unsupported_languages = set(setting_value) - set(language_order) + if unsupported_languages: + raise SettingValidationError( + "Found invalid languages in the '{}' setting: {}".format( + setting_name, ', '.join(map(repr, unsupported_languages)) + ) + ) + _check_repeated_values(setting_name, setting_value) + + +def _check_between_0_and_1(setting_name, setting_value): + is_valid = 0 <= setting_value <= 1 + if not is_valid: + raise SettingValidationError( + '{} is not a valid value for {}. It can take values between 0 and ' + '1.'.format( + setting_value, setting_name, + ) + ) + + def check_settings(settings): """ Check if provided settings are valid, if not it raises `SettingValidationError`. @@ -193,6 +218,14 @@ def check_settings(settings): 'PREFER_LOCALE_DATE_ORDER': { 'type': bool }, + 'DEFAULT_LANGUAGES': { + 'type': list, + 'extra_check': _check_default_languages + }, + 'LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD': { + 'type': float, + 'extra_check': _check_between_0_and_1 + }, } modified_settings = settings._mod_settings # check only modified settings diff --git a/dateparser/custom_language_detection/__init__.py b/dateparser/custom_language_detection/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dateparser/custom_language_detection/fasttext.py b/dateparser/custom_language_detection/fasttext.py new file mode 100644 index 000000000..2adec5add --- /dev/null +++ b/dateparser/custom_language_detection/fasttext.py @@ -0,0 +1,45 @@ +import os + +import fasttext + +from dateparser_cli.fasttext_manager import fasttext_downloader +from dateparser_cli.utils import dateparser_model_home, create_data_model_home +from dateparser_cli.exceptions import FastTextModelNotFoundException + + +_supported_models = ["large.bin", "small.bin"] +_DEFAULT_MODEL = "small" + + +class _FastTextCache: + model = None + + +def _load_fasttext_model(): + if _FastTextCache.model: + return _FastTextCache.model + create_data_model_home() + downloaded_models = [ + file for file in os.listdir(dateparser_model_home) + if file in _supported_models + ] + if not downloaded_models: + fasttext_downloader(_DEFAULT_MODEL) + return _load_fasttext_model() + model_path = os.path.join(dateparser_model_home, downloaded_models[0]) + if not os.path.isfile(model_path): + raise FastTextModelNotFoundException('Fasttext model file not found') + _FastTextCache.model = fasttext.load_model(model_path) + return _FastTextCache.model + + +def detect_languages(text, confidence_threshold): + _language_parser = _load_fasttext_model() + text = text.replace('\n', ' ').replace('\r', '') + language_codes = [] + parser_data = _language_parser.predict(text) + for idx, language_probability in enumerate(parser_data[1]): + if language_probability > confidence_threshold: + language_code = parser_data[0][idx].replace("__label__", "") + language_codes.append(language_code) + return language_codes diff --git a/dateparser/custom_language_detection/langdetect.py b/dateparser/custom_language_detection/langdetect.py new file mode 100644 index 000000000..5ef8f6220 --- /dev/null +++ b/dateparser/custom_language_detection/langdetect.py @@ -0,0 +1,37 @@ +import langdetect + + +# The below _Factory is set to prevent setting global state of the library +# but still get consistent results. +# Refer : https://github.com/Mimino666/langdetect + +class _Factory: + data = None + + +def _init_factory(): + if _Factory.data is None: + _Factory.data = langdetect.detector_factory.DetectorFactory() + _Factory.data.load_profile(langdetect.detector_factory.PROFILES_DIRECTORY) + _Factory.data.seed = 0 + + +def _get_language_probablities(text): + _init_factory() + detector = _Factory.data.create() + detector.append(text) + return detector.get_probabilities() + + +def detect_languages(text, confidence_threshold): + language_codes = [] + try: + parser_data = _get_language_probablities(text) + for language_candidate in parser_data: + if language_candidate.prob > confidence_threshold: + language_codes.append(language_candidate.lang) + except langdetect.lang_detect_exception.LangDetectException: + # This exception can be produced with empty strings or inputs without letters like `10-10-2021`. + # As this could be really common, we ignore them. + pass + return language_codes diff --git a/dateparser/custom_language_detection/language_mapping.py b/dateparser/custom_language_detection/language_mapping.py new file mode 100644 index 000000000..a76030828 --- /dev/null +++ b/dateparser/custom_language_detection/language_mapping.py @@ -0,0 +1,18 @@ +from dateparser.data.languages_info import language_map + + +def map_languages(language_codes): + """ + Returns the candidates from the supported languages codes. + :param language_codes: + A list of language codes, e.g. ['en', 'es'] in ISO 639 Standard. + :type language_codes: list + :return: Returns list[str] representing supported languages + :rtype: list[str] + """ + return [ + language_code + for language in language_codes + if language in language_map + for language_code in language_map[language] + ] diff --git a/dateparser/data/languages_info.py b/dateparser/data/languages_info.py index 14047cf21..02ab3df43 100644 --- a/dateparser/data/languages_info.py +++ b/dateparser/data/languages_info.py @@ -206,6 +206,594 @@ "tl" ] +language_map = { + "af": [ + "af" + ], + "agq": [ + "agq" + ], + "ak": [ + "ak" + ], + "am": [ + "am" + ], + "ar": [ + "ar" + ], + "as": [ + "as" + ], + "asa": [ + "asa" + ], + "ast": [ + "ast" + ], + "az": [ + "az", + "az-Cyrl", + "az-Latn" + ], + "bas": [ + "bas" + ], + "be": [ + "be" + ], + "bem": [ + "bem" + ], + "bez": [ + "bez" + ], + "bg": [ + "bg" + ], + "bm": [ + "bm" + ], + "bn": [ + "bn" + ], + "bo": [ + "bo" + ], + "br": [ + "br" + ], + "brx": [ + "brx" + ], + "bs": [ + "bs", + "bs-Cyrl", + "bs-Latn" + ], + "ca": [ + "ca" + ], + "ce": [ + "ce" + ], + "cgg": [ + "cgg" + ], + "chr": [ + "chr" + ], + "ckb": [ + "ckb" + ], + "cs": [ + "cs" + ], + "cy": [ + "cy" + ], + "da": [ + "da" + ], + "dav": [ + "dav" + ], + "de": [ + "de" + ], + "dje": [ + "dje" + ], + "dsb": [ + "dsb" + ], + "dua": [ + "dua" + ], + "dyo": [ + "dyo" + ], + "dz": [ + "dz" + ], + "ebu": [ + "ebu" + ], + "ee": [ + "ee" + ], + "el": [ + "el" + ], + "en": [ + "en" + ], + "eo": [ + "eo" + ], + "es": [ + "es" + ], + "et": [ + "et" + ], + "eu": [ + "eu" + ], + "ewo": [ + "ewo" + ], + "fa": [ + "fa" + ], + "ff": [ + "ff" + ], + "fi": [ + "fi" + ], + "fil": [ + "fil" + ], + "fo": [ + "fo" + ], + "fr": [ + "fr" + ], + "fur": [ + "fur" + ], + "fy": [ + "fy" + ], + "ga": [ + "ga" + ], + "gd": [ + "gd" + ], + "gl": [ + "gl" + ], + "gsw": [ + "gsw" + ], + "gu": [ + "gu" + ], + "guz": [ + "guz" + ], + "gv": [ + "gv" + ], + "ha": [ + "ha" + ], + "haw": [ + "haw" + ], + "he": [ + "he" + ], + "hi": [ + "hi" + ], + "hr": [ + "hr" + ], + "hsb": [ + "hsb" + ], + "hu": [ + "hu" + ], + "hy": [ + "hy" + ], + "id": [ + "id" + ], + "ig": [ + "ig" + ], + "ii": [ + "ii" + ], + "is": [ + "is" + ], + "it": [ + "it" + ], + "ja": [ + "ja" + ], + "jgo": [ + "jgo" + ], + "jmc": [ + "jmc" + ], + "ka": [ + "ka" + ], + "kab": [ + "kab" + ], + "kam": [ + "kam" + ], + "kde": [ + "kde" + ], + "kea": [ + "kea" + ], + "khq": [ + "khq" + ], + "ki": [ + "ki" + ], + "kk": [ + "kk" + ], + "kl": [ + "kl" + ], + "kln": [ + "kln" + ], + "km": [ + "km" + ], + "kn": [ + "kn" + ], + "ko": [ + "ko" + ], + "kok": [ + "kok" + ], + "ks": [ + "ks" + ], + "ksb": [ + "ksb" + ], + "ksf": [ + "ksf" + ], + "ksh": [ + "ksh" + ], + "kw": [ + "kw" + ], + "ky": [ + "ky" + ], + "lag": [ + "lag" + ], + "lb": [ + "lb" + ], + "lg": [ + "lg" + ], + "lkt": [ + "lkt" + ], + "ln": [ + "ln" + ], + "lo": [ + "lo" + ], + "lrc": [ + "lrc" + ], + "lt": [ + "lt" + ], + "lu": [ + "lu" + ], + "luo": [ + "luo" + ], + "luy": [ + "luy" + ], + "lv": [ + "lv" + ], + "mas": [ + "mas" + ], + "mer": [ + "mer" + ], + "mfe": [ + "mfe" + ], + "mg": [ + "mg" + ], + "mgh": [ + "mgh" + ], + "mgo": [ + "mgo" + ], + "mk": [ + "mk" + ], + "ml": [ + "ml" + ], + "mn": [ + "mn" + ], + "mr": [ + "mr" + ], + "ms": [ + "ms" + ], + "mt": [ + "mt" + ], + "mua": [ + "mua" + ], + "my": [ + "my" + ], + "mzn": [ + "mzn" + ], + "naq": [ + "naq" + ], + "nb": [ + "nb" + ], + "nd": [ + "nd" + ], + "ne": [ + "ne" + ], + "nl": [ + "nl" + ], + "nmg": [ + "nmg" + ], + "nn": [ + "nn" + ], + "nnh": [ + "nnh" + ], + "nus": [ + "nus" + ], + "nyn": [ + "nyn" + ], + "om": [ + "om" + ], + "or": [ + "or" + ], + "os": [ + "os" + ], + "pa": [ + "pa", + "pa-Arab", + "pa-Guru" + ], + "pl": [ + "pl" + ], + "ps": [ + "ps" + ], + "pt": [ + "pt" + ], + "qu": [ + "qu" + ], + "rm": [ + "rm" + ], + "rn": [ + "rn" + ], + "ro": [ + "ro" + ], + "rof": [ + "rof" + ], + "ru": [ + "ru" + ], + "rw": [ + "rw" + ], + "rwk": [ + "rwk" + ], + "sah": [ + "sah" + ], + "saq": [ + "saq" + ], + "sbp": [ + "sbp" + ], + "se": [ + "se" + ], + "seh": [ + "seh" + ], + "ses": [ + "ses" + ], + "sg": [ + "sg" + ], + "shi": [ + "shi", + "shi-Latn", + "shi-Tfng" + ], + "si": [ + "si" + ], + "sk": [ + "sk" + ], + "sl": [ + "sl" + ], + "smn": [ + "smn" + ], + "sn": [ + "sn" + ], + "so": [ + "so" + ], + "sq": [ + "sq" + ], + "sr": [ + "sr", + "sr-Cyrl", + "sr-Latn" + ], + "sv": [ + "sv" + ], + "sw": [ + "sw" + ], + "ta": [ + "ta" + ], + "te": [ + "te" + ], + "teo": [ + "teo" + ], + "th": [ + "th" + ], + "ti": [ + "ti" + ], + "tl": [ + "tl" + ], + "to": [ + "to" + ], + "tr": [ + "tr" + ], + "twq": [ + "twq" + ], + "tzm": [ + "tzm" + ], + "ug": [ + "ug" + ], + "uk": [ + "uk" + ], + "ur": [ + "ur" + ], + "uz": [ + "uz", + "uz-Arab", + "uz-Cyrl", + "uz-Latn" + ], + "vi": [ + "vi" + ], + "vun": [ + "vun" + ], + "wae": [ + "wae" + ], + "xog": [ + "xog" + ], + "yav": [ + "yav" + ], + "yi": [ + "yi" + ], + "yo": [ + "yo" + ], + "yue": [ + "yue" + ], + "zgh": [ + "zgh" + ], + "zh": [ + "zh", + "zh-Hans", + "zh-Hant" + ], + "zu": [ + "zu" + ] +} + language_locale_dict = { "en": [ "en-001", diff --git a/dateparser/date.py b/dateparser/date.py index 54efe5351..b49096e18 100644 --- a/dateparser/date.py +++ b/dateparser/date.py @@ -14,6 +14,7 @@ from dateparser.timezone_parser import pop_tz_offset_from_string from dateparser.utils import apply_timezone_from_settings, \ set_correct_day_from_settings +from dateparser.custom_language_detection.language_mapping import map_languages APOSTROPHE_LOOK_ALIKE_CHARS = [ '\N{RIGHT SINGLE QUOTATION MARK}', # '\u2019' @@ -321,6 +322,12 @@ class DateDataParser: Configure customized behavior using settings defined in :mod:`dateparser.conf.Settings`. :type settings: dict + :param detect_languages_function: + A function for language detection that takes as input a `text` and a `confidence_threshold`, + and returns a list of detected language codes. + Note: this function is only used if ``languages`` and ``locales`` are not provided. + :type detect_languages_function: function + :return: A parser instance :raises: @@ -332,7 +339,7 @@ class DateDataParser: @apply_settings def __init__(self, languages=None, locales=None, region=None, try_previous_locales=False, - use_given_order=False, settings=None): + use_given_order=False, settings=None, detect_languages_function=None): if languages is not None and not isinstance(languages, (list, tuple, Set)): raise TypeError("languages argument must be a list (%r given)" % type(languages)) @@ -359,9 +366,10 @@ def __init__(self, languages=None, locales=None, region=None, try_previous_local self._settings = settings self.try_previous_locales = try_previous_locales self.use_given_order = use_given_order - self.languages = languages + self.languages = list(languages) if languages else None self.locales = locales self.region = region + self.detect_languages_function = detect_languages_function self.previous_locales = collections.OrderedDict() def get_date_data(self, date_string, date_formats=None): @@ -461,6 +469,13 @@ def date_strings(): if self._is_applicable_locale(locale, s): yield locale + if self.detect_languages_function and not self.languages and not self.locales: + detected_languages = self.detect_languages_function( + text=date_string, confidence_threshold=self._settings.LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD + ) + + self.languages = map_languages(detected_languages) + for locale in self._get_locale_loader().get_locales( languages=self.languages, locales=self.locales, region=self.region, use_given_order=self.use_given_order): @@ -468,6 +483,13 @@ def date_strings(): if self._is_applicable_locale(locale, s): yield locale + if self._settings.DEFAULT_LANGUAGES: + for locale in self._get_locale_loader().get_locales( + languages=self._settings.DEFAULT_LANGUAGES, locales=None, + region=self.region, use_given_order=self.use_given_order + ): + yield locale + def _is_applicable_locale(self, locale, date_string): return locale.is_applicable( date_string, diff --git a/dateparser/search/__init__.py b/dateparser/search/__init__.py index fe6306606..67205d4b0 100644 --- a/dateparser/search/__init__.py +++ b/dateparser/search/__init__.py @@ -4,7 +4,7 @@ _search_with_detection = DateSearchWithDetection() -def search_dates(text, languages=None, settings=None, add_detected_language=False): +def search_dates(text, languages=None, settings=None, add_detected_language=False, detect_languages_function=None): """Find all substrings of the given string which represent date and/or time and parse them. :param text: @@ -17,17 +17,23 @@ def search_dates(text, languages=None, settings=None, add_detected_language=Fals :type languages: list :param settings: - Configure customized behavior using settings defined in :mod:`dateparser.conf.Settings`. + Configure customized behavior using settings defined in :mod:`dateparser.conf.Settings`. :type settings: dict :param add_detected_language: - Indicates if we want the detected language returned in the tuple. + Indicates if we want the detected language returned in the tuple. :type add_detected_language: bool + :param detect_languages_function: + A function for language detection that takes as input a `text` and a `confidence_threshold`, + and returns a list of detected language codes. + Note: detect_languages_function is only uses if `languages` are not provided. + :type detect_languages_function: function + :return: Returns list of tuples containing: - substrings representing date and/or time, corresponding :mod:`datetime.datetime` - object and detected language if *add_detected_language* is True. - Returns None if no dates that can be parsed are found. + substrings representing date and/or time, corresponding :mod:`datetime.datetime` + object and detected language if *add_detected_language* is True. + Returns None if no dates that can be parsed are found. :rtype: list :raises: ValueError - Unknown Language @@ -47,7 +53,7 @@ def search_dates(text, languages=None, settings=None, add_detected_language=Fals """ result = _search_with_detection.search_dates( - text=text, languages=languages, settings=settings + text=text, languages=languages, settings=settings, detect_languages_function=detect_languages_function ) dates = result.get('Dates') if dates: diff --git a/dateparser/search/search.py b/dateparser/search/search.py index aa71c7299..3fc657810 100644 --- a/dateparser/search/search.py +++ b/dateparser/search/search.py @@ -1,9 +1,10 @@ from collections.abc import Set from dateparser.languages.loader import LocaleDataLoader -from dateparser.conf import apply_settings, Settings +from dateparser.conf import apply_settings, check_settings, Settings from dateparser.date import DateDataParser from dateparser.search.text_detection import FullTextLanguageDetector +from dateparser.custom_language_detection.language_mapping import map_languages import regex as re @@ -171,15 +172,21 @@ def __init__(self): self.available_language_map = self.loader.get_locale_map() self.search = _ExactLanguageSearch(self.loader) - def detect_language(self, text, languages): - if isinstance(languages, (list, tuple, Set)): + @apply_settings + def detect_language(self, text, languages, settings=None, detect_languages_function=None): + if detect_languages_function and not languages: + detected_languages = detect_languages_function( + text, confidence_threshold=settings.LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD + ) + detected_languages = map_languages(detected_languages) or settings.DEFAULT_LANGUAGES + return detected_languages[0] if detected_languages else None + if isinstance(languages, (list, tuple, Set)): if all([language in self.available_language_map for language in languages]): languages = [self.available_language_map[language] for language in languages] else: unsupported_languages = set(languages) - set(self.available_language_map.keys()) - raise ValueError( - "Unknown language(s): %s" % ', '.join(map(repr, unsupported_languages))) + raise ValueError("Unknown language(s): %s" % ', '.join(map(repr, unsupported_languages))) elif languages is not None: raise TypeError("languages argument must be a list (%r given)" % type(languages)) @@ -188,24 +195,34 @@ def detect_language(self, text, languages): else: self.language_detector = FullTextLanguageDetector(list(self.available_language_map.values())) - return self.language_detector._best_language(text) + detected_language = self.language_detector._best_language(text) or ( + settings.DEFAULT_LANGUAGES[0] if settings.DEFAULT_LANGUAGES else None + ) + return detected_language @apply_settings - def search_dates(self, text, languages=None, settings=None): + def search_dates(self, text, languages=None, settings=None, detect_languages_function=None): """ Find all substrings of the given string which represent date and/or time and parse them. :param text: A string in a natural language which may contain date and/or time expressions. :type text: str + :param languages: A list of two letters language codes.e.g. ['en', 'es']. If languages are given, it will not attempt to detect the language. :type languages: list + :param settings: Configure customized behavior using settings defined in :mod:`dateparser.conf.Settings`. :type settings: dict + :param detect_languages_function: + A function for language detection that takes as input a `text` and a `confidence_threshold`, + returns a list of detected language codes. + :type detect_languages_function: function + :return: a dict mapping keys to two letter language code and a list of tuples of pairs: substring representing date expressions and corresponding :mod:`datetime.datetime` object. For example: @@ -215,7 +232,11 @@ def search_dates(self, text, languages=None, settings=None): :raises: ValueError - Unknown Language """ - language_shortname = self.detect_language(text=text, languages=languages) + check_settings(settings) + + language_shortname = self.detect_language( + text=text, languages=languages, settings=settings, detect_languages_function=detect_languages_function + ) if not language_shortname: return {'Language': None, 'Dates': None} return {'Language': language_shortname, 'Dates': self.search.search_parse(language_shortname, text, diff --git a/dateparser_cli/__init__.py b/dateparser_cli/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dateparser_cli/cli.py b/dateparser_cli/cli.py new file mode 100644 index 000000000..ec96d1482 --- /dev/null +++ b/dateparser_cli/cli.py @@ -0,0 +1,34 @@ +import argparse +import logging + +from .fasttext_manager import fasttext_downloader +from .utils import clear_cache + + +def entrance(): + dateparser_argparse = argparse.ArgumentParser( + description='dateparser download manager.' + ) + dateparser_argparse.add_argument( + '--fasttext', + type=str, + help='To download a fasttext language detection models. Supported models are "small" and "large"' + ) + dateparser_argparse.add_argument( + '--clear', + '--clear-cache', + help='To clear all cached models', + action='store_true' + ) + + args = dateparser_argparse.parse_args() + + if args.clear: + clear_cache() + logging.info("dateparser-download: All cache deleted") + + if args.fasttext: + fasttext_downloader(args.fasttext) + + if not (args.clear or args.fasttext): + dateparser_argparse.error("dateparser-download: You need to specify the command (i.e.: --fasttext or --clear)") diff --git a/dateparser_cli/exceptions.py b/dateparser_cli/exceptions.py new file mode 100644 index 000000000..8bf38679b --- /dev/null +++ b/dateparser_cli/exceptions.py @@ -0,0 +1,2 @@ +class FastTextModelNotFoundException(Exception): + pass diff --git a/dateparser_cli/fasttext_manager.py b/dateparser_cli/fasttext_manager.py new file mode 100644 index 000000000..e7535bba5 --- /dev/null +++ b/dateparser_cli/fasttext_manager.py @@ -0,0 +1,32 @@ +from pathlib import Path +import urllib.request +import os +import logging + +from .exceptions import FastTextModelNotFoundException +from .utils import dateparser_model_home, create_data_model_home + + +def fasttext_downloader(model_name): + create_data_model_home() + models = { + "small": "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz", + "large": "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin" + } + if model_name not in models: + message = "dateparser-download: Couldn't find a model called \"{}\". Supported models are: {}".format( + model_name, ", ".join(models.keys()) + ) + raise FastTextModelNotFoundException(message) + + models_directory_path = os.path.join(dateparser_model_home, (model_name + ".bin")) + + if not Path(models_directory_path).is_file(): + model_url = models[model_name] + logging.info("dateparser-download: Downloading model \"{}\" from \"{}\"...".format(model_name, model_url)) + try: + urllib.request.urlretrieve(model_url, models_directory_path) + except urllib.error.HTTPError as e: + raise Exception("dateparser-download: Fasttext model cannot be downloaded due to HTTP error") from e + else: + logging.info("dateparser-download: The model \"{}\" is already downloaded".format(model_name)) diff --git a/dateparser_cli/utils.py b/dateparser_cli/utils.py new file mode 100644 index 000000000..057ebdcc3 --- /dev/null +++ b/dateparser_cli/utils.py @@ -0,0 +1,33 @@ +import sys +import os +from pathlib import Path + +DEFAULT_DIR_NAME = 'dateparser_models' +DEFAULT_UNIX_CACHE_DIR = '~/.cache' + +if sys.version_info < (3, 6): # python 3.5 compatibility + DEFAULT_WINDOWS_CACHE_DIR = os.path.join(str(Path.home()), "AppData", "Roaming") +else: + DEFAULT_WINDOWS_CACHE_DIR = os.path.join(Path.home(), "AppData", "Roaming") + + +if sys.platform.startswith('win'): + # For Windows: + _cache_dir = DEFAULT_WINDOWS_CACHE_DIR +else: + # UNIX & OS X: + _cache_dir = DEFAULT_UNIX_CACHE_DIR + +dateparser_model_home = os.path.expanduser( + os.path.join(_cache_dir, DEFAULT_DIR_NAME) +) + + +def create_data_model_home(): + if not os.path.isdir(dateparser_model_home): + os.mkdir(dateparser_model_home) + + +def clear_cache(*args): + for path in Path(dateparser_model_home).rglob('*.*'): + os.remove(path) diff --git a/dateparser_data/settings.py b/dateparser_data/settings.py index 8b1fdb16e..c38d10061 100644 --- a/dateparser_data/settings.py +++ b/dateparser_data/settings.py @@ -25,6 +25,10 @@ # Language detection 'SKIP_TOKENS': ['t'], 'NORMALIZE': True, + 'DEFAULT_LANGUAGES': [], + + # Optional language detection + 'LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD': 0.5, # Other settings 'RETURN_TIME_AS_PERIOD': False, diff --git a/dateparser_scripts/order_languages.py b/dateparser_scripts/order_languages.py index 42b2e0e17..ca055047d 100644 --- a/dateparser_scripts/order_languages.py +++ b/dateparser_scripts/order_languages.py @@ -92,6 +92,16 @@ def _get_language_order(language_locale_dict): return language_order +def generate_language_map(language_order): + data = {} + for lang in sorted(language_order): + if '-' not in lang: + data[lang] = [lang] + else: + data[lang.split('-')[0]].append(lang) + return data + + def main(): get_raw_data() language_locale_dict = _get_language_locale_dict() @@ -115,7 +125,12 @@ def main(): language_locale_dict_string = 'language_locale_dict = ' + json.dumps( complete_language_locale_dict, separators=(',', ': '), indent=4 ) - languages_info_string = language_order_string + '\n\n' + language_locale_dict_string + '\n' + language_map_data = generate_language_map(language_order) + language_map_data_string = 'language_map = ' + json.dumps( + language_map_data, separators=(',', ': '), indent=4 + ) + + languages_info_string = language_order_string + '\n\n' + language_map_data_string + '\n\n' + language_locale_dict_string + '\n' with open(filename, 'w') as f: f.write(languages_info_string) diff --git a/docs/custom_language_detection.rst b/docs/custom_language_detection.rst new file mode 100644 index 000000000..f05c0176b --- /dev/null +++ b/docs/custom_language_detection.rst @@ -0,0 +1,96 @@ +========================= +Custom language detection +========================= + +`dateparser` allows to customize the language detection behavior by using the ``detect_languages_function`` parameter. +It currently supports two language detection libraries out of the box: `fastText `_ +and `langdetect `_, and allows you to implement your own custom language detection. + +.. warning:: + + For short strings the language detection could fail, so it's highly recommended to use ``detect_languages_function`` + along with ``DEFAULT_LANGUAGES``. + +Built-in implementations +======================== + +fastText +~~~~~~~~ +Language detection with fastText. + +Import the fastText wrapper and pass it as ``detect_languages_function`` +parameter. Example:: + + >>> from dateparser.custom_language_detection.fasttext import detect_languages + >>> dateparser.parse('12/12/12', detect_languages_function=detect_languages) + +The fastText integration currently supports the large and the small models. +Find more information about `fasttext `_ models. +You can download your model of choice using ``dateparser-download``. + +Downloading small model:: + + >>> dateparser-download --fasttext small + +Downloading large model:: + + >>> dateparser-download --fasttext large + +Deleting all cached models:: + + >>> dateparser-download --clear_cache + +.. note:: + + If no model has been downloaded, the fastText wrapper downloads and uses + the small model by default. + +langdetect +~~~~~~~~~~ +Language detection with langdetect. + +Import the langdetect wrapper and pass it as ``detect_languages_function`` +parameter. Example:: + + >>> from dateparser.custom_language_detection.langdetect import detect_languages + >>> dateparser.parse('12/12/12', detect_languages_function=detect_languages) + + +.. note:: + + From some tests we did, we recommend to use ``fastText`` for faster and more accurate results. + +Custom implementation +===================== + +``dateparser`` allows the integration of any library to detect languages by +wrapping that library in a function that accepts 2 parameters, ``text`` and +``confidence_threshold``, and returns a list of the detected language codes in +ISO 639 standards. + + +Wrapper for boilerplate for implementing custom language detections:: + + def detect_languages(text, confidence_threshold): + """ + Takes 2 parameters, `text` and `confidence_threshold`, and returns + a list of `languages codes`. + + * `text` is the input string whose language needs to be detected. + + * `confidence_threshold` is a number between 0 and 1 that indicates the + minimum confidence required for language matches. + + For language detection libraries that, for each language, indicate how + confident they are that the language matches the input text, you should + filter out languages with a confidence lower than this value (adjusted, + if needed, to the confidence range of the target library). + + This value comes from the dateparser setting + `LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD`. + + The result must be a list of languages codes (strings). + """ + # here you can apply your own logic + return language_codes + diff --git a/docs/index.rst b/docs/index.rst index 1e1c6ec51..0159b6adc 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -49,6 +49,7 @@ Contents: installation usage settings + custom_language_detection supported_locales contributing modules diff --git a/docs/settings.rst b/docs/settings.rst index 1ae658781..4cd80d880 100644 --- a/docs/settings.rst +++ b/docs/settings.rst @@ -141,6 +141,26 @@ Language Detection datetime.datetime(2015, 12, 4, 0, 0) +Default Languages ++++++++++++++++++ + +``DEFAULT_LANGUAGES``: It is a ``list`` of language codes in ISO 639 that will be used as default +languages for parsing when language detection fails. eg. ["en", "fr"]: + + >>> from dateparser import parse + >>> parse('3 de marzo de 2020', settings={'DEFAULT_LANGUAGES': ["es"]}) + +.. note:: When using this setting, these languages will be tried after trying with the detected languages with no success. It is especially useful when using the ``detect_languages_function`. + +Optional language detection ++++++++++++++++++++++++++++ + +``LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD``: defaults to ``0.5``. It is a ``float`` of minimum required confidence for the custom language detection: + + >>> from dateparser import parse + >>> parse('3 de marzo de 2020', settings={'LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD': 0.5}, detect_languages_function=detect_languages) + + Other settings ++++++++++++++ diff --git a/setup.py b/setup.py index 524ffeb0c..c8f69889a 100644 --- a/setup.py +++ b/setup.py @@ -1,9 +1,11 @@ import re from setuptools import setup, find_packages -__version__ = re.search(r"__version__.*\s*=\s*[']([^']+)[']", open('dateparser/__init__.py').read()).group(1) +__version__ = re.search(r"__version__.*\s*=\s*[']([^']+)[']", + open('dateparser/__init__.py').read()).group(1) -introduction = re.sub(r':members:.+|..\sautomodule::.+|:class:|:func:|:ref:', '', open('docs/introduction.rst').read()) +introduction = re.sub(r':members:.+|..\sautomodule::.+|:class:|:func:|:ref:', + '', open('docs/introduction.rst').read()) history = re.sub(r':mod:|:class:|:func:', '', open('HISTORY.rst').read()) test_requirements = open('tests/requirements.txt').read().splitlines() @@ -28,9 +30,14 @@ 'regex !=2019.02.19,!=2021.8.27', 'tzlocal', ], + entry_points={ + 'console_scripts': ['dateparser-download = dateparser_cli.cli:entrance'], + }, extras_require={ 'calendars:python_version<"3.6"': ['convertdate'], 'calendars:python_version>="3.6"': ['hijri-converter', 'convertdate'], + 'fasttext': ['fasttext'], + 'langdetect': ['langdetect'], }, license="BSD", zip_safe=False, diff --git a/tests/test_language_detect.py b/tests/test_language_detect.py new file mode 100644 index 000000000..a9857cf04 --- /dev/null +++ b/tests/test_language_detect.py @@ -0,0 +1,88 @@ +from datetime import datetime +from unittest.mock import Mock +import unittest + +from parameterized import parameterized, param + +from dateparser.custom_language_detection.fasttext import detect_languages as fast_text_detect_languages +from dateparser.custom_language_detection.langdetect import detect_languages as lang_detect_detect_languages +from dateparser import parse +from dateparser.date import DateDataParser +from dateparser.search import search_dates + +detect_languages = Mock() +detect_languages.return_value = ["en"] + + +class CustomLangDetectParserTest(unittest.TestCase): + def check_is_returned_list(self): + self.assertEqual(type(self.result), list) + + @parameterized.expand([ + param(dt_string="14 June 2020", confidence_threshold=0.0), + param(dt_string="26 July 2021", confidence_threshold=0.0) + ]) + def test_custom_language_detect_fast_text(self, dt_string, confidence_threshold): + self.result = fast_text_detect_languages(dt_string, confidence_threshold) + self.check_is_returned_list() + + @parameterized.expand([ + param(dt_string="14 June 2020", confidence_threshold=0.0), + ]) + def test_custom_language_detect_lang_detect(self, dt_string, confidence_threshold): + self.result = lang_detect_detect_languages(dt_string, confidence_threshold) + self.check_is_returned_list() + + @parameterized.expand([ + param(dt_string="10-10-2021", confidence_threshold=0.5), + ]) + def test_lang_detect_doesnt_raise_error(self, dt_string, confidence_threshold): + result = lang_detect_detect_languages(dt_string, confidence_threshold) + assert result == [] + + # Mock test for parse, search_dates and DateDataParser + + detect_languages = Mock() + detect_languages.return_value = ["en"] + + # parse + + def when_date_is_parsed_using_parse(self, dt_string): + self.result = parse(dt_string, detect_languages_function=detect_languages) + + def then_date_obj_exactly_is(self, expected_date_obj): + self.assertEqual(expected_date_obj, self.result) + + @parameterized.expand([ + param("Tuesday Jul 22, 2014", datetime(2014, 7, 22, 0, 0, 0)), + ]) + def test_custom_language_detect_mock_parse(self, dt_string, expected_date_obj): + self.when_date_is_parsed_using_parse(dt_string) + self.then_date_obj_exactly_is(expected_date_obj) + + # DateDataParser + + def when_date_is_parsed_using_with_datedataparser(self, dt_string): + ddp = DateDataParser(detect_languages_function=detect_languages) + self.result = ddp.get_date_data(dt_string)["date_obj"] + + @parameterized.expand([ + param("Tuesday Jul 22, 2014", datetime(2014, 7, 22, 0, 0, 0)), + ]) + def test_custom_language_detect_mock_datedataparser(self, dt_string, expected_date_obj): + self.when_date_is_parsed_using_with_datedataparser(dt_string) + self.then_date_obj_exactly_is(expected_date_obj) + + # search_date + + def when_date_is_parsed_using_with_search_dates(self, dt_string): + self.result = search_dates(dt_string, detect_languages_function=detect_languages) + + @parameterized.expand([ + param('January 3, 2017 - February 1st', + [('January 3, 2017', datetime(2017, 1, 3, 0, 0)), + ('February 1st', datetime(2017, 2, 1, 0, 0))]), + ]) + def test_custom_language_detect_mock_search_dates(self, dt_string, expected_date_obj): + self.when_date_is_parsed_using_with_search_dates(dt_string) + self.then_date_obj_exactly_is(expected_date_obj) diff --git a/tests/test_languages.py b/tests/test_languages.py index 536d6cafc..a9d292d8f 100644 --- a/tests/test_languages.py +++ b/tests/test_languages.py @@ -8,6 +8,11 @@ from dateparser.conf import apply_settings from dateparser.search.detection import AutoDetectLanguage, ExactLanguages from dateparser.utils import normalize_unicode +from dateparser import parse +from dateparser.date import DateDataParser +from dateparser.search import search_dates + +from datetime import datetime from tests import BaseTestCase @@ -2168,3 +2173,49 @@ def test_validate_extra_keys_when_invalid(self, lang_id, lang_info, log_msg): result = self.validator._validate_extra_keys(lang_id, lang_info) self.assertEqual(log_msg, self.get_log_str()) self.assertFalse(result) + + @parameterized.expand([ + param(date_string='3 de marzo 2019', languages=["en"], settings={ + "DEFAULT_LANGUAGES": ["es"] + }, expected=datetime(2019, 3, 3, 0, 0)), + ]) + def test_parse_settings_default_languages(self, date_string, languages, settings, expected): + result = parse(date_string, languages=languages, settings=settings) + assert result == expected + + @parameterized.expand([ + param(date_string='3 de marzo 2019', languages=["en"], settings={ + "DEFAULT_LANGUAGES": ["es"] + }, expected=datetime(2019, 3, 3, 0, 0)), + ]) + def test_date_data_parser_settings_default_languages(self, date_string, languages, settings, expected): + ddp = DateDataParser(languages=languages, settings=settings) + result = ddp.get_date_data(date_string) + assert result.date_obj == expected + + @parameterized.expand([ + param(date_string='3 de marzo 2019', settings={ + "DEFAULT_LANGUAGES": ["es"] + }, expected=[('3 de marzo 2019', datetime(2019, 3, 3, 0, 0))]), + ]) + def test_search_dates_settings_default_languages(self, date_string, settings, expected): + result = search_dates(date_string, settings=settings) + assert result == expected + + @parameterized.expand([ + param(date_string='RANDOM_WORD ', settings={ + "DEFAULT_LANGUAGES": ["en"] + }) + ]) + def test_parse_settings_default_languages_no_language_detect(self, date_string, settings): + result = parse(date_string, settings=settings) + assert result is None + + @parameterized.expand([ + param(date_string='29 mai 2021', languages=["fr"], expected=datetime(2021, 5, 29, 0, 0), settings={ + "DEFAULT_LANGUAGES": ["en", "es"] + }), + ]) + def test_parse_settings_default_languages_with_detected_language(self, date_string, languages, expected, settings): + result = parse(date_string, languages=languages, settings=settings) + assert result == expected diff --git a/tests/test_settings.py b/tests/test_settings.py index 44c355c9c..171dd6a63 100644 --- a/tests/test_settings.py +++ b/tests/test_settings.py @@ -173,6 +173,8 @@ def test_check_settings_wrong_setting_name(self): param('NORMALIZE', 'true', '', True), param('FUZZY', 'true', '', False), param('PREFER_LOCALE_DATE_ORDER', 'false', '', True), + param('DEFAULT_LANGUAGES', 'en', '', ['en']), + param('LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD', '1', '', 0.5), ]) def test_check_settings(self, setting, wrong_type, wrong_value, valid_value): with self.assertRaisesRegex( @@ -212,6 +214,22 @@ def test_check_settings_extra_check_parsers(self): ): DateDataParser(settings={'PARSERS': ['absolute-time', 'timestamp', 'absolute-time']}) + def test_check_settings_extra_check_confidence_threshold(self): + with self.assertRaisesRegex( + SettingValidationError, + r'1.1 is not a valid value for ' + r'LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD. It can take values ' + r'between 0 and 1' + ): + DateDataParser(settings={'LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD': 1.1}) + + def test_check_settings_extra_check_default_languages(self): + with self.assertRaisesRegex( + SettingValidationError, + "Found invalid languages in the 'DEFAULT_LANGUAGES' setting: 'abcd'" + ): + DateDataParser(settings={'DEFAULT_LANGUAGES': ["abcd"]}) + @pytest.mark.parametrize( "date_string,expected_result", [ @@ -227,3 +245,18 @@ def test_no_spaces_strict_parsing(date_string, expected_result): parser = DateDataParser(settings={'PARSERS': ['no-spaces-time'], 'STRICT_PARSING': True}) assert parser.get_date_data(date_string)['date_obj'] is None + + +def detect_languages(text, confidence_threshold): + if confidence_threshold > 0.5: + return ['en'] + else: + return ['fr'] + + +def test_confidence_threshold_setting_is_applied(): + ddp = DateDataParser(detect_languages_function=detect_languages, settings={'LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD': 0.6}) + assert ddp.get_date_data('21/06/2020').locale == 'en' + + ddp2 = DateDataParser(detect_languages_function=detect_languages, settings={'LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD': 0.4}) + assert ddp2.get_date_data('21/06/2020').locale == 'fr' diff --git a/tox.ini b/tox.ini index c9bc329d2..208d05c31 100644 --- a/tox.ini +++ b/tox.ini @@ -10,7 +10,7 @@ deps = deps = {[base]deps} tzlocal<3.0b1 -extras = calendars +extras = calendars, fasttext, langdetect commands = pytest --cov=dateparser --cov-report=xml {posargs: tests} @@ -21,7 +21,7 @@ deps = [testenv:flake8] basepython = python3 -extras = calendars +extras = calendars, fasttext, langdetect deps = {[testenv]deps} pytest-flake8