From f11fd004aa4231b62b1ec0c366bb8ae5ce44e0a7 Mon Sep 17 00:00:00 2001 From: PrimozGodec Date: Fri, 14 Apr 2023 15:00:02 +0200 Subject: [PATCH] Normalize - Use language from Corpus --- orangecontrib/text/language.py | 2 +- orangecontrib/text/preprocess/normalize.py | 171 +++++++++++++------- orangecontrib/text/tests/test_preprocess.py | 106 +++++++++--- 3 files changed, 195 insertions(+), 84 deletions(-) diff --git a/orangecontrib/text/language.py b/orangecontrib/text/language.py index d65c03d29..f4f8af28e 100644 --- a/orangecontrib/text/language.py +++ b/orangecontrib/text/language.py @@ -41,7 +41,7 @@ "ga": "Irish", "gl": "Galician", "got": "Gothic", - "grc": "Ancient greek", + "grc": "Ancient Greek", "gu": "Gujarati", "he": "Hebrew", "hi": "Hindi", diff --git a/orangecontrib/text/preprocess/normalize.py b/orangecontrib/text/preprocess/normalize.py index 735666714..f8ab7f835 100644 --- a/orangecontrib/text/preprocess/normalize.py +++ b/orangecontrib/text/preprocess/normalize.py @@ -1,4 +1,5 @@ -from typing import List, Callable +import warnings +from typing import List, Callable, Optional import os import ufal.udpipe as udpipe from lemmagen3 import Lemmatizer @@ -10,6 +11,7 @@ from Orange.util import wrap_callback, dummy_callback from orangecontrib.text import Corpus +from orangecontrib.text.language import ISO2LANG, LANG2ISO from orangecontrib.text.misc import wait_nltk_data from orangecontrib.text.preprocess import Preprocessor, TokenizedPreprocessor @@ -23,18 +25,24 @@ class BaseNormalizer(TokenizedPreprocessor): normalizer. """ normalizer = NotImplemented + supported_languages = NotImplemented def __init__(self): # cache already normalized string to speedup normalization + self._language = None self._normalization_cache = {} def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus: if callback is None: callback = dummy_callback + self.load_model(self.get_language(corpus.language)) corpus = super().__call__(corpus, wrap_callback(callback, end=0.2)) callback(0.2, "Normalizing...") return self._store_tokens(corpus, wrap_callback(callback, start=0.2)) + def load_model(self, iso_language: str): + pass + def _preprocess(self, string: str) -> str: """ Normalizes token to canonical form. """ if string in self._normalization_cache: @@ -54,10 +62,27 @@ def __setstate__(self, state): # _normalization_cache self._normalization_cache = {} + def get_language(self, corpus_language: Optional[str]) -> str: + language = self._language or corpus_language + if language not in self.supported_languages: + option = ["selected", "Corpus's"][self._language is None] + error_text = f"{self.name} does not support the {option} language." + if len(self.supported_languages) == 1: + # model that support only single language does not have language + # attribute - language will always be extracted from Corpus + # it doesn't make sense to raise here since user does not have + # option to provide different language setting + la = next(iter(self.supported_languages)) + warnings.warn(error_text + f" It will use model's language: {la}))") + else: + raise ValueError(error_text) + return language + class WordNetLemmatizer(BaseNormalizer): name = 'WordNet Lemmatizer' normalizer = stem.WordNetLemmatizer().lemmatize + supported_languages = {"en"} @wait_nltk_data def __init__(self): @@ -67,42 +92,42 @@ def __init__(self): class PorterStemmer(BaseNormalizer): name = 'Porter Stemmer' normalizer = stem.PorterStemmer().stem + supported_languages = {"en"} class SnowballStemmer(BaseNormalizer): name = 'Snowball Stemmer' - supported_languages = [l.capitalize() for l in - stem.SnowballStemmer.languages] + supported_languages = { + LANG2ISO[la.capitalize()] + for la in stem.SnowballStemmer.languages + # porter is not language but porter stemmer that we implement separately + if la != "porter" + } - def __init__(self, language='English'): + def __init__(self, language: Optional[str] = None): super().__init__() - self.normalizer = stem.SnowballStemmer(language.lower()).stem - - -def language_to_name(language): - return language.lower().replace(' ', '') + 'ud' - - -def file_to_name(file): - return file.replace('-', '').replace('_', '') - + self._language = language -def file_to_language(file): - return file[:file.find('ud') - 1] \ - .replace('-', ' ').replace('_', ' ').capitalize() + def load_model(self, iso_langauge: str): + language = ISO2LANG[iso_langauge].lower() + self.normalizer = stem.SnowballStemmer(language).stem class UDPipeModels: server_url = "https://file.biolab.si/files/udpipe/" + # some languages differ between udpipe and iso standard + LANG2UDPIPE = {"Norwegian Bokmål": "Norwegian Bokmaal"} + UDPIPE2LANG = {v: k for k, v in LANG2UDPIPE.items()} + def __init__(self): self.local_data = os.path.join(data_dir(versioned=False), 'udpipe/') self.serverfiles = serverfiles.ServerFiles(self.server_url) self.localfiles = serverfiles.LocalFiles(self.local_data, serverfiles=self.serverfiles) - def __getitem__(self, language): - file_name = self._find_file(language_to_name(language)) + def __getitem__(self, iso_language): + file_name = self._find_file(self.iso_to_file(iso_language)) return self.localfiles.localpath_download(file_name) @property @@ -113,12 +138,14 @@ def model_files(self): return self.localfiles.listfiles() def _find_file(self, language): - return next(filter(lambda f: file_to_name(f).startswith(language), - map(lambda f: f[0], self.model_files))) + return next(f[0] for f in self.model_files if f[0].startswith(language)) @property def supported_languages(self): - return list(map(lambda f: file_to_language(f[0]), self.model_files)) + return sorted(map(lambda f: self.file_to_language(f[0]), self.model_files)) + + def supported_languages_iso(self): + return [self.language_to_iso(lg) for lg in self.supported_languages] @property def online(self): @@ -128,6 +155,46 @@ def online(self): except ConnectionError: return False + # use _ since - is already used in iso standard + VARIATION_DELIMITER = "_" + + def language_to_iso(self, language): + if "(" in language: + language, model = language.split("(") + language = LANG2ISO[language.strip()] + return UDPipeModels.VARIATION_DELIMITER.join((language, model.strip(")"))) + return LANG2ISO[language] + + @staticmethod + def iso_to_language(iso_language): + lg_var = iso_language.split(UDPipeModels.VARIATION_DELIMITER) + lg, model_variation = lg_var if len(lg_var) == 2 else (lg_var[0], "") + lg = ISO2LANG[lg] + if model_variation: + model_variation = f"({model_variation})" + return " ".join((lg, model_variation)).strip() + + def iso_to_file(self, iso_language): + lg_var = iso_language.split(self.VARIATION_DELIMITER) + lg, model_variation = lg_var if len(lg_var) == 2 else (lg_var[0], None) + lg = ISO2LANG[lg] + lg = [self.LANG2UDPIPE.get(lg, lg).lower().replace(" ", "_")] + if model_variation: + lg.append(model_variation) + return "-".join(lg + ["ud"]) + + def file_to_language(self, file): + lg = file[: file.find("ud") - 1].split("-") + # if filename includes "-" then variation is part of the name + lg, model_variation = lg if len(lg) == 2 else (lg[0], "") + # capitalize multi-word languages separated by _ + lg = " ".join(map(lambda x: x.capitalize(), lg.split("_"))) + # fix wrong spelling for Norwegian Bokmål + lg = self.UDPIPE2LANG.get(lg, lg) + if model_variation: + model_variation = f"({model_variation})" + return " ".join((lg, model_variation)).strip() + class UDPipeStopIteration(StopIteration): pass @@ -136,13 +203,17 @@ class UDPipeStopIteration(StopIteration): class UDPipeLemmatizer(BaseNormalizer): name = 'UDPipe Lemmatizer' - def __init__(self, language='English', use_tokenizer=False): + def __init__(self, language: Optional[str] = None, use_tokenizer=False): super().__init__() - self.__language = language + self._language = language self.__use_tokenizer = use_tokenizer self.models = UDPipeModels() self.__model = None + @property + def supported_languages(self): + return self.models.supported_languages_iso() + @property def use_tokenizer(self): return self.__use_tokenizer @@ -153,12 +224,8 @@ def normalizer(self): else self.__normalize_token def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus: - try: - self.__model = udpipe.Model.load(self.models[self.__language]) - except StopIteration: - raise UDPipeStopIteration - if self.__use_tokenizer: + self.load_model(self.get_language(corpus.language)) corpus = Preprocessor.__call__(self, corpus) if callback is None: callback = dummy_callback @@ -167,6 +234,12 @@ def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus: else: return super().__call__(corpus, callback) + def load_model(self, iso_language: str): + try: + self.__model = udpipe.Model.load(self.models[iso_language]) + except StopIteration: + raise UDPipeStopIteration + def __normalize_token(self, token: str) -> str: sentence = udpipe.Sentence() sentence.addWord(token) @@ -213,39 +286,15 @@ def __setstate__(self, state): class LemmagenLemmatizer(BaseNormalizer): name = 'Lemmagen Lemmatizer' - lemmagen_languages = { - "Bulgarian": "bg", - "Croatian": "hr", - "Czech": "cs", - "English": "en", - "Estonian": "et", - "Farsi/Persian": "fa", - "French": "fr", - "German": "de", - "Hungarian": "hu", - "Italian": "it", - "Macedonian": "mk", - "Polish": "pl", - "Romanian": "ro", - "Russian": "ru", - "Serbian": "sr", - "Slovak": "sk", - "Slovenian": "sl", - "Spanish": "es", - "Ukrainian": "uk" - } + supported_languages = Lemmatizer.list_supported_languages() - def __init__(self, language='English'): + def __init__(self, language: Optional[str] = None): super().__init__() - self.language = language + self._language = language self.lemmatizer = None - def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus: - # lemmagen3 lemmatizer is not picklable, define it on call and discard it afterward - self.lemmatizer = Lemmatizer(self.lemmagen_languages[self.language]) - output_corpus = super().__call__(corpus, callback) - self.lemmatizer = None - return output_corpus + def load_model(self, iso_language: str): + self.lemmatizer = Lemmatizer(iso_language) def normalizer(self, token): assert self.lemmatizer is not None @@ -253,3 +302,9 @@ def normalizer(self, token): # sometimes Lemmagen returns an empty string, return original tokens # in this case return t if t else token + + def __getstate__(self): + """This function remove model that cannot be pickled""" + state = super().__getstate__() + state["lemmatizer"] = None + return state diff --git a/orangecontrib/text/tests/test_preprocess.py b/orangecontrib/text/tests/test_preprocess.py index 26dc54821..7d275b677 100644 --- a/orangecontrib/text/tests/test_preprocess.py +++ b/orangecontrib/text/tests/test_preprocess.py @@ -115,6 +115,8 @@ def _preprocess(cls, string): def test_token_normalizer(self): class CapTokenNormalizer(preprocess.BaseNormalizer): + supported_languages = {"en"} + @classmethod def _preprocess(cls, token): return token.capitalize() @@ -270,7 +272,7 @@ def test_call_word_net(self): self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2) def test_call_UDPipe(self): - pp = preprocess.UDPipeLemmatizer(language="Lithuanian") + pp = preprocess.UDPipeLemmatizer(language="lt") self.assertFalse(self.corpus.has_tokens()) corpus = pp(self.corpus) self.assertTrue(corpus.has_tokens()) @@ -289,15 +291,19 @@ def test_function(self): self.assertEqual(stemmer._preprocess('token'), 'toke') def test_snowball(self): - stemmer = preprocess.SnowballStemmer('french') + stemmer = preprocess.SnowballStemmer("fr") token = 'voudrais' - self.assertEqual( - stemmer._preprocess(token), - nltk.SnowballStemmer(language='french').stem(token)) + with self.corpus.unlocked(): + self.corpus.metas[0, 0] = token + corpus = stemmer(self.corpus) + self.assertListEqual( + list(corpus.tokens[0]), + [nltk.SnowballStemmer(language="french").stem(token)], + ) def test_udpipe(self): """Test udpipe token lemmatization""" - normalizer = preprocess.UDPipeLemmatizer("Lithuanian") + normalizer = preprocess.UDPipeLemmatizer("lt") with self.corpus.unlocked(): self.corpus.metas[0, 0] = "esu" corpus = normalizer(self.corpus) @@ -306,7 +312,7 @@ def test_udpipe(self): def test_udpipe_doc(self): """Test udpipe lemmatization with its own tokenization""" - normalizer = preprocess.UDPipeLemmatizer("Lithuanian", True) + normalizer = preprocess.UDPipeLemmatizer("lt", True) with self.corpus.unlocked(): self.corpus.metas[0, 0] = "Ant kalno dega namas" corpus = normalizer(self.corpus) @@ -314,12 +320,11 @@ def test_udpipe_doc(self): self.assertEqual(len(corpus.used_preprocessor.preprocessors), 1) def test_udpipe_pickle(self): - normalizer = preprocess.UDPipeLemmatizer("Lithuanian", True) + normalizer = preprocess.UDPipeLemmatizer("lt", True) # udpipe store model after first call - model is not picklable normalizer(self.corpus) loaded = pickle.loads(pickle.dumps(normalizer)) - self.assertEqual(normalizer._UDPipeLemmatizer__language, - loaded._UDPipeLemmatizer__language) + self.assertEqual(normalizer._language, loaded._language) self.assertEqual(normalizer._UDPipeLemmatizer__use_tokenizer, loaded._UDPipeLemmatizer__use_tokenizer) with self.corpus.unlocked(): @@ -329,10 +334,9 @@ def test_udpipe_pickle(self): ) def test_udpipe_deepcopy(self): - normalizer = preprocess.UDPipeLemmatizer("Lithuanian", True) + normalizer = preprocess.UDPipeLemmatizer("lt", True) copied = copy.deepcopy(normalizer) - self.assertEqual(normalizer._UDPipeLemmatizer__language, - copied._UDPipeLemmatizer__language) + self.assertEqual(normalizer._language, copied._language) self.assertEqual(normalizer._UDPipeLemmatizer__use_tokenizer, copied._UDPipeLemmatizer__use_tokenizer) with self.corpus.unlocked(): @@ -342,7 +346,7 @@ def test_udpipe_deepcopy(self): ) def test_lemmagen(self): - normalizer = preprocess.LemmagenLemmatizer('Slovenian') + normalizer = preprocess.LemmagenLemmatizer("sl") sentence = 'Gori na gori hiša gori' with self.corpus.unlocked(): self.corpus.metas[0, 0] = sentence @@ -356,7 +360,7 @@ def test_normalizers_picklable(self): for nm in set(preprocess.normalize.__all__) - {"BaseNormalizer"}: normalizer = getattr(preprocess.normalize, nm) normalizer = ( - normalizer(language="Lithuanian") + normalizer(language="lt") if normalizer is preprocess.UDPipeLemmatizer else normalizer() ) @@ -365,7 +369,7 @@ def test_normalizers_picklable(self): loaded(self.corpus) def test_cache(self): - normalizer = preprocess.UDPipeLemmatizer("Lithuanian") + normalizer = preprocess.UDPipeLemmatizer("lt") with self.corpus.unlocked(): self.corpus.metas[0, 0] = "esu" normalizer(self.corpus) @@ -376,26 +380,69 @@ def test_cache(self): loaded_normalizer = pickle.loads(pickle.dumps(normalizer)) self.assertEqual(0, len(loaded_normalizer._normalization_cache)) + def test_language(self): + normalizers = ( + # preprocessor, valid language + (preprocess.WordNetLemmatizer, "en"), + (preprocess.PorterStemmer, "en"), + (preprocess.SnowballStemmer, "en"), + (preprocess.UDPipeLemmatizer, "lt"), + (preprocess.LemmagenLemmatizer, "en"), + ) + without_lang = (preprocess.WordNetLemmatizer, preprocess.PorterStemmer) + for n, valid_lg in normalizers: + for lg in (valid_lg, "tg"): # valid and not valid language + # try with language in constructor + m = n() if n in without_lang else n(language=lg) + if lg == valid_lg or n in without_lang: + res = m(self.corpus) + self.assertIsNotNone(res) + else: + with self.assertRaises(ValueError): + m(self.corpus) + + # try with valid language in corpus + m = n() + corpus = self.corpus.copy() + corpus.attributes["language"] = lg + if lg == valid_lg: + res = m(corpus) + self.assertIsNotNone(res) + else: + if n in without_lang: + with self.assertWarns(UserWarning): + res = m(corpus) + self.assertIsNotNone(res) + else: + with self.assertRaises(ValueError): + m(corpus) + @patch(SF_LIST, return_value=SERVER_FILES) class UDPipeModelsTests(unittest.TestCase): def test_label_transform(self, _): """Test helper functions for label transformation""" - self.assertEqual(file_to_language('slovenian-sst-ud-2.0-170801.udpipe'), - 'Slovenian sst') - self.assertEqual(file_to_name('slovenian-sst-ud-2.0-170801.udpipe'), - 'sloveniansstud2.0170801.udpipe') - self.assertEqual(language_to_name('Slovenian sst'), 'sloveniansstud') + model = UDPipeModels() + self.assertEqual( + model.file_to_language("slovenian-sst-ud-2.0-170801.udpipe"), + "Slovenian (sst)" + ) + self.assertEqual(model.iso_to_file("sl_sst"), "slovenian-sst-ud") + self.assertEqual( + model.file_to_language("norwegian_bokmaal-sst-ud-2.0-170801.udpipe"), + "Norwegian Bokmål (sst)", + ) + self.assertEqual(model.iso_to_file("nb_sst"), "norwegian_bokmaal-sst-ud") @patch(SF_DOWNLOAD, download_patch) def test_udpipe_model(self, _): """Test udpipe models loading from server""" models = UDPipeModels() - self.assertIn("Lithuanian", models.supported_languages) + self.assertIn("lt", models.supported_languages_iso()) self.assertEqual(7, len(models.supported_languages)) local_file = os.path.join(models.local_data, "lithuanian-ud-2.0-170801.udpipe") - model = models["Lithuanian"] + model = models["lt"] self.assertEqual(model, local_file) self.assertTrue(os.path.isfile(local_file)) @@ -405,10 +452,11 @@ def test_udpipe_local_models(self, sf_mock): models = UDPipeModels() [models.localfiles.remove(f[0]) for f in models.localfiles.listfiles()] # use Uyghur, it is the smallest model, we can have it in the repository - _ = models["Lithuanian"] + _ = models["lt"] sf_mock.side_effect = ConnectionError() + self.assertIn("lt", UDPipeModels().supported_languages_iso()) self.assertIn("Lithuanian", UDPipeModels().supported_languages) - self.assertEqual(1, len(UDPipeModels().supported_languages)) + self.assertEqual(1, len(UDPipeModels().supported_languages_iso())) def test_udpipe_offline(self, sf_mock): """Test if UDPipe works offline""" @@ -416,6 +464,14 @@ def test_udpipe_offline(self, sf_mock): sf_mock.side_effect = ConnectionError() self.assertFalse(UDPipeModels().online) + def test_language_to_iso(self, _): + self.assertEqual("en", UDPipeModels.language_to_iso("English")) + self.assertEqual("en_lines", UDPipeModels.language_to_iso("English (lines)")) + + def test_iso_to_language(self, _): + self.assertEqual("English", UDPipeModels.iso_to_language("en")) + self.assertEqual("English (lines)", UDPipeModels.iso_to_language("en_lines")) + class FilteringTests(unittest.TestCase): def setUp(self):