From b35a6758c0c210bedbe0e01b2b92bd40e1c168fc Mon Sep 17 00:00:00 2001 From: PrimozGodec Date: Fri, 14 Apr 2023 15:00:02 +0200 Subject: [PATCH] Normalize - Use language from Corpus --- orangecontrib/text/language.py | 2 +- orangecontrib/text/preprocess/normalize.py | 79 +++++++++++++++++---- orangecontrib/text/tests/test_preprocess.py | 49 ++++++++----- 3 files changed, 98 insertions(+), 32 deletions(-) diff --git a/orangecontrib/text/language.py b/orangecontrib/text/language.py index d65c03d29..f4f8af28e 100644 --- a/orangecontrib/text/language.py +++ b/orangecontrib/text/language.py @@ -41,7 +41,7 @@ "ga": "Irish", "gl": "Galician", "got": "Gothic", - "grc": "Ancient greek", + "grc": "Ancient Greek", "gu": "Gujarati", "he": "Hebrew", "hi": "Hindi", diff --git a/orangecontrib/text/preprocess/normalize.py b/orangecontrib/text/preprocess/normalize.py index 07f85f761..cb7666794 100644 --- a/orangecontrib/text/preprocess/normalize.py +++ b/orangecontrib/text/preprocess/normalize.py @@ -1,4 +1,5 @@ -from typing import List, Callable +import warnings +from typing import List, Callable, Optional import os import ufal.udpipe as udpipe from lemmagen3 import Lemmatizer @@ -100,14 +101,18 @@ def file_to_language(file): class UDPipeModels: server_url = "https://file.biolab.si/files/udpipe/" + # some languages differ between udpipe and iso standard + LANG2UDPIPE = {"Norwegian Bokmål": "Norwegian Bokmaal"} + UDPIPE2LANG = {v: k for k, v in LANG2UDPIPE.items()} + def __init__(self): self.local_data = os.path.join(data_dir(versioned=False), 'udpipe/') self.serverfiles = serverfiles.ServerFiles(self.server_url) self.localfiles = serverfiles.LocalFiles(self.local_data, serverfiles=self.serverfiles) - def __getitem__(self, language): - file_name = self._find_file(language_to_name(language)) + def __getitem__(self, iso_language): + file_name = self._find_file(self.iso_to_file(iso_language)) return self.localfiles.localpath_download(file_name) @property @@ -118,12 +123,14 @@ def model_files(self): return self.localfiles.listfiles() def _find_file(self, language): - return next(filter(lambda f: file_to_name(f).startswith(language), - map(lambda f: f[0], self.model_files))) + return next(f[0] for f in self.model_files if f[0].startswith(language)) @property def supported_languages(self): - return list(map(lambda f: file_to_language(f[0]), self.model_files)) + return sorted(map(lambda f: self.file_to_language(f[0]), self.model_files)) + + def supported_languages_iso(self): + return [self.language_to_iso(lg) for lg in self.supported_languages] @property def online(self): @@ -133,6 +140,46 @@ def online(self): except ConnectionError: return False + # use _ since - is already used in iso standard + VARIATION_DELIMITER = "_" + + def language_to_iso(self, language): + if "(" in language: + language, model = language.split("(") + language = LANG2ISO[language.strip()] + return UDPipeModels.VARIATION_DELIMITER.join((language, model.strip(")"))) + return LANG2ISO[language] + + @staticmethod + def iso_to_language(iso_language): + lg_var = iso_language.split(UDPipeModels.VARIATION_DELIMITER) + lg, model_variation = lg_var if len(lg_var) == 2 else (lg_var[0], "") + lg = ISO2LANG[lg] + if model_variation: + model_variation = f"({model_variation})" + return " ".join((lg, model_variation)).strip() + + def iso_to_file(self, iso_language): + lg_var = iso_language.split(self.VARIATION_DELIMITER) + lg, model_variation = lg_var if len(lg_var) == 2 else (lg_var[0], None) + lg = ISO2LANG[lg] + lg = [self.LANG2UDPIPE.get(lg, lg).lower().replace(" ", "_")] + if model_variation: + lg.append(model_variation) + return "-".join(lg + ["ud"]) + + def file_to_language(self, file): + lg = file[: file.find("ud") - 1].split("-") + # if filename includes "-" then variation is part of the name + lg, model_variation = lg if len(lg) == 2 else (lg[0], "") + # capitalize multi-word languages separated by _ + lg = " ".join(map(lambda x: x.capitalize(), lg.split("_"))) + # fix wrong spelling for Norwegian Bokmål + lg = self.UDPIPE2LANG.get(lg, lg) + if model_variation: + model_variation = f"({model_variation})" + return " ".join((lg, model_variation)).strip() + class UDPipeStopIteration(StopIteration): pass @@ -141,13 +188,17 @@ class UDPipeStopIteration(StopIteration): class UDPipeLemmatizer(BaseNormalizer): name = 'UDPipe Lemmatizer' - def __init__(self, language='English', use_tokenizer=False): + def __init__(self, language: Optional[str] = None, use_tokenizer=False): super().__init__() - self.__language = language + self._language = language self.__use_tokenizer = use_tokenizer self.models = UDPipeModels() self.__model = None + @property + def supported_languages(self): + return self.models.supported_languages_iso() + @property def use_tokenizer(self): return self.__use_tokenizer @@ -158,12 +209,8 @@ def normalizer(self): else self.__normalize_token def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus: - try: - self.__model = udpipe.Model.load(self.models[self.__language]) - except StopIteration: - raise UDPipeStopIteration - if self.__use_tokenizer: + self.load_model(self.get_language(corpus.language)) corpus = Preprocessor.__call__(self, corpus) if callback is None: callback = dummy_callback @@ -172,6 +219,12 @@ def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus: else: return super().__call__(corpus, callback) + def load_model(self, iso_language: str): + try: + self.__model = udpipe.Model.load(self.models[iso_language]) + except StopIteration: + raise UDPipeStopIteration + def __normalize_token(self, token: str) -> str: sentence = udpipe.Sentence() sentence.addWord(token) diff --git a/orangecontrib/text/tests/test_preprocess.py b/orangecontrib/text/tests/test_preprocess.py index 516c2627c..b3df54813 100644 --- a/orangecontrib/text/tests/test_preprocess.py +++ b/orangecontrib/text/tests/test_preprocess.py @@ -270,7 +270,7 @@ def test_call_word_net(self): self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2) def test_call_UDPipe(self): - pp = preprocess.UDPipeLemmatizer(language="Lithuanian") + pp = preprocess.UDPipeLemmatizer(language="lt") self.assertFalse(self.corpus.has_tokens()) corpus = pp(self.corpus) self.assertTrue(corpus.has_tokens()) @@ -304,7 +304,7 @@ def test_snowball_all_langs(self): def test_udpipe(self): """Test udpipe token lemmatization""" - normalizer = preprocess.UDPipeLemmatizer("Lithuanian") + normalizer = preprocess.UDPipeLemmatizer("lt") with self.corpus.unlocked(): self.corpus.metas[0, 0] = "esu" corpus = normalizer(self.corpus) @@ -313,7 +313,7 @@ def test_udpipe(self): def test_udpipe_doc(self): """Test udpipe lemmatization with its own tokenization""" - normalizer = preprocess.UDPipeLemmatizer("Lithuanian", True) + normalizer = preprocess.UDPipeLemmatizer("lt", True) with self.corpus.unlocked(): self.corpus.metas[0, 0] = "Ant kalno dega namas" corpus = normalizer(self.corpus) @@ -321,12 +321,11 @@ def test_udpipe_doc(self): self.assertEqual(len(corpus.used_preprocessor.preprocessors), 1) def test_udpipe_pickle(self): - normalizer = preprocess.UDPipeLemmatizer("Lithuanian", True) + normalizer = preprocess.UDPipeLemmatizer("lt", True) # udpipe store model after first call - model is not picklable normalizer(self.corpus) loaded = pickle.loads(pickle.dumps(normalizer)) - self.assertEqual(normalizer._UDPipeLemmatizer__language, - loaded._UDPipeLemmatizer__language) + self.assertEqual(normalizer._language, loaded._language) self.assertEqual(normalizer._UDPipeLemmatizer__use_tokenizer, loaded._UDPipeLemmatizer__use_tokenizer) with self.corpus.unlocked(): @@ -336,10 +335,9 @@ def test_udpipe_pickle(self): ) def test_udpipe_deepcopy(self): - normalizer = preprocess.UDPipeLemmatizer("Lithuanian", True) + normalizer = preprocess.UDPipeLemmatizer("lt", True) copied = copy.deepcopy(normalizer) - self.assertEqual(normalizer._UDPipeLemmatizer__language, - copied._UDPipeLemmatizer__language) + self.assertEqual(normalizer._language, copied._language) self.assertEqual(normalizer._UDPipeLemmatizer__use_tokenizer, copied._UDPipeLemmatizer__use_tokenizer) with self.corpus.unlocked(): @@ -395,21 +393,27 @@ def test_cache(self): class UDPipeModelsTests(unittest.TestCase): def test_label_transform(self, _): """Test helper functions for label transformation""" - self.assertEqual(file_to_language('slovenian-sst-ud-2.0-170801.udpipe'), - 'Slovenian sst') - self.assertEqual(file_to_name('slovenian-sst-ud-2.0-170801.udpipe'), - 'sloveniansstud2.0170801.udpipe') - self.assertEqual(language_to_name('Slovenian sst'), 'sloveniansstud') + model = UDPipeModels() + self.assertEqual( + model.file_to_language("slovenian-sst-ud-2.0-170801.udpipe"), + "Slovenian (sst)" + ) + self.assertEqual(model.iso_to_file("sl_sst"), "slovenian-sst-ud") + self.assertEqual( + model.file_to_language("norwegian_bokmaal-sst-ud-2.0-170801.udpipe"), + "Norwegian Bokmål (sst)", + ) + self.assertEqual(model.iso_to_file("nb_sst"), "norwegian_bokmaal-sst-ud") @patch(SF_DOWNLOAD, download_patch) def test_udpipe_model(self, _): """Test udpipe models loading from server""" models = UDPipeModels() - self.assertIn("Lithuanian", models.supported_languages) + self.assertIn("lt", models.supported_languages_iso()) self.assertEqual(7, len(models.supported_languages)) local_file = os.path.join(models.local_data, "lithuanian-ud-2.0-170801.udpipe") - model = models["Lithuanian"] + model = models["lt"] self.assertEqual(model, local_file) self.assertTrue(os.path.isfile(local_file)) @@ -419,10 +423,11 @@ def test_udpipe_local_models(self, sf_mock): models = UDPipeModels() [models.localfiles.remove(f[0]) for f in models.localfiles.listfiles()] # use Uyghur, it is the smallest model, we can have it in the repository - _ = models["Lithuanian"] + _ = models["lt"] sf_mock.side_effect = ConnectionError() + self.assertIn("lt", UDPipeModels().supported_languages_iso()) self.assertIn("Lithuanian", UDPipeModels().supported_languages) - self.assertEqual(1, len(UDPipeModels().supported_languages)) + self.assertEqual(1, len(UDPipeModels().supported_languages_iso())) def test_udpipe_offline(self, sf_mock): """Test if UDPipe works offline""" @@ -430,6 +435,14 @@ def test_udpipe_offline(self, sf_mock): sf_mock.side_effect = ConnectionError() self.assertFalse(UDPipeModels().online) + def test_language_to_iso(self, _): + self.assertEqual("en", UDPipeModels.language_to_iso("English")) + self.assertEqual("en_lines", UDPipeModels.language_to_iso("English (lines)")) + + def test_iso_to_language(self, _): + self.assertEqual("English", UDPipeModels.iso_to_language("en")) + self.assertEqual("English (lines)", UDPipeModels.iso_to_language("en_lines")) + class FilteringTests(unittest.TestCase): def setUp(self):