From f11fd004aa4231b62b1ec0c366bb8ae5ce44e0a7 Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Fri, 14 Apr 2023 15:00:02 +0200
Subject: [PATCH] Normalize - Use language from Corpus
---
orangecontrib/text/language.py | 2 +-
orangecontrib/text/preprocess/normalize.py | 171 +++++++++++++-------
orangecontrib/text/tests/test_preprocess.py | 106 +++++++++---
3 files changed, 195 insertions(+), 84 deletions(-)
diff --git a/orangecontrib/text/language.py b/orangecontrib/text/language.py
index d65c03d29..f4f8af28e 100644
--- a/orangecontrib/text/language.py
+++ b/orangecontrib/text/language.py
@@ -41,7 +41,7 @@
"ga": "Irish",
"gl": "Galician",
"got": "Gothic",
- "grc": "Ancient greek",
+ "grc": "Ancient Greek",
"gu": "Gujarati",
"he": "Hebrew",
"hi": "Hindi",
diff --git a/orangecontrib/text/preprocess/normalize.py b/orangecontrib/text/preprocess/normalize.py
index 735666714..f8ab7f835 100644
--- a/orangecontrib/text/preprocess/normalize.py
+++ b/orangecontrib/text/preprocess/normalize.py
@@ -1,4 +1,5 @@
-from typing import List, Callable
+import warnings
+from typing import List, Callable, Optional
import os
import ufal.udpipe as udpipe
from lemmagen3 import Lemmatizer
@@ -10,6 +11,7 @@
from Orange.util import wrap_callback, dummy_callback
from orangecontrib.text import Corpus
+from orangecontrib.text.language import ISO2LANG, LANG2ISO
from orangecontrib.text.misc import wait_nltk_data
from orangecontrib.text.preprocess import Preprocessor, TokenizedPreprocessor
@@ -23,18 +25,24 @@ class BaseNormalizer(TokenizedPreprocessor):
normalizer.
"""
normalizer = NotImplemented
+ supported_languages = NotImplemented
def __init__(self):
# cache already normalized string to speedup normalization
+ self._language = None
self._normalization_cache = {}
def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
if callback is None:
callback = dummy_callback
+ self.load_model(self.get_language(corpus.language))
corpus = super().__call__(corpus, wrap_callback(callback, end=0.2))
callback(0.2, "Normalizing...")
return self._store_tokens(corpus, wrap_callback(callback, start=0.2))
+ def load_model(self, iso_language: str):
+ pass
+
def _preprocess(self, string: str) -> str:
""" Normalizes token to canonical form. """
if string in self._normalization_cache:
@@ -54,10 +62,27 @@ def __setstate__(self, state):
# _normalization_cache
self._normalization_cache = {}
+ def get_language(self, corpus_language: Optional[str]) -> str:
+ language = self._language or corpus_language
+ if language not in self.supported_languages:
+ option = ["selected", "Corpus's"][self._language is None]
+ error_text = f"{self.name} does not support the {option} language."
+ if len(self.supported_languages) == 1:
+ # model that support only single language does not have language
+ # attribute - language will always be extracted from Corpus
+ # it doesn't make sense to raise here since user does not have
+ # option to provide different language setting
+ la = next(iter(self.supported_languages))
+ warnings.warn(error_text + f" It will use model's language: {la}))")
+ else:
+ raise ValueError(error_text)
+ return language
+
class WordNetLemmatizer(BaseNormalizer):
name = 'WordNet Lemmatizer'
normalizer = stem.WordNetLemmatizer().lemmatize
+ supported_languages = {"en"}
@wait_nltk_data
def __init__(self):
@@ -67,42 +92,42 @@ def __init__(self):
class PorterStemmer(BaseNormalizer):
name = 'Porter Stemmer'
normalizer = stem.PorterStemmer().stem
+ supported_languages = {"en"}
class SnowballStemmer(BaseNormalizer):
name = 'Snowball Stemmer'
- supported_languages = [l.capitalize() for l in
- stem.SnowballStemmer.languages]
+ supported_languages = {
+ LANG2ISO[la.capitalize()]
+ for la in stem.SnowballStemmer.languages
+ # porter is not language but porter stemmer that we implement separately
+ if la != "porter"
+ }
- def __init__(self, language='English'):
+ def __init__(self, language: Optional[str] = None):
super().__init__()
- self.normalizer = stem.SnowballStemmer(language.lower()).stem
-
-
-def language_to_name(language):
- return language.lower().replace(' ', '') + 'ud'
-
-
-def file_to_name(file):
- return file.replace('-', '').replace('_', '')
-
+ self._language = language
-def file_to_language(file):
- return file[:file.find('ud') - 1] \
- .replace('-', ' ').replace('_', ' ').capitalize()
+ def load_model(self, iso_langauge: str):
+ language = ISO2LANG[iso_langauge].lower()
+ self.normalizer = stem.SnowballStemmer(language).stem
class UDPipeModels:
server_url = "https://file.biolab.si/files/udpipe/"
+ # some languages differ between udpipe and iso standard
+ LANG2UDPIPE = {"Norwegian Bokmål": "Norwegian Bokmaal"}
+ UDPIPE2LANG = {v: k for k, v in LANG2UDPIPE.items()}
+
def __init__(self):
self.local_data = os.path.join(data_dir(versioned=False), 'udpipe/')
self.serverfiles = serverfiles.ServerFiles(self.server_url)
self.localfiles = serverfiles.LocalFiles(self.local_data,
serverfiles=self.serverfiles)
- def __getitem__(self, language):
- file_name = self._find_file(language_to_name(language))
+ def __getitem__(self, iso_language):
+ file_name = self._find_file(self.iso_to_file(iso_language))
return self.localfiles.localpath_download(file_name)
@property
@@ -113,12 +138,14 @@ def model_files(self):
return self.localfiles.listfiles()
def _find_file(self, language):
- return next(filter(lambda f: file_to_name(f).startswith(language),
- map(lambda f: f[0], self.model_files)))
+ return next(f[0] for f in self.model_files if f[0].startswith(language))
@property
def supported_languages(self):
- return list(map(lambda f: file_to_language(f[0]), self.model_files))
+ return sorted(map(lambda f: self.file_to_language(f[0]), self.model_files))
+
+ def supported_languages_iso(self):
+ return [self.language_to_iso(lg) for lg in self.supported_languages]
@property
def online(self):
@@ -128,6 +155,46 @@ def online(self):
except ConnectionError:
return False
+ # use _ since - is already used in iso standard
+ VARIATION_DELIMITER = "_"
+
+ def language_to_iso(self, language):
+ if "(" in language:
+ language, model = language.split("(")
+ language = LANG2ISO[language.strip()]
+ return UDPipeModels.VARIATION_DELIMITER.join((language, model.strip(")")))
+ return LANG2ISO[language]
+
+ @staticmethod
+ def iso_to_language(iso_language):
+ lg_var = iso_language.split(UDPipeModels.VARIATION_DELIMITER)
+ lg, model_variation = lg_var if len(lg_var) == 2 else (lg_var[0], "")
+ lg = ISO2LANG[lg]
+ if model_variation:
+ model_variation = f"({model_variation})"
+ return " ".join((lg, model_variation)).strip()
+
+ def iso_to_file(self, iso_language):
+ lg_var = iso_language.split(self.VARIATION_DELIMITER)
+ lg, model_variation = lg_var if len(lg_var) == 2 else (lg_var[0], None)
+ lg = ISO2LANG[lg]
+ lg = [self.LANG2UDPIPE.get(lg, lg).lower().replace(" ", "_")]
+ if model_variation:
+ lg.append(model_variation)
+ return "-".join(lg + ["ud"])
+
+ def file_to_language(self, file):
+ lg = file[: file.find("ud") - 1].split("-")
+ # if filename includes "-" then variation is part of the name
+ lg, model_variation = lg if len(lg) == 2 else (lg[0], "")
+ # capitalize multi-word languages separated by _
+ lg = " ".join(map(lambda x: x.capitalize(), lg.split("_")))
+ # fix wrong spelling for Norwegian Bokmål
+ lg = self.UDPIPE2LANG.get(lg, lg)
+ if model_variation:
+ model_variation = f"({model_variation})"
+ return " ".join((lg, model_variation)).strip()
+
class UDPipeStopIteration(StopIteration):
pass
@@ -136,13 +203,17 @@ class UDPipeStopIteration(StopIteration):
class UDPipeLemmatizer(BaseNormalizer):
name = 'UDPipe Lemmatizer'
- def __init__(self, language='English', use_tokenizer=False):
+ def __init__(self, language: Optional[str] = None, use_tokenizer=False):
super().__init__()
- self.__language = language
+ self._language = language
self.__use_tokenizer = use_tokenizer
self.models = UDPipeModels()
self.__model = None
+ @property
+ def supported_languages(self):
+ return self.models.supported_languages_iso()
+
@property
def use_tokenizer(self):
return self.__use_tokenizer
@@ -153,12 +224,8 @@ def normalizer(self):
else self.__normalize_token
def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
- try:
- self.__model = udpipe.Model.load(self.models[self.__language])
- except StopIteration:
- raise UDPipeStopIteration
-
if self.__use_tokenizer:
+ self.load_model(self.get_language(corpus.language))
corpus = Preprocessor.__call__(self, corpus)
if callback is None:
callback = dummy_callback
@@ -167,6 +234,12 @@ def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
else:
return super().__call__(corpus, callback)
+ def load_model(self, iso_language: str):
+ try:
+ self.__model = udpipe.Model.load(self.models[iso_language])
+ except StopIteration:
+ raise UDPipeStopIteration
+
def __normalize_token(self, token: str) -> str:
sentence = udpipe.Sentence()
sentence.addWord(token)
@@ -213,39 +286,15 @@ def __setstate__(self, state):
class LemmagenLemmatizer(BaseNormalizer):
name = 'Lemmagen Lemmatizer'
- lemmagen_languages = {
- "Bulgarian": "bg",
- "Croatian": "hr",
- "Czech": "cs",
- "English": "en",
- "Estonian": "et",
- "Farsi/Persian": "fa",
- "French": "fr",
- "German": "de",
- "Hungarian": "hu",
- "Italian": "it",
- "Macedonian": "mk",
- "Polish": "pl",
- "Romanian": "ro",
- "Russian": "ru",
- "Serbian": "sr",
- "Slovak": "sk",
- "Slovenian": "sl",
- "Spanish": "es",
- "Ukrainian": "uk"
- }
+ supported_languages = Lemmatizer.list_supported_languages()
- def __init__(self, language='English'):
+ def __init__(self, language: Optional[str] = None):
super().__init__()
- self.language = language
+ self._language = language
self.lemmatizer = None
- def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
- # lemmagen3 lemmatizer is not picklable, define it on call and discard it afterward
- self.lemmatizer = Lemmatizer(self.lemmagen_languages[self.language])
- output_corpus = super().__call__(corpus, callback)
- self.lemmatizer = None
- return output_corpus
+ def load_model(self, iso_language: str):
+ self.lemmatizer = Lemmatizer(iso_language)
def normalizer(self, token):
assert self.lemmatizer is not None
@@ -253,3 +302,9 @@ def normalizer(self, token):
# sometimes Lemmagen returns an empty string, return original tokens
# in this case
return t if t else token
+
+ def __getstate__(self):
+ """This function remove model that cannot be pickled"""
+ state = super().__getstate__()
+ state["lemmatizer"] = None
+ return state
diff --git a/orangecontrib/text/tests/test_preprocess.py b/orangecontrib/text/tests/test_preprocess.py
index 26dc54821..7d275b677 100644
--- a/orangecontrib/text/tests/test_preprocess.py
+++ b/orangecontrib/text/tests/test_preprocess.py
@@ -115,6 +115,8 @@ def _preprocess(cls, string):
def test_token_normalizer(self):
class CapTokenNormalizer(preprocess.BaseNormalizer):
+ supported_languages = {"en"}
+
@classmethod
def _preprocess(cls, token):
return token.capitalize()
@@ -270,7 +272,7 @@ def test_call_word_net(self):
self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)
def test_call_UDPipe(self):
- pp = preprocess.UDPipeLemmatizer(language="Lithuanian")
+ pp = preprocess.UDPipeLemmatizer(language="lt")
self.assertFalse(self.corpus.has_tokens())
corpus = pp(self.corpus)
self.assertTrue(corpus.has_tokens())
@@ -289,15 +291,19 @@ def test_function(self):
self.assertEqual(stemmer._preprocess('token'), 'toke')
def test_snowball(self):
- stemmer = preprocess.SnowballStemmer('french')
+ stemmer = preprocess.SnowballStemmer("fr")
token = 'voudrais'
- self.assertEqual(
- stemmer._preprocess(token),
- nltk.SnowballStemmer(language='french').stem(token))
+ with self.corpus.unlocked():
+ self.corpus.metas[0, 0] = token
+ corpus = stemmer(self.corpus)
+ self.assertListEqual(
+ list(corpus.tokens[0]),
+ [nltk.SnowballStemmer(language="french").stem(token)],
+ )
def test_udpipe(self):
"""Test udpipe token lemmatization"""
- normalizer = preprocess.UDPipeLemmatizer("Lithuanian")
+ normalizer = preprocess.UDPipeLemmatizer("lt")
with self.corpus.unlocked():
self.corpus.metas[0, 0] = "esu"
corpus = normalizer(self.corpus)
@@ -306,7 +312,7 @@ def test_udpipe(self):
def test_udpipe_doc(self):
"""Test udpipe lemmatization with its own tokenization"""
- normalizer = preprocess.UDPipeLemmatizer("Lithuanian", True)
+ normalizer = preprocess.UDPipeLemmatizer("lt", True)
with self.corpus.unlocked():
self.corpus.metas[0, 0] = "Ant kalno dega namas"
corpus = normalizer(self.corpus)
@@ -314,12 +320,11 @@ def test_udpipe_doc(self):
self.assertEqual(len(corpus.used_preprocessor.preprocessors), 1)
def test_udpipe_pickle(self):
- normalizer = preprocess.UDPipeLemmatizer("Lithuanian", True)
+ normalizer = preprocess.UDPipeLemmatizer("lt", True)
# udpipe store model after first call - model is not picklable
normalizer(self.corpus)
loaded = pickle.loads(pickle.dumps(normalizer))
- self.assertEqual(normalizer._UDPipeLemmatizer__language,
- loaded._UDPipeLemmatizer__language)
+ self.assertEqual(normalizer._language, loaded._language)
self.assertEqual(normalizer._UDPipeLemmatizer__use_tokenizer,
loaded._UDPipeLemmatizer__use_tokenizer)
with self.corpus.unlocked():
@@ -329,10 +334,9 @@ def test_udpipe_pickle(self):
)
def test_udpipe_deepcopy(self):
- normalizer = preprocess.UDPipeLemmatizer("Lithuanian", True)
+ normalizer = preprocess.UDPipeLemmatizer("lt", True)
copied = copy.deepcopy(normalizer)
- self.assertEqual(normalizer._UDPipeLemmatizer__language,
- copied._UDPipeLemmatizer__language)
+ self.assertEqual(normalizer._language, copied._language)
self.assertEqual(normalizer._UDPipeLemmatizer__use_tokenizer,
copied._UDPipeLemmatizer__use_tokenizer)
with self.corpus.unlocked():
@@ -342,7 +346,7 @@ def test_udpipe_deepcopy(self):
)
def test_lemmagen(self):
- normalizer = preprocess.LemmagenLemmatizer('Slovenian')
+ normalizer = preprocess.LemmagenLemmatizer("sl")
sentence = 'Gori na gori hiša gori'
with self.corpus.unlocked():
self.corpus.metas[0, 0] = sentence
@@ -356,7 +360,7 @@ def test_normalizers_picklable(self):
for nm in set(preprocess.normalize.__all__) - {"BaseNormalizer"}:
normalizer = getattr(preprocess.normalize, nm)
normalizer = (
- normalizer(language="Lithuanian")
+ normalizer(language="lt")
if normalizer is preprocess.UDPipeLemmatizer
else normalizer()
)
@@ -365,7 +369,7 @@ def test_normalizers_picklable(self):
loaded(self.corpus)
def test_cache(self):
- normalizer = preprocess.UDPipeLemmatizer("Lithuanian")
+ normalizer = preprocess.UDPipeLemmatizer("lt")
with self.corpus.unlocked():
self.corpus.metas[0, 0] = "esu"
normalizer(self.corpus)
@@ -376,26 +380,69 @@ def test_cache(self):
loaded_normalizer = pickle.loads(pickle.dumps(normalizer))
self.assertEqual(0, len(loaded_normalizer._normalization_cache))
+ def test_language(self):
+ normalizers = (
+ # preprocessor, valid language
+ (preprocess.WordNetLemmatizer, "en"),
+ (preprocess.PorterStemmer, "en"),
+ (preprocess.SnowballStemmer, "en"),
+ (preprocess.UDPipeLemmatizer, "lt"),
+ (preprocess.LemmagenLemmatizer, "en"),
+ )
+ without_lang = (preprocess.WordNetLemmatizer, preprocess.PorterStemmer)
+ for n, valid_lg in normalizers:
+ for lg in (valid_lg, "tg"): # valid and not valid language
+ # try with language in constructor
+ m = n() if n in without_lang else n(language=lg)
+ if lg == valid_lg or n in without_lang:
+ res = m(self.corpus)
+ self.assertIsNotNone(res)
+ else:
+ with self.assertRaises(ValueError):
+ m(self.corpus)
+
+ # try with valid language in corpus
+ m = n()
+ corpus = self.corpus.copy()
+ corpus.attributes["language"] = lg
+ if lg == valid_lg:
+ res = m(corpus)
+ self.assertIsNotNone(res)
+ else:
+ if n in without_lang:
+ with self.assertWarns(UserWarning):
+ res = m(corpus)
+ self.assertIsNotNone(res)
+ else:
+ with self.assertRaises(ValueError):
+ m(corpus)
+
@patch(SF_LIST, return_value=SERVER_FILES)
class UDPipeModelsTests(unittest.TestCase):
def test_label_transform(self, _):
"""Test helper functions for label transformation"""
- self.assertEqual(file_to_language('slovenian-sst-ud-2.0-170801.udpipe'),
- 'Slovenian sst')
- self.assertEqual(file_to_name('slovenian-sst-ud-2.0-170801.udpipe'),
- 'sloveniansstud2.0170801.udpipe')
- self.assertEqual(language_to_name('Slovenian sst'), 'sloveniansstud')
+ model = UDPipeModels()
+ self.assertEqual(
+ model.file_to_language("slovenian-sst-ud-2.0-170801.udpipe"),
+ "Slovenian (sst)"
+ )
+ self.assertEqual(model.iso_to_file("sl_sst"), "slovenian-sst-ud")
+ self.assertEqual(
+ model.file_to_language("norwegian_bokmaal-sst-ud-2.0-170801.udpipe"),
+ "Norwegian Bokmål (sst)",
+ )
+ self.assertEqual(model.iso_to_file("nb_sst"), "norwegian_bokmaal-sst-ud")
@patch(SF_DOWNLOAD, download_patch)
def test_udpipe_model(self, _):
"""Test udpipe models loading from server"""
models = UDPipeModels()
- self.assertIn("Lithuanian", models.supported_languages)
+ self.assertIn("lt", models.supported_languages_iso())
self.assertEqual(7, len(models.supported_languages))
local_file = os.path.join(models.local_data, "lithuanian-ud-2.0-170801.udpipe")
- model = models["Lithuanian"]
+ model = models["lt"]
self.assertEqual(model, local_file)
self.assertTrue(os.path.isfile(local_file))
@@ -405,10 +452,11 @@ def test_udpipe_local_models(self, sf_mock):
models = UDPipeModels()
[models.localfiles.remove(f[0]) for f in models.localfiles.listfiles()]
# use Uyghur, it is the smallest model, we can have it in the repository
- _ = models["Lithuanian"]
+ _ = models["lt"]
sf_mock.side_effect = ConnectionError()
+ self.assertIn("lt", UDPipeModels().supported_languages_iso())
self.assertIn("Lithuanian", UDPipeModels().supported_languages)
- self.assertEqual(1, len(UDPipeModels().supported_languages))
+ self.assertEqual(1, len(UDPipeModels().supported_languages_iso()))
def test_udpipe_offline(self, sf_mock):
"""Test if UDPipe works offline"""
@@ -416,6 +464,14 @@ def test_udpipe_offline(self, sf_mock):
sf_mock.side_effect = ConnectionError()
self.assertFalse(UDPipeModels().online)
+ def test_language_to_iso(self, _):
+ self.assertEqual("en", UDPipeModels.language_to_iso("English"))
+ self.assertEqual("en_lines", UDPipeModels.language_to_iso("English (lines)"))
+
+ def test_iso_to_language(self, _):
+ self.assertEqual("English", UDPipeModels.iso_to_language("en"))
+ self.assertEqual("English (lines)", UDPipeModels.iso_to_language("en_lines"))
+
class FilteringTests(unittest.TestCase):
def setUp(self):