Skip to content

Commit

Permalink
Normalize - Use language from Corpus
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Dec 1, 2023
1 parent c0edee3 commit f11fd00
Show file tree
Hide file tree
Showing 3 changed files with 195 additions and 84 deletions.
2 changes: 1 addition & 1 deletion orangecontrib/text/language.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
"ga": "Irish",
"gl": "Galician",
"got": "Gothic",
"grc": "Ancient greek",
"grc": "Ancient Greek",
"gu": "Gujarati",
"he": "Hebrew",
"hi": "Hindi",
Expand Down
171 changes: 113 additions & 58 deletions orangecontrib/text/preprocess/normalize.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import List, Callable
import warnings
from typing import List, Callable, Optional
import os
import ufal.udpipe as udpipe
from lemmagen3 import Lemmatizer
Expand All @@ -10,6 +11,7 @@
from Orange.util import wrap_callback, dummy_callback

from orangecontrib.text import Corpus
from orangecontrib.text.language import ISO2LANG, LANG2ISO
from orangecontrib.text.misc import wait_nltk_data
from orangecontrib.text.preprocess import Preprocessor, TokenizedPreprocessor

Expand All @@ -23,18 +25,24 @@ class BaseNormalizer(TokenizedPreprocessor):
normalizer.
"""
normalizer = NotImplemented
supported_languages = NotImplemented

def __init__(self):
# cache already normalized string to speedup normalization
self._language = None
self._normalization_cache = {}

def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
if callback is None:
callback = dummy_callback
self.load_model(self.get_language(corpus.language))
corpus = super().__call__(corpus, wrap_callback(callback, end=0.2))
callback(0.2, "Normalizing...")
return self._store_tokens(corpus, wrap_callback(callback, start=0.2))

def load_model(self, iso_language: str):
pass

def _preprocess(self, string: str) -> str:
""" Normalizes token to canonical form. """
if string in self._normalization_cache:
Expand All @@ -54,10 +62,27 @@ def __setstate__(self, state):
# _normalization_cache
self._normalization_cache = {}

def get_language(self, corpus_language: Optional[str]) -> str:
language = self._language or corpus_language
if language not in self.supported_languages:
option = ["selected", "Corpus's"][self._language is None]
error_text = f"{self.name} does not support the {option} language."
if len(self.supported_languages) == 1:
# model that support only single language does not have language
# attribute - language will always be extracted from Corpus
# it doesn't make sense to raise here since user does not have
# option to provide different language setting
la = next(iter(self.supported_languages))
warnings.warn(error_text + f" It will use model's language: {la}))")
else:
raise ValueError(error_text)
return language


class WordNetLemmatizer(BaseNormalizer):
name = 'WordNet Lemmatizer'
normalizer = stem.WordNetLemmatizer().lemmatize
supported_languages = {"en"}

@wait_nltk_data
def __init__(self):
Expand All @@ -67,42 +92,42 @@ def __init__(self):
class PorterStemmer(BaseNormalizer):
name = 'Porter Stemmer'
normalizer = stem.PorterStemmer().stem
supported_languages = {"en"}


class SnowballStemmer(BaseNormalizer):
name = 'Snowball Stemmer'
supported_languages = [l.capitalize() for l in
stem.SnowballStemmer.languages]
supported_languages = {
LANG2ISO[la.capitalize()]
for la in stem.SnowballStemmer.languages
# porter is not language but porter stemmer that we implement separately
if la != "porter"
}

def __init__(self, language='English'):
def __init__(self, language: Optional[str] = None):
super().__init__()
self.normalizer = stem.SnowballStemmer(language.lower()).stem


def language_to_name(language):
return language.lower().replace(' ', '') + 'ud'


def file_to_name(file):
return file.replace('-', '').replace('_', '')

self._language = language

def file_to_language(file):
return file[:file.find('ud') - 1] \
.replace('-', ' ').replace('_', ' ').capitalize()
def load_model(self, iso_langauge: str):
language = ISO2LANG[iso_langauge].lower()
self.normalizer = stem.SnowballStemmer(language).stem


class UDPipeModels:
server_url = "https://file.biolab.si/files/udpipe/"

# some languages differ between udpipe and iso standard
LANG2UDPIPE = {"Norwegian Bokmål": "Norwegian Bokmaal"}
UDPIPE2LANG = {v: k for k, v in LANG2UDPIPE.items()}

def __init__(self):
self.local_data = os.path.join(data_dir(versioned=False), 'udpipe/')
self.serverfiles = serverfiles.ServerFiles(self.server_url)
self.localfiles = serverfiles.LocalFiles(self.local_data,
serverfiles=self.serverfiles)

def __getitem__(self, language):
file_name = self._find_file(language_to_name(language))
def __getitem__(self, iso_language):
file_name = self._find_file(self.iso_to_file(iso_language))
return self.localfiles.localpath_download(file_name)

@property
Expand All @@ -113,12 +138,14 @@ def model_files(self):
return self.localfiles.listfiles()

def _find_file(self, language):
return next(filter(lambda f: file_to_name(f).startswith(language),
map(lambda f: f[0], self.model_files)))
return next(f[0] for f in self.model_files if f[0].startswith(language))

@property
def supported_languages(self):
return list(map(lambda f: file_to_language(f[0]), self.model_files))
return sorted(map(lambda f: self.file_to_language(f[0]), self.model_files))

def supported_languages_iso(self):
return [self.language_to_iso(lg) for lg in self.supported_languages]

@property
def online(self):
Expand All @@ -128,6 +155,46 @@ def online(self):
except ConnectionError:
return False

# use _ since - is already used in iso standard
VARIATION_DELIMITER = "_"

def language_to_iso(self, language):
if "(" in language:
language, model = language.split("(")
language = LANG2ISO[language.strip()]
return UDPipeModels.VARIATION_DELIMITER.join((language, model.strip(")")))
return LANG2ISO[language]

@staticmethod
def iso_to_language(iso_language):
lg_var = iso_language.split(UDPipeModels.VARIATION_DELIMITER)
lg, model_variation = lg_var if len(lg_var) == 2 else (lg_var[0], "")
lg = ISO2LANG[lg]
if model_variation:
model_variation = f"({model_variation})"
return " ".join((lg, model_variation)).strip()

def iso_to_file(self, iso_language):
lg_var = iso_language.split(self.VARIATION_DELIMITER)
lg, model_variation = lg_var if len(lg_var) == 2 else (lg_var[0], None)
lg = ISO2LANG[lg]
lg = [self.LANG2UDPIPE.get(lg, lg).lower().replace(" ", "_")]
if model_variation:
lg.append(model_variation)
return "-".join(lg + ["ud"])

def file_to_language(self, file):
lg = file[: file.find("ud") - 1].split("-")
# if filename includes "-" then variation is part of the name
lg, model_variation = lg if len(lg) == 2 else (lg[0], "")
# capitalize multi-word languages separated by _
lg = " ".join(map(lambda x: x.capitalize(), lg.split("_")))
# fix wrong spelling for Norwegian Bokmål
lg = self.UDPIPE2LANG.get(lg, lg)
if model_variation:
model_variation = f"({model_variation})"
return " ".join((lg, model_variation)).strip()


class UDPipeStopIteration(StopIteration):
pass
Expand All @@ -136,13 +203,17 @@ class UDPipeStopIteration(StopIteration):
class UDPipeLemmatizer(BaseNormalizer):
name = 'UDPipe Lemmatizer'

def __init__(self, language='English', use_tokenizer=False):
def __init__(self, language: Optional[str] = None, use_tokenizer=False):
super().__init__()
self.__language = language
self._language = language
self.__use_tokenizer = use_tokenizer
self.models = UDPipeModels()
self.__model = None

@property
def supported_languages(self):
return self.models.supported_languages_iso()

@property
def use_tokenizer(self):
return self.__use_tokenizer
Expand All @@ -153,12 +224,8 @@ def normalizer(self):
else self.__normalize_token

def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
try:
self.__model = udpipe.Model.load(self.models[self.__language])
except StopIteration:
raise UDPipeStopIteration

if self.__use_tokenizer:
self.load_model(self.get_language(corpus.language))
corpus = Preprocessor.__call__(self, corpus)
if callback is None:
callback = dummy_callback
Expand All @@ -167,6 +234,12 @@ def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
else:
return super().__call__(corpus, callback)

def load_model(self, iso_language: str):
try:
self.__model = udpipe.Model.load(self.models[iso_language])
except StopIteration:
raise UDPipeStopIteration

def __normalize_token(self, token: str) -> str:
sentence = udpipe.Sentence()
sentence.addWord(token)
Expand Down Expand Up @@ -213,43 +286,25 @@ def __setstate__(self, state):

class LemmagenLemmatizer(BaseNormalizer):
name = 'Lemmagen Lemmatizer'
lemmagen_languages = {
"Bulgarian": "bg",
"Croatian": "hr",
"Czech": "cs",
"English": "en",
"Estonian": "et",
"Farsi/Persian": "fa",
"French": "fr",
"German": "de",
"Hungarian": "hu",
"Italian": "it",
"Macedonian": "mk",
"Polish": "pl",
"Romanian": "ro",
"Russian": "ru",
"Serbian": "sr",
"Slovak": "sk",
"Slovenian": "sl",
"Spanish": "es",
"Ukrainian": "uk"
}
supported_languages = Lemmatizer.list_supported_languages()

def __init__(self, language='English'):
def __init__(self, language: Optional[str] = None):
super().__init__()
self.language = language
self._language = language
self.lemmatizer = None

def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
# lemmagen3 lemmatizer is not picklable, define it on call and discard it afterward
self.lemmatizer = Lemmatizer(self.lemmagen_languages[self.language])
output_corpus = super().__call__(corpus, callback)
self.lemmatizer = None
return output_corpus
def load_model(self, iso_language: str):
self.lemmatizer = Lemmatizer(iso_language)

def normalizer(self, token):
assert self.lemmatizer is not None
t = self.lemmatizer.lemmatize(token)
# sometimes Lemmagen returns an empty string, return original tokens
# in this case
return t if t else token

def __getstate__(self):
"""This function remove model that cannot be pickled"""
state = super().__getstate__()
state["lemmatizer"] = None
return state
Loading

0 comments on commit f11fd00

Please sign in to comment.