Skip to content

Commit 0495fd5

Browse files
authored
Merge pull request #1025 from PrimozGodec/language-normalizers
[ENH] Lemmagen - Use ISO language codes
2 parents c0edee3 + 6ee6269 commit 0495fd5

File tree

4 files changed

+89
-78
lines changed

4 files changed

+89
-78
lines changed

orangecontrib/text/preprocess/normalize.py

+15-31
Original file line numberDiff line numberDiff line change
@@ -213,43 +213,27 @@ def __setstate__(self, state):
213213

214214
class LemmagenLemmatizer(BaseNormalizer):
215215
name = 'Lemmagen Lemmatizer'
216-
lemmagen_languages = {
217-
"Bulgarian": "bg",
218-
"Croatian": "hr",
219-
"Czech": "cs",
220-
"English": "en",
221-
"Estonian": "et",
222-
"Farsi/Persian": "fa",
223-
"French": "fr",
224-
"German": "de",
225-
"Hungarian": "hu",
226-
"Italian": "it",
227-
"Macedonian": "mk",
228-
"Polish": "pl",
229-
"Romanian": "ro",
230-
"Russian": "ru",
231-
"Serbian": "sr",
232-
"Slovak": "sk",
233-
"Slovenian": "sl",
234-
"Spanish": "es",
235-
"Ukrainian": "uk"
236-
}
216+
supported_languages = set(Lemmatizer.list_supported_languages())
237217

238-
def __init__(self, language='English'):
218+
def __init__(self, language="en"):
239219
super().__init__()
240-
self.language = language
241-
self.lemmatizer = None
242-
243-
def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
244-
# lemmagen3 lemmatizer is not picklable, define it on call and discard it afterward
245-
self.lemmatizer = Lemmatizer(self.lemmagen_languages[self.language])
246-
output_corpus = super().__call__(corpus, callback)
247-
self.lemmatizer = None
248-
return output_corpus
220+
self.language = language # used only for unpicking
221+
self.lemmatizer = Lemmatizer(language)
249222

250223
def normalizer(self, token):
251224
assert self.lemmatizer is not None
252225
t = self.lemmatizer.lemmatize(token)
253226
# sometimes Lemmagen returns an empty string, return original tokens
254227
# in this case
255228
return t if t else token
229+
230+
def __getstate__(self):
231+
"""Remove model that cannot be pickled"""
232+
state = super().__getstate__()
233+
state["lemmatizer"] = None
234+
return state
235+
236+
def __setstate__(self, state):
237+
"""Reinstate the model when upickled"""
238+
super().__setstate__(state)
239+
self.lemmatizer = Lemmatizer(self.language)

orangecontrib/text/tests/test_preprocess.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -342,15 +342,22 @@ def test_udpipe_deepcopy(self):
342342
)
343343

344344
def test_lemmagen(self):
345-
normalizer = preprocess.LemmagenLemmatizer('Slovenian')
346-
sentence = 'Gori na gori hiša gori'
345+
normalizer = preprocess.LemmagenLemmatizer("sl")
346+
sentence = "Gori na gori hiša gori"
347347
with self.corpus.unlocked():
348348
self.corpus.metas[0, 0] = sentence
349349
self.assertEqual(
350350
[Lemmatizer("sl").lemmatize(t) for t in sentence.split()],
351351
normalizer(self.corpus).tokens[0],
352352
)
353353

354+
def test_lemmagen_all_langs(self):
355+
for language in preprocess.LemmagenLemmatizer.supported_languages:
356+
normalizer = preprocess.LemmagenLemmatizer(language)
357+
tokens = normalizer(self.corpus).tokens
358+
self.assertEqual(len(self.corpus), len(tokens))
359+
self.assertTrue(all(tokens))
360+
354361
def test_normalizers_picklable(self):
355362
""" Normalizers must be picklable, tests if it is true"""
356363
for nm in set(preprocess.normalize.__all__) - {"BaseNormalizer"}:

orangecontrib/text/widgets/owpreprocess.py

+22-14
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from Orange.widgets.widget import Input, Output, Msg, Message
2525

2626
from orangecontrib.text import Corpus
27-
from orangecontrib.text.language import ISO2LANG
27+
from orangecontrib.text.language import ISO2LANG, LANG2ISO
2828
from orangecontrib.text.misc import nltk_data_dir
2929
from orangecontrib.text.preprocess import *
3030
from orangecontrib.text.preprocess.normalize import UDPipeStopIteration
@@ -475,13 +475,15 @@ class NormalizationModule(SingleMethodModule):
475475
UDPipe: UDPipeLemmatizer,
476476
Lemmagen: LemmagenLemmatizer}
477477
DEFAULT_METHOD = Porter
478-
DEFAULT_LANGUAGE = "English"
478+
DEFAULT_SNOWBALL_LANG = "English" # todo: remove when snowball use iso
479+
DEFAULT_UDPIPE_LANG = "English" # todo: remove when udpipe use iso
480+
DEFAULT_LANGUAGE = "en"
479481
DEFAULT_USE_TOKE = False
480482

481483
def __init__(self, parent=None, **kwargs):
482484
super().__init__(parent, **kwargs)
483-
self.__snowball_lang = self.DEFAULT_LANGUAGE
484-
self.__udpipe_lang = self.DEFAULT_LANGUAGE
485+
self.__snowball_lang = self.DEFAULT_SNOWBALL_LANG
486+
self.__udpipe_lang = self.DEFAULT_UDPIPE_LANG
485487
self.__lemmagen_lang = self.DEFAULT_LANGUAGE
486488
self.__use_tokenizer = self.DEFAULT_USE_TOKE
487489

@@ -490,15 +492,17 @@ def __init__(self, parent=None, **kwargs):
490492
self.__snowball_lang, self.__set_snowball_lang
491493
)
492494
self.__combo_udl = UDPipeComboBox(
493-
self, self.__udpipe_lang, self.DEFAULT_LANGUAGE,
494-
self.__set_udpipe_lang
495+
self, self.__udpipe_lang, self.DEFAULT_UDPIPE_LANG, self.__set_udpipe_lang
495496
)
496497
self.__check_use = QCheckBox("UDPipe tokenizer",
497498
checked=self.DEFAULT_USE_TOKE)
498499
self.__check_use.clicked.connect(self.__set_use_tokenizer)
499-
self.__combo_lemm = ComboBox(
500-
self, LemmagenLemmatizer.lemmagen_languages,
501-
self.__lemmagen_lang, self.__set_lemmagen_lang
500+
self.__combo_lemm = LanguageComboBox(
501+
self,
502+
LemmagenLemmatizer.supported_languages,
503+
self.__lemmagen_lang,
504+
False,
505+
self.__set_lemmagen_lang,
502506
)
503507

504508
label = QLabel("Language:")
@@ -530,9 +534,9 @@ def __enable_udpipe(self):
530534

531535
def setParameters(self, params: Dict):
532536
super().setParameters(params)
533-
snowball_lang = params.get("snowball_language", self.DEFAULT_LANGUAGE)
537+
snowball_lang = params.get("snowball_language", self.DEFAULT_SNOWBALL_LANG)
534538
self.__set_snowball_lang(snowball_lang)
535-
udpipe_lang = params.get("udpipe_language", self.DEFAULT_LANGUAGE)
539+
udpipe_lang = params.get("udpipe_language", self.DEFAULT_UDPIPE_LANG)
536540
self.__set_udpipe_lang(udpipe_lang)
537541
use_tokenizer = params.get("udpipe_tokenizer", self.DEFAULT_USE_TOKE)
538542
self.__set_use_tokenizer(use_tokenizer)
@@ -562,7 +566,7 @@ def __set_udpipe_lang(self, language: str):
562566
def __set_lemmagen_lang(self, language: str):
563567
if self.__lemmagen_lang != language:
564568
self.__lemmagen_lang = language
565-
self.__combo_lemm.setCurrentText(language)
569+
self.__combo_lemm.set_current_language(language)
566570
self.changed.emit()
567571
if self.method == self.Lemmagen:
568572
self.edited.emit()
@@ -587,12 +591,14 @@ def parameters(self) -> Dict:
587591
def createinstance(params: Dict) -> BaseNormalizer:
588592
method = params.get("method", NormalizationModule.DEFAULT_METHOD)
589593
args = {}
594+
def_snowball = NormalizationModule.DEFAULT_SNOWBALL_LANG
595+
def_udpipe = NormalizationModule.DEFAULT_UDPIPE_LANG
590596
def_lang = NormalizationModule.DEFAULT_LANGUAGE
591597
if method == NormalizationModule.Snowball:
592-
args = {"language": params.get("snowball_language", def_lang)}
598+
args = {"language": params.get("snowball_language", def_snowball)}
593599
elif method == NormalizationModule.UDPipe:
594600
def_use = NormalizationModule.DEFAULT_USE_TOKE
595-
args = {"language": params.get("udpipe_language", def_lang),
601+
args = {"language": params.get("udpipe_language", def_udpipe),
596602
"use_tokenizer": params.get("udpipe_tokenizer", def_use)}
597603
elif method == NormalizationModule.Lemmagen:
598604
args = {"language": params.get("lemmagen_language", def_lang)}
@@ -1384,6 +1390,8 @@ def str_into_paths(label):
13841390
pp["language"] = None
13851391
else:
13861392
pp["language"] = StopwordsFilter.lang_to_iso(pp["language"])
1393+
if pp_name == "preprocess.normalize" and "lemmagen_language" in pp:
1394+
pp["lemmagen_language"] = LANG2ISO[pp["lemmagen_language"]]
13871395

13881396

13891397
if __name__ == "__main__":

orangecontrib/text/widgets/tests/test_owpreprocess.py

+43-31
Original file line numberDiff line numberDiff line change
@@ -271,30 +271,16 @@ def test_migrate_settings(self):
271271
}
272272
self.create_widget(OWPreprocess, stored_settings=settings)
273273

274-
def test_migrate_language_settings(self):
274+
def test_migrate_filter_language_settings(self):
275275
"""Test migration to iso langauge codes"""
276276
settings = {
277277
"__version__": 3,
278278
"storedsettings": {
279-
"preprocessors": [
280-
(
281-
"preprocess.normalize",
282-
{
283-
"snowball_language": "French",
284-
"udpipe_language": "German",
285-
"lemmagen_language": "Slovenian",
286-
},
287-
),
288-
("preprocess.filter", {"language": "Finnish"}),
289-
]
279+
"preprocessors": [("preprocess.filter", {"language": "Finnish"})]
290280
},
291281
}
292282
widget = self.create_widget(OWPreprocess, stored_settings=settings)
293-
normalize_settings = widget.storedsettings["preprocessors"][0][1]
294-
filter_settings = widget.storedsettings["preprocessors"][1][1]
295-
self.assertEqual("Slovenian", normalize_settings["lemmagen_language"])
296-
self.assertEqual("French", normalize_settings["snowball_language"])
297-
self.assertEqual("German", normalize_settings["udpipe_language"])
283+
filter_settings = widget.storedsettings["preprocessors"][0][1]
298284
self.assertEqual("fi", filter_settings["language"])
299285

300286
# NLTK uses Slovene instead of Slovenian, this is also the reason
@@ -320,6 +306,32 @@ def test_migrate_language_settings(self):
320306
filter_settings = widget.storedsettings["preprocessors"][0][1]
321307
self.assertIsNone(filter_settings["language"])
322308

309+
def test_migrate_lemmagen_language_settings(self):
310+
"""Test migration to iso langauge codes"""
311+
settings = {
312+
"__version__": 3,
313+
"storedsettings": {
314+
"preprocessors": [
315+
("preprocess.normalize", {"lemmagen_language": "Slovenian"}),
316+
]
317+
},
318+
}
319+
widget = self.create_widget(OWPreprocess, stored_settings=settings)
320+
normalize_settings = widget.storedsettings["preprocessors"][0][1]
321+
self.assertEqual("sl", normalize_settings["lemmagen_language"])
322+
323+
settings = {
324+
"__version__": 3,
325+
"storedsettings": {
326+
"preprocessors": [
327+
("preprocess.normalize", {"lemmagen_language": "English"}),
328+
]
329+
},
330+
}
331+
widget = self.create_widget(OWPreprocess, stored_settings=settings)
332+
normalize_settings = widget.storedsettings["preprocessors"][0][1]
333+
self.assertEqual("en", normalize_settings["lemmagen_language"])
334+
323335

324336
class TestTransformationModule(WidgetTest):
325337
def setUp(self):
@@ -459,19 +471,23 @@ def test_init(self):
459471
self.assertFalse(self.check_use.isChecked())
460472

461473
def test_parameters(self):
462-
params = {"method": NormalizationModule.Porter,
463-
"snowball_language": "English",
464-
"udpipe_language": "English",
465-
"lemmagen_language": "English",
466-
"udpipe_tokenizer": False}
474+
params = {
475+
"method": NormalizationModule.Porter,
476+
"snowball_language": "English",
477+
"udpipe_language": "English",
478+
"lemmagen_language": "en",
479+
"udpipe_tokenizer": False,
480+
}
467481
self.assertDictEqual(self.editor.parameters(), params)
468482

469483
def test_set_parameters(self):
470-
params = {"method": NormalizationModule.UDPipe,
471-
"snowball_language": "Dutch",
472-
"udpipe_language": "Slovenian",
473-
"lemmagen_language": "Bulgarian",
474-
"udpipe_tokenizer": True}
484+
params = {
485+
"method": NormalizationModule.UDPipe,
486+
"snowball_language": "Dutch",
487+
"udpipe_language": "Slovenian",
488+
"lemmagen_language": "bg",
489+
"udpipe_tokenizer": True,
490+
}
475491
self.editor.setParameters(params)
476492
self.assertDictEqual(self.editor.parameters(), params)
477493
self.assertEqual(self.combo_sbl.currentText(), "Dutch")
@@ -738,10 +754,6 @@ def test_createinstance(self):
738754
pp = self.editor.createinstance({"method": POSTaggingModule.MaxEnt})
739755
self.assertIsInstance(pp, MaxEntTagger)
740756

741-
# TODO - implement StanfordPOSTagger
742-
# pp = self.editor.createinstance({"method": POSTaggingModule.Stanford})
743-
# self.assertIsInstance(pp, StanfordPOSTagger)
744-
745757
def test_repr(self):
746758
self.assertEqual(str(self.editor), "Averaged Perceptron Tagger")
747759

0 commit comments

Comments
 (0)