diff --git a/orangecontrib/text/keywords/__init__.py b/orangecontrib/text/keywords/__init__.py index 16a9527dd..bdfb44bdd 100644 --- a/orangecontrib/text/keywords/__init__.py +++ b/orangecontrib/text/keywords/__init__.py @@ -20,6 +20,7 @@ # all available languages for RAKE from orangecontrib.text.vectorization import BowVectorizer +# todo RAKE_LANGUAGES = StopwordsFilter.supported_languages() # all available languages for YAKE! YAKE_LANGUAGE_MAPPING = { diff --git a/orangecontrib/text/language.py b/orangecontrib/text/language.py index 189d49da7..f4f8af28e 100644 --- a/orangecontrib/text/language.py +++ b/orangecontrib/text/language.py @@ -46,6 +46,9 @@ "he": "Hebrew", "hi": "Hindi", "hi-Latn": "Hindi (latin)", + # https://en.wikipedia.org/wiki/Hinglish - since it doesn't really have ISO + # code we made one up to be able to used it for stopwords (supported in NLTK) + "hi_eng": "Hinglish", "hr": "Croatian", "ht": "Haitian", "hu": "Hungarian", diff --git a/orangecontrib/text/preprocess/filter.py b/orangecontrib/text/preprocess/filter.py index 4c24043bb..b02c100d0 100644 --- a/orangecontrib/text/preprocess/filter.py +++ b/orangecontrib/text/preprocess/filter.py @@ -11,7 +11,7 @@ from Orange.util import wrap_callback, dummy_callback from orangecontrib.text import Corpus -from orangecontrib.text.language import ISO2LANG +from orangecontrib.text.language import ISO2LANG, LANG2ISO from orangecontrib.text.misc import wait_nltk_data from orangecontrib.text.preprocess import TokenizedPreprocessor @@ -91,7 +91,8 @@ class StopwordsFilter(BaseTokenFilter, FileWordListMixin): name = 'Stopwords' # nltk uses different language nams for some languages - nltk_mapping = {"Slovenian": "Slovene"} + nltk_mapping = {"Slovene": "Slovenian"} + nltk_mapping_inv = {v: k for k, v in nltk_mapping.items()} def __init__( self, @@ -107,12 +108,13 @@ def __init__( @wait_nltk_data def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus: - # use language set in init if not None and Corpus's language otherwise - la = ISO2LANG[self.__language or corpus.language] - la = self.nltk_mapping.get(la, la) if self.__use_default_stopwords: - if la in self.supported_languages(): - self.__stopwords = set(x.strip() for x in stopwords.words(la.lower())) + # use language from attr if not None and Corpus's language otherwise + lang = self.__language or corpus.language + if lang in self.supported_languages(): + lang = ISO2LANG[lang] + lang = self.nltk_mapping_inv.get(lang, lang).lower() + self.__stopwords = set(x.strip() for x in stopwords.words(lang)) else: raise ValueError( "The stopwords filter does not support the Corpus's or " @@ -124,15 +126,16 @@ def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus: @wait_nltk_data def supported_languages(): # get NLTK list of stopwords - stopwords_listdir = [] try: - stopwords_listdir = [file for file in - os.listdir(stopwords._get_root()) - if file.islower()] + stopwords_listdir = [ + file for file in os.listdir(stopwords._get_root()) if file.islower() + ] except LookupError: # when no NLTK data is available - pass + stopwords_listdir = [] - return sorted(file.capitalize() for file in stopwords_listdir) + def to_iso(lang): + return LANG2ISO[StopwordsFilter.nltk_mapping.get(lang, lang)] + return {to_iso(file.capitalize()) for file in stopwords_listdir} def _check(self, token): return token not in self.__stopwords and token not in self._lexicon diff --git a/orangecontrib/text/preprocess/normalize.py b/orangecontrib/text/preprocess/normalize.py index 141e42e10..e17f3de91 100644 --- a/orangecontrib/text/preprocess/normalize.py +++ b/orangecontrib/text/preprocess/normalize.py @@ -144,7 +144,6 @@ def _find_file(self, language): def supported_languages(self): return sorted(map(lambda f: self.file_to_language(f[0]), self.model_files)) - @property def supported_languages_iso(self): return [self.language_to_iso(lg) for lg in self.supported_languages] @@ -156,17 +155,28 @@ def online(self): except ConnectionError: return False + # todo: clanup # use _ since - is already used in iso standard VARIATION_DELIMITER = "_" - def language_to_iso(self, language): + @staticmethod + def language_to_iso(language): if "(" in language: language, model = language.split("(") - return self.VARIATION_DELIMITER.join((language, model.strip(")"))) + return UDPipeModels.VARIATION_DELIMITER.join((language, model.strip(")"))) return LANG2ISO[language] - def iso_to_file(self, iso_langauge): - lg_var = iso_langauge.split(self.VARIATION_DELIMITER) + @staticmethod + def iso_to_language(iso_language): + lg_var = iso_language.split(UDPipeModels.VARIATION_DELIMITER) + lg, model_variation = lg_var if len(lg_var) == 2 else (lg_var[0], "") + lg = ISO2LANG[lg] + if model_variation: + model_variation = f"({model_variation})" + return " ".join((lg, model_variation)).strip() + + def iso_to_file(self, iso_language): + lg_var = iso_language.split(self.VARIATION_DELIMITER) lg, model_variation = lg_var if len(lg_var) == 2 else (lg_var[0], None) lg = ISO2LANG[lg] lg = [self.LANG2UDPIPE.get(lg, lg).lower().replace(" ", "_")] diff --git a/orangecontrib/text/widgets/owpreprocess.py b/orangecontrib/text/widgets/owpreprocess.py index d31af4911..8c69d42ce 100644 --- a/orangecontrib/text/widgets/owpreprocess.py +++ b/orangecontrib/text/widgets/owpreprocess.py @@ -1,10 +1,11 @@ -from typing import Dict, Optional, List, Callable, Tuple, Type, Union +from collections import defaultdict +from typing import Dict, Optional, List, Callable, Tuple, Type, Union, Iterable from types import SimpleNamespace import os import random import pkg_resources -from AnyQt.QtCore import Qt, pyqtSignal +from AnyQt.QtCore import Qt, pyqtSignal, QModelIndex from AnyQt.QtWidgets import QComboBox, QButtonGroup, QLabel, QCheckBox, \ QRadioButton, QGridLayout, QLineEdit, QSpinBox, QFormLayout, QHBoxLayout, \ QDoubleSpinBox, QFileDialog, QAbstractSpinBox @@ -12,6 +13,8 @@ from AnyQt.QtGui import QBrush, QValidator from Orange.util import wrap_callback +from orangecanvas.gui.utils import disconnected +from orangewidget.settings import SettingsHandler from orangewidget.utils.filedialogs import RecentPath import Orange.widgets.data.owpreprocess @@ -24,12 +27,12 @@ from Orange.widgets.widget import Input, Output, Msg, Message from orangecontrib.text import Corpus +from orangecontrib.text.language import ISO2LANG, LANG2ISO from orangecontrib.text.misc import nltk_data_dir from orangecontrib.text.preprocess import * -from orangecontrib.text.preprocess.normalize import UDPipeStopIteration +from orangecontrib.text.preprocess.normalize import UDPipeStopIteration, UDPipeModels from orangecontrib.text.tag import AveragedPerceptronTagger, MaxEntTagger, \ POSTagger -from orangecontrib.text.tag.pos import StanfordPOSTaggerError _DEFAULT_NONE = "(none)" @@ -54,14 +57,25 @@ def __validate(self): self.setStyleSheet(f"QLineEdit {{ border : 1px solid {color};}}") -class ComboBox(QComboBox): - def __init__(self, master: BaseEditor, items: List[str], value: str, +class LanguageComboBox(QComboBox): + def __init__(self, master: BaseEditor, items: Iterable[Optional[str]], value: str, callback: Callable): super().__init__(master) self.setMinimumWidth(80) - self.addItems(items) - self.setCurrentText(value) - self.currentTextChanged.connect(callback) + self.__add_items(items) + self.set_current_language(value) + self.currentIndexChanged.connect(self.__index_changed) + self.callback = callback + + def __add_items(self, items: Iterable[Optional[str]]): + for itm in sorted(items, key=lambda x: ISO2LANG.get(x) or ""): + self.addItem(ISO2LANG[itm] if itm else _DEFAULT_NONE, itm) + + def __index_changed(self, index: QModelIndex): + self.callback(self.itemData(index)) + + def set_current_language(self, iso_language: str): + self.setCurrentText(ISO2LANG[iso_language]) class UDPipeComboBox(QComboBox): @@ -70,8 +84,9 @@ def __init__(self, master: BaseEditor, value: str, default: str, super().__init__(master) self.__items = [] # type: List self.__default_lang = default - self.add_items(value) - self.currentTextChanged.connect(callback) + self.add_items(UDPipeModels.iso_to_language(value)) + self.currentTextChanged.connect(self.__text_changed) + self.callback = callback self.setMinimumWidth(80) @property @@ -94,6 +109,12 @@ def showPopup(self): self.add_items(self.currentText()) super().showPopup() + def __text_changed(self, language): + self.callback(UDPipeLemmatizer().models.language_to_iso(language)) + + def set_current_language(self, iso_language: str): + self.setCurrentText(UDPipeModels.iso_to_language(iso_language)) + class RangeSpins(QHBoxLayout): SpinBox = QSpinBox @@ -424,9 +445,12 @@ class NormalizationModule(SingleMethodModule): UDPipe: UDPipeLemmatizer, Lemmagen: LemmagenLemmatizer} DEFAULT_METHOD = Porter - DEFAULT_LANGUAGE = "English" + DEFAULT_LANGUAGE = "en" DEFAULT_USE_TOKE = False + SNOWBALL_LANGUAGES = SnowballStemmer.supported_languages + LEMMAGEN_LANGUAGES = LemmagenLemmatizer.supported_languages + def __init__(self, parent=None, **kwargs): super().__init__(parent, **kwargs) self.__snowball_lang = self.DEFAULT_LANGUAGE @@ -434,9 +458,11 @@ def __init__(self, parent=None, **kwargs): self.__lemmagen_lang = self.DEFAULT_LANGUAGE self.__use_tokenizer = self.DEFAULT_USE_TOKE - self.__combo_sbl = ComboBox( - self, SnowballStemmer.supported_languages, - self.__snowball_lang, self.__set_snowball_lang + self.__combo_sbl = LanguageComboBox( + self, + self.SNOWBALL_LANGUAGES, + self.__snowball_lang, + self.__set_snowball_lang, ) self.__combo_udl = UDPipeComboBox( self, self.__udpipe_lang, self.DEFAULT_LANGUAGE, @@ -445,9 +471,11 @@ def __init__(self, parent=None, **kwargs): self.__check_use = QCheckBox("UDPipe tokenizer", checked=self.DEFAULT_USE_TOKE) self.__check_use.clicked.connect(self.__set_use_tokenizer) - self.__combo_lemm = ComboBox( - self, LemmagenLemmatizer.lemmagen_languages, - self.__lemmagen_lang, self.__set_lemmagen_lang + self.__combo_lemm = LanguageComboBox( + self, + self.LEMMAGEN_LANGUAGES, + self.__lemmagen_lang, + self.__set_lemmagen_lang, ) label = QLabel("Language:") @@ -495,7 +523,7 @@ def _set_method(self, method: int): def __set_snowball_lang(self, language: str): if self.__snowball_lang != language: self.__snowball_lang = language - self.__combo_sbl.setCurrentText(language) + self.__combo_sbl.set_current_language(language) self.changed.emit() if self.method == self.Snowball: self.edited.emit() @@ -503,7 +531,7 @@ def __set_snowball_lang(self, language: str): def __set_udpipe_lang(self, language: str): if self.__udpipe_lang != language: self.__udpipe_lang = language - self.__combo_udl.setCurrentText(language) + self.__combo_udl.set_current_language(language) self.changed.emit() if self.method == self.UDPipe: self.edited.emit() @@ -511,7 +539,7 @@ def __set_udpipe_lang(self, language: str): def __set_lemmagen_lang(self, language: str): if self.__lemmagen_lang != language: self.__lemmagen_lang = language - self.__combo_lemm.setCurrentText(language) + self.__combo_lemm.set_current_language(language) self.changed.emit() if self.method == self.Lemmagen: self.edited.emit() @@ -571,7 +599,7 @@ class FilteringModule(MultipleMethodModule): MostFreq: MostFrequentTokensFilter, PosTag: PosTagFilter} DEFAULT_METHODS = [Stopwords] - DEFAULT_LANG = "English" + DEFAULT_LANG = "en" DEFAULT_NONE = None DEFAULT_INCL_NUM = False DEFAULT_PATTERN = r"\.|,|:|;|!|\?|\(|\)|\||\+|\'|\"|‘|’|“|”|\'|" \ @@ -582,6 +610,8 @@ class FilteringModule(MultipleMethodModule): DEFAULT_N_TOKEN = 100 DEFAULT_POS_TAGS = "NOUN,VERB" + STOP_WORDS_LANGUAGES = StopwordsFilter.supported_languages() + def __init__(self, parent=None, **kwargs): super().__init__(parent, **kwargs) self.__sw_lang = self.DEFAULT_LANG @@ -598,9 +628,11 @@ def __init__(self, parent=None, **kwargs): self.__pos_tag = self.DEFAULT_POS_TAGS self.__invalidated = False - self.__combo = ComboBox( - self, [_DEFAULT_NONE] + StopwordsFilter.supported_languages(), - self.__sw_lang, self.__set_language + self.__combo = LanguageComboBox( + self, + [None] + list(self.STOP_WORDS_LANGUAGES), + self.__sw_lang, + self.__set_language, ) self.__sw_loader = FileLoader() self.__sw_loader.set_file_list() @@ -759,7 +791,7 @@ def setParameters(self, params: Dict): def __set_language(self, language: str): if self.__sw_lang != language: self.__sw_lang = language - self.__combo.setCurrentText(language) + self.__combo.setCurrentText(ISO2LANG[language]) self.changed.emit() if self.Stopwords in self.methods: self.edited.emit() @@ -984,6 +1016,21 @@ def createinstance(params: Dict) -> POSTagger: return POSTaggingModule.Methods[method]() +class PreprocessSettingsHandler(SettingsHandler): + """ + A bit modified settings handler, that makes all language settings, which are + a part of common preprocess settings, schema_only. It removes them when + settings are not loaded from schema but from common settings. + """ + def _remove_schema_only(self, settings_dict): + super()._remove_schema_only(settings_dict) + for setting, data, _ in self.provider.traverse_settings(data=settings_dict): + for pp_name, settings in data["storedsettings"]["preprocessors"]: + for key in list(settings): + if "language" in key: + settings.pop(key) + + PREPROCESS_ACTIONS = [ PreprocessAction( "Transformation", "preprocess.transform", "", @@ -1027,7 +1074,7 @@ class OWPreprocess(Orange.widgets.data.owpreprocess.OWPreprocess, priority = 200 keywords = "preprocess text, text" - settings_version = 3 + settings_version = 4 class Inputs: corpus = Input("Corpus", Corpus) @@ -1068,12 +1115,14 @@ class Warning(Orange.widgets.data.owpreprocess.OWPreprocess.Warning): ("preprocess.tokenize", {}), ("preprocess.filter", {})] } # type: Dict[str, List[Tuple[str, Dict]]] + settingsHandler = PreprocessSettingsHandler() storedsettings = Setting(DEFAULT_PP) buttons_area_orientation = Qt.Vertical def __init__(self): ConcurrentWidgetMixin.__init__(self) Orange.widgets.data.owpreprocess.OWPreprocess.__init__(self) + self.__store_pending_languages() box = gui.vBox(self.controlArea, "Preview") self.preview = "" @@ -1091,6 +1140,12 @@ def load(self, saved: Dict) -> StandardItemModel: saved["preprocessors"][i] = (name, params) return super().load(saved) + def set_model(self, pmodel): + if pmodel: + pmodel.rowsInserted.connect(self.__on_item_inserted) + super().set_model(pmodel) + + def __update_filtering_params(self, params: Dict): params["sw_path"] = self.__relocate_file(params.get("sw_path")) params["sw_list"] = self.__relocate_files(params.get("sw_list", [])) @@ -1116,10 +1171,56 @@ def __relocate_file(self, path: RecentPath) -> RecentPath: search_paths, **kwargs) return path + def __on_item_inserted(self, _, first: int, last: int): + assert first == last + self.__set_languages_single_item(first) + self.storedsettings = self.save(self.preprocessormodel) + @Inputs.corpus def set_data(self, data: Corpus): self.cancel() self.data = data + self.__set_languages() + + LANG_PARAMS = { + "preprocess.normalize": [ + ("snowball_language", NormalizationModule.SNOWBALL_LANGUAGES), + ("udpipe_language", UDPipeLemmatizer().models.supported_languages_iso), + ("lemmagen_language", NormalizationModule.LEMMAGEN_LANGUAGES), + ], + "preprocess.filter": [("language", FilteringModule.STOP_WORDS_LANGUAGES)], + } + + def __store_pending_languages(self): + self.__pending_languages = defaultdict(dict) + for pp_name, params in self.storedsettings["preprocessors"]: + for p, _ in self.LANG_PARAMS.get(pp_name, []): + if p in params: + self.__pending_languages[pp_name][p] = params[p] + + def __set_languages(self): + if self.data is not None: + for i in range(self.preprocessormodel.rowCount()): + self.__set_languages_single_item(i) + self.__pending_languages = {} + self.storedsettings = self.save(self.preprocessormodel) + + def __set_languages_single_item(self, item_index: int): + item = self.preprocessormodel.item(item_index) + pp_name = item.data(DescriptionRole).qualname + params = item.data(ParametersRole) + pending = self.__pending_languages.get(pp_name, {}) + for param, sup_lang in self.LANG_PARAMS.get(pp_name, []): + if param in pending: + params[param] = pending[param] + else: + sup_lang = sup_lang() if callable(sup_lang) else sup_lang + if self.data.language and self.data.language in sup_lang: + params[param] = self.data.language + with disconnected(self.preprocessormodel.dataChanged, self.__on_modelchanged): + # dataChange must be disconnected to prevent double apply call + # both calls of this method call apply after + item.setData(params, ParametersRole) def buildpreproc(self) -> PreprocessorList: plist = [] @@ -1162,8 +1263,6 @@ def apply(self): self.Error.file_not_found() except UnicodeError as e: self.Error.invalid_encoding(e) - except StanfordPOSTaggerError as e: - self.Error.stanford_tagger(e) except Exception as e: self.Error.unknown_error(str(e)) @@ -1325,6 +1424,20 @@ def str_into_paths(label): del pp_settings["start"] del pp_settings["end"] + # before version 4 languages were saved as full-word language strings + if version < 4: + preprocessors = settings["storedsettings"]["preprocessors"] + for pp_name, pp in preprocessors: + if pp_name == "preprocess.normalize": + for k in ("snowball_language", "lemmagen_language"): + if k in pp: + pp[k] = LANG2ISO[pp[k]] + up_lang = "udpipe_language" + if up_lang in pp: + pp[up_lang] = UDPipeModels.language_to_iso(pp[up_lang]) + if pp_name == "preprocess.filter" and "language" in pp: + pp["language"] = LANG2ISO[pp["language"]] + if __name__ == "__main__": from Orange.widgets.utils.widgetpreview import WidgetPreview diff --git a/orangecontrib/text/widgets/tests/test_owpreprocess.py b/orangecontrib/text/widgets/tests/test_owpreprocess.py index c4f1e47ab..d4646cc90 100644 --- a/orangecontrib/text/widgets/tests/test_owpreprocess.py +++ b/orangecontrib/text/widgets/tests/test_owpreprocess.py @@ -139,6 +139,70 @@ def fun(*_): widget = self.create_widget(OWPreprocess) self.assertTrue(widget.Error.invalid_encoding.is_shown()) + def test_language_from_corpus(self): + """Languege from corpus is set correctly""" + initial = { + "name": "", + "preprocessors": [ + ("preprocess.transform", {}), + ("preprocess.tokenize", {}), + ("preprocess.normalize", {}), + ("preprocess.filter", {}), + ], + } + self.widget.storedsettings = initial + self.widget._initialize() + self.assertDictEqual(initial, self.widget.storedsettings) + + self.corpus.attributes["language"] = None + self.send_signal(self.widget.Inputs.corpus, self.corpus) + # nothing should change since language is missing in corpus + self.assertDictEqual(initial, self.widget.storedsettings) + + self.corpus.attributes["language"] = "en" + self.send_signal(self.widget.Inputs.corpus, self.corpus) + normalize_settings = self.widget.storedsettings["preprocessors"][2][1] + filter_settings = self.widget.storedsettings["preprocessors"][3][1] + self.assertEqual("en", normalize_settings["lemmagen_language"]) + self.assertEqual("en", normalize_settings["snowball_language"]) + self.assertEqual("en", normalize_settings["udpipe_language"]) + self.assertEqual("en", filter_settings["language"]) + + # language not supported by all preprocessors + self.corpus.attributes["language"] = "nl" + self.send_signal(self.widget.Inputs.corpus, self.corpus) + normalize_settings = self.widget.storedsettings["preprocessors"][2][1] + filter_settings = self.widget.storedsettings["preprocessors"][3][1] + self.assertEqual("en", normalize_settings["lemmagen_language"]) + self.assertEqual("nl", normalize_settings["snowball_language"]) + self.assertEqual("en", normalize_settings["udpipe_language"]) + self.assertEqual("nl", filter_settings["language"]) + + def test_language_from_schema(self): + """Test language from schema/workflow is retained""" + initial = { + "name": "", + "preprocessors": [ + ("preprocess.transform", {}), + ("preprocess.tokenize", {}), + ( + "preprocess.normalize", + { + "lemmagen_language": "sl", + "snowball_language": "nl", + "udpipe_language": "lt", + }, + ), + ("preprocess.filter", {"language": "nl"}), + ], + } + self.widget.storedsettings = initial + + settings = self.widget.settingsHandler.pack_data(self.widget) + widget = self.create_widget(OWPreprocess, stored_settings=settings) + self.send_signal(widget.Inputs.corpus, self.corpus, widget=widget) + self.assertDictEqual(initial, widget.storedsettings) + @patch(SF_LIST, new=Mock(return_value=SERVER_FILES)) class TestOWPreprocessMigrateSettings(WidgetTest): @@ -165,8 +229,8 @@ def test_migrate_settings_normalize(self): "udpipe_tokenizer": True}} widget = self.create_widget(OWPreprocess, stored_settings=settings) params = [("preprocess.normalize", - {"method": 2, "snowball_language": "French", - "udpipe_language": "German", "udpipe_tokenizer": True})] + {"method": 2, "snowball_language": "fr", + "udpipe_language": "de", "udpipe_tokenizer": True})] self.assertEqual(widget.storedsettings["preprocessors"], params) def test_migrate_settings_filter(self): @@ -180,7 +244,7 @@ def test_migrate_settings_filter(self): "use_df": False, "use_keep_n": False}} widget = self.create_widget(OWPreprocess, stored_settings=settings) params = [("preprocess.filter", - {"methods": [0, 2, 4], "language": "Finnish", + {"methods": [0, 2, 4], "language": "fi", "sw_path": None, "sw_list": [], "lx_path": None, "lx_list": [], "pattern": "foo", "rel_start": 0.3, @@ -231,6 +295,52 @@ def test_migrate_settings(self): } self.create_widget(OWPreprocess, stored_settings=settings) + def test_migrate_language_settings(self): + """Test migration to iso langauge codes""" + settings = { + "__version__": 3, + "storedsettings": { + "preprocessors": [ + ( + "preprocess.normalize", + { + "snowball_language": "French", + "udpipe_language": "German", + "lemmagen_language": "Slovenian", + }, + ), + ("preprocess.filter", {"language": "Finnish"}), + ] + }, + } + widget = self.create_widget(OWPreprocess, stored_settings=settings) + normalize_settings = widget.storedsettings["preprocessors"][0][1] + filter_settings = widget.storedsettings["preprocessors"][1][1] + self.assertEqual("sl", normalize_settings["lemmagen_language"]) + self.assertEqual("fr", normalize_settings["snowball_language"]) + self.assertEqual("de", normalize_settings["udpipe_language"]) + self.assertEqual("fi", filter_settings["language"]) + + settings = { + "__version__": 3, + "storedsettings": { + "preprocessors": [ + ( + "preprocess.normalize", + { + "snowball_language": "French", + "lemmagen_language": "Slovenian", + }, + ), + ("preprocess.filter", {}), + ] + }, + } + widget = self.create_widget(OWPreprocess, stored_settings=settings) + normalize_settings = widget.storedsettings["preprocessors"][0][1] + self.assertEqual("sl", normalize_settings["lemmagen_language"]) + self.assertEqual("fr", normalize_settings["snowball_language"]) + class TestTransformationModule(WidgetTest): def setUp(self): @@ -371,17 +481,17 @@ def test_init(self): def test_parameters(self): params = {"method": NormalizationModule.Porter, - "snowball_language": "English", - "udpipe_language": "English", - "lemmagen_language": "English", + "snowball_language": "en", + "udpipe_language": "en", + "lemmagen_language": "en", "udpipe_tokenizer": False} self.assertDictEqual(self.editor.parameters(), params) def test_set_parameters(self): params = {"method": NormalizationModule.UDPipe, - "snowball_language": "Dutch", - "udpipe_language": "Slovenian", - "lemmagen_language": "Bulgarian", + "snowball_language": "nl", + "udpipe_language": "sl", + "lemmagen_language": "bg", "udpipe_tokenizer": True} self.editor.setParameters(params) self.assertDictEqual(self.editor.parameters(), params) @@ -397,20 +507,19 @@ def test_createinstance(self): params = {"method": NormalizationModule.Snowball} pp = self.editor.createinstance(params) self.assertIsInstance(pp, SnowballStemmer) - self.assertIn("", str(pp.normalizer)) + self.assertEqual("en", pp._language) - params = {"method": NormalizationModule.Snowball, - "snowball_language": "Dutch"} + params = {"method": NormalizationModule.Snowball, "snowball_language": "nl"} pp = self.editor.createinstance(params) self.assertIsInstance(pp, SnowballStemmer) - self.assertIn("", str(pp.normalizer)) + self.assertEqual("nl", pp._language) params = {"method": NormalizationModule.UDPipe, - "udpipe_language": "Finnish", + "udpipe_language": "fi", "udpipe_tokenizer": True} pp = self.editor.createinstance(params) self.assertIsInstance(pp, UDPipeLemmatizer) - self.assertEqual(pp._UDPipeLemmatizer__language, "Finnish") + self.assertEqual(pp._language, "fi") self.assertEqual(pp._UDPipeLemmatizer__use_tokenizer, True) def test_repr(self): @@ -491,7 +600,7 @@ def test_init(self): def test_parameters(self): params = {"methods": [FilteringModule.Stopwords], - "language": "English", "sw_path": None, "lx_path": None, + "language": "en", "sw_path": None, "lx_path": None, "sw_list": [], "lx_list": [], "incl_num": False, "pattern": FilteringModule.DEFAULT_PATTERN, @@ -506,7 +615,7 @@ def test_set_parameters(self): sw_path = RecentPath.create("Foo", []) lx_path = RecentPath.create("Bar", []) params = {"methods": [FilteringModule.Lexicon, FilteringModule.Regexp], - "language": "Finnish", + "language": "fi", "sw_path": sw_path, "lx_path": lx_path, "sw_list": [sw_path], "lx_list": [lx_path], "incl_num": False, @@ -551,7 +660,7 @@ def test_createinstance(self): def test_repr(self): self.assertEqual(str(self.editor), - "Stopwords (Language: English, File: None)") + "Stopwords (Language: en, File: None)") params = {"methods": [FilteringModule.Lexicon, FilteringModule.Regexp]} self.editor.setParameters(params) @@ -646,10 +755,6 @@ def test_createinstance(self): pp = self.editor.createinstance({"method": POSTaggingModule.MaxEnt}) self.assertIsInstance(pp, MaxEntTagger) - # TODO - implement StanfordPOSTagger - # pp = self.editor.createinstance({"method": POSTaggingModule.Stanford}) - # self.assertIsInstance(pp, StanfordPOSTagger) - def test_repr(self): self.assertEqual(str(self.editor), "Averaged Perceptron Tagger")