From a2a84662e9ec4b22ce732d3f8a3c8d77954bedd0 Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Fri, 14 Apr 2023 15:33:34 +0200
Subject: [PATCH] Preprocess widget - Language from corpus
---
orangecontrib/text/keywords/__init__.py | 1 +
orangecontrib/text/language.py | 3 +
orangecontrib/text/preprocess/filter.py | 29 +--
orangecontrib/text/preprocess/normalize.py | 4 +-
orangecontrib/text/widgets/owpreprocess.py | 171 +++++++++++++++---
.../text/widgets/tests/test_owpreprocess.py | 149 ++++++++++++---
6 files changed, 292 insertions(+), 65 deletions(-)
diff --git a/orangecontrib/text/keywords/__init__.py b/orangecontrib/text/keywords/__init__.py
index 16a9527dd..bdfb44bdd 100644
--- a/orangecontrib/text/keywords/__init__.py
+++ b/orangecontrib/text/keywords/__init__.py
@@ -20,6 +20,7 @@
# all available languages for RAKE
from orangecontrib.text.vectorization import BowVectorizer
+# todo
RAKE_LANGUAGES = StopwordsFilter.supported_languages()
# all available languages for YAKE!
YAKE_LANGUAGE_MAPPING = {
diff --git a/orangecontrib/text/language.py b/orangecontrib/text/language.py
index 189d49da7..f4f8af28e 100644
--- a/orangecontrib/text/language.py
+++ b/orangecontrib/text/language.py
@@ -46,6 +46,9 @@
"he": "Hebrew",
"hi": "Hindi",
"hi-Latn": "Hindi (latin)",
+ # https://en.wikipedia.org/wiki/Hinglish - since it doesn't really have ISO
+ # code we made one up to be able to used it for stopwords (supported in NLTK)
+ "hi_eng": "Hinglish",
"hr": "Croatian",
"ht": "Haitian",
"hu": "Hungarian",
diff --git a/orangecontrib/text/preprocess/filter.py b/orangecontrib/text/preprocess/filter.py
index 8adf988bd..25c98466d 100644
--- a/orangecontrib/text/preprocess/filter.py
+++ b/orangecontrib/text/preprocess/filter.py
@@ -11,7 +11,7 @@
from Orange.util import wrap_callback, dummy_callback
from orangecontrib.text import Corpus
-from orangecontrib.text.language import ISO2LANG
+from orangecontrib.text.language import ISO2LANG, LANG2ISO
from orangecontrib.text.misc import wait_nltk_data
from orangecontrib.text.preprocess import TokenizedPreprocessor
@@ -87,7 +87,8 @@ class StopwordsFilter(BaseTokenFilter, FileWordListMixin):
name = 'Stopwords'
# nltk uses different language nams for some languages
- nltk_mapping = {"Slovenian": "Slovene"}
+ nltk_mapping = {"Slovene": "Slovenian"}
+ nltk_mapping_inv = {v: k for k, v in nltk_mapping.items()}
def __init__(
self,
@@ -103,12 +104,13 @@ def __init__(
@wait_nltk_data
def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
- # use language set in init if not None and Corpus's language otherwise
- la = ISO2LANG[self.__language or corpus.language]
- la = self.nltk_mapping.get(la, la)
if self.__use_default_stopwords:
- if la in self.supported_languages():
- self.__stopwords = set(x.strip() for x in stopwords.words(la.lower()))
+ # use language from attr if not None and Corpus's language otherwise
+ lang = self.__language or corpus.language
+ if lang in self.supported_languages():
+ lang = ISO2LANG[lang]
+ lang = self.nltk_mapping_inv.get(lang, lang).lower()
+ self.__stopwords = set(x.strip() for x in stopwords.words(lang))
else:
raise ValueError(
"The stopwords filter does not support the Corpus's or "
@@ -120,15 +122,16 @@ def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
@wait_nltk_data
def supported_languages():
# get NLTK list of stopwords
- stopwords_listdir = []
try:
- stopwords_listdir = [file for file in
- os.listdir(stopwords._get_root())
- if file.islower()]
+ stopwords_listdir = [
+ file for file in os.listdir(stopwords._get_root()) if file.islower()
+ ]
except LookupError: # when no NLTK data is available
- pass
+ stopwords_listdir = []
- return sorted(file.capitalize() for file in stopwords_listdir)
+ def to_iso(lang):
+ return LANG2ISO[StopwordsFilter.nltk_mapping.get(lang, lang)]
+ return {to_iso(file.capitalize()) for file in stopwords_listdir}
def _check(self, token):
return token not in self.__stopwords and token not in self._lexicon
diff --git a/orangecontrib/text/preprocess/normalize.py b/orangecontrib/text/preprocess/normalize.py
index f8ab7f835..d6acc1ba8 100644
--- a/orangecontrib/text/preprocess/normalize.py
+++ b/orangecontrib/text/preprocess/normalize.py
@@ -155,10 +155,12 @@ def online(self):
except ConnectionError:
return False
+ # todo: clanup
# use _ since - is already used in iso standard
VARIATION_DELIMITER = "_"
- def language_to_iso(self, language):
+ @staticmethod
+ def language_to_iso(language):
if "(" in language:
language, model = language.split("(")
language = LANG2ISO[language.strip()]
diff --git a/orangecontrib/text/widgets/owpreprocess.py b/orangecontrib/text/widgets/owpreprocess.py
index adbb4d024..4b83931d0 100644
--- a/orangecontrib/text/widgets/owpreprocess.py
+++ b/orangecontrib/text/widgets/owpreprocess.py
@@ -1,10 +1,11 @@
-from typing import Dict, Optional, List, Callable, Tuple, Type, Union
+from collections import defaultdict
+from typing import Dict, Optional, List, Callable, Tuple, Type, Union, Iterable
from types import SimpleNamespace
import os
import random
import pkg_resources
-from AnyQt.QtCore import Qt, pyqtSignal
+from AnyQt.QtCore import Qt, pyqtSignal, QModelIndex
from AnyQt.QtWidgets import QComboBox, QButtonGroup, QLabel, QCheckBox, \
QRadioButton, QGridLayout, QLineEdit, QSpinBox, QFormLayout, QHBoxLayout, \
QDoubleSpinBox, QFileDialog, QAbstractSpinBox
@@ -12,6 +13,8 @@
from AnyQt.QtGui import QBrush, QValidator
from Orange.util import wrap_callback
+from orangecanvas.gui.utils import disconnected
+from orangewidget.settings import SettingsHandler
from orangewidget.utils.filedialogs import RecentPath
import Orange.widgets.data.owpreprocess
@@ -24,12 +27,12 @@
from Orange.widgets.widget import Input, Output, Msg, Message
from orangecontrib.text import Corpus
+from orangecontrib.text.language import ISO2LANG, LANG2ISO
from orangecontrib.text.misc import nltk_data_dir
from orangecontrib.text.preprocess import *
-from orangecontrib.text.preprocess.normalize import UDPipeStopIteration
+from orangecontrib.text.preprocess.normalize import UDPipeStopIteration, UDPipeModels
from orangecontrib.text.tag import AveragedPerceptronTagger, MaxEntTagger, \
POSTagger
-from orangecontrib.text.tag.pos import StanfordPOSTaggerError
_DEFAULT_NONE = "(none)"
@@ -54,14 +57,25 @@ def __validate(self):
self.setStyleSheet(f"QLineEdit {{ border : 1px solid {color};}}")
-class ComboBox(QComboBox):
- def __init__(self, master: BaseEditor, items: List[str], value: str,
+class LanguageComboBox(QComboBox):
+ def __init__(self, master: BaseEditor, items: Iterable[Optional[str]], value: str,
callback: Callable):
super().__init__(master)
self.setMinimumWidth(80)
- self.addItems(items)
- self.setCurrentText(value)
- self.currentTextChanged.connect(callback)
+ self.__add_items(items)
+ self.set_current_language(value)
+ self.currentIndexChanged.connect(self.__index_changed)
+ self.callback = callback
+
+ def __add_items(self, items: Iterable[Optional[str]]):
+ for itm in sorted(items, key=lambda x: ISO2LANG.get(x) or ""):
+ self.addItem(ISO2LANG[itm] if itm else _DEFAULT_NONE, itm)
+
+ def __index_changed(self, index: QModelIndex):
+ self.callback(self.itemData(index))
+
+ def set_current_language(self, iso_language: str):
+ self.setCurrentText(ISO2LANG[iso_language])
class UDPipeComboBox(QComboBox):
@@ -70,8 +84,9 @@ def __init__(self, master: BaseEditor, value: str, default: str,
super().__init__(master)
self.__items = [] # type: List
self.__default_lang = default
- self.add_items(value)
- self.currentTextChanged.connect(callback)
+ self.add_items(UDPipeModels.iso_to_language(value))
+ self.currentTextChanged.connect(self.__text_changed)
+ self.callback = callback
self.setMinimumWidth(80)
@property
@@ -94,6 +109,12 @@ def showPopup(self):
self.add_items(self.currentText())
super().showPopup()
+ def __text_changed(self, language):
+ self.callback(UDPipeLemmatizer().models.language_to_iso(language))
+
+ def set_current_language(self, iso_language: str):
+ self.setCurrentText(UDPipeModels.iso_to_language(iso_language))
+
class RangeSpins(QHBoxLayout):
SpinBox = QSpinBox
@@ -424,9 +445,12 @@ class NormalizationModule(SingleMethodModule):
UDPipe: UDPipeLemmatizer,
Lemmagen: LemmagenLemmatizer}
DEFAULT_METHOD = Porter
- DEFAULT_LANGUAGE = "English"
+ DEFAULT_LANGUAGE = "en"
DEFAULT_USE_TOKE = False
+ SNOWBALL_LANGUAGES = SnowballStemmer.supported_languages
+ LEMMAGEN_LANGUAGES = LemmagenLemmatizer.supported_languages
+
def __init__(self, parent=None, **kwargs):
super().__init__(parent, **kwargs)
self.__snowball_lang = self.DEFAULT_LANGUAGE
@@ -434,9 +458,11 @@ def __init__(self, parent=None, **kwargs):
self.__lemmagen_lang = self.DEFAULT_LANGUAGE
self.__use_tokenizer = self.DEFAULT_USE_TOKE
- self.__combo_sbl = ComboBox(
- self, SnowballStemmer.supported_languages,
- self.__snowball_lang, self.__set_snowball_lang
+ self.__combo_sbl = LanguageComboBox(
+ self,
+ self.SNOWBALL_LANGUAGES,
+ self.__snowball_lang,
+ self.__set_snowball_lang,
)
self.__combo_udl = UDPipeComboBox(
self, self.__udpipe_lang, self.DEFAULT_LANGUAGE,
@@ -445,9 +471,11 @@ def __init__(self, parent=None, **kwargs):
self.__check_use = QCheckBox("UDPipe tokenizer",
checked=self.DEFAULT_USE_TOKE)
self.__check_use.clicked.connect(self.__set_use_tokenizer)
- self.__combo_lemm = ComboBox(
- self, LemmagenLemmatizer.lemmagen_languages,
- self.__lemmagen_lang, self.__set_lemmagen_lang
+ self.__combo_lemm = LanguageComboBox(
+ self,
+ self.LEMMAGEN_LANGUAGES,
+ self.__lemmagen_lang,
+ self.__set_lemmagen_lang,
)
label = QLabel("Language:")
@@ -495,7 +523,7 @@ def _set_method(self, method: int):
def __set_snowball_lang(self, language: str):
if self.__snowball_lang != language:
self.__snowball_lang = language
- self.__combo_sbl.setCurrentText(language)
+ self.__combo_sbl.set_current_language(language)
self.changed.emit()
if self.method == self.Snowball:
self.edited.emit()
@@ -503,7 +531,7 @@ def __set_snowball_lang(self, language: str):
def __set_udpipe_lang(self, language: str):
if self.__udpipe_lang != language:
self.__udpipe_lang = language
- self.__combo_udl.setCurrentText(language)
+ self.__combo_udl.set_current_language(language)
self.changed.emit()
if self.method == self.UDPipe:
self.edited.emit()
@@ -511,7 +539,7 @@ def __set_udpipe_lang(self, language: str):
def __set_lemmagen_lang(self, language: str):
if self.__lemmagen_lang != language:
self.__lemmagen_lang = language
- self.__combo_lemm.setCurrentText(language)
+ self.__combo_lemm.set_current_language(language)
self.changed.emit()
if self.method == self.Lemmagen:
self.edited.emit()
@@ -571,7 +599,7 @@ class FilteringModule(MultipleMethodModule):
MostFreq: MostFrequentTokensFilter,
PosTag: PosTagFilter}
DEFAULT_METHODS = [Stopwords]
- DEFAULT_LANG = "English"
+ DEFAULT_LANG = "en"
DEFAULT_NONE = None
DEFAULT_INCL_NUM = False
DEFAULT_PATTERN = r"\.|,|:|;|!|\?|\(|\)|\||\+|\'|\"|‘|’|“|”|\'|" \
@@ -582,6 +610,8 @@ class FilteringModule(MultipleMethodModule):
DEFAULT_N_TOKEN = 100
DEFAULT_POS_TAGS = "NOUN,VERB"
+ STOP_WORDS_LANGUAGES = StopwordsFilter.supported_languages()
+
def __init__(self, parent=None, **kwargs):
super().__init__(parent, **kwargs)
self.__sw_lang = self.DEFAULT_LANG
@@ -598,9 +628,11 @@ def __init__(self, parent=None, **kwargs):
self.__pos_tag = self.DEFAULT_POS_TAGS
self.__invalidated = False
- self.__combo = ComboBox(
- self, [_DEFAULT_NONE] + StopwordsFilter.supported_languages(),
- self.__sw_lang, self.__set_language
+ self.__combo = LanguageComboBox(
+ self,
+ [None] + list(self.STOP_WORDS_LANGUAGES),
+ self.__sw_lang,
+ self.__set_language,
)
self.__sw_loader = FileLoader()
self.__sw_loader.set_file_list()
@@ -759,7 +791,7 @@ def setParameters(self, params: Dict):
def __set_language(self, language: str):
if self.__sw_lang != language:
self.__sw_lang = language
- self.__combo.setCurrentText(language)
+ self.__combo.setCurrentText(ISO2LANG[language])
self.changed.emit()
if self.Stopwords in self.methods:
self.edited.emit()
@@ -984,6 +1016,21 @@ def createinstance(params: Dict) -> POSTagger:
return POSTaggingModule.Methods[method]()
+class PreprocessSettingsHandler(SettingsHandler):
+ """
+ A bit modified settings handler, that makes all language settings, which are
+ a part of common preprocess settings, schema_only. It removes them when
+ settings are not loaded from schema but from common settings.
+ """
+ def _remove_schema_only(self, settings_dict):
+ super()._remove_schema_only(settings_dict)
+ for setting, data, _ in self.provider.traverse_settings(data=settings_dict):
+ for pp_name, settings in data["storedsettings"]["preprocessors"]:
+ for key in list(settings):
+ if "language" in key:
+ settings.pop(key)
+
+
PREPROCESS_ACTIONS = [
PreprocessAction(
"Transformation", "preprocess.transform", "",
@@ -1027,7 +1074,7 @@ class OWPreprocess(Orange.widgets.data.owpreprocess.OWPreprocess,
priority = 200
keywords = "preprocess text, text"
- settings_version = 3
+ settings_version = 4
class Inputs:
corpus = Input("Corpus", Corpus)
@@ -1068,12 +1115,14 @@ class Warning(Orange.widgets.data.owpreprocess.OWPreprocess.Warning):
("preprocess.tokenize", {}),
("preprocess.filter", {})]
} # type: Dict[str, List[Tuple[str, Dict]]]
+ settingsHandler = PreprocessSettingsHandler()
storedsettings = Setting(DEFAULT_PP)
buttons_area_orientation = Qt.Vertical
def __init__(self):
ConcurrentWidgetMixin.__init__(self)
Orange.widgets.data.owpreprocess.OWPreprocess.__init__(self)
+ self.__store_pending_languages()
box = gui.vBox(self.controlArea, "Preview")
self.preview = ""
@@ -1091,6 +1140,12 @@ def load(self, saved: Dict) -> StandardItemModel:
saved["preprocessors"][i] = (name, params)
return super().load(saved)
+ def set_model(self, pmodel):
+ if pmodel:
+ pmodel.rowsInserted.connect(self.__on_item_inserted)
+ super().set_model(pmodel)
+
+
def __update_filtering_params(self, params: Dict):
params["sw_path"] = self.__relocate_file(params.get("sw_path"))
params["sw_list"] = self.__relocate_files(params.get("sw_list", []))
@@ -1116,10 +1171,56 @@ def __relocate_file(self, path: RecentPath) -> RecentPath:
search_paths, **kwargs)
return path
+ def __on_item_inserted(self, _, first: int, last: int):
+ assert first == last
+ self.__set_languages_single_item(first)
+ self.storedsettings = self.save(self.preprocessormodel)
+
@Inputs.corpus
def set_data(self, data: Corpus):
self.cancel()
self.data = data
+ self.__set_languages()
+
+ LANG_PARAMS = {
+ "preprocess.normalize": [
+ ("snowball_language", NormalizationModule.SNOWBALL_LANGUAGES),
+ ("udpipe_language", UDPipeLemmatizer().models.supported_languages_iso),
+ ("lemmagen_language", NormalizationModule.LEMMAGEN_LANGUAGES),
+ ],
+ "preprocess.filter": [("language", FilteringModule.STOP_WORDS_LANGUAGES)],
+ }
+
+ def __store_pending_languages(self):
+ self.__pending_languages = defaultdict(dict)
+ for pp_name, params in self.storedsettings["preprocessors"]:
+ for p, _ in self.LANG_PARAMS.get(pp_name, []):
+ if p in params:
+ self.__pending_languages[pp_name][p] = params[p]
+
+ def __set_languages(self):
+ if self.data is not None:
+ for i in range(self.preprocessormodel.rowCount()):
+ self.__set_languages_single_item(i)
+ self.__pending_languages = {}
+ self.storedsettings = self.save(self.preprocessormodel)
+
+ def __set_languages_single_item(self, item_index: int):
+ item = self.preprocessormodel.item(item_index)
+ pp_name = item.data(DescriptionRole).qualname
+ params = item.data(ParametersRole)
+ pending = self.__pending_languages.get(pp_name, {})
+ for param, sup_lang in self.LANG_PARAMS.get(pp_name, []):
+ if param in pending:
+ params[param] = pending[param]
+ else:
+ sup_lang = sup_lang() if callable(sup_lang) else sup_lang
+ if self.data.language and self.data.language in sup_lang:
+ params[param] = self.data.language
+ with disconnected(self.preprocessormodel.dataChanged, self.__on_modelchanged):
+ # dataChange must be disconnected to prevent double apply call
+ # both calls of this method call apply after
+ item.setData(params, ParametersRole)
def buildpreproc(self) -> PreprocessorList:
plist = []
@@ -1162,8 +1263,6 @@ def apply(self):
self.Error.file_not_found()
except UnicodeError as e:
self.Error.invalid_encoding(e)
- except StanfordPOSTaggerError as e:
- self.Error.stanford_tagger(e)
except Exception as e:
self.Error.unknown_error(str(e))
@@ -1324,6 +1423,20 @@ def str_into_paths(label):
del pp_settings["start"]
del pp_settings["end"]
+ # before version 4 languages were saved as full-word language strings
+ if version < 4:
+ preprocessors = settings["storedsettings"]["preprocessors"]
+ for pp_name, pp in preprocessors:
+ if pp_name == "preprocess.normalize":
+ for k in ("snowball_language", "lemmagen_language"):
+ if k in pp:
+ pp[k] = LANG2ISO[pp[k]]
+ up_lang = "udpipe_language"
+ if up_lang in pp:
+ pp[up_lang] = UDPipeModels.language_to_iso(pp[up_lang])
+ if pp_name == "preprocess.filter" and "language" in pp:
+ pp["language"] = LANG2ISO[pp["language"]]
+
if __name__ == "__main__":
from Orange.widgets.utils.widgetpreview import WidgetPreview
diff --git a/orangecontrib/text/widgets/tests/test_owpreprocess.py b/orangecontrib/text/widgets/tests/test_owpreprocess.py
index d1e47f2a3..98a6929ae 100644
--- a/orangecontrib/text/widgets/tests/test_owpreprocess.py
+++ b/orangecontrib/text/widgets/tests/test_owpreprocess.py
@@ -170,6 +170,70 @@ def test_no_tokens_left(self):
self.wait_until_finished()
self.assertFalse(self.widget.Warning.no_token_left.is_shown())
+ def test_language_from_corpus(self):
+ """Languege from corpus is set correctly"""
+ initial = {
+ "name": "",
+ "preprocessors": [
+ ("preprocess.transform", {}),
+ ("preprocess.tokenize", {}),
+ ("preprocess.normalize", {}),
+ ("preprocess.filter", {}),
+ ],
+ }
+ self.widget.storedsettings = initial
+ self.widget._initialize()
+ self.assertDictEqual(initial, self.widget.storedsettings)
+
+ self.corpus.attributes["language"] = None
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ # nothing should change since language is missing in corpus
+ self.assertDictEqual(initial, self.widget.storedsettings)
+
+ self.corpus.attributes["language"] = "en"
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ normalize_settings = self.widget.storedsettings["preprocessors"][2][1]
+ filter_settings = self.widget.storedsettings["preprocessors"][3][1]
+ self.assertEqual("en", normalize_settings["lemmagen_language"])
+ self.assertEqual("en", normalize_settings["snowball_language"])
+ self.assertEqual("en", normalize_settings["udpipe_language"])
+ self.assertEqual("en", filter_settings["language"])
+
+ # language not supported by all preprocessors
+ self.corpus.attributes["language"] = "nl"
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ normalize_settings = self.widget.storedsettings["preprocessors"][2][1]
+ filter_settings = self.widget.storedsettings["preprocessors"][3][1]
+ self.assertEqual("en", normalize_settings["lemmagen_language"])
+ self.assertEqual("nl", normalize_settings["snowball_language"])
+ self.assertEqual("en", normalize_settings["udpipe_language"])
+ self.assertEqual("nl", filter_settings["language"])
+
+ def test_language_from_schema(self):
+ """Test language from schema/workflow is retained"""
+ initial = {
+ "name": "",
+ "preprocessors": [
+ ("preprocess.transform", {}),
+ ("preprocess.tokenize", {}),
+ (
+ "preprocess.normalize",
+ {
+ "lemmagen_language": "sl",
+ "snowball_language": "nl",
+ "udpipe_language": "lt",
+ },
+ ),
+ ("preprocess.filter", {"language": "nl"}),
+ ],
+ }
+ self.widget.storedsettings = initial
+
+ settings = self.widget.settingsHandler.pack_data(self.widget)
+ widget = self.create_widget(OWPreprocess, stored_settings=settings)
+ self.send_signal(widget.Inputs.corpus, self.corpus, widget=widget)
+ self.assertDictEqual(initial, widget.storedsettings)
+
@patch(SF_LIST, new=Mock(return_value=SERVER_FILES))
class TestOWPreprocessMigrateSettings(WidgetTest):
@@ -196,8 +260,8 @@ def test_migrate_settings_normalize(self):
"udpipe_tokenizer": True}}
widget = self.create_widget(OWPreprocess, stored_settings=settings)
params = [("preprocess.normalize",
- {"method": 2, "snowball_language": "French",
- "udpipe_language": "German", "udpipe_tokenizer": True})]
+ {"method": 2, "snowball_language": "fr",
+ "udpipe_language": "de", "udpipe_tokenizer": True})]
self.assertEqual(widget.storedsettings["preprocessors"], params)
def test_migrate_settings_filter(self):
@@ -211,7 +275,7 @@ def test_migrate_settings_filter(self):
"use_df": False, "use_keep_n": False}}
widget = self.create_widget(OWPreprocess, stored_settings=settings)
params = [("preprocess.filter",
- {"methods": [0, 2, 4], "language": "Finnish",
+ {"methods": [0, 2, 4], "language": "fi",
"sw_path": None, "sw_list": [],
"lx_path": None, "lx_list": [],
"pattern": "foo", "rel_start": 0.3,
@@ -262,6 +326,52 @@ def test_migrate_settings(self):
}
self.create_widget(OWPreprocess, stored_settings=settings)
+ def test_migrate_language_settings(self):
+ """Test migration to iso langauge codes"""
+ settings = {
+ "__version__": 3,
+ "storedsettings": {
+ "preprocessors": [
+ (
+ "preprocess.normalize",
+ {
+ "snowball_language": "French",
+ "udpipe_language": "German",
+ "lemmagen_language": "Slovenian",
+ },
+ ),
+ ("preprocess.filter", {"language": "Finnish"}),
+ ]
+ },
+ }
+ widget = self.create_widget(OWPreprocess, stored_settings=settings)
+ normalize_settings = widget.storedsettings["preprocessors"][0][1]
+ filter_settings = widget.storedsettings["preprocessors"][1][1]
+ self.assertEqual("sl", normalize_settings["lemmagen_language"])
+ self.assertEqual("fr", normalize_settings["snowball_language"])
+ self.assertEqual("de", normalize_settings["udpipe_language"])
+ self.assertEqual("fi", filter_settings["language"])
+
+ settings = {
+ "__version__": 3,
+ "storedsettings": {
+ "preprocessors": [
+ (
+ "preprocess.normalize",
+ {
+ "snowball_language": "French",
+ "lemmagen_language": "Slovenian",
+ },
+ ),
+ ("preprocess.filter", {}),
+ ]
+ },
+ }
+ widget = self.create_widget(OWPreprocess, stored_settings=settings)
+ normalize_settings = widget.storedsettings["preprocessors"][0][1]
+ self.assertEqual("sl", normalize_settings["lemmagen_language"])
+ self.assertEqual("fr", normalize_settings["snowball_language"])
+
class TestTransformationModule(WidgetTest):
def setUp(self):
@@ -402,17 +512,17 @@ def test_init(self):
def test_parameters(self):
params = {"method": NormalizationModule.Porter,
- "snowball_language": "English",
- "udpipe_language": "English",
- "lemmagen_language": "English",
+ "snowball_language": "en",
+ "udpipe_language": "en",
+ "lemmagen_language": "en",
"udpipe_tokenizer": False}
self.assertDictEqual(self.editor.parameters(), params)
def test_set_parameters(self):
params = {"method": NormalizationModule.UDPipe,
- "snowball_language": "Dutch",
- "udpipe_language": "Slovenian",
- "lemmagen_language": "Bulgarian",
+ "snowball_language": "nl",
+ "udpipe_language": "sl",
+ "lemmagen_language": "bg",
"udpipe_tokenizer": True}
self.editor.setParameters(params)
self.assertDictEqual(self.editor.parameters(), params)
@@ -428,20 +538,19 @@ def test_createinstance(self):
params = {"method": NormalizationModule.Snowball}
pp = self.editor.createinstance(params)
self.assertIsInstance(pp, SnowballStemmer)
- self.assertIn("", str(pp.normalizer))
+ self.assertEqual("en", pp._language)
- params = {"method": NormalizationModule.Snowball,
- "snowball_language": "Dutch"}
+ params = {"method": NormalizationModule.Snowball, "snowball_language": "nl"}
pp = self.editor.createinstance(params)
self.assertIsInstance(pp, SnowballStemmer)
- self.assertIn("", str(pp.normalizer))
+ self.assertEqual("nl", pp._language)
params = {"method": NormalizationModule.UDPipe,
- "udpipe_language": "Finnish",
+ "udpipe_language": "fi",
"udpipe_tokenizer": True}
pp = self.editor.createinstance(params)
self.assertIsInstance(pp, UDPipeLemmatizer)
- self.assertEqual(pp._UDPipeLemmatizer__language, "Finnish")
+ self.assertEqual(pp._language, "fi")
self.assertEqual(pp._UDPipeLemmatizer__use_tokenizer, True)
def test_repr(self):
@@ -522,7 +631,7 @@ def test_init(self):
def test_parameters(self):
params = {"methods": [FilteringModule.Stopwords],
- "language": "English", "sw_path": None, "lx_path": None,
+ "language": "en", "sw_path": None, "lx_path": None,
"sw_list": [], "lx_list": [],
"incl_num": False,
"pattern": FilteringModule.DEFAULT_PATTERN,
@@ -537,7 +646,7 @@ def test_set_parameters(self):
sw_path = RecentPath.create("Foo", [])
lx_path = RecentPath.create("Bar", [])
params = {"methods": [FilteringModule.Lexicon, FilteringModule.Regexp],
- "language": "Finnish",
+ "language": "fi",
"sw_path": sw_path, "lx_path": lx_path,
"sw_list": [sw_path], "lx_list": [lx_path],
"incl_num": False,
@@ -582,7 +691,7 @@ def test_createinstance(self):
def test_repr(self):
self.assertEqual(str(self.editor),
- "Stopwords (Language: English, File: None)")
+ "Stopwords (Language: en, File: None)")
params = {"methods": [FilteringModule.Lexicon,
FilteringModule.Regexp]}
self.editor.setParameters(params)
@@ -677,10 +786,6 @@ def test_createinstance(self):
pp = self.editor.createinstance({"method": POSTaggingModule.MaxEnt})
self.assertIsInstance(pp, MaxEntTagger)
- # TODO - implement StanfordPOSTagger
- # pp = self.editor.createinstance({"method": POSTaggingModule.Stanford})
- # self.assertIsInstance(pp, StanfordPOSTagger)
-
def test_repr(self):
self.assertEqual(str(self.editor), "Averaged Perceptron Tagger")