From 392232cb84446da796567b46b9eb2bd4d7e19da2 Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Fri, 14 Apr 2023 15:33:34 +0200
Subject: [PATCH] Preprocess widget - Language from corpus
---
orangecontrib/text/widgets/owpreprocess.py | 74 +++++++++++++++++++
.../text/widgets/tests/test_owpreprocess.py | 64 ++++++++++++++++
2 files changed, 138 insertions(+)
diff --git a/orangecontrib/text/widgets/owpreprocess.py b/orangecontrib/text/widgets/owpreprocess.py
index 8f939612b..60f0e0a5f 100644
--- a/orangecontrib/text/widgets/owpreprocess.py
+++ b/orangecontrib/text/widgets/owpreprocess.py
@@ -12,6 +12,8 @@
from AnyQt.QtGui import QBrush, QValidator
from Orange.util import wrap_callback
+from orangecanvas.gui.utils import disconnected
+from orangewidget.settings import SettingsHandler
from orangewidget.utils.filedialogs import RecentPath
import Orange.widgets.data.owpreprocess
@@ -147,6 +149,9 @@ def showPopup(self):
self.add_items(None, False, self.itemData(self.currentIndex()))
super().showPopup()
+ def set_current_language(self, iso_language: str):
+ self.setCurrentText(UDPipeModels.iso_to_language(iso_language))
+
class RangeSpins(QHBoxLayout):
SpinBox = QSpinBox
@@ -1045,6 +1050,21 @@ def createinstance(params: Dict) -> POSTagger:
return POSTaggingModule.Methods[method]()
+class PreprocessSettingsHandler(SettingsHandler):
+ """
+ A bit modified settings handler, that makes all language settings, which are
+ a part of common preprocess settings, schema_only. It removes them when
+ settings are not loaded from schema but from common settings.
+ """
+ def _remove_schema_only(self, settings_dict):
+ super()._remove_schema_only(settings_dict)
+ for setting, data, _ in self.provider.traverse_settings(data=settings_dict):
+ for pp_name, settings in data["storedsettings"]["preprocessors"]:
+ for key in list(settings):
+ if "language" in key:
+ settings.pop(key)
+
+
PREPROCESS_ACTIONS = [
PreprocessAction(
"Transformation", "preprocess.transform", "",
@@ -1128,12 +1148,14 @@ class Warning(Orange.widgets.data.owpreprocess.OWPreprocess.Warning):
("preprocess.tokenize", {}),
("preprocess.filter", {})]
} # type: Dict[str, List[Tuple[str, Dict]]]
+ settingsHandler = PreprocessSettingsHandler()
storedsettings = Setting(DEFAULT_PP)
buttons_area_orientation = Qt.Vertical
def __init__(self):
ConcurrentWidgetMixin.__init__(self)
Orange.widgets.data.owpreprocess.OWPreprocess.__init__(self)
+ self.__store_pending_languages()
box = gui.vBox(self.controlArea, "Preview")
self.preview = ""
@@ -1151,6 +1173,12 @@ def load(self, saved: Dict) -> StandardItemModel:
saved["preprocessors"][i] = (name, params)
return super().load(saved)
+ def set_model(self, pmodel):
+ if pmodel:
+ pmodel.rowsInserted.connect(self.__on_item_inserted)
+ super().set_model(pmodel)
+
+
def __update_filtering_params(self, params: Dict):
params["sw_path"] = self.__relocate_file(params.get("sw_path"))
params["sw_list"] = self.__relocate_files(params.get("sw_list", []))
@@ -1176,10 +1204,56 @@ def __relocate_file(self, path: RecentPath) -> RecentPath:
search_paths, **kwargs)
return path
+ def __on_item_inserted(self, _, first: int, last: int):
+ assert first == last
+ self.__set_languages_single_item(first)
+ self.storedsettings = self.save(self.preprocessormodel)
+
@Inputs.corpus
def set_data(self, data: Corpus):
self.cancel()
self.data = data
+ self.__set_languages()
+
+ LANG_PARAMS = {
+ "preprocess.normalize": [
+ ("snowball_language", NormalizationModule.SNOWBALL_LANGUAGES),
+ ("udpipe_language", UDPipeLemmatizer().models.supported_languages_iso),
+ ("lemmagen_language", NormalizationModule.LEMMAGEN_LANGUAGES),
+ ],
+ "preprocess.filter": [("language", FilteringModule.STOP_WORDS_LANGUAGES)],
+ }
+
+ def __store_pending_languages(self):
+ self.__pending_languages = defaultdict(dict)
+ for pp_name, params in self.storedsettings["preprocessors"]:
+ for p, _ in self.LANG_PARAMS.get(pp_name, []):
+ if p in params:
+ self.__pending_languages[pp_name][p] = params[p]
+
+ def __set_languages(self):
+ if self.data is not None:
+ for i in range(self.preprocessormodel.rowCount()):
+ self.__set_languages_single_item(i)
+ self.__pending_languages = {}
+ self.storedsettings = self.save(self.preprocessormodel)
+
+ def __set_languages_single_item(self, item_index: int):
+ item = self.preprocessormodel.item(item_index)
+ pp_name = item.data(DescriptionRole).qualname
+ params = item.data(ParametersRole)
+ pending = self.__pending_languages.get(pp_name, {})
+ for param, sup_lang in self.LANG_PARAMS.get(pp_name, []):
+ if param in pending:
+ params[param] = pending[param]
+ else:
+ sup_lang = sup_lang() if callable(sup_lang) else sup_lang
+ if self.data.language and self.data.language in sup_lang:
+ params[param] = self.data.language
+ with disconnected(self.preprocessormodel.dataChanged, self.__on_modelchanged):
+ # dataChange must be disconnected to prevent double apply call
+ # both calls of this method call apply after
+ item.setData(params, ParametersRole)
def buildpreproc(self) -> PreprocessorList:
plist = []
diff --git a/orangecontrib/text/widgets/tests/test_owpreprocess.py b/orangecontrib/text/widgets/tests/test_owpreprocess.py
index 49ccb1229..3a39853e7 100644
--- a/orangecontrib/text/widgets/tests/test_owpreprocess.py
+++ b/orangecontrib/text/widgets/tests/test_owpreprocess.py
@@ -180,6 +180,70 @@ def test_no_tokens_left(self):
self.wait_until_finished()
self.assertFalse(self.widget.Warning.no_token_left.is_shown())
+ def test_language_from_corpus(self):
+ """Languege from corpus is set correctly"""
+ initial = {
+ "name": "",
+ "preprocessors": [
+ ("preprocess.transform", {}),
+ ("preprocess.tokenize", {}),
+ ("preprocess.normalize", {}),
+ ("preprocess.filter", {}),
+ ],
+ }
+ self.widget.storedsettings = initial
+ self.widget._initialize()
+ self.assertDictEqual(initial, self.widget.storedsettings)
+
+ self.corpus.attributes["language"] = None
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ # nothing should change since language is missing in corpus
+ self.assertDictEqual(initial, self.widget.storedsettings)
+
+ self.corpus.attributes["language"] = "en"
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ normalize_settings = self.widget.storedsettings["preprocessors"][2][1]
+ filter_settings = self.widget.storedsettings["preprocessors"][3][1]
+ self.assertEqual("en", normalize_settings["lemmagen_language"])
+ self.assertEqual("en", normalize_settings["snowball_language"])
+ self.assertEqual("en", normalize_settings["udpipe_language"])
+ self.assertEqual("en", filter_settings["language"])
+
+ # language not supported by all preprocessors
+ self.corpus.attributes["language"] = "nl"
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ normalize_settings = self.widget.storedsettings["preprocessors"][2][1]
+ filter_settings = self.widget.storedsettings["preprocessors"][3][1]
+ self.assertEqual("en", normalize_settings["lemmagen_language"])
+ self.assertEqual("nl", normalize_settings["snowball_language"])
+ self.assertEqual("en", normalize_settings["udpipe_language"])
+ self.assertEqual("nl", filter_settings["language"])
+
+ def test_language_from_schema(self):
+ """Test language from schema/workflow is retained"""
+ initial = {
+ "name": "",
+ "preprocessors": [
+ ("preprocess.transform", {}),
+ ("preprocess.tokenize", {}),
+ (
+ "preprocess.normalize",
+ {
+ "lemmagen_language": "sl",
+ "snowball_language": "nl",
+ "udpipe_language": "lt",
+ },
+ ),
+ ("preprocess.filter", {"language": "nl"}),
+ ],
+ }
+ self.widget.storedsettings = initial
+
+ settings = self.widget.settingsHandler.pack_data(self.widget)
+ widget = self.create_widget(OWPreprocess, stored_settings=settings)
+ self.send_signal(widget.Inputs.corpus, self.corpus, widget=widget)
+ self.assertDictEqual(initial, widget.storedsettings)
+
@patch(SF_LIST, new=Mock(return_value=SERVER_FILES))
class TestOWPreprocessMigrateSettings(WidgetTest):