From d22d1c55e74be33f874cc6ddd933df5c18461945 Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Fri, 14 Apr 2023 15:33:34 +0200
Subject: [PATCH] Preprocess widget - Language from corpus
---
orangecontrib/text/widgets/owpreprocess.py | 95 ++++++++++---
.../text/widgets/tests/test_owpreprocess.py | 133 +++++++++++++++++-
2 files changed, 208 insertions(+), 20 deletions(-)
diff --git a/orangecontrib/text/widgets/owpreprocess.py b/orangecontrib/text/widgets/owpreprocess.py
index 8f939612b..592c85c85 100644
--- a/orangecontrib/text/widgets/owpreprocess.py
+++ b/orangecontrib/text/widgets/owpreprocess.py
@@ -12,6 +12,8 @@
from AnyQt.QtGui import QBrush, QValidator
from Orange.util import wrap_callback
+from orangecanvas.gui.utils import disconnected
+from orangewidget.settings import SettingsHandler
from orangewidget.utils.filedialogs import RecentPath
import Orange.widgets.data.owpreprocess
@@ -113,7 +115,8 @@ def set_current_language(self, iso_language: Optional[str]):
The ISO language code of element to be selected.
"""
index = self.findData(iso_language)
- self.setCurrentIndex(index)
+ if index >= 0:
+ self.setCurrentIndex(index)
class UDPipeComboBox(LanguageComboBox):
@@ -131,15 +134,9 @@ def items(self) -> List:
def add_items(self, _, include_none: bool, language: str):
self.__items = self.items
super().add_items(self.__items, include_none, language)
-
- def set_current_language(self, iso_language: Optional[str]):
iso_items = {iso for _, iso in self.__items}
- if iso_language in iso_items:
- super().set_current_language(iso_language)
- elif self.__default_lang in iso_items:
+ if language not in iso_items and self.__default_lang in iso_items:
super().set_current_language(self.__default_lang)
- elif self.__items:
- self.setCurrentIndex(0)
def showPopup(self):
if self.__items != self.items:
@@ -538,13 +535,13 @@ def __enable_udpipe(self):
def setParameters(self, params: Dict):
super().setParameters(params)
snowball_lang = params.get("snowball_language", self.DEFAULT_LANGUAGE)
- self.__set_snowball_lang(snowball_lang)
+ self.__combo_sbl.set_current_language(snowball_lang)
udpipe_lang = params.get("udpipe_language", self.DEFAULT_LANGUAGE)
- self.__set_udpipe_lang(udpipe_lang)
+ self.__combo_udl.set_current_language(udpipe_lang)
use_tokenizer = params.get("udpipe_tokenizer", self.DEFAULT_USE_TOKE)
self.__set_use_tokenizer(use_tokenizer)
lemmagen_lang = params.get("lemmagen_language", self.DEFAULT_LANGUAGE)
- self.__set_lemmagen_lang(lemmagen_lang)
+ self.__combo_lemm.set_current_language(lemmagen_lang)
def _set_method(self, method: int):
super()._set_method(method)
@@ -553,7 +550,6 @@ def _set_method(self, method: int):
def __set_snowball_lang(self, language: str):
if self.__snowball_lang != language:
self.__snowball_lang = language
- self.__combo_sbl.set_current_language(language)
self.changed.emit()
if self.method == self.Snowball:
self.edited.emit()
@@ -561,7 +557,6 @@ def __set_snowball_lang(self, language: str):
def __set_udpipe_lang(self, language: str):
if self.__udpipe_lang != language:
self.__udpipe_lang = language
- self.__combo_udl.set_current_language(language)
self.changed.emit()
if self.method == self.UDPipe:
self.edited.emit()
@@ -569,7 +564,6 @@ def __set_udpipe_lang(self, language: str):
def __set_lemmagen_lang(self, language: str):
if self.__lemmagen_lang != language:
self.__lemmagen_lang = language
- self.__combo_lemm.set_current_language(language)
self.changed.emit()
if self.method == self.Lemmagen:
self.edited.emit()
@@ -795,7 +789,7 @@ def __spin_n_edited(self):
def setParameters(self, params: Dict):
super().setParameters(params)
- self.__set_language(params.get("language", self.DEFAULT_LANG))
+ self.__combo.set_current_language(params.get("language", self.DEFAULT_LANG))
self.__set_sw_path(params.get("sw_path", self.DEFAULT_NONE),
params.get("sw_list", []))
self.__set_lx_path(params.get("lx_path", self.DEFAULT_NONE),
@@ -820,7 +814,6 @@ def setParameters(self, params: Dict):
def __set_language(self, language: Optional[str]):
if self.__sw_lang != language:
self.__sw_lang = language
- self.__combo.set_current_language(language)
self.changed.emit()
if self.Stopwords in self.methods:
self.edited.emit()
@@ -1045,6 +1038,21 @@ def createinstance(params: Dict) -> POSTagger:
return POSTaggingModule.Methods[method]()
+class PreprocessSettingsHandler(SettingsHandler):
+ """
+ Settings handler, that makes all language settings, which are
+ a part of common preprocess settings, schema_only. It removes them when
+ settings are not loaded from schema but from common settings.
+ """
+ def _remove_schema_only(self, settings_dict):
+ super()._remove_schema_only(settings_dict)
+ for setting, data, _ in self.provider.traverse_settings(data=settings_dict):
+ for pp_name, settings in data["storedsettings"]["preprocessors"]:
+ for key in list(settings):
+ if "language" in key:
+ settings.pop(key)
+
+
PREPROCESS_ACTIONS = [
PreprocessAction(
"Transformation", "preprocess.transform", "",
@@ -1128,12 +1136,14 @@ class Warning(Orange.widgets.data.owpreprocess.OWPreprocess.Warning):
("preprocess.tokenize", {}),
("preprocess.filter", {})]
} # type: Dict[str, List[Tuple[str, Dict]]]
+ settingsHandler = PreprocessSettingsHandler()
storedsettings = Setting(DEFAULT_PP)
buttons_area_orientation = Qt.Vertical
def __init__(self):
ConcurrentWidgetMixin.__init__(self)
Orange.widgets.data.owpreprocess.OWPreprocess.__init__(self)
+ self.__store_pending_languages()
box = gui.vBox(self.controlArea, "Preview")
self.preview = ""
@@ -1151,6 +1161,16 @@ def load(self, saved: Dict) -> StandardItemModel:
saved["preprocessors"][i] = (name, params)
return super().load(saved)
+ def set_model(self, pmodel):
+ """Connect signal which handle setting language from corpus"""
+ super().set_model(pmodel)
+ if pmodel:
+ pmodel.rowsInserted.connect(self.__on_item_inserted)
+
+ def __on_item_inserted(self, _, first: int, last: int):
+ assert first == last
+ self.__set_languages_single_editor(first)
+
def __update_filtering_params(self, params: Dict):
params["sw_path"] = self.__relocate_file(params.get("sw_path"))
params["sw_list"] = self.__relocate_files(params.get("sw_list", []))
@@ -1180,6 +1200,49 @@ def __relocate_file(self, path: RecentPath) -> RecentPath:
def set_data(self, data: Corpus):
self.cancel()
self.data = data
+ self.__set_languages()
+
+ LANG_PARAMS = {
+ "preprocess.normalize": [
+ "snowball_language",
+ "udpipe_language",
+ "lemmagen_language"
+ ],
+ "preprocess.filter": ["language"],
+ }
+
+ def __store_pending_languages(self):
+ settings = self.storedsettings["preprocessors"]
+ self.__pending_languages = {
+ pp_name: {p for p in par if "language" in p} for pp_name, par in settings
+ }
+
+ def __set_languages(self):
+ if self.data is not None:
+ for i in range(self.preprocessormodel.rowCount()):
+ self.__set_languages_single_editor(i)
+ self.__pending_languages = {}
+
+ def __set_languages_single_editor(self, item_index: int):
+ """
+ Set language from corpus for single editor/module,
+ keep language unchanged if it comes from schema (pending).
+ """
+ if self.data and self.data.language:
+ model = self.preprocessormodel
+ item = model.item(item_index)
+ pp_name = item.data(DescriptionRole).qualname
+ params = item.data(ParametersRole)
+ pending = self.__pending_languages.get(pp_name, set())
+ for param in self.LANG_PARAMS.get(pp_name, []):
+ if param not in pending:
+ # set language if not in pending - if pending it is means
+ # that it came from schema and should not be changed
+ params[param] = self.data.language
+ with disconnected(model.dataChanged, self.__on_modelchanged):
+ # disconnection prevent double apply call, it is already called
+ # on new data and when row inserted, both caller of this method
+ item.setData(params, ParametersRole)
def buildpreproc(self) -> PreprocessorList:
plist = []
diff --git a/orangecontrib/text/widgets/tests/test_owpreprocess.py b/orangecontrib/text/widgets/tests/test_owpreprocess.py
index 49ccb1229..e5e979e29 100644
--- a/orangecontrib/text/widgets/tests/test_owpreprocess.py
+++ b/orangecontrib/text/widgets/tests/test_owpreprocess.py
@@ -2,7 +2,9 @@
from unittest.mock import patch, PropertyMock, MagicMock, Mock
import numpy as np
+from AnyQt.QtGui import QStandardItem, QIcon
from Orange.data import Domain, StringVariable
+from Orange.widgets.data.utils.preprocess import DescriptionRole, ParametersRole
from orangewidget.utils.filedialogs import RecentPath
from Orange.widgets.tests.base import WidgetTest
from Orange.widgets.tests.utils import simulate
@@ -180,6 +182,123 @@ def test_no_tokens_left(self):
self.wait_until_finished()
self.assertFalse(self.widget.Warning.no_token_left.is_shown())
+ def test_language_from_corpus(self):
+ """Test language from corpus is set correctly"""
+ initial = {
+ "name": "",
+ "preprocessors": [("preprocess.normalize", {}), ("preprocess.filter", {})],
+ }
+ self.widget.storedsettings = initial
+ self.widget._initialize()
+ self.assertDictEqual(initial, self.widget.storedsettings)
+ combos = self.widget.mainArea.findChildren(LanguageComboBox)
+ self.assertEqual(
+ ["English", "English", "English", "English"],
+ [c.currentText() for c in combos]
+ )
+
+ # test with Slovenian - language should set for all preprocessors except
+ # Snowball that doesn't support Slovenian
+ self.corpus.attributes["language"] = "sl"
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ self.assertEqual(
+ ["English", "Slovenian", "Slovenian", "Slovenian"],
+ [c.currentText() for c in combos]
+ )
+
+ # test with Dutch that is support by two preprocessors
+ self.corpus.attributes["language"] = "nl"
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ self.assertEqual(
+ ["Dutch", "Slovenian", "Slovenian", "Dutch"],
+ [c.currentText() for c in combos]
+ )
+
+ # language not supported by any preprocessor - language shouldn't change
+ self.corpus.attributes["language"] = "bo"
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ self.assertEqual(
+ ["Dutch", "Slovenian", "Slovenian", "Dutch"],
+ [c.currentText() for c in combos]
+ )
+
+ # test with missing language - language shouldn't change
+ self.corpus.attributes["language"] = None
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ self.assertEqual(
+ ["Dutch", "Slovenian", "Slovenian", "Dutch"],
+ [c.currentText() for c in combos]
+ )
+
+ def test_language_from_schema(self):
+ """Test language from schema/workflow is retained"""
+ initial = {
+ "name": "",
+ "preprocessors": [
+ (
+ "preprocess.normalize",
+ {
+ "lemmagen_language": "sl",
+ "snowball_language": "nl",
+ "udpipe_language": "lt",
+ },
+ ),
+ ("preprocess.filter", {"language": "nl"}),
+ ],
+ }
+ self.widget.storedsettings = initial
+
+ settings = self.widget.settingsHandler.pack_data(self.widget)
+ widget = self.create_widget(OWPreprocess, stored_settings=settings)
+ self.send_signal(widget.Inputs.corpus, self.corpus, widget=widget)
+ self.assertDictEqual(initial, widget.storedsettings)
+ combos = widget.mainArea.findChildren(LanguageComboBox)
+ self.assertEqual(
+ ["Dutch", "Lithuanian", "Slovenian", "Dutch"],
+ [c.currentText() for c in combos]
+ )
+
+ def test_language_from_corpus_editor_inserted(self):
+ """Test language from corpus is set to new editor too"""
+ initial = {
+ "name": "",
+ "preprocessors": [("preprocess.filter", {})],
+ }
+ self.widget.storedsettings = initial
+ self.widget._initialize()
+ self.assertDictEqual(initial, self.widget.storedsettings)
+ combos = self.widget.mainArea.findChildren(LanguageComboBox)
+ self.assertEqual(
+ ["English"],
+ [c.currentText() for c in combos]
+ )
+
+ # insert data - language of stopwords combo should change to italian
+ self.corpus.attributes["language"] = "sl"
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ self.assertEqual(
+ ["Slovenian"],
+ [c.currentText() for c in combos]
+ )
+
+ # insert new editor - all languages except snowball should be set to Slovenian
+ pp_def = self.widget._qname2ppdef["preprocess.normalize"]
+ description = pp_def.description
+ item = QStandardItem(description.title)
+ icon = QIcon(description.icon)
+ item.setIcon(icon)
+ item.setToolTip(description.summary)
+ item.setData(pp_def, DescriptionRole)
+ item.setData({}, ParametersRole)
+ self.widget.preprocessormodel.insertRow(0, [item])
+ self.wait_until_finished()
+
+ combos = self.widget.mainArea.findChildren(LanguageComboBox)
+ self.assertEqual(
+ ['Slovenian', 'English', 'Slovenian', 'Slovenian'],
+ [c.currentText() for c in combos]
+ )
+
@patch(SF_LIST, new=Mock(return_value=SERVER_FILES))
class TestOWPreprocessMigrateSettings(WidgetTest):
@@ -983,14 +1102,20 @@ def test_set_current_language(self):
self.assertEqual("Portuguese", cb.currentText())
cb.set_current_language("sl")
self.assertEqual("Slovenian", cb.currentText())
- cb.set_current_language("abc") # should set to default
- self.assertEqual("English", cb.currentText())
+ cb.set_current_language("abc") # language not in list - keep current seleciton
+ self.assertEqual("Slovenian", cb.currentText())
+
+ def test_set_language_to_default(self):
+ """In case current item not in dropdown anymore set language to default"""
+ mock = Mock()
+ cb = UDPipeComboBox(None, "pt", "en", mock)
+ self.assertEqual("Portuguese", cb.currentText())
# when no default language in the dropdown set to first
cb.removeItem(0)
x = cb._UDPipeComboBox__items
cb._UDPipeComboBox__items = x[:3] + x[4:]
- cb.set_current_language("abc")
- self.assertEqual("English (lines)", cb.currentText())
+ cb.showPopup()
+ self.assertEqual("English", cb.currentText())
def test_change_item(self):
mock = Mock()