From 2c16beb23c14f2442978498a0cbc2c3e47e750c3 Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Fri, 14 Apr 2023 15:33:34 +0200
Subject: [PATCH] Preprocess widget - Language from corpus
---
orangecontrib/text/keywords/__init__.py | 2 +-
orangecontrib/text/preprocess/filter.py | 5 +-
orangecontrib/text/preprocess/normalize.py | 4 +
orangecontrib/text/tests/test_preprocess.py | 2 +-
orangecontrib/text/widgets/owpreprocess.py | 85 ++++++++-
.../text/widgets/tests/test_owpreprocess.py | 163 +++++++++++++++++-
6 files changed, 244 insertions(+), 17 deletions(-)
diff --git a/orangecontrib/text/keywords/__init__.py b/orangecontrib/text/keywords/__init__.py
index deb22c2c0..6b72bad76 100644
--- a/orangecontrib/text/keywords/__init__.py
+++ b/orangecontrib/text/keywords/__init__.py
@@ -20,7 +20,7 @@
from orangecontrib.text.vectorization import BowVectorizer
# all available languages for RAKE
-RAKE_LANGUAGES = StopwordsFilter.supported_languages()
+RAKE_LANGUAGES = StopwordsFilter.supported_languages
# all available languages for YAKE!
# fmt: off
YAKE_LANGUAGES = [
diff --git a/orangecontrib/text/preprocess/filter.py b/orangecontrib/text/preprocess/filter.py
index ae8ccfc77..74e71b0cd 100644
--- a/orangecontrib/text/preprocess/filter.py
+++ b/orangecontrib/text/preprocess/filter.py
@@ -117,9 +117,10 @@ def lang_to_iso(language: str) -> str:
"""
return LANG2ISO[StopwordsFilter.NLTK2LANG.get(language, language)]
- @staticmethod
+ @classmethod
+ @property
@wait_nltk_data
- def supported_languages() -> Set[str]:
+ def supported_languages(_) -> Set[str]:
"""
List all languages supported by NLTK
diff --git a/orangecontrib/text/preprocess/normalize.py b/orangecontrib/text/preprocess/normalize.py
index cf58a5b5b..11d6a3ae0 100644
--- a/orangecontrib/text/preprocess/normalize.py
+++ b/orangecontrib/text/preprocess/normalize.py
@@ -122,6 +122,10 @@ def __files_to_dict(self, files: List[Tuple[str]]) -> Dict[str, Tuple[str, str]]
def supported_languages(self) -> List[Tuple[str, str]]:
return [(name, iso) for iso, (name, _) in self.model_files.items()]
+ @property
+ def supported_languages_iso(self) -> List[Tuple[str, str]]:
+ return {iso for _, iso in self.supported_languages}
+
@property
def online(self) -> bool:
try:
diff --git a/orangecontrib/text/tests/test_preprocess.py b/orangecontrib/text/tests/test_preprocess.py
index 0a91a49aa..dac743fae 100644
--- a/orangecontrib/text/tests/test_preprocess.py
+++ b/orangecontrib/text/tests/test_preprocess.py
@@ -486,7 +486,7 @@ def test_stopwords_slovene(self):
self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)
def test_supported_languages(self):
- langs = preprocess.StopwordsFilter.supported_languages()
+ langs = preprocess.StopwordsFilter.supported_languages
self.assertIsInstance(langs, set)
# just testing few of most important languages since I want for test to be
# resistant for any potentially newly introduced languages by NLTK
diff --git a/orangecontrib/text/widgets/owpreprocess.py b/orangecontrib/text/widgets/owpreprocess.py
index fc973e33c..19265c086 100644
--- a/orangecontrib/text/widgets/owpreprocess.py
+++ b/orangecontrib/text/widgets/owpreprocess.py
@@ -11,6 +11,8 @@
from AnyQt.QtGui import QBrush, QValidator
from Orange.util import wrap_callback
+from orangecanvas.gui.utils import disconnected
+from orangewidget.settings import SettingsHandler
from orangewidget.utils.filedialogs import RecentPath
import Orange.widgets.data.owpreprocess
@@ -112,7 +114,8 @@ def set_current_language(self, iso_language: Optional[str]):
The ISO language code of element to be selected.
"""
index = self.findData(iso_language)
- self.setCurrentIndex(index)
+ if index >= 0:
+ self.setCurrentIndex(index)
class UDPipeComboBox(LanguageComboBox):
@@ -130,15 +133,9 @@ def items(self) -> List:
def add_items(self, _, include_none: bool, language: str):
self.__items = self.items
super().add_items(self.__items, include_none, language)
-
- def set_current_language(self, iso_language: Optional[str]):
iso_items = {iso for _, iso in self.__items}
- if iso_language in iso_items:
- super().set_current_language(iso_language)
- elif self.__default_lang in iso_items:
+ if language not in iso_items and self.__default_lang in iso_items:
super().set_current_language(self.__default_lang)
- elif self.__items:
- self.setCurrentIndex(0)
def showPopup(self):
if self.__items != self.items:
@@ -657,7 +654,7 @@ def __init__(self, parent=None, **kwargs):
self.__combo = LanguageComboBox(
self,
- StopwordsFilter.supported_languages(),
+ StopwordsFilter.supported_languages,
self.__sw_lang,
True,
self.__set_language,
@@ -1044,6 +1041,21 @@ def createinstance(params: Dict) -> POSTagger:
return POSTaggingModule.Methods[method]()
+class PreprocessSettingsHandler(SettingsHandler):
+ """
+ Settings handler, that makes all language settings, which are
+ a part of common preprocess settings, schema_only. It removes them when
+ settings are not loaded from schema but from common settings.
+ """
+ def _remove_schema_only(self, settings_dict):
+ super()._remove_schema_only(settings_dict)
+ for setting, data, _ in self.provider.traverse_settings(data=settings_dict):
+ for pp_name, settings in data["storedsettings"]["preprocessors"]:
+ for key in list(settings):
+ if "language" in key:
+ settings.pop(key)
+
+
PREPROCESS_ACTIONS = [
PreprocessAction(
"Transformation", "preprocess.transform", "",
@@ -1127,12 +1139,14 @@ class Warning(Orange.widgets.data.owpreprocess.OWPreprocess.Warning):
("preprocess.tokenize", {}),
("preprocess.filter", {})]
} # type: Dict[str, List[Tuple[str, Dict]]]
+ settingsHandler = PreprocessSettingsHandler()
storedsettings = Setting(DEFAULT_PP)
buttons_area_orientation = Qt.Vertical
def __init__(self):
ConcurrentWidgetMixin.__init__(self)
Orange.widgets.data.owpreprocess.OWPreprocess.__init__(self)
+ self.__store_pending_languages()
box = gui.vBox(self.controlArea, "Preview")
self.preview = ""
@@ -1150,6 +1164,16 @@ def load(self, saved: Dict) -> StandardItemModel:
saved["preprocessors"][i] = (name, params)
return super().load(saved)
+ def set_model(self, pmodel):
+ """Connect signal which handle setting language from corpus"""
+ super().set_model(pmodel)
+ if pmodel:
+ pmodel.rowsInserted.connect(self.__on_item_inserted)
+
+ def __on_item_inserted(self, _, first: int, last: int):
+ assert first == last
+ self.__set_languages_single_editor(first)
+
def __update_filtering_params(self, params: Dict):
params["sw_path"] = self.__relocate_file(params.get("sw_path"))
params["sw_list"] = self.__relocate_files(params.get("sw_list", []))
@@ -1179,6 +1203,49 @@ def __relocate_file(self, path: RecentPath) -> RecentPath:
def set_data(self, data: Corpus):
self.cancel()
self.data = data
+ self.__set_languages()
+
+ LANG_PARAMS = {
+ "preprocess.normalize": [
+ ("snowball_language", SnowballStemmer.supported_languages),
+ ("udpipe_language", UDPipeModels().supported_languages_iso),
+ ("lemmagen_language", LemmagenLemmatizer.supported_languages),
+ ],
+ "preprocess.filter": [("language", StopwordsFilter.supported_languages)],
+ }
+
+ def __store_pending_languages(self):
+ settings = self.storedsettings["preprocessors"]
+ self.__pending_languages = {
+ pp_name: {p for p in par if "language" in p} for pp_name, par in settings
+ }
+
+ def __set_languages(self):
+ if self.data is not None:
+ for i in range(self.preprocessormodel.rowCount()):
+ self.__set_languages_single_editor(i)
+ self.__pending_languages = {}
+
+ def __set_languages_single_editor(self, item_index: int):
+ """
+ Set language from corpus for single editor/module,
+ keep language unchanged if it comes from schema (pending).
+ """
+ if self.data and self.data.language:
+ model = self.preprocessormodel
+ item = model.item(item_index)
+ pp_name = item.data(DescriptionRole).qualname
+ params = item.data(ParametersRole)
+ pending = self.__pending_languages.get(pp_name, set())
+ for param, available_langs in self.LANG_PARAMS.get(pp_name, []):
+ if param not in pending and self.data.language in available_langs:
+ # set language if not pending from schema - should not be changed
+ # and if available for the method
+ params[param] = self.data.language
+ with disconnected(model.dataChanged, self.__on_modelchanged):
+ # disconnection prevent double apply call, it is already called
+ # on new data and when row inserted, both caller of this method
+ item.setData(params, ParametersRole)
def buildpreproc(self) -> PreprocessorList:
plist = []
diff --git a/orangecontrib/text/widgets/tests/test_owpreprocess.py b/orangecontrib/text/widgets/tests/test_owpreprocess.py
index 49ccb1229..5b2aeb660 100644
--- a/orangecontrib/text/widgets/tests/test_owpreprocess.py
+++ b/orangecontrib/text/widgets/tests/test_owpreprocess.py
@@ -2,7 +2,9 @@
from unittest.mock import patch, PropertyMock, MagicMock, Mock
import numpy as np
+from AnyQt.QtGui import QStandardItem, QIcon
from Orange.data import Domain, StringVariable
+from Orange.widgets.data.utils.preprocess import DescriptionRole, ParametersRole
from orangewidget.utils.filedialogs import RecentPath
from Orange.widgets.tests.base import WidgetTest
from Orange.widgets.tests.utils import simulate
@@ -180,6 +182,153 @@ def test_no_tokens_left(self):
self.wait_until_finished()
self.assertFalse(self.widget.Warning.no_token_left.is_shown())
+ def test_language_from_corpus(self):
+ """Test language from corpus is set correctly"""
+ initial = {
+ "name": "",
+ "preprocessors": [("preprocess.normalize", {}), ("preprocess.filter", {})],
+ }
+ self.widget.storedsettings = initial
+ self.widget._initialize()
+ self.assertDictEqual(initial, self.widget.storedsettings)
+ combos = self.widget.mainArea.findChildren(LanguageComboBox)
+ self.assertEqual(
+ ["English", "English", "English", "English"],
+ [c.currentText() for c in combos]
+ )
+
+ # test with Slovenian - language should set for all preprocessors except
+ # Snowball that doesn't support Slovenian
+ self.corpus.attributes["language"] = "sl"
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ self.assertEqual(
+ ["English", "Slovenian", "Slovenian", "Slovenian"],
+ [c.currentText() for c in combos]
+ )
+ settings = self.widget.storedsettings["preprocessors"]
+ self.assertEqual("sl", settings[0][1]["udpipe_language"])
+ self.assertEqual("sl", settings[0][1]["lemmagen_language"])
+ self.assertEqual("sl", settings[1][1]["language"])
+
+ # test with Lithuanian that is support by one preprocessors
+ self.corpus.attributes["language"] = "lt"
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ self.assertEqual(
+ ["English", "Lithuanian", "Slovenian", "Slovenian"],
+ [c.currentText() for c in combos]
+ )
+ settings = self.widget.storedsettings["preprocessors"]
+ self.assertEqual("lt", settings[0][1]["udpipe_language"])
+ self.assertEqual("sl", settings[0][1]["lemmagen_language"])
+ self.assertEqual("sl", settings[1][1]["language"])
+
+ self.corpus.attributes["language"] = "pt"
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ self.assertEqual(
+ ["Portuguese", "Portuguese", "Slovenian", "Portuguese"],
+ [c.currentText() for c in combos]
+ )
+ settings = self.widget.storedsettings["preprocessors"]
+ self.assertEqual("pt", settings[0][1]["snowball_language"])
+ self.assertEqual("pt", settings[0][1]["udpipe_language"])
+ self.assertEqual("sl", settings[0][1]["lemmagen_language"])
+ self.assertEqual("pt", settings[1][1]["language"])
+
+ # language not supported by any preprocessor - language shouldn't change
+ self.corpus.attributes["language"] = "bo"
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ self.assertEqual(
+ ["Portuguese", "Portuguese", "Slovenian", "Portuguese"],
+ [c.currentText() for c in combos]
+ )
+ settings = self.widget.storedsettings["preprocessors"]
+ self.assertEqual("pt", settings[0][1]["snowball_language"])
+ self.assertEqual("pt", settings[0][1]["udpipe_language"])
+ self.assertEqual("sl", settings[0][1]["lemmagen_language"])
+ self.assertEqual("pt", settings[1][1]["language"])
+
+ # test with missing language - language shouldn't change
+ self.corpus.attributes["language"] = None
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ self.assertEqual(
+ ["Portuguese", "Portuguese", "Slovenian", "Portuguese"],
+ [c.currentText() for c in combos]
+ )
+ settings = self.widget.storedsettings["preprocessors"]
+ self.assertEqual("pt", settings[0][1]["snowball_language"])
+ self.assertEqual("pt", settings[0][1]["udpipe_language"])
+ self.assertEqual("sl", settings[0][1]["lemmagen_language"])
+ self.assertEqual("pt", settings[1][1]["language"])
+
+ def test_language_from_schema(self):
+ """Test language from schema/workflow is retained"""
+ initial = {
+ "name": "",
+ "preprocessors": [
+ (
+ "preprocess.normalize",
+ {
+ "lemmagen_language": "sl",
+ "snowball_language": "nl",
+ "udpipe_language": "lt",
+ },
+ ),
+ ("preprocess.filter", {"language": "nl"}),
+ ],
+ }
+ self.widget.storedsettings = initial
+
+ settings = self.widget.settingsHandler.pack_data(self.widget)
+ widget = self.create_widget(OWPreprocess, stored_settings=settings)
+ self.send_signal(widget.Inputs.corpus, self.corpus, widget=widget)
+ self.assertDictEqual(initial, widget.storedsettings)
+ combos = widget.mainArea.findChildren(LanguageComboBox)
+ self.assertEqual(
+ ["Dutch", "Lithuanian", "Slovenian", "Dutch"],
+ [c.currentText() for c in combos]
+ )
+
+ def test_language_from_corpus_editor_inserted(self):
+ """Test language from corpus is set to new editor too"""
+ initial = {
+ "name": "",
+ "preprocessors": [("preprocess.filter", {})],
+ }
+ self.widget.storedsettings = initial
+ self.widget._initialize()
+ self.assertDictEqual(initial, self.widget.storedsettings)
+ combos = self.widget.mainArea.findChildren(LanguageComboBox)
+ self.assertEqual(
+ ["English"],
+ [c.currentText() for c in combos]
+ )
+
+ # insert data - language of stopwords combo should change to italian
+ self.corpus.attributes["language"] = "sl"
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ self.assertEqual(
+ ["Slovenian"],
+ [c.currentText() for c in combos]
+ )
+
+ # insert new editor - all languages except snowball should be set to Slovenian
+ pp_def = self.widget._qname2ppdef["preprocess.normalize"]
+ description = pp_def.description
+ item = QStandardItem(description.title)
+ icon = QIcon(description.icon)
+ item.setIcon(icon)
+ item.setToolTip(description.summary)
+ item.setData(pp_def, DescriptionRole)
+ item.setData({}, ParametersRole)
+ self.widget.preprocessormodel.insertRow(0, [item])
+ self.wait_until_finished()
+
+ combos = self.widget.mainArea.findChildren(LanguageComboBox)
+ self.assertEqual(
+ ['Slovenian', 'English', 'Slovenian', 'Slovenian'],
+ [c.currentText() for c in combos]
+ )
+
@patch(SF_LIST, new=Mock(return_value=SERVER_FILES))
class TestOWPreprocessMigrateSettings(WidgetTest):
@@ -983,14 +1132,20 @@ def test_set_current_language(self):
self.assertEqual("Portuguese", cb.currentText())
cb.set_current_language("sl")
self.assertEqual("Slovenian", cb.currentText())
- cb.set_current_language("abc") # should set to default
- self.assertEqual("English", cb.currentText())
+ cb.set_current_language("abc") # language not in list - keep current seleciton
+ self.assertEqual("Slovenian", cb.currentText())
+
+ def test_set_language_to_default(self):
+ """In case current item not in dropdown anymore set language to default"""
+ mock = Mock()
+ cb = UDPipeComboBox(None, "pt", "en", mock)
+ self.assertEqual("Portuguese", cb.currentText())
# when no default language in the dropdown set to first
cb.removeItem(0)
x = cb._UDPipeComboBox__items
cb._UDPipeComboBox__items = x[:3] + x[4:]
- cb.set_current_language("abc")
- self.assertEqual("English (lines)", cb.currentText())
+ cb.showPopup()
+ self.assertEqual("English", cb.currentText())
def test_change_item(self):
mock = Mock()