From 9a47c0cbe55721a449e648b684e5c2ac6cf03a2a Mon Sep 17 00:00:00 2001 From: PrimozGodec Date: Fri, 14 Apr 2023 15:33:34 +0200 Subject: [PATCH] Preprocess widget - Language from corpus --- orangecontrib/text/keywords/__init__.py | 2 +- orangecontrib/text/preprocess/filter.py | 5 +- orangecontrib/text/preprocess/normalize.py | 4 + orangecontrib/text/tests/test_preprocess.py | 2 +- orangecontrib/text/widgets/owpreprocess.py | 108 +++++++++--- .../text/widgets/tests/test_owpreprocess.py | 163 +++++++++++++++++- 6 files changed, 254 insertions(+), 30 deletions(-) diff --git a/orangecontrib/text/keywords/__init__.py b/orangecontrib/text/keywords/__init__.py index 783ecad13..2900abbc8 100644 --- a/orangecontrib/text/keywords/__init__.py +++ b/orangecontrib/text/keywords/__init__.py @@ -24,7 +24,7 @@ # todo: refactor when refactoring language for keywords module # this is a temporary solution since supported_languages now returns lang ISO codes -RAKE_LANGUAGES = [ISO2LANG[la] for la in StopwordsFilter.supported_languages()] +RAKE_LANGUAGES = [ISO2LANG[la] for la in StopwordsFilter.supported_languages] # all available languages for YAKE! YAKE_LANGUAGE_MAPPING = { "Arabic": "ar", diff --git a/orangecontrib/text/preprocess/filter.py b/orangecontrib/text/preprocess/filter.py index ae8ccfc77..74e71b0cd 100644 --- a/orangecontrib/text/preprocess/filter.py +++ b/orangecontrib/text/preprocess/filter.py @@ -117,9 +117,10 @@ def lang_to_iso(language: str) -> str: """ return LANG2ISO[StopwordsFilter.NLTK2LANG.get(language, language)] - @staticmethod + @classmethod + @property @wait_nltk_data - def supported_languages() -> Set[str]: + def supported_languages(_) -> Set[str]: """ List all languages supported by NLTK diff --git a/orangecontrib/text/preprocess/normalize.py b/orangecontrib/text/preprocess/normalize.py index cf58a5b5b..11d6a3ae0 100644 --- a/orangecontrib/text/preprocess/normalize.py +++ b/orangecontrib/text/preprocess/normalize.py @@ -122,6 +122,10 @@ def __files_to_dict(self, files: List[Tuple[str]]) -> Dict[str, Tuple[str, str]] def supported_languages(self) -> List[Tuple[str, str]]: return [(name, iso) for iso, (name, _) in self.model_files.items()] + @property + def supported_languages_iso(self) -> List[Tuple[str, str]]: + return {iso for _, iso in self.supported_languages} + @property def online(self) -> bool: try: diff --git a/orangecontrib/text/tests/test_preprocess.py b/orangecontrib/text/tests/test_preprocess.py index 0a91a49aa..dac743fae 100644 --- a/orangecontrib/text/tests/test_preprocess.py +++ b/orangecontrib/text/tests/test_preprocess.py @@ -486,7 +486,7 @@ def test_stopwords_slovene(self): self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2) def test_supported_languages(self): - langs = preprocess.StopwordsFilter.supported_languages() + langs = preprocess.StopwordsFilter.supported_languages self.assertIsInstance(langs, set) # just testing few of most important languages since I want for test to be # resistant for any potentially newly introduced languages by NLTK diff --git a/orangecontrib/text/widgets/owpreprocess.py b/orangecontrib/text/widgets/owpreprocess.py index 8f939612b..6d052fa95 100644 --- a/orangecontrib/text/widgets/owpreprocess.py +++ b/orangecontrib/text/widgets/owpreprocess.py @@ -12,6 +12,8 @@ from AnyQt.QtGui import QBrush, QValidator from Orange.util import wrap_callback +from orangecanvas.gui.utils import disconnected +from orangewidget.settings import SettingsHandler from orangewidget.utils.filedialogs import RecentPath import Orange.widgets.data.owpreprocess @@ -113,7 +115,10 @@ def set_current_language(self, iso_language: Optional[str]): The ISO language code of element to be selected. """ index = self.findData(iso_language) - self.setCurrentIndex(index) + if index >= 0: + self.setCurrentIndex(index) + else: + self.index_changed(self.currentIndex()) class UDPipeComboBox(LanguageComboBox): @@ -131,15 +136,9 @@ def items(self) -> List: def add_items(self, _, include_none: bool, language: str): self.__items = self.items super().add_items(self.__items, include_none, language) - - def set_current_language(self, iso_language: Optional[str]): iso_items = {iso for _, iso in self.__items} - if iso_language in iso_items: - super().set_current_language(iso_language) - elif self.__default_lang in iso_items: + if language not in iso_items and self.__default_lang in iso_items: super().set_current_language(self.__default_lang) - elif self.__items: - self.setCurrentIndex(0) def showPopup(self): if self.__items != self.items: @@ -538,13 +537,13 @@ def __enable_udpipe(self): def setParameters(self, params: Dict): super().setParameters(params) snowball_lang = params.get("snowball_language", self.DEFAULT_LANGUAGE) - self.__set_snowball_lang(snowball_lang) + self.__combo_sbl.set_current_language(snowball_lang) udpipe_lang = params.get("udpipe_language", self.DEFAULT_LANGUAGE) - self.__set_udpipe_lang(udpipe_lang) + self.__combo_udl.set_current_language(udpipe_lang) use_tokenizer = params.get("udpipe_tokenizer", self.DEFAULT_USE_TOKE) self.__set_use_tokenizer(use_tokenizer) lemmagen_lang = params.get("lemmagen_language", self.DEFAULT_LANGUAGE) - self.__set_lemmagen_lang(lemmagen_lang) + self.__combo_lemm.set_current_language(lemmagen_lang) def _set_method(self, method: int): super()._set_method(method) @@ -553,7 +552,6 @@ def _set_method(self, method: int): def __set_snowball_lang(self, language: str): if self.__snowball_lang != language: self.__snowball_lang = language - self.__combo_sbl.set_current_language(language) self.changed.emit() if self.method == self.Snowball: self.edited.emit() @@ -561,18 +559,15 @@ def __set_snowball_lang(self, language: str): def __set_udpipe_lang(self, language: str): if self.__udpipe_lang != language: self.__udpipe_lang = language - self.__combo_udl.set_current_language(language) self.changed.emit() if self.method == self.UDPipe: self.edited.emit() def __set_lemmagen_lang(self, language: str): - if self.__lemmagen_lang != language: - self.__lemmagen_lang = language - self.__combo_lemm.set_current_language(language) - self.changed.emit() - if self.method == self.Lemmagen: - self.edited.emit() + self.__lemmagen_lang = language + self.changed.emit() + if self.method == self.Lemmagen: + self.edited.emit() def __set_use_tokenizer(self, use: bool): if self.__use_tokenizer != use: @@ -658,7 +653,7 @@ def __init__(self, parent=None, **kwargs): self.__combo = LanguageComboBox( self, - StopwordsFilter.supported_languages(), + StopwordsFilter.supported_languages, self.__sw_lang, True, self.__set_language, @@ -795,7 +790,7 @@ def __spin_n_edited(self): def setParameters(self, params: Dict): super().setParameters(params) - self.__set_language(params.get("language", self.DEFAULT_LANG)) + self.__combo.set_current_language(params.get("language", self.DEFAULT_LANG)) self.__set_sw_path(params.get("sw_path", self.DEFAULT_NONE), params.get("sw_list", [])) self.__set_lx_path(params.get("lx_path", self.DEFAULT_NONE), @@ -820,7 +815,6 @@ def setParameters(self, params: Dict): def __set_language(self, language: Optional[str]): if self.__sw_lang != language: self.__sw_lang = language - self.__combo.set_current_language(language) self.changed.emit() if self.Stopwords in self.methods: self.edited.emit() @@ -1045,6 +1039,21 @@ def createinstance(params: Dict) -> POSTagger: return POSTaggingModule.Methods[method]() +class PreprocessSettingsHandler(SettingsHandler): + """ + Settings handler, that makes all language settings, which are + a part of common preprocess settings, schema_only. It removes them when + settings are not loaded from schema but from common settings. + """ + def _remove_schema_only(self, settings_dict): + super()._remove_schema_only(settings_dict) + for setting, data, _ in self.provider.traverse_settings(data=settings_dict): + for pp_name, settings in data["storedsettings"]["preprocessors"]: + for key in list(settings): + if "language" in key: + settings.pop(key) + + PREPROCESS_ACTIONS = [ PreprocessAction( "Transformation", "preprocess.transform", "", @@ -1128,12 +1137,14 @@ class Warning(Orange.widgets.data.owpreprocess.OWPreprocess.Warning): ("preprocess.tokenize", {}), ("preprocess.filter", {})] } # type: Dict[str, List[Tuple[str, Dict]]] + settingsHandler = PreprocessSettingsHandler() storedsettings = Setting(DEFAULT_PP) buttons_area_orientation = Qt.Vertical def __init__(self): ConcurrentWidgetMixin.__init__(self) Orange.widgets.data.owpreprocess.OWPreprocess.__init__(self) + self.__store_pending_languages() box = gui.vBox(self.controlArea, "Preview") self.preview = "" @@ -1151,6 +1162,16 @@ def load(self, saved: Dict) -> StandardItemModel: saved["preprocessors"][i] = (name, params) return super().load(saved) + def set_model(self, pmodel): + """Connect signal which handle setting language from corpus""" + super().set_model(pmodel) + if pmodel: + pmodel.rowsInserted.connect(self.__on_item_inserted) + + def __on_item_inserted(self, _, first: int, last: int): + assert first == last + self.__set_languages_single_editor(first) + def __update_filtering_params(self, params: Dict): params["sw_path"] = self.__relocate_file(params.get("sw_path")) params["sw_list"] = self.__relocate_files(params.get("sw_list", [])) @@ -1180,6 +1201,49 @@ def __relocate_file(self, path: RecentPath) -> RecentPath: def set_data(self, data: Corpus): self.cancel() self.data = data + self.__set_languages() + + LANG_PARAMS = { + "preprocess.normalize": [ + ("snowball_language", SnowballStemmer.supported_languages), + ("udpipe_language", UDPipeModels().supported_languages_iso), + ("lemmagen_language", LemmagenLemmatizer.supported_languages), + ], + "preprocess.filter": [("language", StopwordsFilter.supported_languages)], + } + + def __store_pending_languages(self): + settings = self.storedsettings["preprocessors"] + self.__pending_languages = { + pp_name: {p for p in par if "language" in p} for pp_name, par in settings + } + + def __set_languages(self): + if self.data is not None: + for i in range(self.preprocessormodel.rowCount()): + self.__set_languages_single_editor(i) + self.__pending_languages = {} + + def __set_languages_single_editor(self, item_index: int): + """ + Set language from corpus for single editor/module, + keep language unchanged if it comes from schema (pending). + """ + if self.data and self.data.language: + model = self.preprocessormodel + item = model.item(item_index) + pp_name = item.data(DescriptionRole).qualname + params = item.data(ParametersRole) + pending = self.__pending_languages.get(pp_name, set()) + for param, available_langs in self.LANG_PARAMS.get(pp_name, []): + if param not in pending and self.data.language in available_langs: + # set language if not pending from schema - should not be changed + # and if available for the method + params[param] = self.data.language + with disconnected(model.dataChanged, self.__on_modelchanged): + # disconnection prevent double apply call, it is already called + # on new data and when row inserted, both caller of this method + item.setData(params, ParametersRole) def buildpreproc(self) -> PreprocessorList: plist = [] diff --git a/orangecontrib/text/widgets/tests/test_owpreprocess.py b/orangecontrib/text/widgets/tests/test_owpreprocess.py index 49ccb1229..5b2aeb660 100644 --- a/orangecontrib/text/widgets/tests/test_owpreprocess.py +++ b/orangecontrib/text/widgets/tests/test_owpreprocess.py @@ -2,7 +2,9 @@ from unittest.mock import patch, PropertyMock, MagicMock, Mock import numpy as np +from AnyQt.QtGui import QStandardItem, QIcon from Orange.data import Domain, StringVariable +from Orange.widgets.data.utils.preprocess import DescriptionRole, ParametersRole from orangewidget.utils.filedialogs import RecentPath from Orange.widgets.tests.base import WidgetTest from Orange.widgets.tests.utils import simulate @@ -180,6 +182,153 @@ def test_no_tokens_left(self): self.wait_until_finished() self.assertFalse(self.widget.Warning.no_token_left.is_shown()) + def test_language_from_corpus(self): + """Test language from corpus is set correctly""" + initial = { + "name": "", + "preprocessors": [("preprocess.normalize", {}), ("preprocess.filter", {})], + } + self.widget.storedsettings = initial + self.widget._initialize() + self.assertDictEqual(initial, self.widget.storedsettings) + combos = self.widget.mainArea.findChildren(LanguageComboBox) + self.assertEqual( + ["English", "English", "English", "English"], + [c.currentText() for c in combos] + ) + + # test with Slovenian - language should set for all preprocessors except + # Snowball that doesn't support Slovenian + self.corpus.attributes["language"] = "sl" + self.send_signal(self.widget.Inputs.corpus, self.corpus) + self.assertEqual( + ["English", "Slovenian", "Slovenian", "Slovenian"], + [c.currentText() for c in combos] + ) + settings = self.widget.storedsettings["preprocessors"] + self.assertEqual("sl", settings[0][1]["udpipe_language"]) + self.assertEqual("sl", settings[0][1]["lemmagen_language"]) + self.assertEqual("sl", settings[1][1]["language"]) + + # test with Lithuanian that is support by one preprocessors + self.corpus.attributes["language"] = "lt" + self.send_signal(self.widget.Inputs.corpus, self.corpus) + self.assertEqual( + ["English", "Lithuanian", "Slovenian", "Slovenian"], + [c.currentText() for c in combos] + ) + settings = self.widget.storedsettings["preprocessors"] + self.assertEqual("lt", settings[0][1]["udpipe_language"]) + self.assertEqual("sl", settings[0][1]["lemmagen_language"]) + self.assertEqual("sl", settings[1][1]["language"]) + + self.corpus.attributes["language"] = "pt" + self.send_signal(self.widget.Inputs.corpus, self.corpus) + self.assertEqual( + ["Portuguese", "Portuguese", "Slovenian", "Portuguese"], + [c.currentText() for c in combos] + ) + settings = self.widget.storedsettings["preprocessors"] + self.assertEqual("pt", settings[0][1]["snowball_language"]) + self.assertEqual("pt", settings[0][1]["udpipe_language"]) + self.assertEqual("sl", settings[0][1]["lemmagen_language"]) + self.assertEqual("pt", settings[1][1]["language"]) + + # language not supported by any preprocessor - language shouldn't change + self.corpus.attributes["language"] = "bo" + self.send_signal(self.widget.Inputs.corpus, self.corpus) + self.assertEqual( + ["Portuguese", "Portuguese", "Slovenian", "Portuguese"], + [c.currentText() for c in combos] + ) + settings = self.widget.storedsettings["preprocessors"] + self.assertEqual("pt", settings[0][1]["snowball_language"]) + self.assertEqual("pt", settings[0][1]["udpipe_language"]) + self.assertEqual("sl", settings[0][1]["lemmagen_language"]) + self.assertEqual("pt", settings[1][1]["language"]) + + # test with missing language - language shouldn't change + self.corpus.attributes["language"] = None + self.send_signal(self.widget.Inputs.corpus, self.corpus) + self.assertEqual( + ["Portuguese", "Portuguese", "Slovenian", "Portuguese"], + [c.currentText() for c in combos] + ) + settings = self.widget.storedsettings["preprocessors"] + self.assertEqual("pt", settings[0][1]["snowball_language"]) + self.assertEqual("pt", settings[0][1]["udpipe_language"]) + self.assertEqual("sl", settings[0][1]["lemmagen_language"]) + self.assertEqual("pt", settings[1][1]["language"]) + + def test_language_from_schema(self): + """Test language from schema/workflow is retained""" + initial = { + "name": "", + "preprocessors": [ + ( + "preprocess.normalize", + { + "lemmagen_language": "sl", + "snowball_language": "nl", + "udpipe_language": "lt", + }, + ), + ("preprocess.filter", {"language": "nl"}), + ], + } + self.widget.storedsettings = initial + + settings = self.widget.settingsHandler.pack_data(self.widget) + widget = self.create_widget(OWPreprocess, stored_settings=settings) + self.send_signal(widget.Inputs.corpus, self.corpus, widget=widget) + self.assertDictEqual(initial, widget.storedsettings) + combos = widget.mainArea.findChildren(LanguageComboBox) + self.assertEqual( + ["Dutch", "Lithuanian", "Slovenian", "Dutch"], + [c.currentText() for c in combos] + ) + + def test_language_from_corpus_editor_inserted(self): + """Test language from corpus is set to new editor too""" + initial = { + "name": "", + "preprocessors": [("preprocess.filter", {})], + } + self.widget.storedsettings = initial + self.widget._initialize() + self.assertDictEqual(initial, self.widget.storedsettings) + combos = self.widget.mainArea.findChildren(LanguageComboBox) + self.assertEqual( + ["English"], + [c.currentText() for c in combos] + ) + + # insert data - language of stopwords combo should change to italian + self.corpus.attributes["language"] = "sl" + self.send_signal(self.widget.Inputs.corpus, self.corpus) + self.assertEqual( + ["Slovenian"], + [c.currentText() for c in combos] + ) + + # insert new editor - all languages except snowball should be set to Slovenian + pp_def = self.widget._qname2ppdef["preprocess.normalize"] + description = pp_def.description + item = QStandardItem(description.title) + icon = QIcon(description.icon) + item.setIcon(icon) + item.setToolTip(description.summary) + item.setData(pp_def, DescriptionRole) + item.setData({}, ParametersRole) + self.widget.preprocessormodel.insertRow(0, [item]) + self.wait_until_finished() + + combos = self.widget.mainArea.findChildren(LanguageComboBox) + self.assertEqual( + ['Slovenian', 'English', 'Slovenian', 'Slovenian'], + [c.currentText() for c in combos] + ) + @patch(SF_LIST, new=Mock(return_value=SERVER_FILES)) class TestOWPreprocessMigrateSettings(WidgetTest): @@ -983,14 +1132,20 @@ def test_set_current_language(self): self.assertEqual("Portuguese", cb.currentText()) cb.set_current_language("sl") self.assertEqual("Slovenian", cb.currentText()) - cb.set_current_language("abc") # should set to default - self.assertEqual("English", cb.currentText()) + cb.set_current_language("abc") # language not in list - keep current seleciton + self.assertEqual("Slovenian", cb.currentText()) + + def test_set_language_to_default(self): + """In case current item not in dropdown anymore set language to default""" + mock = Mock() + cb = UDPipeComboBox(None, "pt", "en", mock) + self.assertEqual("Portuguese", cb.currentText()) # when no default language in the dropdown set to first cb.removeItem(0) x = cb._UDPipeComboBox__items cb._UDPipeComboBox__items = x[:3] + x[4:] - cb.set_current_language("abc") - self.assertEqual("English (lines)", cb.currentText()) + cb.showPopup() + self.assertEqual("English", cb.currentText()) def test_change_item(self): mock = Mock()