From 723e44be692f716c3619b50fa8180e7e27930e1d Mon Sep 17 00:00:00 2001 From: PrimozGodec Date: Fri, 14 Apr 2023 15:33:34 +0200 Subject: [PATCH] Preprocess widget - Language from corpus --- orangecontrib/text/preprocess/normalize.py | 4 +- orangecontrib/text/widgets/owpreprocess.py | 92 ++++++++++++++++++- .../text/widgets/tests/test_owpreprocess.py | 74 ++++++++++++++- 3 files changed, 160 insertions(+), 10 deletions(-) diff --git a/orangecontrib/text/preprocess/normalize.py b/orangecontrib/text/preprocess/normalize.py index cb7666794..48183390b 100644 --- a/orangecontrib/text/preprocess/normalize.py +++ b/orangecontrib/text/preprocess/normalize.py @@ -140,10 +140,12 @@ def online(self): except ConnectionError: return False + # todo: clanup # use _ since - is already used in iso standard VARIATION_DELIMITER = "_" - def language_to_iso(self, language): + @staticmethod + def language_to_iso(language): if "(" in language: language, model = language.split("(") language = LANG2ISO[language.strip()] diff --git a/orangecontrib/text/widgets/owpreprocess.py b/orangecontrib/text/widgets/owpreprocess.py index 7be2eed77..54831371a 100644 --- a/orangecontrib/text/widgets/owpreprocess.py +++ b/orangecontrib/text/widgets/owpreprocess.py @@ -12,6 +12,8 @@ from AnyQt.QtGui import QBrush, QValidator from Orange.util import wrap_callback +from orangecanvas.gui.utils import disconnected +from orangewidget.settings import SettingsHandler from orangewidget.utils.filedialogs import RecentPath import Orange.widgets.data.owpreprocess @@ -27,7 +29,7 @@ from orangecontrib.text.language import ISO2LANG, LANG2ISO from orangecontrib.text.misc import nltk_data_dir from orangecontrib.text.preprocess import * -from orangecontrib.text.preprocess.normalize import UDPipeStopIteration +from orangecontrib.text.preprocess.normalize import UDPipeStopIteration, UDPipeModels from orangecontrib.text.tag import AveragedPerceptronTagger, MaxEntTagger, \ POSTagger @@ -121,8 +123,9 @@ def __init__(self, master: BaseEditor, value: str, default: str, super().__init__(master) self.__items = [] # type: List self.__default_lang = default - self.add_items(value) - self.currentTextChanged.connect(callback) + self.add_items(UDPipeModels.iso_to_language(value)) + self.currentTextChanged.connect(self.__text_changed) + self.callback = callback self.setMinimumWidth(80) @property @@ -145,6 +148,12 @@ def showPopup(self): self.add_items(self.currentText()) super().showPopup() + def __text_changed(self, language): + self.callback(UDPipeLemmatizer().models.language_to_iso(language)) + + def set_current_language(self, iso_language: str): + self.setCurrentText(UDPipeModels.iso_to_language(iso_language)) + class RangeSpins(QHBoxLayout): SpinBox = QSpinBox @@ -560,7 +569,7 @@ def __set_snowball_lang(self, language: str): def __set_udpipe_lang(self, language: str): if self.__udpipe_lang != language: self.__udpipe_lang = language - self.__combo_udl.setCurrentText(language) + self.__combo_udl.set_current_language(language) self.changed.emit() if self.method == self.UDPipe: self.edited.emit() @@ -1045,6 +1054,21 @@ def createinstance(params: Dict) -> POSTagger: return POSTaggingModule.Methods[method]() +class PreprocessSettingsHandler(SettingsHandler): + """ + A bit modified settings handler, that makes all language settings, which are + a part of common preprocess settings, schema_only. It removes them when + settings are not loaded from schema but from common settings. + """ + def _remove_schema_only(self, settings_dict): + super()._remove_schema_only(settings_dict) + for setting, data, _ in self.provider.traverse_settings(data=settings_dict): + for pp_name, settings in data["storedsettings"]["preprocessors"]: + for key in list(settings): + if "language" in key: + settings.pop(key) + + PREPROCESS_ACTIONS = [ PreprocessAction( "Transformation", "preprocess.transform", "", @@ -1128,12 +1152,14 @@ class Warning(Orange.widgets.data.owpreprocess.OWPreprocess.Warning): ("preprocess.tokenize", {}), ("preprocess.filter", {})] } # type: Dict[str, List[Tuple[str, Dict]]] + settingsHandler = PreprocessSettingsHandler() storedsettings = Setting(DEFAULT_PP) buttons_area_orientation = Qt.Vertical def __init__(self): ConcurrentWidgetMixin.__init__(self) Orange.widgets.data.owpreprocess.OWPreprocess.__init__(self) + self.__store_pending_languages() box = gui.vBox(self.controlArea, "Preview") self.preview = "" @@ -1151,6 +1177,12 @@ def load(self, saved: Dict) -> StandardItemModel: saved["preprocessors"][i] = (name, params) return super().load(saved) + def set_model(self, pmodel): + if pmodel: + pmodel.rowsInserted.connect(self.__on_item_inserted) + super().set_model(pmodel) + + def __update_filtering_params(self, params: Dict): params["sw_path"] = self.__relocate_file(params.get("sw_path")) params["sw_list"] = self.__relocate_files(params.get("sw_list", [])) @@ -1176,10 +1208,56 @@ def __relocate_file(self, path: RecentPath) -> RecentPath: search_paths, **kwargs) return path + def __on_item_inserted(self, _, first: int, last: int): + assert first == last + self.__set_languages_single_item(first) + self.storedsettings = self.save(self.preprocessormodel) + @Inputs.corpus def set_data(self, data: Corpus): self.cancel() self.data = data + self.__set_languages() + + LANG_PARAMS = { + "preprocess.normalize": [ + ("snowball_language", NormalizationModule.SNOWBALL_LANGUAGES), + ("udpipe_language", UDPipeLemmatizer().models.supported_languages_iso), + ("lemmagen_language", NormalizationModule.LEMMAGEN_LANGUAGES), + ], + "preprocess.filter": [("language", FilteringModule.STOP_WORDS_LANGUAGES)], + } + + def __store_pending_languages(self): + self.__pending_languages = defaultdict(dict) + for pp_name, params in self.storedsettings["preprocessors"]: + for p, _ in self.LANG_PARAMS.get(pp_name, []): + if p in params: + self.__pending_languages[pp_name][p] = params[p] + + def __set_languages(self): + if self.data is not None: + for i in range(self.preprocessormodel.rowCount()): + self.__set_languages_single_item(i) + self.__pending_languages = {} + self.storedsettings = self.save(self.preprocessormodel) + + def __set_languages_single_item(self, item_index: int): + item = self.preprocessormodel.item(item_index) + pp_name = item.data(DescriptionRole).qualname + params = item.data(ParametersRole) + pending = self.__pending_languages.get(pp_name, {}) + for param, sup_lang in self.LANG_PARAMS.get(pp_name, []): + if param in pending: + params[param] = pending[param] + else: + sup_lang = sup_lang() if callable(sup_lang) else sup_lang + if self.data.language and self.data.language in sup_lang: + params[param] = self.data.language + with disconnected(self.preprocessormodel.dataChanged, self.__on_modelchanged): + # dataChange must be disconnected to prevent double apply call + # both calls of this method call apply after + item.setData(params, ParametersRole) def buildpreproc(self) -> PreprocessorList: plist = [] @@ -1387,6 +1465,12 @@ def str_into_paths(label): preprocessors = settings["storedsettings"]["preprocessors"] for pp_name, pp in preprocessors: if pp_name == "preprocess.filter" and "language" in pp: + for k in ("snowball_language", "lemmagen_language"): + if k in pp: + pp[k] = LANG2ISO[pp[k]] + up_lang = "udpipe_language" + if up_lang in pp: + pp[up_lang] = UDPipeModels.language_to_iso(pp[up_lang]) if pp["language"] == _DEFAULT_NONE: pp["language"] = None else: diff --git a/orangecontrib/text/widgets/tests/test_owpreprocess.py b/orangecontrib/text/widgets/tests/test_owpreprocess.py index 713d67b43..6c250f6e1 100644 --- a/orangecontrib/text/widgets/tests/test_owpreprocess.py +++ b/orangecontrib/text/widgets/tests/test_owpreprocess.py @@ -179,6 +179,70 @@ def test_no_tokens_left(self): self.wait_until_finished() self.assertFalse(self.widget.Warning.no_token_left.is_shown()) + def test_language_from_corpus(self): + """Languege from corpus is set correctly""" + initial = { + "name": "", + "preprocessors": [ + ("preprocess.transform", {}), + ("preprocess.tokenize", {}), + ("preprocess.normalize", {}), + ("preprocess.filter", {}), + ], + } + self.widget.storedsettings = initial + self.widget._initialize() + self.assertDictEqual(initial, self.widget.storedsettings) + + self.corpus.attributes["language"] = None + self.send_signal(self.widget.Inputs.corpus, self.corpus) + # nothing should change since language is missing in corpus + self.assertDictEqual(initial, self.widget.storedsettings) + + self.corpus.attributes["language"] = "en" + self.send_signal(self.widget.Inputs.corpus, self.corpus) + normalize_settings = self.widget.storedsettings["preprocessors"][2][1] + filter_settings = self.widget.storedsettings["preprocessors"][3][1] + self.assertEqual("en", normalize_settings["lemmagen_language"]) + self.assertEqual("en", normalize_settings["snowball_language"]) + self.assertEqual("en", normalize_settings["udpipe_language"]) + self.assertEqual("en", filter_settings["language"]) + + # language not supported by all preprocessors + self.corpus.attributes["language"] = "nl" + self.send_signal(self.widget.Inputs.corpus, self.corpus) + normalize_settings = self.widget.storedsettings["preprocessors"][2][1] + filter_settings = self.widget.storedsettings["preprocessors"][3][1] + self.assertEqual("en", normalize_settings["lemmagen_language"]) + self.assertEqual("nl", normalize_settings["snowball_language"]) + self.assertEqual("en", normalize_settings["udpipe_language"]) + self.assertEqual("nl", filter_settings["language"]) + + def test_language_from_schema(self): + """Test language from schema/workflow is retained""" + initial = { + "name": "", + "preprocessors": [ + ("preprocess.transform", {}), + ("preprocess.tokenize", {}), + ( + "preprocess.normalize", + { + "lemmagen_language": "sl", + "snowball_language": "nl", + "udpipe_language": "lt", + }, + ), + ("preprocess.filter", {"language": "nl"}), + ], + } + self.widget.storedsettings = initial + + settings = self.widget.settingsHandler.pack_data(self.widget) + widget = self.create_widget(OWPreprocess, stored_settings=settings) + self.send_signal(widget.Inputs.corpus, self.corpus, widget=widget) + self.assertDictEqual(initial, widget.storedsettings) + @patch(SF_LIST, new=Mock(return_value=SERVER_FILES)) class TestOWPreprocessMigrateSettings(WidgetTest): @@ -206,7 +270,7 @@ def test_migrate_settings_normalize(self): widget = self.create_widget(OWPreprocess, stored_settings=settings) params = [("preprocess.normalize", {"method": 2, "snowball_language": "fr", - "udpipe_language": "German", "udpipe_tokenizer": True})] + "udpipe_language": "de", "udpipe_tokenizer": True})] self.assertEqual(widget.storedsettings["preprocessors"], params) def test_migrate_settings_filter(self): @@ -528,19 +592,19 @@ def test_createinstance(self): params = {"method": NormalizationModule.Snowball} pp = self.editor.createinstance(params) self.assertIsInstance(pp, SnowballStemmer) - self.assertIn("", str(pp.normalizer)) + self.assertEqual("en", pp._language) params = {"method": NormalizationModule.Snowball, "snowball_language": "nl"} pp = self.editor.createinstance(params) self.assertIsInstance(pp, SnowballStemmer) - self.assertIn("", str(pp.normalizer)) + self.assertEqual("nl", pp._language) params = {"method": NormalizationModule.UDPipe, - "udpipe_language": "Finnish", + "udpipe_language": "fi", "udpipe_tokenizer": True} pp = self.editor.createinstance(params) self.assertIsInstance(pp, UDPipeLemmatizer) - self.assertEqual(pp._UDPipeLemmatizer__language, "Finnish") + self.assertEqual(pp._language, "fi") self.assertEqual(pp._UDPipeLemmatizer__use_tokenizer, True) def test_repr(self):