Skip to content

Commit

Permalink
Preprocess widget - Language from corpus
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Mar 8, 2024
1 parent e36fdea commit 392232c
Show file tree
Hide file tree
Showing 2 changed files with 138 additions and 0 deletions.
74 changes: 74 additions & 0 deletions orangecontrib/text/widgets/owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from AnyQt.QtGui import QBrush, QValidator

from Orange.util import wrap_callback
from orangecanvas.gui.utils import disconnected
from orangewidget.settings import SettingsHandler
from orangewidget.utils.filedialogs import RecentPath

import Orange.widgets.data.owpreprocess
Expand Down Expand Up @@ -147,6 +149,9 @@ def showPopup(self):
self.add_items(None, False, self.itemData(self.currentIndex()))
super().showPopup()

def set_current_language(self, iso_language: str):
self.setCurrentText(UDPipeModels.iso_to_language(iso_language))


class RangeSpins(QHBoxLayout):
SpinBox = QSpinBox
Expand Down Expand Up @@ -1045,6 +1050,21 @@ def createinstance(params: Dict) -> POSTagger:
return POSTaggingModule.Methods[method]()


class PreprocessSettingsHandler(SettingsHandler):
"""
A bit modified settings handler, that makes all language settings, which are
a part of common preprocess settings, schema_only. It removes them when
settings are not loaded from schema but from common settings.
"""
def _remove_schema_only(self, settings_dict):
super()._remove_schema_only(settings_dict)
for setting, data, _ in self.provider.traverse_settings(data=settings_dict):
for pp_name, settings in data["storedsettings"]["preprocessors"]:
for key in list(settings):
if "language" in key:
settings.pop(key)


PREPROCESS_ACTIONS = [
PreprocessAction(
"Transformation", "preprocess.transform", "",
Expand Down Expand Up @@ -1128,12 +1148,14 @@ class Warning(Orange.widgets.data.owpreprocess.OWPreprocess.Warning):
("preprocess.tokenize", {}),
("preprocess.filter", {})]
} # type: Dict[str, List[Tuple[str, Dict]]]
settingsHandler = PreprocessSettingsHandler()
storedsettings = Setting(DEFAULT_PP)
buttons_area_orientation = Qt.Vertical

def __init__(self):
ConcurrentWidgetMixin.__init__(self)
Orange.widgets.data.owpreprocess.OWPreprocess.__init__(self)
self.__store_pending_languages()

box = gui.vBox(self.controlArea, "Preview")
self.preview = ""
Expand All @@ -1151,6 +1173,12 @@ def load(self, saved: Dict) -> StandardItemModel:
saved["preprocessors"][i] = (name, params)
return super().load(saved)

def set_model(self, pmodel):
if pmodel:
pmodel.rowsInserted.connect(self.__on_item_inserted)
super().set_model(pmodel)


def __update_filtering_params(self, params: Dict):
params["sw_path"] = self.__relocate_file(params.get("sw_path"))
params["sw_list"] = self.__relocate_files(params.get("sw_list", []))
Expand All @@ -1176,10 +1204,56 @@ def __relocate_file(self, path: RecentPath) -> RecentPath:
search_paths, **kwargs)
return path

def __on_item_inserted(self, _, first: int, last: int):
assert first == last
self.__set_languages_single_item(first)
self.storedsettings = self.save(self.preprocessormodel)

@Inputs.corpus
def set_data(self, data: Corpus):
self.cancel()
self.data = data
self.__set_languages()

LANG_PARAMS = {
"preprocess.normalize": [
("snowball_language", NormalizationModule.SNOWBALL_LANGUAGES),
("udpipe_language", UDPipeLemmatizer().models.supported_languages_iso),
("lemmagen_language", NormalizationModule.LEMMAGEN_LANGUAGES),
],
"preprocess.filter": [("language", FilteringModule.STOP_WORDS_LANGUAGES)],
}

def __store_pending_languages(self):
self.__pending_languages = defaultdict(dict)
for pp_name, params in self.storedsettings["preprocessors"]:
for p, _ in self.LANG_PARAMS.get(pp_name, []):
if p in params:
self.__pending_languages[pp_name][p] = params[p]

def __set_languages(self):
if self.data is not None:
for i in range(self.preprocessormodel.rowCount()):
self.__set_languages_single_item(i)
self.__pending_languages = {}
self.storedsettings = self.save(self.preprocessormodel)

def __set_languages_single_item(self, item_index: int):
item = self.preprocessormodel.item(item_index)
pp_name = item.data(DescriptionRole).qualname
params = item.data(ParametersRole)
pending = self.__pending_languages.get(pp_name, {})
for param, sup_lang in self.LANG_PARAMS.get(pp_name, []):
if param in pending:
params[param] = pending[param]
else:
sup_lang = sup_lang() if callable(sup_lang) else sup_lang
if self.data.language and self.data.language in sup_lang:
params[param] = self.data.language
with disconnected(self.preprocessormodel.dataChanged, self.__on_modelchanged):
# dataChange must be disconnected to prevent double apply call
# both calls of this method call apply after
item.setData(params, ParametersRole)

def buildpreproc(self) -> PreprocessorList:
plist = []
Expand Down
64 changes: 64 additions & 0 deletions orangecontrib/text/widgets/tests/test_owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,70 @@ def test_no_tokens_left(self):
self.wait_until_finished()
self.assertFalse(self.widget.Warning.no_token_left.is_shown())

def test_language_from_corpus(self):
"""Languege from corpus is set correctly"""
initial = {
"name": "",
"preprocessors": [
("preprocess.transform", {}),
("preprocess.tokenize", {}),
("preprocess.normalize", {}),
("preprocess.filter", {}),
],
}
self.widget.storedsettings = initial
self.widget._initialize()
self.assertDictEqual(initial, self.widget.storedsettings)

self.corpus.attributes["language"] = None
self.send_signal(self.widget.Inputs.corpus, self.corpus)
# nothing should change since language is missing in corpus
self.assertDictEqual(initial, self.widget.storedsettings)

self.corpus.attributes["language"] = "en"
self.send_signal(self.widget.Inputs.corpus, self.corpus)
normalize_settings = self.widget.storedsettings["preprocessors"][2][1]
filter_settings = self.widget.storedsettings["preprocessors"][3][1]
self.assertEqual("en", normalize_settings["lemmagen_language"])
self.assertEqual("en", normalize_settings["snowball_language"])
self.assertEqual("en", normalize_settings["udpipe_language"])
self.assertEqual("en", filter_settings["language"])

# language not supported by all preprocessors
self.corpus.attributes["language"] = "nl"
self.send_signal(self.widget.Inputs.corpus, self.corpus)
normalize_settings = self.widget.storedsettings["preprocessors"][2][1]
filter_settings = self.widget.storedsettings["preprocessors"][3][1]
self.assertEqual("en", normalize_settings["lemmagen_language"])
self.assertEqual("nl", normalize_settings["snowball_language"])
self.assertEqual("en", normalize_settings["udpipe_language"])
self.assertEqual("nl", filter_settings["language"])

def test_language_from_schema(self):
"""Test language from schema/workflow is retained"""
initial = {
"name": "",
"preprocessors": [
("preprocess.transform", {}),
("preprocess.tokenize", {}),
(
"preprocess.normalize",
{
"lemmagen_language": "sl",
"snowball_language": "nl",
"udpipe_language": "lt",
},
),
("preprocess.filter", {"language": "nl"}),
],
}
self.widget.storedsettings = initial

settings = self.widget.settingsHandler.pack_data(self.widget)
widget = self.create_widget(OWPreprocess, stored_settings=settings)
self.send_signal(widget.Inputs.corpus, self.corpus, widget=widget)
self.assertDictEqual(initial, widget.storedsettings)


@patch(SF_LIST, new=Mock(return_value=SERVER_FILES))
class TestOWPreprocessMigrateSettings(WidgetTest):
Expand Down

0 comments on commit 392232c

Please sign in to comment.