Skip to content

Commit

Permalink
Preprocess widget - Language from corpus
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Mar 29, 2024
1 parent 64eee89 commit dbd7af3
Show file tree
Hide file tree
Showing 2 changed files with 208 additions and 20 deletions.
95 changes: 79 additions & 16 deletions orangecontrib/text/widgets/owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from AnyQt.QtGui import QBrush, QValidator

from Orange.util import wrap_callback
from orangecanvas.gui.utils import disconnected
from orangewidget.settings import SettingsHandler
from orangewidget.utils.filedialogs import RecentPath

import Orange.widgets.data.owpreprocess
Expand Down Expand Up @@ -113,7 +115,8 @@ def set_current_language(self, iso_language: Optional[str]):
The ISO language code of element to be selected.
"""
index = self.findData(iso_language)
self.setCurrentIndex(index)
if index >= 0:
self.setCurrentIndex(index)


class UDPipeComboBox(LanguageComboBox):
Expand All @@ -131,15 +134,9 @@ def items(self) -> List:
def add_items(self, _, include_none: bool, language: str):
self.__items = self.items
super().add_items(self.__items, include_none, language)

def set_current_language(self, iso_language: Optional[str]):
iso_items = {iso for _, iso in self.__items}
if iso_language in iso_items:
super().set_current_language(iso_language)
elif self.__default_lang in iso_items:
if language not in iso_items and self.__default_lang in iso_items:
super().set_current_language(self.__default_lang)
elif self.__items:
self.setCurrentIndex(0)

def showPopup(self):
if self.__items != self.items:
Expand Down Expand Up @@ -538,13 +535,13 @@ def __enable_udpipe(self):
def setParameters(self, params: Dict):
super().setParameters(params)
snowball_lang = params.get("snowball_language", self.DEFAULT_LANGUAGE)
self.__set_snowball_lang(snowball_lang)
self.__combo_sbl.set_current_language(snowball_lang)
udpipe_lang = params.get("udpipe_language", self.DEFAULT_LANGUAGE)
self.__set_udpipe_lang(udpipe_lang)
self.__combo_udl.set_current_language(udpipe_lang)
use_tokenizer = params.get("udpipe_tokenizer", self.DEFAULT_USE_TOKE)
self.__set_use_tokenizer(use_tokenizer)
lemmagen_lang = params.get("lemmagen_language", self.DEFAULT_LANGUAGE)
self.__set_lemmagen_lang(lemmagen_lang)
self.__combo_lemm.set_current_language(lemmagen_lang)

def _set_method(self, method: int):
super()._set_method(method)
Expand All @@ -553,23 +550,20 @@ def _set_method(self, method: int):
def __set_snowball_lang(self, language: str):
if self.__snowball_lang != language:
self.__snowball_lang = language
self.__combo_sbl.set_current_language(language)
self.changed.emit()
if self.method == self.Snowball:
self.edited.emit()

def __set_udpipe_lang(self, language: str):
if self.__udpipe_lang != language:
self.__udpipe_lang = language
self.__combo_udl.set_current_language(language)
self.changed.emit()
if self.method == self.UDPipe:
self.edited.emit()

def __set_lemmagen_lang(self, language: str):
if self.__lemmagen_lang != language:
self.__lemmagen_lang = language
self.__combo_lemm.set_current_language(language)
self.changed.emit()
if self.method == self.Lemmagen:
self.edited.emit()
Expand Down Expand Up @@ -795,7 +789,7 @@ def __spin_n_edited(self):

def setParameters(self, params: Dict):
super().setParameters(params)
self.__set_language(params.get("language", self.DEFAULT_LANG))
self.__combo.set_current_language(params.get("language", self.DEFAULT_LANG))
self.__set_sw_path(params.get("sw_path", self.DEFAULT_NONE),
params.get("sw_list", []))
self.__set_lx_path(params.get("lx_path", self.DEFAULT_NONE),
Expand All @@ -820,7 +814,6 @@ def setParameters(self, params: Dict):
def __set_language(self, language: Optional[str]):
if self.__sw_lang != language:
self.__sw_lang = language
self.__combo.set_current_language(language)
self.changed.emit()
if self.Stopwords in self.methods:
self.edited.emit()
Expand Down Expand Up @@ -1045,6 +1038,21 @@ def createinstance(params: Dict) -> POSTagger:
return POSTaggingModule.Methods[method]()


class PreprocessSettingsHandler(SettingsHandler):
"""
Settings handler, that makes all language settings, which are
a part of common preprocess settings, schema_only. It removes them when
settings are not loaded from schema but from common settings.
"""
def _remove_schema_only(self, settings_dict):
super()._remove_schema_only(settings_dict)
for setting, data, _ in self.provider.traverse_settings(data=settings_dict):
for pp_name, settings in data["storedsettings"]["preprocessors"]:
for key in list(settings):
if "language" in key:
settings.pop(key)


PREPROCESS_ACTIONS = [
PreprocessAction(
"Transformation", "preprocess.transform", "",
Expand Down Expand Up @@ -1128,12 +1136,14 @@ class Warning(Orange.widgets.data.owpreprocess.OWPreprocess.Warning):
("preprocess.tokenize", {}),
("preprocess.filter", {})]
} # type: Dict[str, List[Tuple[str, Dict]]]
settingsHandler = PreprocessSettingsHandler()
storedsettings = Setting(DEFAULT_PP)
buttons_area_orientation = Qt.Vertical

def __init__(self):
ConcurrentWidgetMixin.__init__(self)
Orange.widgets.data.owpreprocess.OWPreprocess.__init__(self)
self.__store_pending_languages()

box = gui.vBox(self.controlArea, "Preview")
self.preview = ""
Expand All @@ -1151,6 +1161,16 @@ def load(self, saved: Dict) -> StandardItemModel:
saved["preprocessors"][i] = (name, params)
return super().load(saved)

def set_model(self, pmodel):
"""Connect signal which handle setting language from corpus"""
super().set_model(pmodel)
if pmodel:
pmodel.rowsInserted.connect(self.__on_item_inserted)

def __on_item_inserted(self, _, first: int, last: int):
assert first == last
self.__set_languages_single_editor(first)

def __update_filtering_params(self, params: Dict):
params["sw_path"] = self.__relocate_file(params.get("sw_path"))
params["sw_list"] = self.__relocate_files(params.get("sw_list", []))
Expand Down Expand Up @@ -1180,6 +1200,49 @@ def __relocate_file(self, path: RecentPath) -> RecentPath:
def set_data(self, data: Corpus):
self.cancel()
self.data = data
self.__set_languages()

LANG_PARAMS = {
"preprocess.normalize": [
"snowball_language",
"udpipe_language",
"lemmagen_language"
],
"preprocess.filter": ["language"],
}

def __store_pending_languages(self):
settings = self.storedsettings["preprocessors"]
self.__pending_languages = {
pp_name: {p for p in par if "language" in p} for pp_name, par in settings
}

def __set_languages(self):
if self.data is not None:
for i in range(self.preprocessormodel.rowCount()):
self.__set_languages_single_editor(i)
self.__pending_languages = {}

def __set_languages_single_editor(self, item_index: int):
"""
Set language from corpus for single editor/module,
keep language unchanged if it comes from schema (pending).
"""
if self.data and self.data.language:
model = self.preprocessormodel
item = model.item(item_index)
pp_name = item.data(DescriptionRole).qualname
params = item.data(ParametersRole)
pending = self.__pending_languages.get(pp_name, set())
for param in self.LANG_PARAMS.get(pp_name, []):
if param not in pending:
# set language if not in pending - if pending it is means
# that it came from schema and should not be changed
params[param] = self.data.language
with disconnected(model.dataChanged, self.__on_modelchanged):
# disconnection prevent double apply call, it is already called
# on new data and when row inserted, both caller of this method
item.setData(params, ParametersRole)

def buildpreproc(self) -> PreprocessorList:
plist = []
Expand Down
133 changes: 129 additions & 4 deletions orangecontrib/text/widgets/tests/test_owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

import numpy as np
from Orange.data import Domain, StringVariable
from Orange.widgets.data.utils.preprocess import DescriptionRole, ParametersRole
from PyQt6.QtGui import QStandardItem, QIcon
from orangewidget.utils.filedialogs import RecentPath
from Orange.widgets.tests.base import WidgetTest
from Orange.widgets.tests.utils import simulate
Expand Down Expand Up @@ -180,6 +182,123 @@ def test_no_tokens_left(self):
self.wait_until_finished()
self.assertFalse(self.widget.Warning.no_token_left.is_shown())

def test_language_from_corpus(self):
"""Test language from corpus is set correctly"""
initial = {
"name": "",
"preprocessors": [("preprocess.normalize", {}), ("preprocess.filter", {})],
}
self.widget.storedsettings = initial
self.widget._initialize()
self.assertDictEqual(initial, self.widget.storedsettings)
combos = self.widget.mainArea.findChildren(LanguageComboBox)
self.assertEqual(
["English", "English", "English", "English"],
[c.currentText() for c in combos]
)

# test with Slovenian - language should set for all preprocessors except
# Snowball that doesn't support Slovenian
self.corpus.attributes["language"] = "sl"
self.send_signal(self.widget.Inputs.corpus, self.corpus)
self.assertEqual(
["English", "Slovenian", "Slovenian", "Slovenian"],
[c.currentText() for c in combos]
)

# test with Dutch that is support by two preprocessors
self.corpus.attributes["language"] = "nl"
self.send_signal(self.widget.Inputs.corpus, self.corpus)
self.assertEqual(
["Dutch", "Slovenian", "Slovenian", "Dutch"],
[c.currentText() for c in combos]
)

# language not supported by any preprocessor - language shouldn't change
self.corpus.attributes["language"] = "bo"
self.send_signal(self.widget.Inputs.corpus, self.corpus)
self.assertEqual(
["Dutch", "Slovenian", "Slovenian", "Dutch"],
[c.currentText() for c in combos]
)

# test with missing language - language shouldn't change
self.corpus.attributes["language"] = None
self.send_signal(self.widget.Inputs.corpus, self.corpus)
self.assertEqual(
["Dutch", "Slovenian", "Slovenian", "Dutch"],
[c.currentText() for c in combos]
)

def test_language_from_schema(self):
"""Test language from schema/workflow is retained"""
initial = {
"name": "",
"preprocessors": [
(
"preprocess.normalize",
{
"lemmagen_language": "sl",
"snowball_language": "nl",
"udpipe_language": "lt",
},
),
("preprocess.filter", {"language": "nl"}),
],
}
self.widget.storedsettings = initial

settings = self.widget.settingsHandler.pack_data(self.widget)
widget = self.create_widget(OWPreprocess, stored_settings=settings)
self.send_signal(widget.Inputs.corpus, self.corpus, widget=widget)
self.assertDictEqual(initial, widget.storedsettings)
combos = widget.mainArea.findChildren(LanguageComboBox)
self.assertEqual(
["Dutch", "Lithuanian", "Slovenian", "Dutch"],
[c.currentText() for c in combos]
)

def test_language_from_corpus_editor_inserted(self):
"""Test language from corpus is set to new editor too"""
initial = {
"name": "",
"preprocessors": [("preprocess.filter", {})],
}
self.widget.storedsettings = initial
self.widget._initialize()
self.assertDictEqual(initial, self.widget.storedsettings)
combos = self.widget.mainArea.findChildren(LanguageComboBox)
self.assertEqual(
["English"],
[c.currentText() for c in combos]
)

# insert data - language of stopwords combo should change to italian
self.corpus.attributes["language"] = "sl"
self.send_signal(self.widget.Inputs.corpus, self.corpus)
self.assertEqual(
["Slovenian"],
[c.currentText() for c in combos]
)

# insert new editor - all languages except snowball should be set to Slovenian
pp_def = self.widget._qname2ppdef["preprocess.normalize"]
description = pp_def.description
item = QStandardItem(description.title)
icon = QIcon(description.icon)
item.setIcon(icon)
item.setToolTip(description.summary)
item.setData(pp_def, DescriptionRole)
item.setData({}, ParametersRole)
self.widget.preprocessormodel.insertRow(0, [item])
self.wait_until_finished()

combos = self.widget.mainArea.findChildren(LanguageComboBox)
self.assertEqual(
['Slovenian', 'English', 'Slovenian', 'Slovenian'],
[c.currentText() for c in combos]
)


@patch(SF_LIST, new=Mock(return_value=SERVER_FILES))
class TestOWPreprocessMigrateSettings(WidgetTest):
Expand Down Expand Up @@ -983,14 +1102,20 @@ def test_set_current_language(self):
self.assertEqual("Portuguese", cb.currentText())
cb.set_current_language("sl")
self.assertEqual("Slovenian", cb.currentText())
cb.set_current_language("abc") # should set to default
self.assertEqual("English", cb.currentText())
cb.set_current_language("abc") # language not in list - keep current seleciton
self.assertEqual("Slovenian", cb.currentText())

def test_set_language_to_default(self):
"""In case current item not in dropdown anymore set language to default"""
mock = Mock()
cb = UDPipeComboBox(None, "pt", "en", mock)
self.assertEqual("Portuguese", cb.currentText())
# when no default language in the dropdown set to first
cb.removeItem(0)
x = cb._UDPipeComboBox__items
cb._UDPipeComboBox__items = x[:3] + x[4:]
cb.set_current_language("abc")
self.assertEqual("English (lines)", cb.currentText())
cb.showPopup()
self.assertEqual("English", cb.currentText())

def test_change_item(self):
mock = Mock()
Expand Down

0 comments on commit dbd7af3

Please sign in to comment.