Skip to content

Commit

Permalink
Preprocess widget - Language from corpus
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Nov 24, 2023
1 parent 7a46c80 commit 723d077
Show file tree
Hide file tree
Showing 3 changed files with 184 additions and 32 deletions.
4 changes: 3 additions & 1 deletion orangecontrib/text/preprocess/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,10 +155,12 @@ def online(self):
except ConnectionError:
return False

# todo: clanup
# use _ since - is already used in iso standard
VARIATION_DELIMITER = "_"

def language_to_iso(self, language):
@staticmethod
def language_to_iso(language):
if "(" in language:
language, model = language.split("(")
language = LANG2ISO[language.strip()]
Expand Down
117 changes: 104 additions & 13 deletions orangecontrib/text/widgets/owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from AnyQt.QtGui import QBrush, QValidator

from Orange.util import wrap_callback
from orangecanvas.gui.utils import disconnected
from orangewidget.settings import SettingsHandler
from orangewidget.utils.filedialogs import RecentPath

import Orange.widgets.data.owpreprocess
Expand All @@ -27,7 +29,7 @@
from orangecontrib.text.language import ISO2LANG
from orangecontrib.text.misc import nltk_data_dir
from orangecontrib.text.preprocess import *
from orangecontrib.text.preprocess.normalize import UDPipeStopIteration
from orangecontrib.text.preprocess.normalize import UDPipeStopIteration, UDPipeModels
from orangecontrib.text.tag import AveragedPerceptronTagger, MaxEntTagger, \
POSTagger

Expand Down Expand Up @@ -121,8 +123,9 @@ def __init__(self, master: BaseEditor, value: str, default: str,
super().__init__(master)
self.__items = [] # type: List
self.__default_lang = default
self.add_items(value)
self.currentTextChanged.connect(callback)
self.add_items(UDPipeModels.iso_to_language(value))
self.currentTextChanged.connect(self.__text_changed)
self.callback = callback
self.setMinimumWidth(80)

@property
Expand All @@ -145,6 +148,12 @@ def showPopup(self):
self.add_items(self.currentText())
super().showPopup()

def __text_changed(self, language):
self.callback(UDPipeLemmatizer().models.language_to_iso(language))

def set_current_language(self, iso_language: str):
self.setCurrentText(UDPipeModels.iso_to_language(iso_language))


class RangeSpins(QHBoxLayout):
SpinBox = QSpinBox
Expand Down Expand Up @@ -475,19 +484,24 @@ class NormalizationModule(SingleMethodModule):
UDPipe: UDPipeLemmatizer,
Lemmagen: LemmagenLemmatizer}
DEFAULT_METHOD = Porter
DEFAULT_LANGUAGE = "English"
DEFAULT_LANGUAGE = "en"
DEFAULT_USE_TOKE = False

SNOWBALL_LANGUAGES = SnowballStemmer.supported_languages
LEMMAGEN_LANGUAGES = LemmagenLemmatizer.supported_languages

def __init__(self, parent=None, **kwargs):
super().__init__(parent, **kwargs)
self.__snowball_lang = self.DEFAULT_LANGUAGE
self.__udpipe_lang = self.DEFAULT_LANGUAGE
self.__lemmagen_lang = self.DEFAULT_LANGUAGE
self.__use_tokenizer = self.DEFAULT_USE_TOKE

self.__combo_sbl = ComboBox(
self, SnowballStemmer.supported_languages,
self.__snowball_lang, self.__set_snowball_lang
self.__combo_sbl = LanguageComboBox(
self,
self.SNOWBALL_LANGUAGES,
self.__snowball_lang,
self.__set_snowball_lang,
)
self.__combo_udl = UDPipeComboBox(
self, self.__udpipe_lang, self.DEFAULT_LANGUAGE,
Expand All @@ -496,9 +510,11 @@ def __init__(self, parent=None, **kwargs):
self.__check_use = QCheckBox("UDPipe tokenizer",
checked=self.DEFAULT_USE_TOKE)
self.__check_use.clicked.connect(self.__set_use_tokenizer)
self.__combo_lemm = ComboBox(
self, LemmagenLemmatizer.lemmagen_languages,
self.__lemmagen_lang, self.__set_lemmagen_lang
self.__combo_lemm = LanguageComboBox(
self,
self.LEMMAGEN_LANGUAGES,
self.__lemmagen_lang,
self.__set_lemmagen_lang,
)

label = QLabel("Language:")
Expand Down Expand Up @@ -546,23 +562,23 @@ def _set_method(self, method: int):
def __set_snowball_lang(self, language: str):
if self.__snowball_lang != language:
self.__snowball_lang = language
self.__combo_sbl.setCurrentText(language)
self.__combo_sbl.set_current_language(language)
self.changed.emit()
if self.method == self.Snowball:
self.edited.emit()

def __set_udpipe_lang(self, language: str):
if self.__udpipe_lang != language:
self.__udpipe_lang = language
self.__combo_udl.setCurrentText(language)
self.__combo_udl.set_current_language(language)
self.changed.emit()
if self.method == self.UDPipe:
self.edited.emit()

def __set_lemmagen_lang(self, language: str):
if self.__lemmagen_lang != language:
self.__lemmagen_lang = language
self.__combo_lemm.setCurrentText(language)
self.__combo_lemm.set_current_language(language)
self.changed.emit()
if self.method == self.Lemmagen:
self.edited.emit()
Expand Down Expand Up @@ -1038,6 +1054,21 @@ def createinstance(params: Dict) -> POSTagger:
return POSTaggingModule.Methods[method]()


class PreprocessSettingsHandler(SettingsHandler):
"""
A bit modified settings handler, that makes all language settings, which are
a part of common preprocess settings, schema_only. It removes them when
settings are not loaded from schema but from common settings.
"""
def _remove_schema_only(self, settings_dict):
super()._remove_schema_only(settings_dict)
for setting, data, _ in self.provider.traverse_settings(data=settings_dict):
for pp_name, settings in data["storedsettings"]["preprocessors"]:
for key in list(settings):
if "language" in key:
settings.pop(key)


PREPROCESS_ACTIONS = [
PreprocessAction(
"Transformation", "preprocess.transform", "",
Expand Down Expand Up @@ -1121,12 +1152,14 @@ class Warning(Orange.widgets.data.owpreprocess.OWPreprocess.Warning):
("preprocess.tokenize", {}),
("preprocess.filter", {})]
} # type: Dict[str, List[Tuple[str, Dict]]]
settingsHandler = PreprocessSettingsHandler()
storedsettings = Setting(DEFAULT_PP)
buttons_area_orientation = Qt.Vertical

def __init__(self):
ConcurrentWidgetMixin.__init__(self)
Orange.widgets.data.owpreprocess.OWPreprocess.__init__(self)
self.__store_pending_languages()

box = gui.vBox(self.controlArea, "Preview")
self.preview = ""
Expand All @@ -1144,6 +1177,12 @@ def load(self, saved: Dict) -> StandardItemModel:
saved["preprocessors"][i] = (name, params)
return super().load(saved)

def set_model(self, pmodel):
if pmodel:
pmodel.rowsInserted.connect(self.__on_item_inserted)
super().set_model(pmodel)


def __update_filtering_params(self, params: Dict):
params["sw_path"] = self.__relocate_file(params.get("sw_path"))
params["sw_list"] = self.__relocate_files(params.get("sw_list", []))
Expand All @@ -1169,10 +1208,56 @@ def __relocate_file(self, path: RecentPath) -> RecentPath:
search_paths, **kwargs)
return path

def __on_item_inserted(self, _, first: int, last: int):
assert first == last
self.__set_languages_single_item(first)
self.storedsettings = self.save(self.preprocessormodel)

@Inputs.corpus
def set_data(self, data: Corpus):
self.cancel()
self.data = data
self.__set_languages()

LANG_PARAMS = {
"preprocess.normalize": [
("snowball_language", NormalizationModule.SNOWBALL_LANGUAGES),
("udpipe_language", UDPipeLemmatizer().models.supported_languages_iso),
("lemmagen_language", NormalizationModule.LEMMAGEN_LANGUAGES),
],
"preprocess.filter": [("language", FilteringModule.STOP_WORDS_LANGUAGES)],
}

def __store_pending_languages(self):
self.__pending_languages = defaultdict(dict)
for pp_name, params in self.storedsettings["preprocessors"]:
for p, _ in self.LANG_PARAMS.get(pp_name, []):
if p in params:
self.__pending_languages[pp_name][p] = params[p]

def __set_languages(self):
if self.data is not None:
for i in range(self.preprocessormodel.rowCount()):
self.__set_languages_single_item(i)
self.__pending_languages = {}
self.storedsettings = self.save(self.preprocessormodel)

def __set_languages_single_item(self, item_index: int):
item = self.preprocessormodel.item(item_index)
pp_name = item.data(DescriptionRole).qualname
params = item.data(ParametersRole)
pending = self.__pending_languages.get(pp_name, {})
for param, sup_lang in self.LANG_PARAMS.get(pp_name, []):
if param in pending:
params[param] = pending[param]
else:
sup_lang = sup_lang() if callable(sup_lang) else sup_lang
if self.data.language and self.data.language in sup_lang:
params[param] = self.data.language
with disconnected(self.preprocessormodel.dataChanged, self.__on_modelchanged):
# dataChange must be disconnected to prevent double apply call
# both calls of this method call apply after
item.setData(params, ParametersRole)

def buildpreproc(self) -> PreprocessorList:
plist = []
Expand Down Expand Up @@ -1380,6 +1465,12 @@ def str_into_paths(label):
preprocessors = settings["storedsettings"]["preprocessors"]
for pp_name, pp in preprocessors:
if pp_name == "preprocess.filter" and "language" in pp:
for k in ("snowball_language", "lemmagen_language"):
if k in pp:
pp[k] = LANG2ISO[pp[k]]
up_lang = "udpipe_language"
if up_lang in pp:
pp[up_lang] = UDPipeModels.language_to_iso(pp[up_lang])
if pp["language"] == _DEFAULT_NONE:
pp["language"] = None
else:
Expand Down
95 changes: 77 additions & 18 deletions orangecontrib/text/widgets/tests/test_owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,70 @@ def test_no_tokens_left(self):
self.wait_until_finished()
self.assertFalse(self.widget.Warning.no_token_left.is_shown())

def test_language_from_corpus(self):
"""Languege from corpus is set correctly"""
initial = {
"name": "",
"preprocessors": [
("preprocess.transform", {}),
("preprocess.tokenize", {}),
("preprocess.normalize", {}),
("preprocess.filter", {}),
],
}
self.widget.storedsettings = initial
self.widget._initialize()
self.assertDictEqual(initial, self.widget.storedsettings)

self.corpus.attributes["language"] = None
self.send_signal(self.widget.Inputs.corpus, self.corpus)
# nothing should change since language is missing in corpus
self.assertDictEqual(initial, self.widget.storedsettings)

self.corpus.attributes["language"] = "en"
self.send_signal(self.widget.Inputs.corpus, self.corpus)
normalize_settings = self.widget.storedsettings["preprocessors"][2][1]
filter_settings = self.widget.storedsettings["preprocessors"][3][1]
self.assertEqual("en", normalize_settings["lemmagen_language"])
self.assertEqual("en", normalize_settings["snowball_language"])
self.assertEqual("en", normalize_settings["udpipe_language"])
self.assertEqual("en", filter_settings["language"])

# language not supported by all preprocessors
self.corpus.attributes["language"] = "nl"
self.send_signal(self.widget.Inputs.corpus, self.corpus)
normalize_settings = self.widget.storedsettings["preprocessors"][2][1]
filter_settings = self.widget.storedsettings["preprocessors"][3][1]
self.assertEqual("en", normalize_settings["lemmagen_language"])
self.assertEqual("nl", normalize_settings["snowball_language"])
self.assertEqual("en", normalize_settings["udpipe_language"])
self.assertEqual("nl", filter_settings["language"])

def test_language_from_schema(self):
"""Test language from schema/workflow is retained"""
initial = {
"name": "",
"preprocessors": [
("preprocess.transform", {}),
("preprocess.tokenize", {}),
(
"preprocess.normalize",
{
"lemmagen_language": "sl",
"snowball_language": "nl",
"udpipe_language": "lt",
},
),
("preprocess.filter", {"language": "nl"}),
],
}
self.widget.storedsettings = initial

settings = self.widget.settingsHandler.pack_data(self.widget)
widget = self.create_widget(OWPreprocess, stored_settings=settings)
self.send_signal(widget.Inputs.corpus, self.corpus, widget=widget)
self.assertDictEqual(initial, widget.storedsettings)


@patch(SF_LIST, new=Mock(return_value=SERVER_FILES))
class TestOWPreprocessMigrateSettings(WidgetTest):
Expand All @@ -205,8 +269,8 @@ def test_migrate_settings_normalize(self):
"udpipe_tokenizer": True}}
widget = self.create_widget(OWPreprocess, stored_settings=settings)
params = [("preprocess.normalize",
{"method": 2, "snowball_language": "French",
"udpipe_language": "German", "udpipe_tokenizer": True})]
{"method": 2, "snowball_language": "fr",
"udpipe_language": "de", "udpipe_tokenizer": True})]
self.assertEqual(widget.storedsettings["preprocessors"], params)

def test_migrate_settings_filter(self):
Expand Down Expand Up @@ -460,17 +524,17 @@ def test_init(self):

def test_parameters(self):
params = {"method": NormalizationModule.Porter,
"snowball_language": "English",
"udpipe_language": "English",
"lemmagen_language": "English",
"snowball_language": "en",
"udpipe_language": "en",
"lemmagen_language": "en",
"udpipe_tokenizer": False}
self.assertDictEqual(self.editor.parameters(), params)

def test_set_parameters(self):
params = {"method": NormalizationModule.UDPipe,
"snowball_language": "Dutch",
"udpipe_language": "Slovenian",
"lemmagen_language": "Bulgarian",
"snowball_language": "nl",
"udpipe_language": "sl",
"lemmagen_language": "bg",
"udpipe_tokenizer": True}
self.editor.setParameters(params)
self.assertDictEqual(self.editor.parameters(), params)
Expand All @@ -486,20 +550,19 @@ def test_createinstance(self):
params = {"method": NormalizationModule.Snowball}
pp = self.editor.createinstance(params)
self.assertIsInstance(pp, SnowballStemmer)
self.assertIn("<EnglishStemmer>", str(pp.normalizer))
self.assertEqual("en", pp._language)

params = {"method": NormalizationModule.Snowball,
"snowball_language": "Dutch"}
params = {"method": NormalizationModule.Snowball, "snowball_language": "nl"}
pp = self.editor.createinstance(params)
self.assertIsInstance(pp, SnowballStemmer)
self.assertIn("<DutchStemmer>", str(pp.normalizer))
self.assertEqual("nl", pp._language)

params = {"method": NormalizationModule.UDPipe,
"udpipe_language": "Finnish",
"udpipe_language": "fi",
"udpipe_tokenizer": True}
pp = self.editor.createinstance(params)
self.assertIsInstance(pp, UDPipeLemmatizer)
self.assertEqual(pp._UDPipeLemmatizer__language, "Finnish")
self.assertEqual(pp._language, "fi")
self.assertEqual(pp._UDPipeLemmatizer__use_tokenizer, True)

def test_repr(self):
Expand Down Expand Up @@ -738,10 +801,6 @@ def test_createinstance(self):
pp = self.editor.createinstance({"method": POSTaggingModule.MaxEnt})
self.assertIsInstance(pp, MaxEntTagger)

# TODO - implement StanfordPOSTagger
# pp = self.editor.createinstance({"method": POSTaggingModule.Stanford})
# self.assertIsInstance(pp, StanfordPOSTagger)

def test_repr(self):
self.assertEqual(str(self.editor), "Averaged Perceptron Tagger")

Expand Down

0 comments on commit 723d077

Please sign in to comment.