Skip to content

Commit

Permalink
Preprocess widget - Language from corpus
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Aug 27, 2024
1 parent 64eee89 commit 9a47c0c
Show file tree
Hide file tree
Showing 6 changed files with 254 additions and 30 deletions.
2 changes: 1 addition & 1 deletion orangecontrib/text/keywords/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

# todo: refactor when refactoring language for keywords module
# this is a temporary solution since supported_languages now returns lang ISO codes
RAKE_LANGUAGES = [ISO2LANG[la] for la in StopwordsFilter.supported_languages()]
RAKE_LANGUAGES = [ISO2LANG[la] for la in StopwordsFilter.supported_languages]
# all available languages for YAKE!
YAKE_LANGUAGE_MAPPING = {
"Arabic": "ar",
Expand Down
5 changes: 3 additions & 2 deletions orangecontrib/text/preprocess/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,9 +117,10 @@ def lang_to_iso(language: str) -> str:
"""
return LANG2ISO[StopwordsFilter.NLTK2LANG.get(language, language)]

@staticmethod
@classmethod
@property
@wait_nltk_data
def supported_languages() -> Set[str]:
def supported_languages(_) -> Set[str]:
"""
List all languages supported by NLTK
Expand Down
4 changes: 4 additions & 0 deletions orangecontrib/text/preprocess/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,10 @@ def __files_to_dict(self, files: List[Tuple[str]]) -> Dict[str, Tuple[str, str]]
def supported_languages(self) -> List[Tuple[str, str]]:
return [(name, iso) for iso, (name, _) in self.model_files.items()]

@property
def supported_languages_iso(self) -> List[Tuple[str, str]]:
return {iso for _, iso in self.supported_languages}

@property
def online(self) -> bool:
try:
Expand Down
2 changes: 1 addition & 1 deletion orangecontrib/text/tests/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,7 +486,7 @@ def test_stopwords_slovene(self):
self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)

def test_supported_languages(self):
langs = preprocess.StopwordsFilter.supported_languages()
langs = preprocess.StopwordsFilter.supported_languages
self.assertIsInstance(langs, set)
# just testing few of most important languages since I want for test to be
# resistant for any potentially newly introduced languages by NLTK
Expand Down
108 changes: 86 additions & 22 deletions orangecontrib/text/widgets/owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from AnyQt.QtGui import QBrush, QValidator

from Orange.util import wrap_callback
from orangecanvas.gui.utils import disconnected
from orangewidget.settings import SettingsHandler
from orangewidget.utils.filedialogs import RecentPath

import Orange.widgets.data.owpreprocess
Expand Down Expand Up @@ -113,7 +115,10 @@ def set_current_language(self, iso_language: Optional[str]):
The ISO language code of element to be selected.
"""
index = self.findData(iso_language)
self.setCurrentIndex(index)
if index >= 0:
self.setCurrentIndex(index)
else:
self.index_changed(self.currentIndex())


class UDPipeComboBox(LanguageComboBox):
Expand All @@ -131,15 +136,9 @@ def items(self) -> List:
def add_items(self, _, include_none: bool, language: str):
self.__items = self.items
super().add_items(self.__items, include_none, language)

def set_current_language(self, iso_language: Optional[str]):
iso_items = {iso for _, iso in self.__items}
if iso_language in iso_items:
super().set_current_language(iso_language)
elif self.__default_lang in iso_items:
if language not in iso_items and self.__default_lang in iso_items:
super().set_current_language(self.__default_lang)
elif self.__items:
self.setCurrentIndex(0)

def showPopup(self):
if self.__items != self.items:
Expand Down Expand Up @@ -538,13 +537,13 @@ def __enable_udpipe(self):
def setParameters(self, params: Dict):
super().setParameters(params)
snowball_lang = params.get("snowball_language", self.DEFAULT_LANGUAGE)
self.__set_snowball_lang(snowball_lang)
self.__combo_sbl.set_current_language(snowball_lang)
udpipe_lang = params.get("udpipe_language", self.DEFAULT_LANGUAGE)
self.__set_udpipe_lang(udpipe_lang)
self.__combo_udl.set_current_language(udpipe_lang)
use_tokenizer = params.get("udpipe_tokenizer", self.DEFAULT_USE_TOKE)
self.__set_use_tokenizer(use_tokenizer)
lemmagen_lang = params.get("lemmagen_language", self.DEFAULT_LANGUAGE)
self.__set_lemmagen_lang(lemmagen_lang)
self.__combo_lemm.set_current_language(lemmagen_lang)

def _set_method(self, method: int):
super()._set_method(method)
Expand All @@ -553,26 +552,22 @@ def _set_method(self, method: int):
def __set_snowball_lang(self, language: str):
if self.__snowball_lang != language:
self.__snowball_lang = language
self.__combo_sbl.set_current_language(language)
self.changed.emit()
if self.method == self.Snowball:
self.edited.emit()

def __set_udpipe_lang(self, language: str):
if self.__udpipe_lang != language:
self.__udpipe_lang = language
self.__combo_udl.set_current_language(language)
self.changed.emit()
if self.method == self.UDPipe:
self.edited.emit()

def __set_lemmagen_lang(self, language: str):
if self.__lemmagen_lang != language:
self.__lemmagen_lang = language
self.__combo_lemm.set_current_language(language)
self.changed.emit()
if self.method == self.Lemmagen:
self.edited.emit()
self.__lemmagen_lang = language
self.changed.emit()
if self.method == self.Lemmagen:
self.edited.emit()

def __set_use_tokenizer(self, use: bool):
if self.__use_tokenizer != use:
Expand Down Expand Up @@ -658,7 +653,7 @@ def __init__(self, parent=None, **kwargs):

self.__combo = LanguageComboBox(
self,
StopwordsFilter.supported_languages(),
StopwordsFilter.supported_languages,
self.__sw_lang,
True,
self.__set_language,
Expand Down Expand Up @@ -795,7 +790,7 @@ def __spin_n_edited(self):

def setParameters(self, params: Dict):
super().setParameters(params)
self.__set_language(params.get("language", self.DEFAULT_LANG))
self.__combo.set_current_language(params.get("language", self.DEFAULT_LANG))
self.__set_sw_path(params.get("sw_path", self.DEFAULT_NONE),
params.get("sw_list", []))
self.__set_lx_path(params.get("lx_path", self.DEFAULT_NONE),
Expand All @@ -820,7 +815,6 @@ def setParameters(self, params: Dict):
def __set_language(self, language: Optional[str]):
if self.__sw_lang != language:
self.__sw_lang = language
self.__combo.set_current_language(language)
self.changed.emit()
if self.Stopwords in self.methods:
self.edited.emit()
Expand Down Expand Up @@ -1045,6 +1039,21 @@ def createinstance(params: Dict) -> POSTagger:
return POSTaggingModule.Methods[method]()


class PreprocessSettingsHandler(SettingsHandler):
"""
Settings handler, that makes all language settings, which are
a part of common preprocess settings, schema_only. It removes them when
settings are not loaded from schema but from common settings.
"""
def _remove_schema_only(self, settings_dict):
super()._remove_schema_only(settings_dict)
for setting, data, _ in self.provider.traverse_settings(data=settings_dict):
for pp_name, settings in data["storedsettings"]["preprocessors"]:
for key in list(settings):
if "language" in key:
settings.pop(key)


PREPROCESS_ACTIONS = [
PreprocessAction(
"Transformation", "preprocess.transform", "",
Expand Down Expand Up @@ -1128,12 +1137,14 @@ class Warning(Orange.widgets.data.owpreprocess.OWPreprocess.Warning):
("preprocess.tokenize", {}),
("preprocess.filter", {})]
} # type: Dict[str, List[Tuple[str, Dict]]]
settingsHandler = PreprocessSettingsHandler()
storedsettings = Setting(DEFAULT_PP)
buttons_area_orientation = Qt.Vertical

def __init__(self):
ConcurrentWidgetMixin.__init__(self)
Orange.widgets.data.owpreprocess.OWPreprocess.__init__(self)
self.__store_pending_languages()

box = gui.vBox(self.controlArea, "Preview")
self.preview = ""
Expand All @@ -1151,6 +1162,16 @@ def load(self, saved: Dict) -> StandardItemModel:
saved["preprocessors"][i] = (name, params)
return super().load(saved)

def set_model(self, pmodel):
"""Connect signal which handle setting language from corpus"""
super().set_model(pmodel)
if pmodel:
pmodel.rowsInserted.connect(self.__on_item_inserted)

def __on_item_inserted(self, _, first: int, last: int):
assert first == last
self.__set_languages_single_editor(first)

def __update_filtering_params(self, params: Dict):
params["sw_path"] = self.__relocate_file(params.get("sw_path"))
params["sw_list"] = self.__relocate_files(params.get("sw_list", []))
Expand Down Expand Up @@ -1180,6 +1201,49 @@ def __relocate_file(self, path: RecentPath) -> RecentPath:
def set_data(self, data: Corpus):
self.cancel()
self.data = data
self.__set_languages()

LANG_PARAMS = {
"preprocess.normalize": [
("snowball_language", SnowballStemmer.supported_languages),
("udpipe_language", UDPipeModels().supported_languages_iso),
("lemmagen_language", LemmagenLemmatizer.supported_languages),
],
"preprocess.filter": [("language", StopwordsFilter.supported_languages)],
}

def __store_pending_languages(self):
settings = self.storedsettings["preprocessors"]
self.__pending_languages = {
pp_name: {p for p in par if "language" in p} for pp_name, par in settings
}

def __set_languages(self):
if self.data is not None:
for i in range(self.preprocessormodel.rowCount()):
self.__set_languages_single_editor(i)
self.__pending_languages = {}

def __set_languages_single_editor(self, item_index: int):
"""
Set language from corpus for single editor/module,
keep language unchanged if it comes from schema (pending).
"""
if self.data and self.data.language:
model = self.preprocessormodel
item = model.item(item_index)
pp_name = item.data(DescriptionRole).qualname
params = item.data(ParametersRole)
pending = self.__pending_languages.get(pp_name, set())
for param, available_langs in self.LANG_PARAMS.get(pp_name, []):
if param not in pending and self.data.language in available_langs:
# set language if not pending from schema - should not be changed
# and if available for the method
params[param] = self.data.language
with disconnected(model.dataChanged, self.__on_modelchanged):
# disconnection prevent double apply call, it is already called
# on new data and when row inserted, both caller of this method
item.setData(params, ParametersRole)

def buildpreproc(self) -> PreprocessorList:
plist = []
Expand Down
Loading

0 comments on commit 9a47c0c

Please sign in to comment.