From 4206b70d61dd52b8ae0755b86b970db36f37ed67 Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Fri, 21 Apr 2023 16:20:44 +0200
Subject: [PATCH] preproc
---
orangecontrib/text/widgets/owpreprocess.py | 50 ++++++++++++++++------
1 file changed, 37 insertions(+), 13 deletions(-)
diff --git a/orangecontrib/text/widgets/owpreprocess.py b/orangecontrib/text/widgets/owpreprocess.py
index ede04d90c..36b97b13d 100644
--- a/orangecontrib/text/widgets/owpreprocess.py
+++ b/orangecontrib/text/widgets/owpreprocess.py
@@ -40,6 +40,7 @@
from Orange.widgets.widget import Input, Output, Msg, Message
from orangecontrib.text import Corpus
+from orangecontrib.text.language import ISO2LANG, LANG2ISO
from orangecontrib.text.misc import nltk_data_dir
from orangecontrib.text.preprocess import *
from orangecontrib.text.preprocess.normalize import UDPipeStopIteration
@@ -78,6 +79,8 @@ def __init__(self, master: BaseEditor, items: List[str], value: str,
self.setCurrentText(value)
self.currentTextChanged.connect(callback)
+ # todo: idea uporabi se userRole za shranit iso code in potem callback vrne iso code
+
class UDPipeComboBox(QComboBox):
def __init__(self, master: BaseEditor, value: str, default: str,
@@ -439,7 +442,7 @@ class NormalizationModule(SingleMethodModule):
UDPipe: UDPipeLemmatizer,
Lemmagen: LemmagenLemmatizer}
DEFAULT_METHOD = Porter
- DEFAULT_LANGUAGE = "English"
+ DEFAULT_LANGUAGE = "en"
DEFAULT_USE_TOKE = False
def __init__(self, parent=None, **kwargs):
@@ -450,7 +453,7 @@ def __init__(self, parent=None, **kwargs):
self.__use_tokenizer = self.DEFAULT_USE_TOKE
self.__combo_sbl = ComboBox(
- self, SnowballStemmer.supported_languages,
+ self, [ISO2LANG[lang] for lang in SnowballStemmer.supported_languages],
self.__snowball_lang, self.__set_snowball_lang
)
self.__combo_udl = UDPipeComboBox(
@@ -461,7 +464,7 @@ def __init__(self, parent=None, **kwargs):
checked=self.DEFAULT_USE_TOKE)
self.__check_use.clicked.connect(self.__set_use_tokenizer)
self.__combo_lemm = ComboBox(
- self, LemmagenLemmatizer.supported_languages,
+ self, [ISO2LANG[lang] for lang in LemmagenLemmatizer.supported_languages],
self.__lemmagen_lang, self.__set_lemmagen_lang
)
@@ -508,24 +511,27 @@ def _set_method(self, method: int):
self.__enable_udpipe()
def __set_snowball_lang(self, language: str):
- if self.__snowball_lang != language:
- self.__snowball_lang = language
+ iso_language = LANG2ISO[language]
+ if self.__snowball_lang != iso_language:
+ self.__snowball_lang = iso_language
self.__combo_sbl.setCurrentText(language)
self.changed.emit()
if self.method == self.Snowball:
self.edited.emit()
def __set_udpipe_lang(self, language: str):
- if self.__udpipe_lang != language:
- self.__udpipe_lang = language
+ iso_language = LANG2ISO[language]
+ if self.__udpipe_lang != iso_language:
+ self.__udpipe_lang = iso_language
self.__combo_udl.setCurrentText(language)
self.changed.emit()
if self.method == self.UDPipe:
self.edited.emit()
def __set_lemmagen_lang(self, language: str):
- if self.__lemmagen_lang != language:
- self.__lemmagen_lang = language
+ iso_language = LANG2ISO[language]
+ if self.__lemmagen_lang != iso_language:
+ self.__lemmagen_lang = iso_language
self.__combo_lemm.setCurrentText(language)
self.changed.emit()
if self.method == self.Lemmagen:
@@ -586,7 +592,7 @@ class FilteringModule(MultipleMethodModule):
MostFreq: MostFrequentTokensFilter,
PosTag: PosTagFilter}
DEFAULT_METHODS = [Stopwords]
- DEFAULT_LANG = "English"
+ DEFAULT_LANG = "en"
DEFAULT_NONE = None
DEFAULT_INCL_NUM = False
DEFAULT_PATTERN = r"\.|,|:|;|!|\?|\(|\)|\||\+|\'|\"|‘|’|“|”|\'|" \
@@ -772,8 +778,9 @@ def setParameters(self, params: Dict):
self.__invalidated = False
def __set_language(self, language: str):
- if self.__sw_lang != language:
- self.__sw_lang = language
+ iso_language = LANG2ISO[language]
+ if self.__sw_lang != iso_language:
+ self.__sw_lang = iso_language
self.__combo.setCurrentText(language)
self.changed.emit()
if self.Stopwords in self.methods:
@@ -1042,7 +1049,7 @@ class OWPreprocess(Orange.widgets.data.owpreprocess.OWPreprocess,
priority = 200
keywords = []
- settings_version = 3
+ settings_version = 4
class Inputs:
corpus = Input("Corpus", Corpus)
@@ -1081,6 +1088,7 @@ class Warning(Orange.widgets.data.owpreprocess.OWPreprocess.Warning):
PREPROCESSORS = PREPROCESS_ACTIONS
DEFAULT_PP = {"preprocessors": [("preprocess.transform", {}),
("preprocess.tokenize", {}),
+ ("preprocess.normalize", {}),
("preprocess.filter", {})]
} # type: Dict[str, List[Tuple[str, Dict]]]
storedsettings = Setting(DEFAULT_PP)
@@ -1133,6 +1141,9 @@ def __relocate_file(self, path: RecentPath) -> RecentPath:
@Inputs.corpus
def set_data(self, data: Corpus):
+ # todo: language set from corpus
+ # since they cannot be schema only reset them evrytime according to language
+ # corpus except on first run
self.cancel()
self.data = data
@@ -1338,6 +1349,19 @@ def str_into_paths(label):
del pp_settings["start"]
del pp_settings["end"]
+ # before version 4 languages were saved as full-word language strings
+ if version < 4:
+ preprocessors = settings["storedsettings"]["preprocessors"]
+ if "preprocess.normalize" in preprocessors:
+ pp_settings = preprocessors["preprocess.normalize"]
+ for k in ("snowball_language", "udpipe_language", "lemmagen_language"):
+ pp_settings[k] = LANG2ISO[pp_settings[k]]
+ pp_settings[k] = LANG2ISO[pp_settings[k]]
+ pp_settings[k] = LANG2ISO[pp_settings[k]]
+ if "preprocess.filter" in preprocessors:
+ pp_settings = preprocessors["preprocess.filter"]
+ pp_settings["language"] = LANG2ISO[pp_settings["language"]]
+
if __name__ == "__main__":
from Orange.widgets.utils.widgetpreview import WidgetPreview