diff --git a/orangecontrib/text/widgets/owpreprocess.py b/orangecontrib/text/widgets/owpreprocess.py index ede04d90c..36b97b13d 100644 --- a/orangecontrib/text/widgets/owpreprocess.py +++ b/orangecontrib/text/widgets/owpreprocess.py @@ -40,6 +40,7 @@ from Orange.widgets.widget import Input, Output, Msg, Message from orangecontrib.text import Corpus +from orangecontrib.text.language import ISO2LANG, LANG2ISO from orangecontrib.text.misc import nltk_data_dir from orangecontrib.text.preprocess import * from orangecontrib.text.preprocess.normalize import UDPipeStopIteration @@ -78,6 +79,8 @@ def __init__(self, master: BaseEditor, items: List[str], value: str, self.setCurrentText(value) self.currentTextChanged.connect(callback) + # todo: idea uporabi se userRole za shranit iso code in potem callback vrne iso code + class UDPipeComboBox(QComboBox): def __init__(self, master: BaseEditor, value: str, default: str, @@ -439,7 +442,7 @@ class NormalizationModule(SingleMethodModule): UDPipe: UDPipeLemmatizer, Lemmagen: LemmagenLemmatizer} DEFAULT_METHOD = Porter - DEFAULT_LANGUAGE = "English" + DEFAULT_LANGUAGE = "en" DEFAULT_USE_TOKE = False def __init__(self, parent=None, **kwargs): @@ -450,7 +453,7 @@ def __init__(self, parent=None, **kwargs): self.__use_tokenizer = self.DEFAULT_USE_TOKE self.__combo_sbl = ComboBox( - self, SnowballStemmer.supported_languages, + self, [ISO2LANG[lang] for lang in SnowballStemmer.supported_languages], self.__snowball_lang, self.__set_snowball_lang ) self.__combo_udl = UDPipeComboBox( @@ -461,7 +464,7 @@ def __init__(self, parent=None, **kwargs): checked=self.DEFAULT_USE_TOKE) self.__check_use.clicked.connect(self.__set_use_tokenizer) self.__combo_lemm = ComboBox( - self, LemmagenLemmatizer.supported_languages, + self, [ISO2LANG[lang] for lang in LemmagenLemmatizer.supported_languages], self.__lemmagen_lang, self.__set_lemmagen_lang ) @@ -508,24 +511,27 @@ def _set_method(self, method: int): self.__enable_udpipe() def __set_snowball_lang(self, language: str): - if self.__snowball_lang != language: - self.__snowball_lang = language + iso_language = LANG2ISO[language] + if self.__snowball_lang != iso_language: + self.__snowball_lang = iso_language self.__combo_sbl.setCurrentText(language) self.changed.emit() if self.method == self.Snowball: self.edited.emit() def __set_udpipe_lang(self, language: str): - if self.__udpipe_lang != language: - self.__udpipe_lang = language + iso_language = LANG2ISO[language] + if self.__udpipe_lang != iso_language: + self.__udpipe_lang = iso_language self.__combo_udl.setCurrentText(language) self.changed.emit() if self.method == self.UDPipe: self.edited.emit() def __set_lemmagen_lang(self, language: str): - if self.__lemmagen_lang != language: - self.__lemmagen_lang = language + iso_language = LANG2ISO[language] + if self.__lemmagen_lang != iso_language: + self.__lemmagen_lang = iso_language self.__combo_lemm.setCurrentText(language) self.changed.emit() if self.method == self.Lemmagen: @@ -586,7 +592,7 @@ class FilteringModule(MultipleMethodModule): MostFreq: MostFrequentTokensFilter, PosTag: PosTagFilter} DEFAULT_METHODS = [Stopwords] - DEFAULT_LANG = "English" + DEFAULT_LANG = "en" DEFAULT_NONE = None DEFAULT_INCL_NUM = False DEFAULT_PATTERN = r"\.|,|:|;|!|\?|\(|\)|\||\+|\'|\"|‘|’|“|”|\'|" \ @@ -772,8 +778,9 @@ def setParameters(self, params: Dict): self.__invalidated = False def __set_language(self, language: str): - if self.__sw_lang != language: - self.__sw_lang = language + iso_language = LANG2ISO[language] + if self.__sw_lang != iso_language: + self.__sw_lang = iso_language self.__combo.setCurrentText(language) self.changed.emit() if self.Stopwords in self.methods: @@ -1042,7 +1049,7 @@ class OWPreprocess(Orange.widgets.data.owpreprocess.OWPreprocess, priority = 200 keywords = [] - settings_version = 3 + settings_version = 4 class Inputs: corpus = Input("Corpus", Corpus) @@ -1081,6 +1088,7 @@ class Warning(Orange.widgets.data.owpreprocess.OWPreprocess.Warning): PREPROCESSORS = PREPROCESS_ACTIONS DEFAULT_PP = {"preprocessors": [("preprocess.transform", {}), ("preprocess.tokenize", {}), + ("preprocess.normalize", {}), ("preprocess.filter", {})] } # type: Dict[str, List[Tuple[str, Dict]]] storedsettings = Setting(DEFAULT_PP) @@ -1133,6 +1141,9 @@ def __relocate_file(self, path: RecentPath) -> RecentPath: @Inputs.corpus def set_data(self, data: Corpus): + # todo: language set from corpus + # since they cannot be schema only reset them evrytime according to language + # corpus except on first run self.cancel() self.data = data @@ -1338,6 +1349,19 @@ def str_into_paths(label): del pp_settings["start"] del pp_settings["end"] + # before version 4 languages were saved as full-word language strings + if version < 4: + preprocessors = settings["storedsettings"]["preprocessors"] + if "preprocess.normalize" in preprocessors: + pp_settings = preprocessors["preprocess.normalize"] + for k in ("snowball_language", "udpipe_language", "lemmagen_language"): + pp_settings[k] = LANG2ISO[pp_settings[k]] + pp_settings[k] = LANG2ISO[pp_settings[k]] + pp_settings[k] = LANG2ISO[pp_settings[k]] + if "preprocess.filter" in preprocessors: + pp_settings = preprocessors["preprocess.filter"] + pp_settings["language"] = LANG2ISO[pp_settings["language"]] + if __name__ == "__main__": from Orange.widgets.utils.widgetpreview import WidgetPreview