Skip to content

Commit

Permalink
preproc
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Apr 21, 2023
1 parent 6459369 commit 4206b70
Showing 1 changed file with 37 additions and 13 deletions.
50 changes: 37 additions & 13 deletions orangecontrib/text/widgets/owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
from Orange.widgets.widget import Input, Output, Msg, Message

from orangecontrib.text import Corpus
from orangecontrib.text.language import ISO2LANG, LANG2ISO
from orangecontrib.text.misc import nltk_data_dir
from orangecontrib.text.preprocess import *
from orangecontrib.text.preprocess.normalize import UDPipeStopIteration
Expand Down Expand Up @@ -78,6 +79,8 @@ def __init__(self, master: BaseEditor, items: List[str], value: str,
self.setCurrentText(value)
self.currentTextChanged.connect(callback)

# todo: idea uporabi se userRole za shranit iso code in potem callback vrne iso code


class UDPipeComboBox(QComboBox):
def __init__(self, master: BaseEditor, value: str, default: str,
Expand Down Expand Up @@ -439,7 +442,7 @@ class NormalizationModule(SingleMethodModule):
UDPipe: UDPipeLemmatizer,
Lemmagen: LemmagenLemmatizer}
DEFAULT_METHOD = Porter
DEFAULT_LANGUAGE = "English"
DEFAULT_LANGUAGE = "en"
DEFAULT_USE_TOKE = False

def __init__(self, parent=None, **kwargs):
Expand All @@ -450,7 +453,7 @@ def __init__(self, parent=None, **kwargs):
self.__use_tokenizer = self.DEFAULT_USE_TOKE

self.__combo_sbl = ComboBox(
self, SnowballStemmer.supported_languages,
self, [ISO2LANG[lang] for lang in SnowballStemmer.supported_languages],
self.__snowball_lang, self.__set_snowball_lang
)
self.__combo_udl = UDPipeComboBox(
Expand All @@ -461,7 +464,7 @@ def __init__(self, parent=None, **kwargs):
checked=self.DEFAULT_USE_TOKE)
self.__check_use.clicked.connect(self.__set_use_tokenizer)
self.__combo_lemm = ComboBox(
self, LemmagenLemmatizer.supported_languages,
self, [ISO2LANG[lang] for lang in LemmagenLemmatizer.supported_languages],
self.__lemmagen_lang, self.__set_lemmagen_lang
)

Expand Down Expand Up @@ -508,24 +511,27 @@ def _set_method(self, method: int):
self.__enable_udpipe()

def __set_snowball_lang(self, language: str):
if self.__snowball_lang != language:
self.__snowball_lang = language
iso_language = LANG2ISO[language]
if self.__snowball_lang != iso_language:
self.__snowball_lang = iso_language
self.__combo_sbl.setCurrentText(language)
self.changed.emit()
if self.method == self.Snowball:
self.edited.emit()

def __set_udpipe_lang(self, language: str):
if self.__udpipe_lang != language:
self.__udpipe_lang = language
iso_language = LANG2ISO[language]
if self.__udpipe_lang != iso_language:
self.__udpipe_lang = iso_language
self.__combo_udl.setCurrentText(language)
self.changed.emit()
if self.method == self.UDPipe:
self.edited.emit()

def __set_lemmagen_lang(self, language: str):
if self.__lemmagen_lang != language:
self.__lemmagen_lang = language
iso_language = LANG2ISO[language]
if self.__lemmagen_lang != iso_language:
self.__lemmagen_lang = iso_language
self.__combo_lemm.setCurrentText(language)
self.changed.emit()
if self.method == self.Lemmagen:
Expand Down Expand Up @@ -586,7 +592,7 @@ class FilteringModule(MultipleMethodModule):
MostFreq: MostFrequentTokensFilter,
PosTag: PosTagFilter}
DEFAULT_METHODS = [Stopwords]
DEFAULT_LANG = "English"
DEFAULT_LANG = "en"
DEFAULT_NONE = None
DEFAULT_INCL_NUM = False
DEFAULT_PATTERN = r"\.|,|:|;|!|\?|\(|\)|\||\+|\'|\"|‘|’|“|”|\'|" \
Expand Down Expand Up @@ -772,8 +778,9 @@ def setParameters(self, params: Dict):
self.__invalidated = False

def __set_language(self, language: str):
if self.__sw_lang != language:
self.__sw_lang = language
iso_language = LANG2ISO[language]
if self.__sw_lang != iso_language:
self.__sw_lang = iso_language
self.__combo.setCurrentText(language)
self.changed.emit()
if self.Stopwords in self.methods:
Expand Down Expand Up @@ -1042,7 +1049,7 @@ class OWPreprocess(Orange.widgets.data.owpreprocess.OWPreprocess,
priority = 200
keywords = []

settings_version = 3
settings_version = 4

class Inputs:
corpus = Input("Corpus", Corpus)
Expand Down Expand Up @@ -1081,6 +1088,7 @@ class Warning(Orange.widgets.data.owpreprocess.OWPreprocess.Warning):
PREPROCESSORS = PREPROCESS_ACTIONS
DEFAULT_PP = {"preprocessors": [("preprocess.transform", {}),
("preprocess.tokenize", {}),
("preprocess.normalize", {}),
("preprocess.filter", {})]
} # type: Dict[str, List[Tuple[str, Dict]]]
storedsettings = Setting(DEFAULT_PP)
Expand Down Expand Up @@ -1133,6 +1141,9 @@ def __relocate_file(self, path: RecentPath) -> RecentPath:

@Inputs.corpus
def set_data(self, data: Corpus):
# todo: language set from corpus
# since they cannot be schema only reset them evrytime according to language
# corpus except on first run
self.cancel()
self.data = data

Expand Down Expand Up @@ -1338,6 +1349,19 @@ def str_into_paths(label):
del pp_settings["start"]
del pp_settings["end"]

# before version 4 languages were saved as full-word language strings
if version < 4:
preprocessors = settings["storedsettings"]["preprocessors"]
if "preprocess.normalize" in preprocessors:
pp_settings = preprocessors["preprocess.normalize"]
for k in ("snowball_language", "udpipe_language", "lemmagen_language"):
pp_settings[k] = LANG2ISO[pp_settings[k]]
pp_settings[k] = LANG2ISO[pp_settings[k]]
pp_settings[k] = LANG2ISO[pp_settings[k]]
if "preprocess.filter" in preprocessors:
pp_settings = preprocessors["preprocess.filter"]
pp_settings["language"] = LANG2ISO[pp_settings["language"]]


if __name__ == "__main__":
from Orange.widgets.utils.widgetpreview import WidgetPreview
Expand Down

0 comments on commit 4206b70

Please sign in to comment.