Skip to content

Commit

Permalink
Preprocess widget - Language from corpus
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Sep 21, 2023
1 parent 4cb3494 commit a2a8466
Show file tree
Hide file tree
Showing 6 changed files with 292 additions and 65 deletions.
1 change: 1 addition & 0 deletions orangecontrib/text/keywords/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
# all available languages for RAKE
from orangecontrib.text.vectorization import BowVectorizer

# todo
RAKE_LANGUAGES = StopwordsFilter.supported_languages()
# all available languages for YAKE!
YAKE_LANGUAGE_MAPPING = {
Expand Down
3 changes: 3 additions & 0 deletions orangecontrib/text/language.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@
"he": "Hebrew",
"hi": "Hindi",
"hi-Latn": "Hindi (latin)",
# https://en.wikipedia.org/wiki/Hinglish - since it doesn't really have ISO
# code we made one up to be able to used it for stopwords (supported in NLTK)
"hi_eng": "Hinglish",
"hr": "Croatian",
"ht": "Haitian",
"hu": "Hungarian",
Expand Down
29 changes: 16 additions & 13 deletions orangecontrib/text/preprocess/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from Orange.util import wrap_callback, dummy_callback

from orangecontrib.text import Corpus
from orangecontrib.text.language import ISO2LANG
from orangecontrib.text.language import ISO2LANG, LANG2ISO
from orangecontrib.text.misc import wait_nltk_data
from orangecontrib.text.preprocess import TokenizedPreprocessor

Expand Down Expand Up @@ -87,7 +87,8 @@ class StopwordsFilter(BaseTokenFilter, FileWordListMixin):
name = 'Stopwords'

# nltk uses different language nams for some languages
nltk_mapping = {"Slovenian": "Slovene"}
nltk_mapping = {"Slovene": "Slovenian"}
nltk_mapping_inv = {v: k for k, v in nltk_mapping.items()}

def __init__(
self,
Expand All @@ -103,12 +104,13 @@ def __init__(

@wait_nltk_data
def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
# use language set in init if not None and Corpus's language otherwise
la = ISO2LANG[self.__language or corpus.language]
la = self.nltk_mapping.get(la, la)
if self.__use_default_stopwords:
if la in self.supported_languages():
self.__stopwords = set(x.strip() for x in stopwords.words(la.lower()))
# use language from attr if not None and Corpus's language otherwise
lang = self.__language or corpus.language
if lang in self.supported_languages():
lang = ISO2LANG[lang]
lang = self.nltk_mapping_inv.get(lang, lang).lower()
self.__stopwords = set(x.strip() for x in stopwords.words(lang))
else:
raise ValueError(
"The stopwords filter does not support the Corpus's or "
Expand All @@ -120,15 +122,16 @@ def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
@wait_nltk_data
def supported_languages():
# get NLTK list of stopwords
stopwords_listdir = []
try:
stopwords_listdir = [file for file in
os.listdir(stopwords._get_root())
if file.islower()]
stopwords_listdir = [
file for file in os.listdir(stopwords._get_root()) if file.islower()
]
except LookupError: # when no NLTK data is available
pass
stopwords_listdir = []

return sorted(file.capitalize() for file in stopwords_listdir)
def to_iso(lang):
return LANG2ISO[StopwordsFilter.nltk_mapping.get(lang, lang)]
return {to_iso(file.capitalize()) for file in stopwords_listdir}

def _check(self, token):
return token not in self.__stopwords and token not in self._lexicon
Expand Down
4 changes: 3 additions & 1 deletion orangecontrib/text/preprocess/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,10 +155,12 @@ def online(self):
except ConnectionError:
return False

# todo: clanup
# use _ since - is already used in iso standard
VARIATION_DELIMITER = "_"

def language_to_iso(self, language):
@staticmethod
def language_to_iso(language):
if "(" in language:
language, model = language.split("(")
language = LANG2ISO[language.strip()]
Expand Down
Loading

0 comments on commit a2a8466

Please sign in to comment.