From bab6863e29866e2475e9e566e5b80eebf37d85c3 Mon Sep 17 00:00:00 2001 From: PrimozGodec Date: Fri, 14 Apr 2023 09:24:17 +0200 Subject: [PATCH] Filter - language form corpus in StopwordsFilter --- orangecontrib/text/annotate_documents.py | 2 +- orangecontrib/text/preprocess/filter.py | 34 +++++++++++++++---- orangecontrib/text/tests/test_preprocess.py | 17 +++++----- orangecontrib/text/widgets/owannotator.py | 2 +- .../text/widgets/tests/test_owannotator.py | 2 +- 5 files changed, 39 insertions(+), 18 deletions(-) diff --git a/orangecontrib/text/annotate_documents.py b/orangecontrib/text/annotate_documents.py index c975e2023..9f166101e 100644 --- a/orangecontrib/text/annotate_documents.py +++ b/orangecontrib/text/annotate_documents.py @@ -289,7 +289,7 @@ def _hypergeom_clusters( corpus_ = Corpus.from_file("book-excerpts") for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"), - StopwordsFilter("English"), FrequencyFilter(0.1)): + StopwordsFilter("en"), FrequencyFilter(0.1)): corpus_ = pp(corpus_) transformed_corpus = BowVectorizer().transform(corpus_) diff --git a/orangecontrib/text/preprocess/filter.py b/orangecontrib/text/preprocess/filter.py index 50f748c31..f8e0d843a 100644 --- a/orangecontrib/text/preprocess/filter.py +++ b/orangecontrib/text/preprocess/filter.py @@ -1,5 +1,5 @@ from itertools import compress -from typing import List, Callable +from typing import List, Callable, Optional import os import re @@ -11,6 +11,7 @@ from Orange.util import wrap_callback, dummy_callback from orangecontrib.text import Corpus +from orangecontrib.text.language import ISO2LANG from orangecontrib.text.misc import wait_nltk_data from orangecontrib.text.preprocess import TokenizedPreprocessor @@ -68,16 +69,35 @@ def from_file(path): class StopwordsFilter(BaseTokenFilter, FileWordListMixin): - """ Remove tokens present in NLTK's language specific lists or a file. """ + """Remove tokens present in NLTK's language-specific lists or a file.""" name = 'Stopwords' - @wait_nltk_data - def __init__(self, language='English', path: str = None): + # nltk uses different language nams for some languages + nltk_mapping = {"Slovenian": "Slovene"} + + def __init__( + self, + language: Optional[str] = "en", + path: str = None, + ): + """ + Parameters + ---------- + language + The language code in ISO format for NLTK stopwords selection. + If None, only words from file are used (NLTK stopwords are not used). + path + The path to the file with its stopwords will be used if present. + The file must contain a newline-separated list of words. + """ super().__init__() FileWordListMixin.__init__(self, path) - self.__stopwords = set(x.strip() for x in - stopwords.words(language.lower())) \ - if language else [] + self.__stopwords = set() + if language: + # transform iso code to NLTK's language name + language = ISO2LANG[language] + language = self.nltk_mapping.get(language, language).lower() + self.__stopwords = set(x.strip() for x in stopwords.words(language)) @staticmethod @wait_nltk_data diff --git a/orangecontrib/text/tests/test_preprocess.py b/orangecontrib/text/tests/test_preprocess.py index f94ba5f81..8d385052a 100644 --- a/orangecontrib/text/tests/test_preprocess.py +++ b/orangecontrib/text/tests/test_preprocess.py @@ -430,23 +430,24 @@ def _check(self, token): self.assertEqual(filtered, ['a']) def test_stopwords(self): - f = preprocess.StopwordsFilter('english') - self.assertFalse(f._check('a')) - self.assertTrue(f._check('filter')) + f = preprocess.StopwordsFilter("en") with self.corpus.unlocked(): - self.corpus.metas[0, 0] = 'a snake is in a house' + self.corpus.metas[0, 0] = "a snake is in a house" + self.corpus.metas[1, 0] = "a filter" corpus = f(self.corpus) self.assertListEqual(["snake", "house"], corpus.tokens[0]) + self.assertListEqual(["filter"], corpus.tokens[1]) self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2) def test_stopwords_slovene(self): - f = preprocess.StopwordsFilter('slovene') - self.assertFalse(f._check('in')) - self.assertTrue(f._check('abeceda')) + f = preprocess.StopwordsFilter("sl") with self.corpus.unlocked(): - self.corpus.metas[0, 0] = 'kača je v hiši' + self.corpus.metas[0, 0] = "kača je v hiši" + self.corpus.metas[1, 0] = "in abeceda" + self.corpus.attributes["language"] = "sl" corpus = f(self.corpus) self.assertListEqual(["kača", "hiši"], corpus.tokens[0]) + self.assertListEqual(["abeceda"], corpus.tokens[1]) self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2) def test_lexicon(self): diff --git a/orangecontrib/text/widgets/owannotator.py b/orangecontrib/text/widgets/owannotator.py index 8bffc5294..e3cd5f25f 100644 --- a/orangecontrib/text/widgets/owannotator.py +++ b/orangecontrib/text/widgets/owannotator.py @@ -618,7 +618,7 @@ def onDeleteWidget(self): corpus_ = Corpus.from_file("book-excerpts") for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"), - StopwordsFilter("English"), FrequencyFilter(0.1)): + StopwordsFilter("en"), FrequencyFilter(0.1)): corpus_ = pp(corpus_) transformed_corpus = BowVectorizer().transform(corpus_) diff --git a/orangecontrib/text/widgets/tests/test_owannotator.py b/orangecontrib/text/widgets/tests/test_owannotator.py index 5ee0b52d9..19e4951d2 100644 --- a/orangecontrib/text/widgets/tests/test_owannotator.py +++ b/orangecontrib/text/widgets/tests/test_owannotator.py @@ -21,7 +21,7 @@ def preprocess(corpus: Corpus) -> Corpus: for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"), - StopwordsFilter("English"), FrequencyFilter(0.25, 0.5)): + StopwordsFilter("en"), FrequencyFilter(0.25, 0.5)): corpus = pp(corpus) corpus = BowVectorizer().transform(corpus) return add_embedding(corpus, 4)