diff --git a/orangecontrib/text/annotate_documents.py b/orangecontrib/text/annotate_documents.py index 5cdb338f1..28e612273 100644 --- a/orangecontrib/text/annotate_documents.py +++ b/orangecontrib/text/annotate_documents.py @@ -289,7 +289,7 @@ def _hypergeom_clusters( corpus_ = Corpus.from_file("book-excerpts") for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"), - StopwordsFilter("English"), FrequencyFilter(0.1)): + StopwordsFilter("en"), FrequencyFilter(0.1)): corpus_ = pp(corpus_) transformed_corpus = BowVectorizer().transform(corpus_) diff --git a/orangecontrib/text/preprocess/filter.py b/orangecontrib/text/preprocess/filter.py index 851c5b7ef..4c24043bb 100644 --- a/orangecontrib/text/preprocess/filter.py +++ b/orangecontrib/text/preprocess/filter.py @@ -1,5 +1,5 @@ from itertools import compress -from typing import List, Callable +from typing import List, Callable, Optional import os import re @@ -11,6 +11,7 @@ from Orange.util import wrap_callback, dummy_callback from orangecontrib.text import Corpus +from orangecontrib.text.language import ISO2LANG from orangecontrib.text.misc import wait_nltk_data from orangecontrib.text.preprocess import TokenizedPreprocessor @@ -72,16 +73,52 @@ def from_file(path): class StopwordsFilter(BaseTokenFilter, FileWordListMixin): - """ Remove tokens present in NLTK's language specific lists or a file. """ + """ + Remove tokens present in NLTK's language-specific lists or a file. + + Attributes + ---------- + language + The language for NLTK stopwords selection. If None, language from the + Corpus will be used. + use_default_stopwords + Indication whether to use NLTK's stopwords since setting language to + None doesn't prevent the use of NLTK's stopwords. + path + The path to the file with its stopwords will be used if present. + The file must contain a newline-separated list of words. + """ name = 'Stopwords' - @wait_nltk_data - def __init__(self, language='English', path: str = None): + # nltk uses different language nams for some languages + nltk_mapping = {"Slovenian": "Slovene"} + + def __init__( + self, + language: Optional[str] = None, + use_default_stopwords: bool = True, + path: str = None, + ): super().__init__() FileWordListMixin.__init__(self, path) - self.__stopwords = set(x.strip() for x in - stopwords.words(language.lower())) \ - if language else [] + self.__language = language + self.__use_default_stopwords = use_default_stopwords + self.__stopwords = set() + + @wait_nltk_data + def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus: + # use language set in init if not None and Corpus's language otherwise + la = ISO2LANG[self.__language or corpus.language] + la = self.nltk_mapping.get(la, la) + if self.__use_default_stopwords: + if la in self.supported_languages(): + self.__stopwords = set(x.strip() for x in stopwords.words(la.lower())) + else: + raise ValueError( + "The stopwords filter does not support the Corpus's or " + "selected language." + ) + return super().__call__(corpus, callback) @staticmethod @wait_nltk_data diff --git a/orangecontrib/text/tests/test_preprocess.py b/orangecontrib/text/tests/test_preprocess.py index f94ba5f81..2fe6209d3 100644 --- a/orangecontrib/text/tests/test_preprocess.py +++ b/orangecontrib/text/tests/test_preprocess.py @@ -430,25 +430,45 @@ def _check(self, token): self.assertEqual(filtered, ['a']) def test_stopwords(self): - f = preprocess.StopwordsFilter('english') - self.assertFalse(f._check('a')) - self.assertTrue(f._check('filter')) + f = preprocess.StopwordsFilter("en") with self.corpus.unlocked(): - self.corpus.metas[0, 0] = 'a snake is in a house' + self.corpus.metas[0, 0] = "a snake is in a house" + self.corpus.metas[1, 0] = "a filter" corpus = f(self.corpus) self.assertListEqual(["snake", "house"], corpus.tokens[0]) + self.assertListEqual(["filter"], corpus.tokens[1]) self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2) def test_stopwords_slovene(self): - f = preprocess.StopwordsFilter('slovene') - self.assertFalse(f._check('in')) - self.assertTrue(f._check('abeceda')) + f = preprocess.StopwordsFilter("sl") with self.corpus.unlocked(): - self.corpus.metas[0, 0] = 'kača je v hiši' + self.corpus.metas[0, 0] = "kača je v hiši" + self.corpus.metas[1, 0] = "in abeceda" + self.corpus.attributes["language"] = "sl" corpus = f(self.corpus) self.assertListEqual(["kača", "hiši"], corpus.tokens[0]) + self.assertListEqual(["abeceda"], corpus.tokens[1]) self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2) + def test_stopwords_language_from_corpus(self): + f = preprocess.StopwordsFilter() + self.corpus.attributes["language"] = "en" + corpus = f(self.corpus) + self.assertListEqual( + ["Human", "machine", "interface", "lab", "abc", "computer", "applications"], + corpus.tokens[0], + ) + + # fail when use_default_stopwords and language not supported + f = preprocess.StopwordsFilter(use_default_stopwords=True) + self.corpus.attributes["language"] = "am" + with self.assertRaises(ValueError): + f(self.corpus) + # success when not use_default_stopwords and language not supported + f = preprocess.StopwordsFilter(use_default_stopwords=False) + corpus = f(self.corpus) + self.assertEqual(len(corpus.tokens), 9) + def test_lexicon(self): f = tempfile.NamedTemporaryFile(delete=False) f.write(b'filter\n') diff --git a/orangecontrib/text/widgets/owannotator.py b/orangecontrib/text/widgets/owannotator.py index 68b1da9db..cdb3db200 100644 --- a/orangecontrib/text/widgets/owannotator.py +++ b/orangecontrib/text/widgets/owannotator.py @@ -611,7 +611,7 @@ def onDeleteWidget(self): corpus_ = Corpus.from_file("book-excerpts") for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"), - StopwordsFilter("English"), FrequencyFilter(0.1)): + StopwordsFilter("en"), FrequencyFilter(0.1)): corpus_ = pp(corpus_) transformed_corpus = BowVectorizer().transform(corpus_) diff --git a/orangecontrib/text/widgets/tests/test_owannotator.py b/orangecontrib/text/widgets/tests/test_owannotator.py index cd3a6098b..06237aa27 100644 --- a/orangecontrib/text/widgets/tests/test_owannotator.py +++ b/orangecontrib/text/widgets/tests/test_owannotator.py @@ -22,7 +22,7 @@ def preprocess(corpus: Corpus) -> Corpus: for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"), - StopwordsFilter("English"), FrequencyFilter(0.25, 0.5)): + StopwordsFilter("en"), FrequencyFilter(0.25, 0.5)): corpus = pp(corpus) transformed_corpus = BowVectorizer().transform(corpus)