From 43422121d1dee343f78bb16376238942b663861c Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Fri, 14 Apr 2023 09:24:17 +0200
Subject: [PATCH] Filter - language form corpus in StopwordsFilter
---
orangecontrib/text/preprocess/filter.py | 51 ++++++++++++++++---
orangecontrib/text/tests/test_preprocess.py | 36 ++++++++++---
orangecontrib/text/widgets/owannotator.py | 2 +-
.../text/widgets/tests/test_owannotator.py | 2 +-
4 files changed, 74 insertions(+), 17 deletions(-)
diff --git a/orangecontrib/text/preprocess/filter.py b/orangecontrib/text/preprocess/filter.py
index 851c5b7ef..4c24043bb 100644
--- a/orangecontrib/text/preprocess/filter.py
+++ b/orangecontrib/text/preprocess/filter.py
@@ -1,5 +1,5 @@
from itertools import compress
-from typing import List, Callable
+from typing import List, Callable, Optional
import os
import re
@@ -11,6 +11,7 @@
from Orange.util import wrap_callback, dummy_callback
from orangecontrib.text import Corpus
+from orangecontrib.text.language import ISO2LANG
from orangecontrib.text.misc import wait_nltk_data
from orangecontrib.text.preprocess import TokenizedPreprocessor
@@ -72,16 +73,52 @@ def from_file(path):
class StopwordsFilter(BaseTokenFilter, FileWordListMixin):
- """ Remove tokens present in NLTK's language specific lists or a file. """
+ """
+ Remove tokens present in NLTK's language-specific lists or a file.
+
+ Attributes
+ ----------
+ language
+ The language for NLTK stopwords selection. If None, language from the
+ Corpus will be used.
+ use_default_stopwords
+ Indication whether to use NLTK's stopwords since setting language to
+ None doesn't prevent the use of NLTK's stopwords.
+ path
+ The path to the file with its stopwords will be used if present.
+ The file must contain a newline-separated list of words.
+ """
name = 'Stopwords'
- @wait_nltk_data
- def __init__(self, language='English', path: str = None):
+ # nltk uses different language nams for some languages
+ nltk_mapping = {"Slovenian": "Slovene"}
+
+ def __init__(
+ self,
+ language: Optional[str] = None,
+ use_default_stopwords: bool = True,
+ path: str = None,
+ ):
super().__init__()
FileWordListMixin.__init__(self, path)
- self.__stopwords = set(x.strip() for x in
- stopwords.words(language.lower())) \
- if language else []
+ self.__language = language
+ self.__use_default_stopwords = use_default_stopwords
+ self.__stopwords = set()
+
+ @wait_nltk_data
+ def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
+ # use language set in init if not None and Corpus's language otherwise
+ la = ISO2LANG[self.__language or corpus.language]
+ la = self.nltk_mapping.get(la, la)
+ if self.__use_default_stopwords:
+ if la in self.supported_languages():
+ self.__stopwords = set(x.strip() for x in stopwords.words(la.lower()))
+ else:
+ raise ValueError(
+ "The stopwords filter does not support the Corpus's or "
+ "selected language."
+ )
+ return super().__call__(corpus, callback)
@staticmethod
@wait_nltk_data
diff --git a/orangecontrib/text/tests/test_preprocess.py b/orangecontrib/text/tests/test_preprocess.py
index f94ba5f81..2fe6209d3 100644
--- a/orangecontrib/text/tests/test_preprocess.py
+++ b/orangecontrib/text/tests/test_preprocess.py
@@ -430,25 +430,45 @@ def _check(self, token):
self.assertEqual(filtered, ['a'])
def test_stopwords(self):
- f = preprocess.StopwordsFilter('english')
- self.assertFalse(f._check('a'))
- self.assertTrue(f._check('filter'))
+ f = preprocess.StopwordsFilter("en")
with self.corpus.unlocked():
- self.corpus.metas[0, 0] = 'a snake is in a house'
+ self.corpus.metas[0, 0] = "a snake is in a house"
+ self.corpus.metas[1, 0] = "a filter"
corpus = f(self.corpus)
self.assertListEqual(["snake", "house"], corpus.tokens[0])
+ self.assertListEqual(["filter"], corpus.tokens[1])
self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)
def test_stopwords_slovene(self):
- f = preprocess.StopwordsFilter('slovene')
- self.assertFalse(f._check('in'))
- self.assertTrue(f._check('abeceda'))
+ f = preprocess.StopwordsFilter("sl")
with self.corpus.unlocked():
- self.corpus.metas[0, 0] = 'kača je v hiši'
+ self.corpus.metas[0, 0] = "kača je v hiši"
+ self.corpus.metas[1, 0] = "in abeceda"
+ self.corpus.attributes["language"] = "sl"
corpus = f(self.corpus)
self.assertListEqual(["kača", "hiši"], corpus.tokens[0])
+ self.assertListEqual(["abeceda"], corpus.tokens[1])
self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)
+ def test_stopwords_language_from_corpus(self):
+ f = preprocess.StopwordsFilter()
+ self.corpus.attributes["language"] = "en"
+ corpus = f(self.corpus)
+ self.assertListEqual(
+ ["Human", "machine", "interface", "lab", "abc", "computer", "applications"],
+ corpus.tokens[0],
+ )
+
+ # fail when use_default_stopwords and language not supported
+ f = preprocess.StopwordsFilter(use_default_stopwords=True)
+ self.corpus.attributes["language"] = "am"
+ with self.assertRaises(ValueError):
+ f(self.corpus)
+ # success when not use_default_stopwords and language not supported
+ f = preprocess.StopwordsFilter(use_default_stopwords=False)
+ corpus = f(self.corpus)
+ self.assertEqual(len(corpus.tokens), 9)
+
def test_lexicon(self):
f = tempfile.NamedTemporaryFile(delete=False)
f.write(b'filter\n')
diff --git a/orangecontrib/text/widgets/owannotator.py b/orangecontrib/text/widgets/owannotator.py
index 68b1da9db..cdb3db200 100644
--- a/orangecontrib/text/widgets/owannotator.py
+++ b/orangecontrib/text/widgets/owannotator.py
@@ -611,7 +611,7 @@ def onDeleteWidget(self):
corpus_ = Corpus.from_file("book-excerpts")
for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"),
- StopwordsFilter("English"), FrequencyFilter(0.1)):
+ StopwordsFilter("en"), FrequencyFilter(0.1)):
corpus_ = pp(corpus_)
transformed_corpus = BowVectorizer().transform(corpus_)
diff --git a/orangecontrib/text/widgets/tests/test_owannotator.py b/orangecontrib/text/widgets/tests/test_owannotator.py
index cd3a6098b..06237aa27 100644
--- a/orangecontrib/text/widgets/tests/test_owannotator.py
+++ b/orangecontrib/text/widgets/tests/test_owannotator.py
@@ -22,7 +22,7 @@
def preprocess(corpus: Corpus) -> Corpus:
for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"),
- StopwordsFilter("English"), FrequencyFilter(0.25, 0.5)):
+ StopwordsFilter("en"), FrequencyFilter(0.25, 0.5)):
corpus = pp(corpus)
transformed_corpus = BowVectorizer().transform(corpus)