From 1d676a7d0ff65dcacea3c31ddf77cfba422084af Mon Sep 17 00:00:00 2001
From: PrimozGodec <p.godec9@gmail.com>
Date: Fri, 14 Apr 2023 09:24:17 +0200
Subject: [PATCH] Filter - language form corpus in StopwordsFilter

---
 orangecontrib/text/annotate_documents.py      |  2 +-
 orangecontrib/text/preprocess/filter.py       | 51 ++++++++++++++++---
 orangecontrib/text/tests/test_preprocess.py   | 36 ++++++++++---
 orangecontrib/text/widgets/owannotator.py     |  2 +-
 .../text/widgets/tests/test_owannotator.py    |  2 +-
 5 files changed, 75 insertions(+), 18 deletions(-)

diff --git a/orangecontrib/text/annotate_documents.py b/orangecontrib/text/annotate_documents.py
index 5cdb338f1..28e612273 100644
--- a/orangecontrib/text/annotate_documents.py
+++ b/orangecontrib/text/annotate_documents.py
@@ -289,7 +289,7 @@ def _hypergeom_clusters(
 
     corpus_ = Corpus.from_file("book-excerpts")
     for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"),
-               StopwordsFilter("English"), FrequencyFilter(0.1)):
+               StopwordsFilter("en"), FrequencyFilter(0.1)):
         corpus_ = pp(corpus_)
 
     transformed_corpus = BowVectorizer().transform(corpus_)
diff --git a/orangecontrib/text/preprocess/filter.py b/orangecontrib/text/preprocess/filter.py
index 851c5b7ef..4c24043bb 100644
--- a/orangecontrib/text/preprocess/filter.py
+++ b/orangecontrib/text/preprocess/filter.py
@@ -1,5 +1,5 @@
 from itertools import compress
-from typing import List, Callable
+from typing import List, Callable, Optional
 import os
 import re
 
@@ -11,6 +11,7 @@
 from Orange.util import wrap_callback, dummy_callback
 
 from orangecontrib.text import Corpus
+from orangecontrib.text.language import ISO2LANG
 from orangecontrib.text.misc import wait_nltk_data
 from orangecontrib.text.preprocess import TokenizedPreprocessor
 
@@ -72,16 +73,52 @@ def from_file(path):
 
 
 class StopwordsFilter(BaseTokenFilter, FileWordListMixin):
-    """ Remove tokens present in NLTK's language specific lists or a file. """
+    """
+    Remove tokens present in NLTK's language-specific lists or a file.
+
+    Attributes
+    ----------
+    language
+        The language for NLTK stopwords selection. If None, language from the
+        Corpus will be used.
+    use_default_stopwords
+        Indication whether to use NLTK's stopwords since setting language to
+        None doesn't prevent the use of NLTK's stopwords.
+    path
+        The path to the file with its stopwords will be used if present.
+        The file must contain a newline-separated list of words.
+    """
     name = 'Stopwords'
 
-    @wait_nltk_data
-    def __init__(self, language='English', path: str = None):
+    # nltk uses different language nams for some languages
+    nltk_mapping = {"Slovenian": "Slovene"}
+
+    def __init__(
+        self,
+        language: Optional[str] = None,
+        use_default_stopwords: bool = True,
+        path: str = None,
+    ):
         super().__init__()
         FileWordListMixin.__init__(self, path)
-        self.__stopwords = set(x.strip() for x in
-                               stopwords.words(language.lower())) \
-            if language else []
+        self.__language = language
+        self.__use_default_stopwords = use_default_stopwords
+        self.__stopwords = set()
+
+    @wait_nltk_data
+    def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
+        # use language set in init if not None and Corpus's language otherwise
+        la = ISO2LANG[self.__language or corpus.language]
+        la = self.nltk_mapping.get(la, la)
+        if self.__use_default_stopwords:
+            if la in self.supported_languages():
+                self.__stopwords = set(x.strip() for x in stopwords.words(la.lower()))
+            else:
+                raise ValueError(
+                    "The stopwords filter does not support the Corpus's or "
+                    "selected language."
+                )
+        return super().__call__(corpus, callback)
 
     @staticmethod
     @wait_nltk_data
diff --git a/orangecontrib/text/tests/test_preprocess.py b/orangecontrib/text/tests/test_preprocess.py
index f94ba5f81..2fe6209d3 100644
--- a/orangecontrib/text/tests/test_preprocess.py
+++ b/orangecontrib/text/tests/test_preprocess.py
@@ -430,25 +430,45 @@ def _check(self, token):
         self.assertEqual(filtered, ['a'])
 
     def test_stopwords(self):
-        f = preprocess.StopwordsFilter('english')
-        self.assertFalse(f._check('a'))
-        self.assertTrue(f._check('filter'))
+        f = preprocess.StopwordsFilter("en")
         with self.corpus.unlocked():
-            self.corpus.metas[0, 0] = 'a snake is in a house'
+            self.corpus.metas[0, 0] = "a snake is in a house"
+            self.corpus.metas[1, 0] = "a filter"
         corpus = f(self.corpus)
         self.assertListEqual(["snake", "house"], corpus.tokens[0])
+        self.assertListEqual(["filter"], corpus.tokens[1])
         self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)
 
     def test_stopwords_slovene(self):
-        f = preprocess.StopwordsFilter('slovene')
-        self.assertFalse(f._check('in'))
-        self.assertTrue(f._check('abeceda'))
+        f = preprocess.StopwordsFilter("sl")
         with self.corpus.unlocked():
-            self.corpus.metas[0, 0] = 'kača je v hiši'
+            self.corpus.metas[0, 0] = "kača je v hiši"
+            self.corpus.metas[1, 0] = "in abeceda"
+        self.corpus.attributes["language"] = "sl"
         corpus = f(self.corpus)
         self.assertListEqual(["kača", "hiši"], corpus.tokens[0])
+        self.assertListEqual(["abeceda"], corpus.tokens[1])
         self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)
 
+    def test_stopwords_language_from_corpus(self):
+        f = preprocess.StopwordsFilter()
+        self.corpus.attributes["language"] = "en"
+        corpus = f(self.corpus)
+        self.assertListEqual(
+            ["Human", "machine", "interface", "lab", "abc", "computer", "applications"],
+            corpus.tokens[0],
+        )
+
+        # fail when use_default_stopwords and language not supported
+        f = preprocess.StopwordsFilter(use_default_stopwords=True)
+        self.corpus.attributes["language"] = "am"
+        with self.assertRaises(ValueError):
+            f(self.corpus)
+        # success when not use_default_stopwords and language not supported
+        f = preprocess.StopwordsFilter(use_default_stopwords=False)
+        corpus = f(self.corpus)
+        self.assertEqual(len(corpus.tokens), 9)
+
     def test_lexicon(self):
         f = tempfile.NamedTemporaryFile(delete=False)
         f.write(b'filter\n')
diff --git a/orangecontrib/text/widgets/owannotator.py b/orangecontrib/text/widgets/owannotator.py
index 68b1da9db..cdb3db200 100644
--- a/orangecontrib/text/widgets/owannotator.py
+++ b/orangecontrib/text/widgets/owannotator.py
@@ -611,7 +611,7 @@ def onDeleteWidget(self):
 
     corpus_ = Corpus.from_file("book-excerpts")
     for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"),
-               StopwordsFilter("English"), FrequencyFilter(0.1)):
+               StopwordsFilter("en"), FrequencyFilter(0.1)):
         corpus_ = pp(corpus_)
 
     transformed_corpus = BowVectorizer().transform(corpus_)
diff --git a/orangecontrib/text/widgets/tests/test_owannotator.py b/orangecontrib/text/widgets/tests/test_owannotator.py
index cd3a6098b..06237aa27 100644
--- a/orangecontrib/text/widgets/tests/test_owannotator.py
+++ b/orangecontrib/text/widgets/tests/test_owannotator.py
@@ -22,7 +22,7 @@
 
 def preprocess(corpus: Corpus) -> Corpus:
     for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"),
-               StopwordsFilter("English"), FrequencyFilter(0.25, 0.5)):
+               StopwordsFilter("en"), FrequencyFilter(0.25, 0.5)):
         corpus = pp(corpus)
 
     transformed_corpus = BowVectorizer().transform(corpus)