From bab6863e29866e2475e9e566e5b80eebf37d85c3 Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Fri, 14 Apr 2023 09:24:17 +0200
Subject: [PATCH] Filter - language form corpus in StopwordsFilter
---
orangecontrib/text/annotate_documents.py | 2 +-
orangecontrib/text/preprocess/filter.py | 34 +++++++++++++++----
orangecontrib/text/tests/test_preprocess.py | 17 +++++-----
orangecontrib/text/widgets/owannotator.py | 2 +-
.../text/widgets/tests/test_owannotator.py | 2 +-
5 files changed, 39 insertions(+), 18 deletions(-)
diff --git a/orangecontrib/text/annotate_documents.py b/orangecontrib/text/annotate_documents.py
index c975e2023..9f166101e 100644
--- a/orangecontrib/text/annotate_documents.py
+++ b/orangecontrib/text/annotate_documents.py
@@ -289,7 +289,7 @@ def _hypergeom_clusters(
corpus_ = Corpus.from_file("book-excerpts")
for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"),
- StopwordsFilter("English"), FrequencyFilter(0.1)):
+ StopwordsFilter("en"), FrequencyFilter(0.1)):
corpus_ = pp(corpus_)
transformed_corpus = BowVectorizer().transform(corpus_)
diff --git a/orangecontrib/text/preprocess/filter.py b/orangecontrib/text/preprocess/filter.py
index 50f748c31..f8e0d843a 100644
--- a/orangecontrib/text/preprocess/filter.py
+++ b/orangecontrib/text/preprocess/filter.py
@@ -1,5 +1,5 @@
from itertools import compress
-from typing import List, Callable
+from typing import List, Callable, Optional
import os
import re
@@ -11,6 +11,7 @@
from Orange.util import wrap_callback, dummy_callback
from orangecontrib.text import Corpus
+from orangecontrib.text.language import ISO2LANG
from orangecontrib.text.misc import wait_nltk_data
from orangecontrib.text.preprocess import TokenizedPreprocessor
@@ -68,16 +69,35 @@ def from_file(path):
class StopwordsFilter(BaseTokenFilter, FileWordListMixin):
- """ Remove tokens present in NLTK's language specific lists or a file. """
+ """Remove tokens present in NLTK's language-specific lists or a file."""
name = 'Stopwords'
- @wait_nltk_data
- def __init__(self, language='English', path: str = None):
+ # nltk uses different language nams for some languages
+ nltk_mapping = {"Slovenian": "Slovene"}
+
+ def __init__(
+ self,
+ language: Optional[str] = "en",
+ path: str = None,
+ ):
+ """
+ Parameters
+ ----------
+ language
+ The language code in ISO format for NLTK stopwords selection.
+ If None, only words from file are used (NLTK stopwords are not used).
+ path
+ The path to the file with its stopwords will be used if present.
+ The file must contain a newline-separated list of words.
+ """
super().__init__()
FileWordListMixin.__init__(self, path)
- self.__stopwords = set(x.strip() for x in
- stopwords.words(language.lower())) \
- if language else []
+ self.__stopwords = set()
+ if language:
+ # transform iso code to NLTK's language name
+ language = ISO2LANG[language]
+ language = self.nltk_mapping.get(language, language).lower()
+ self.__stopwords = set(x.strip() for x in stopwords.words(language))
@staticmethod
@wait_nltk_data
diff --git a/orangecontrib/text/tests/test_preprocess.py b/orangecontrib/text/tests/test_preprocess.py
index f94ba5f81..8d385052a 100644
--- a/orangecontrib/text/tests/test_preprocess.py
+++ b/orangecontrib/text/tests/test_preprocess.py
@@ -430,23 +430,24 @@ def _check(self, token):
self.assertEqual(filtered, ['a'])
def test_stopwords(self):
- f = preprocess.StopwordsFilter('english')
- self.assertFalse(f._check('a'))
- self.assertTrue(f._check('filter'))
+ f = preprocess.StopwordsFilter("en")
with self.corpus.unlocked():
- self.corpus.metas[0, 0] = 'a snake is in a house'
+ self.corpus.metas[0, 0] = "a snake is in a house"
+ self.corpus.metas[1, 0] = "a filter"
corpus = f(self.corpus)
self.assertListEqual(["snake", "house"], corpus.tokens[0])
+ self.assertListEqual(["filter"], corpus.tokens[1])
self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)
def test_stopwords_slovene(self):
- f = preprocess.StopwordsFilter('slovene')
- self.assertFalse(f._check('in'))
- self.assertTrue(f._check('abeceda'))
+ f = preprocess.StopwordsFilter("sl")
with self.corpus.unlocked():
- self.corpus.metas[0, 0] = 'kača je v hiši'
+ self.corpus.metas[0, 0] = "kača je v hiši"
+ self.corpus.metas[1, 0] = "in abeceda"
+ self.corpus.attributes["language"] = "sl"
corpus = f(self.corpus)
self.assertListEqual(["kača", "hiši"], corpus.tokens[0])
+ self.assertListEqual(["abeceda"], corpus.tokens[1])
self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)
def test_lexicon(self):
diff --git a/orangecontrib/text/widgets/owannotator.py b/orangecontrib/text/widgets/owannotator.py
index 8bffc5294..e3cd5f25f 100644
--- a/orangecontrib/text/widgets/owannotator.py
+++ b/orangecontrib/text/widgets/owannotator.py
@@ -618,7 +618,7 @@ def onDeleteWidget(self):
corpus_ = Corpus.from_file("book-excerpts")
for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"),
- StopwordsFilter("English"), FrequencyFilter(0.1)):
+ StopwordsFilter("en"), FrequencyFilter(0.1)):
corpus_ = pp(corpus_)
transformed_corpus = BowVectorizer().transform(corpus_)
diff --git a/orangecontrib/text/widgets/tests/test_owannotator.py b/orangecontrib/text/widgets/tests/test_owannotator.py
index 5ee0b52d9..19e4951d2 100644
--- a/orangecontrib/text/widgets/tests/test_owannotator.py
+++ b/orangecontrib/text/widgets/tests/test_owannotator.py
@@ -21,7 +21,7 @@
def preprocess(corpus: Corpus) -> Corpus:
for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"),
- StopwordsFilter("English"), FrequencyFilter(0.25, 0.5)):
+ StopwordsFilter("en"), FrequencyFilter(0.25, 0.5)):
corpus = pp(corpus)
corpus = BowVectorizer().transform(corpus)
return add_embedding(corpus, 4)