Skip to content

Commit

Permalink
Filter - language form corpus in StopwordsFilter
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Apr 14, 2023
1 parent 23da347 commit 4342212
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 17 deletions.
51 changes: 44 additions & 7 deletions orangecontrib/text/preprocess/filter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from itertools import compress
from typing import List, Callable
from typing import List, Callable, Optional
import os
import re

Expand All @@ -11,6 +11,7 @@
from Orange.util import wrap_callback, dummy_callback

from orangecontrib.text import Corpus
from orangecontrib.text.language import ISO2LANG
from orangecontrib.text.misc import wait_nltk_data
from orangecontrib.text.preprocess import TokenizedPreprocessor

Expand Down Expand Up @@ -72,16 +73,52 @@ def from_file(path):


class StopwordsFilter(BaseTokenFilter, FileWordListMixin):
""" Remove tokens present in NLTK's language specific lists or a file. """
"""
Remove tokens present in NLTK's language-specific lists or a file.
Attributes
----------
language
The language for NLTK stopwords selection. If None, language from the
Corpus will be used.
use_default_stopwords
Indication whether to use NLTK's stopwords since setting language to
None doesn't prevent the use of NLTK's stopwords.
path
The path to the file with its stopwords will be used if present.
The file must contain a newline-separated list of words.
"""
name = 'Stopwords'

@wait_nltk_data
def __init__(self, language='English', path: str = None):
# nltk uses different language nams for some languages
nltk_mapping = {"Slovenian": "Slovene"}

def __init__(
self,
language: Optional[str] = None,
use_default_stopwords: bool = True,
path: str = None,
):
super().__init__()
FileWordListMixin.__init__(self, path)
self.__stopwords = set(x.strip() for x in
stopwords.words(language.lower())) \
if language else []
self.__language = language
self.__use_default_stopwords = use_default_stopwords
self.__stopwords = set()

@wait_nltk_data
def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
# use language set in init if not None and Corpus's language otherwise
la = ISO2LANG[self.__language or corpus.language]
la = self.nltk_mapping.get(la, la)
if self.__use_default_stopwords:
if la in self.supported_languages():
self.__stopwords = set(x.strip() for x in stopwords.words(la.lower()))
else:
raise ValueError(
"The stopwords filter does not support the Corpus's or "
"selected language."
)
return super().__call__(corpus, callback)

@staticmethod
@wait_nltk_data
Expand Down
36 changes: 28 additions & 8 deletions orangecontrib/text/tests/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,25 +430,45 @@ def _check(self, token):
self.assertEqual(filtered, ['a'])

def test_stopwords(self):
f = preprocess.StopwordsFilter('english')
self.assertFalse(f._check('a'))
self.assertTrue(f._check('filter'))
f = preprocess.StopwordsFilter("en")
with self.corpus.unlocked():
self.corpus.metas[0, 0] = 'a snake is in a house'
self.corpus.metas[0, 0] = "a snake is in a house"
self.corpus.metas[1, 0] = "a filter"
corpus = f(self.corpus)
self.assertListEqual(["snake", "house"], corpus.tokens[0])
self.assertListEqual(["filter"], corpus.tokens[1])
self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)

def test_stopwords_slovene(self):
f = preprocess.StopwordsFilter('slovene')
self.assertFalse(f._check('in'))
self.assertTrue(f._check('abeceda'))
f = preprocess.StopwordsFilter("sl")
with self.corpus.unlocked():
self.corpus.metas[0, 0] = 'kača je v hiši'
self.corpus.metas[0, 0] = "kača je v hiši"
self.corpus.metas[1, 0] = "in abeceda"
self.corpus.attributes["language"] = "sl"
corpus = f(self.corpus)
self.assertListEqual(["kača", "hiši"], corpus.tokens[0])
self.assertListEqual(["abeceda"], corpus.tokens[1])
self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)

def test_stopwords_language_from_corpus(self):
f = preprocess.StopwordsFilter()
self.corpus.attributes["language"] = "en"
corpus = f(self.corpus)
self.assertListEqual(
["Human", "machine", "interface", "lab", "abc", "computer", "applications"],
corpus.tokens[0],
)

# fail when use_default_stopwords and language not supported
f = preprocess.StopwordsFilter(use_default_stopwords=True)
self.corpus.attributes["language"] = "am"
with self.assertRaises(ValueError):
f(self.corpus)
# success when not use_default_stopwords and language not supported
f = preprocess.StopwordsFilter(use_default_stopwords=False)
corpus = f(self.corpus)
self.assertEqual(len(corpus.tokens), 9)

def test_lexicon(self):
f = tempfile.NamedTemporaryFile(delete=False)
f.write(b'filter\n')
Expand Down
2 changes: 1 addition & 1 deletion orangecontrib/text/widgets/owannotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -611,7 +611,7 @@ def onDeleteWidget(self):

corpus_ = Corpus.from_file("book-excerpts")
for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"),
StopwordsFilter("English"), FrequencyFilter(0.1)):
StopwordsFilter("en"), FrequencyFilter(0.1)):
corpus_ = pp(corpus_)

transformed_corpus = BowVectorizer().transform(corpus_)
Expand Down
2 changes: 1 addition & 1 deletion orangecontrib/text/widgets/tests/test_owannotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

def preprocess(corpus: Corpus) -> Corpus:
for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"),
StopwordsFilter("English"), FrequencyFilter(0.25, 0.5)):
StopwordsFilter("en"), FrequencyFilter(0.25, 0.5)):
corpus = pp(corpus)

transformed_corpus = BowVectorizer().transform(corpus)
Expand Down

0 comments on commit 4342212

Please sign in to comment.