Skip to content

Commit

Permalink
Filter - language form corpus in StopwordsFilter
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Nov 17, 2023
1 parent 87a7580 commit 53c0467
Show file tree
Hide file tree
Showing 5 changed files with 31 additions and 11 deletions.
2 changes: 1 addition & 1 deletion orangecontrib/text/annotate_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ def _hypergeom_clusters(

corpus_ = Corpus.from_file("book-excerpts")
for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"),
StopwordsFilter("English"), FrequencyFilter(0.1)):
StopwordsFilter("en"), FrequencyFilter(0.1)):
corpus_ = pp(corpus_)

transformed_corpus = BowVectorizer().transform(corpus_)
Expand Down
32 changes: 26 additions & 6 deletions orangecontrib/text/preprocess/filter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from itertools import compress
from typing import List, Callable
from typing import List, Callable, Optional
import os
import re

Expand All @@ -11,6 +11,7 @@
from Orange.util import wrap_callback, dummy_callback

from orangecontrib.text import Corpus
from orangecontrib.text.language import ISO2LANG
from orangecontrib.text.misc import wait_nltk_data
from orangecontrib.text.preprocess import TokenizedPreprocessor

Expand Down Expand Up @@ -71,13 +72,32 @@ class StopwordsFilter(BaseTokenFilter, FileWordListMixin):
""" Remove tokens present in NLTK's language specific lists or a file. """
name = 'Stopwords'

@wait_nltk_data
def __init__(self, language='English', path: str = None):
# nltk uses different language nams for some languages
nltk_mapping = {"Slovenian": "Slovene"}

def __init__(
self,
language: Optional[str] = "en",
path: Optional[str] = None,
):
"""
Parameters
----------
language
The language code in ISO format for NLTK stopwords selection.
If None, only words from file are used (NLTK stopwords are not used).
path
The path to the file with its stopwords will be used if present.
The file must contain a newline-separated list of words.
"""
super().__init__()
FileWordListMixin.__init__(self, path)
self.__stopwords = set(x.strip() for x in
stopwords.words(language.lower())) \
if language else []
self.__stopwords = set()
if language:
# transform iso code to NLTK's language name
language = ISO2LANG[language]
language = self.nltk_mapping.get(language, language).lower()
self.__stopwords = set(x.strip() for x in stopwords.words(language))

@staticmethod
@wait_nltk_data
Expand Down
4 changes: 2 additions & 2 deletions orangecontrib/text/tests/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ def _check(self, token):
self.assertEqual(filtered, ['a'])

def test_stopwords(self):
f = preprocess.StopwordsFilter('english')
f = preprocess.StopwordsFilter("en")
self.assertFalse(f._check('a'))
self.assertTrue(f._check('filter'))
with self.corpus.unlocked():
Expand All @@ -440,7 +440,7 @@ def test_stopwords(self):
self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)

def test_stopwords_slovene(self):
f = preprocess.StopwordsFilter('slovene')
f = preprocess.StopwordsFilter("sl")
self.assertFalse(f._check('in'))
self.assertTrue(f._check('abeceda'))
with self.corpus.unlocked():
Expand Down
2 changes: 1 addition & 1 deletion orangecontrib/text/widgets/owannotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -618,7 +618,7 @@ def onDeleteWidget(self):

corpus_ = Corpus.from_file("book-excerpts")
for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"),
StopwordsFilter("English"), FrequencyFilter(0.1)):
StopwordsFilter("en"), FrequencyFilter(0.1)):
corpus_ = pp(corpus_)

transformed_corpus = BowVectorizer().transform(corpus_)
Expand Down
2 changes: 1 addition & 1 deletion orangecontrib/text/widgets/tests/test_owannotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

def preprocess(corpus: Corpus) -> Corpus:
for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"),
StopwordsFilter("English"), FrequencyFilter(0.25, 0.5)):
StopwordsFilter("en"), FrequencyFilter(0.25, 0.5)):
corpus = pp(corpus)
corpus = BowVectorizer().transform(corpus)
return add_embedding(corpus, 4)
Expand Down

0 comments on commit 53c0467

Please sign in to comment.