Skip to content

Commit

Permalink
Filter - language form corpus in StopwordsFilter
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Nov 17, 2023
1 parent 87a7580 commit bab6863
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 18 deletions.
2 changes: 1 addition & 1 deletion orangecontrib/text/annotate_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ def _hypergeom_clusters(

corpus_ = Corpus.from_file("book-excerpts")
for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"),
StopwordsFilter("English"), FrequencyFilter(0.1)):
StopwordsFilter("en"), FrequencyFilter(0.1)):
corpus_ = pp(corpus_)

transformed_corpus = BowVectorizer().transform(corpus_)
Expand Down
34 changes: 27 additions & 7 deletions orangecontrib/text/preprocess/filter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from itertools import compress
from typing import List, Callable
from typing import List, Callable, Optional
import os
import re

Expand All @@ -11,6 +11,7 @@
from Orange.util import wrap_callback, dummy_callback

from orangecontrib.text import Corpus
from orangecontrib.text.language import ISO2LANG
from orangecontrib.text.misc import wait_nltk_data
from orangecontrib.text.preprocess import TokenizedPreprocessor

Expand Down Expand Up @@ -68,16 +69,35 @@ def from_file(path):


class StopwordsFilter(BaseTokenFilter, FileWordListMixin):
""" Remove tokens present in NLTK's language specific lists or a file. """
"""Remove tokens present in NLTK's language-specific lists or a file."""
name = 'Stopwords'

@wait_nltk_data
def __init__(self, language='English', path: str = None):
# nltk uses different language nams for some languages
nltk_mapping = {"Slovenian": "Slovene"}

def __init__(
self,
language: Optional[str] = "en",
path: str = None,
):
"""
Parameters
----------
language
The language code in ISO format for NLTK stopwords selection.
If None, only words from file are used (NLTK stopwords are not used).
path
The path to the file with its stopwords will be used if present.
The file must contain a newline-separated list of words.
"""
super().__init__()
FileWordListMixin.__init__(self, path)
self.__stopwords = set(x.strip() for x in
stopwords.words(language.lower())) \
if language else []
self.__stopwords = set()
if language:
# transform iso code to NLTK's language name
language = ISO2LANG[language]
language = self.nltk_mapping.get(language, language).lower()
self.__stopwords = set(x.strip() for x in stopwords.words(language))

@staticmethod
@wait_nltk_data
Expand Down
17 changes: 9 additions & 8 deletions orangecontrib/text/tests/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,23 +430,24 @@ def _check(self, token):
self.assertEqual(filtered, ['a'])

def test_stopwords(self):
f = preprocess.StopwordsFilter('english')
self.assertFalse(f._check('a'))
self.assertTrue(f._check('filter'))
f = preprocess.StopwordsFilter("en")
with self.corpus.unlocked():
self.corpus.metas[0, 0] = 'a snake is in a house'
self.corpus.metas[0, 0] = "a snake is in a house"
self.corpus.metas[1, 0] = "a filter"
corpus = f(self.corpus)
self.assertListEqual(["snake", "house"], corpus.tokens[0])
self.assertListEqual(["filter"], corpus.tokens[1])
self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)

def test_stopwords_slovene(self):
f = preprocess.StopwordsFilter('slovene')
self.assertFalse(f._check('in'))
self.assertTrue(f._check('abeceda'))
f = preprocess.StopwordsFilter("sl")
with self.corpus.unlocked():
self.corpus.metas[0, 0] = 'kača je v hiši'
self.corpus.metas[0, 0] = "kača je v hiši"
self.corpus.metas[1, 0] = "in abeceda"
self.corpus.attributes["language"] = "sl"
corpus = f(self.corpus)
self.assertListEqual(["kača", "hiši"], corpus.tokens[0])
self.assertListEqual(["abeceda"], corpus.tokens[1])
self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)

def test_lexicon(self):
Expand Down
2 changes: 1 addition & 1 deletion orangecontrib/text/widgets/owannotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -618,7 +618,7 @@ def onDeleteWidget(self):

corpus_ = Corpus.from_file("book-excerpts")
for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"),
StopwordsFilter("English"), FrequencyFilter(0.1)):
StopwordsFilter("en"), FrequencyFilter(0.1)):
corpus_ = pp(corpus_)

transformed_corpus = BowVectorizer().transform(corpus_)
Expand Down
2 changes: 1 addition & 1 deletion orangecontrib/text/widgets/tests/test_owannotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

def preprocess(corpus: Corpus) -> Corpus:
for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"),
StopwordsFilter("English"), FrequencyFilter(0.25, 0.5)):
StopwordsFilter("en"), FrequencyFilter(0.25, 0.5)):
corpus = pp(corpus)
corpus = BowVectorizer().transform(corpus)
return add_embedding(corpus, 4)
Expand Down

0 comments on commit bab6863

Please sign in to comment.