From 8c15381d53fcb5ad94f876f8cdef4379e299c425 Mon Sep 17 00:00:00 2001 From: PrimozGodec Date: Thu, 1 Jun 2023 08:40:27 +0200 Subject: [PATCH] Keywords - temporary solution for list of stopwords --- orangecontrib/text/keywords/__init__.py | 6 ++++-- orangecontrib/text/widgets/tests/test_owkeywords.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/orangecontrib/text/keywords/__init__.py b/orangecontrib/text/keywords/__init__.py index bdfb44bdd..8dc84f6c4 100644 --- a/orangecontrib/text/keywords/__init__.py +++ b/orangecontrib/text/keywords/__init__.py @@ -15,13 +15,15 @@ from orangecontrib.text import Corpus from orangecontrib.text.keywords.mbert import mbert_keywords from orangecontrib.text.keywords.rake import Rake +from orangecontrib.text.language import ISO2LANG from orangecontrib.text.preprocess import StopwordsFilter # all available languages for RAKE from orangecontrib.text.vectorization import BowVectorizer -# todo -RAKE_LANGUAGES = StopwordsFilter.supported_languages() +# todo: this is a temporary solution since supported_languages now returns +# languages as ISO codes - refactor with keywords language refactoring +RAKE_LANGUAGES = [ISO2LANG[la] for la in StopwordsFilter.supported_languages()] # all available languages for YAKE! YAKE_LANGUAGE_MAPPING = { "Arabic": "ar", diff --git a/orangecontrib/text/widgets/tests/test_owkeywords.py b/orangecontrib/text/widgets/tests/test_owkeywords.py index 200e77246..18e4deb63 100644 --- a/orangecontrib/text/widgets/tests/test_owkeywords.py +++ b/orangecontrib/text/widgets/tests/test_owkeywords.py @@ -83,7 +83,7 @@ def test_run_with_words(self): self.assertEqual(len(results.scores), 42) def test_run_normalize_words(self): - normalizer = LemmagenLemmatizer() + normalizer = LemmagenLemmatizer(language="en") corpus = normalizer(self.corpus) words = ["minor", "tree"]