From 56268bc1710b62d3bb97b2bde90ff457cd7642a2 Mon Sep 17 00:00:00 2001 From: PrimozGodec Date: Thu, 1 Jun 2023 08:40:27 +0200 Subject: [PATCH] Keywords - temporary solution for list of stopwords --- orangecontrib/text/keywords/__init__.py | 6 ++++-- orangecontrib/text/widgets/tests/test_owkeywords.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/orangecontrib/text/keywords/__init__.py b/orangecontrib/text/keywords/__init__.py index bdfb44bdd..8dc84f6c4 100644 --- a/orangecontrib/text/keywords/__init__.py +++ b/orangecontrib/text/keywords/__init__.py @@ -15,13 +15,15 @@ from orangecontrib.text import Corpus from orangecontrib.text.keywords.mbert import mbert_keywords from orangecontrib.text.keywords.rake import Rake +from orangecontrib.text.language import ISO2LANG from orangecontrib.text.preprocess import StopwordsFilter # all available languages for RAKE from orangecontrib.text.vectorization import BowVectorizer -# todo -RAKE_LANGUAGES = StopwordsFilter.supported_languages() +# todo: this is a temporary solution since supported_languages now returns +# languages as ISO codes - refactor with keywords language refactoring +RAKE_LANGUAGES = [ISO2LANG[la] for la in StopwordsFilter.supported_languages()] # all available languages for YAKE! YAKE_LANGUAGE_MAPPING = { "Arabic": "ar", diff --git a/orangecontrib/text/widgets/tests/test_owkeywords.py b/orangecontrib/text/widgets/tests/test_owkeywords.py index ae301fbce..f34abbd5b 100644 --- a/orangecontrib/text/widgets/tests/test_owkeywords.py +++ b/orangecontrib/text/widgets/tests/test_owkeywords.py @@ -77,7 +77,7 @@ def test_run_with_words(self): self.assertEqual(len(results.scores), 42) def test_run_normalize_words(self): - normalizer = LemmagenLemmatizer() + normalizer = LemmagenLemmatizer(language="en") corpus = normalizer(self.corpus) words = ["minor", "tree"]