From 00d9e5411d48d18baa12fdd3838df2444edab1b5 Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Fri, 14 Apr 2023 15:33:34 +0200
Subject: [PATCH] Preprocess widget - temp
---
orangecontrib/text/widgets/owpreprocess.py | 31 ++++--
.../text/widgets/tests/test_owpreprocess.py | 102 ++++++++----------
2 files changed, 64 insertions(+), 69 deletions(-)
diff --git a/orangecontrib/text/widgets/owpreprocess.py b/orangecontrib/text/widgets/owpreprocess.py
index 6b6efde82..ede04d90c 100644
--- a/orangecontrib/text/widgets/owpreprocess.py
+++ b/orangecontrib/text/widgets/owpreprocess.py
@@ -5,9 +5,21 @@
import pkg_resources
from AnyQt.QtCore import Qt, pyqtSignal
-from AnyQt.QtWidgets import QComboBox, QButtonGroup, QLabel, QCheckBox, \
- QRadioButton, QGridLayout, QLineEdit, QSpinBox, QFormLayout, QHBoxLayout, \
- QDoubleSpinBox, QFileDialog, QAbstractSpinBox
+from AnyQt.QtWidgets import (
+ QComboBox,
+ QButtonGroup,
+ QLabel,
+ QCheckBox,
+ QRadioButton,
+ QGridLayout,
+ QLineEdit,
+ QSpinBox,
+ QFormLayout,
+ QHBoxLayout,
+ QDoubleSpinBox,
+ QFileDialog,
+ QAbstractSpinBox,
+)
from AnyQt.QtWidgets import QWidget, QPushButton, QSizePolicy, QStyle
from AnyQt.QtGui import QBrush, QValidator
@@ -17,8 +29,12 @@
import Orange.widgets.data.owpreprocess
from Orange.widgets import gui
from Orange.widgets.data.owpreprocess import PreprocessAction, Description
-from Orange.widgets.data.utils.preprocess import ParametersRole, \
- DescriptionRole, BaseEditor, StandardItemModel
+from Orange.widgets.data.utils.preprocess import (
+ ParametersRole,
+ DescriptionRole,
+ BaseEditor,
+ StandardItemModel,
+)
from Orange.widgets.settings import Setting
from Orange.widgets.utils.concurrent import TaskState, ConcurrentWidgetMixin
from Orange.widgets.widget import Input, Output, Msg, Message
@@ -29,7 +45,6 @@
from orangecontrib.text.preprocess.normalize import UDPipeStopIteration
from orangecontrib.text.tag import AveragedPerceptronTagger, MaxEntTagger, \
POSTagger
-from orangecontrib.text.tag.pos import StanfordPOSTaggerError
_DEFAULT_NONE = "(none)"
@@ -446,7 +461,7 @@ def __init__(self, parent=None, **kwargs):
checked=self.DEFAULT_USE_TOKE)
self.__check_use.clicked.connect(self.__set_use_tokenizer)
self.__combo_lemm = ComboBox(
- self, LemmagenLemmatizer.lemmagen_languages,
+ self, LemmagenLemmatizer.supported_languages,
self.__lemmagen_lang, self.__set_lemmagen_lang
)
@@ -1162,8 +1177,6 @@ def apply(self):
self.Error.file_not_found()
except UnicodeError as e:
self.Error.invalid_encoding(e)
- except StanfordPOSTaggerError as e:
- self.Error.stanford_tagger(e)
except Exception as e:
self.Error.unknown_error(str(e))
diff --git a/orangecontrib/text/widgets/tests/test_owpreprocess.py b/orangecontrib/text/widgets/tests/test_owpreprocess.py
index c4f1e47ab..0ccdb73f8 100644
--- a/orangecontrib/text/widgets/tests/test_owpreprocess.py
+++ b/orangecontrib/text/widgets/tests/test_owpreprocess.py
@@ -165,8 +165,7 @@ def test_migrate_settings_normalize(self):
"udpipe_tokenizer": True}}
widget = self.create_widget(OWPreprocess, stored_settings=settings)
params = [("preprocess.normalize",
- {"method": 2, "snowball_language": "French",
- "udpipe_language": "German", "udpipe_tokenizer": True})]
+ {"method": 2, "udpipe_tokenizer": True})]
self.assertEqual(widget.storedsettings["preprocessors"], params)
def test_migrate_settings_filter(self):
@@ -180,7 +179,7 @@ def test_migrate_settings_filter(self):
"use_df": False, "use_keep_n": False}}
widget = self.create_widget(OWPreprocess, stored_settings=settings)
params = [("preprocess.filter",
- {"methods": [0, 2, 4], "language": "Finnish",
+ {"methods": [0, 2, 4],
"sw_path": None, "sw_list": [],
"lx_path": None, "lx_list": [],
"pattern": "foo", "rel_start": 0.3,
@@ -365,29 +364,18 @@ def test_init(self):
self.assertTrue(self.buttons[0].isChecked())
for i in range(1, 4):
self.assertFalse(self.buttons[i].isChecked())
- self.assertEqual(self.combo_sbl.currentText(), "English")
- self.assertEqual(self.combo_udl.currentText(), "English")
self.assertFalse(self.check_use.isChecked())
def test_parameters(self):
params = {"method": NormalizationModule.Porter,
- "snowball_language": "English",
- "udpipe_language": "English",
- "lemmagen_language": "English",
"udpipe_tokenizer": False}
self.assertDictEqual(self.editor.parameters(), params)
def test_set_parameters(self):
params = {"method": NormalizationModule.UDPipe,
- "snowball_language": "Dutch",
- "udpipe_language": "Slovenian",
- "lemmagen_language": "Bulgarian",
"udpipe_tokenizer": True}
self.editor.setParameters(params)
self.assertDictEqual(self.editor.parameters(), params)
- self.assertEqual(self.combo_sbl.currentText(), "Dutch")
- self.assertEqual(self.combo_udl.currentText(), "Slovenian")
- self.assertEqual(self.combo_lemm.currentText(), "Bulgarian")
self.assertTrue(self.check_use.isChecked())
def test_createinstance(self):
@@ -397,20 +385,11 @@ def test_createinstance(self):
params = {"method": NormalizationModule.Snowball}
pp = self.editor.createinstance(params)
self.assertIsInstance(pp, SnowballStemmer)
- self.assertIn("", str(pp.normalizer))
-
- params = {"method": NormalizationModule.Snowball,
- "snowball_language": "Dutch"}
- pp = self.editor.createinstance(params)
- self.assertIsInstance(pp, SnowballStemmer)
- self.assertIn("", str(pp.normalizer))
params = {"method": NormalizationModule.UDPipe,
- "udpipe_language": "Finnish",
"udpipe_tokenizer": True}
pp = self.editor.createinstance(params)
self.assertIsInstance(pp, UDPipeLemmatizer)
- self.assertEqual(pp._UDPipeLemmatizer__language, "Finnish")
self.assertEqual(pp._UDPipeLemmatizer__use_tokenizer, True)
def test_repr(self):
@@ -424,8 +403,6 @@ def test_udpipe_no_models(self):
editor = NormalizationModule()
button = editor._SingleMethodModule__group.button(editor.UDPipe)
self.assertFalse(button.isEnabled())
- combo = editor._NormalizationModule__combo_udl
- self.assertFalse(combo.isEnabled())
check = editor._NormalizationModule__check_use
self.assertFalse(check.isEnabled())
@@ -438,10 +415,6 @@ def setUp(self):
def check_boxes(self):
return [cb for i, cb in self.editor._MultipleMethodModule__cbs]
- @property
- def combo(self):
- return self.editor._FilteringModule__combo
-
@property
def sw_combo(self):
return self.editor._FilteringModule__sw_loader.file_combo
@@ -477,7 +450,6 @@ def test_init(self):
for i in range(1, len(check_boxes)):
self.assertFalse(check_boxes[i].isChecked())
self.assertGreater(len(check_boxes[i].toolTip()), 0)
- self.assertEqual(self.combo.currentText(), "English")
self.assertEqual(self.sw_combo.currentText(), "(none)")
self.assertEqual(self.lx_combo.currentText(), "(none)")
self.assertEqual(self.line_edit.text(), FilteringModule.DEFAULT_PATTERN)
@@ -490,32 +462,47 @@ def test_init(self):
self.assertEqual(self.spin.value(), 100)
def test_parameters(self):
- params = {"methods": [FilteringModule.Stopwords],
- "language": "English", "sw_path": None, "lx_path": None,
- "sw_list": [], "lx_list": [],
- "incl_num": False,
- "pattern": FilteringModule.DEFAULT_PATTERN,
- "freq_type": 0,
- "rel_start": 0.1, "rel_end": 0.9,
- "abs_start": 1, "abs_end": 10,
- "n_tokens": 100, "pos_tags": "NOUN,VERB",
- "invalidated": False}
+ params = {
+ "methods": [FilteringModule.Stopwords],
+ "sw_path": None,
+ "lx_path": None,
+ "use_default_sw": True,
+ "sw_list": [],
+ "lx_list": [],
+ "incl_num": False,
+ "pattern": FilteringModule.DEFAULT_PATTERN,
+ "freq_type": 0,
+ "rel_start": 0.1,
+ "rel_end": 0.9,
+ "abs_start": 1,
+ "abs_end": 10,
+ "n_tokens": 100,
+ "pos_tags": "NOUN,VERB",
+ "invalidated": False,
+ }
self.assertDictEqual(self.editor.parameters(), params)
def test_set_parameters(self):
sw_path = RecentPath.create("Foo", [])
lx_path = RecentPath.create("Bar", [])
- params = {"methods": [FilteringModule.Lexicon, FilteringModule.Regexp],
- "language": "Finnish",
- "sw_path": sw_path, "lx_path": lx_path,
- "sw_list": [sw_path], "lx_list": [lx_path],
- "incl_num": False,
- "pattern": "foo",
- "freq_type": 1,
- "rel_start": 0.2, "rel_end": 0.7,
- "abs_start": 2, "abs_end": 15,
- "n_tokens": 10, "pos_tags": "JJ",
- "invalidated": False}
+ params = {
+ "methods": [FilteringModule.Lexicon, FilteringModule.Regexp],
+ "use_default_sw": True,
+ "sw_path": sw_path,
+ "lx_path": lx_path,
+ "sw_list": [sw_path],
+ "lx_list": [lx_path],
+ "incl_num": False,
+ "pattern": "foo",
+ "freq_type": 1,
+ "rel_start": 0.2,
+ "rel_end": 0.7,
+ "abs_start": 2,
+ "abs_end": 15,
+ "n_tokens": 10,
+ "pos_tags": "JJ",
+ "invalidated": False,
+ }
self.editor.setParameters(params)
self.assertDictEqual(self.editor.parameters(), params)
@@ -527,7 +514,6 @@ def test_set_parameters(self):
self.assertFalse(check_boxes[4].isChecked())
self.assertFalse(check_boxes[5].isChecked())
- self.assertEqual(self.combo.currentText(), "Finnish")
self.assertEqual(self.sw_combo.currentText(), "Foo")
self.assertEqual(self.lx_combo.currentText(), "Bar")
self.assertEqual(self.line_edit.text(), "foo")
@@ -550,10 +536,10 @@ def test_createinstance(self):
self.assertIsInstance(pp[1], MostFrequentTokensFilter)
def test_repr(self):
- self.assertEqual(str(self.editor),
- "Stopwords (Language: English, File: None)")
- params = {"methods": [FilteringModule.Lexicon,
- FilteringModule.Regexp]}
+ self.assertEqual(
+ str(self.editor), "Stopwords (Use default stopwords: Yes, File: None)"
+ )
+ params = {"methods": [FilteringModule.Lexicon, FilteringModule.Regexp]}
self.editor.setParameters(params)
self.assertEqual(
str(self.editor),
@@ -646,10 +632,6 @@ def test_createinstance(self):
pp = self.editor.createinstance({"method": POSTaggingModule.MaxEnt})
self.assertIsInstance(pp, MaxEntTagger)
- # TODO - implement StanfordPOSTagger
- # pp = self.editor.createinstance({"method": POSTaggingModule.Stanford})
- # self.assertIsInstance(pp, StanfordPOSTagger)
-
def test_repr(self):
self.assertEqual(str(self.editor), "Averaged Perceptron Tagger")