From 00d9e5411d48d18baa12fdd3838df2444edab1b5 Mon Sep 17 00:00:00 2001 From: PrimozGodec Date: Fri, 14 Apr 2023 15:33:34 +0200 Subject: [PATCH] Preprocess widget - temp --- orangecontrib/text/widgets/owpreprocess.py | 31 ++++-- .../text/widgets/tests/test_owpreprocess.py | 102 ++++++++---------- 2 files changed, 64 insertions(+), 69 deletions(-) diff --git a/orangecontrib/text/widgets/owpreprocess.py b/orangecontrib/text/widgets/owpreprocess.py index 6b6efde82..ede04d90c 100644 --- a/orangecontrib/text/widgets/owpreprocess.py +++ b/orangecontrib/text/widgets/owpreprocess.py @@ -5,9 +5,21 @@ import pkg_resources from AnyQt.QtCore import Qt, pyqtSignal -from AnyQt.QtWidgets import QComboBox, QButtonGroup, QLabel, QCheckBox, \ - QRadioButton, QGridLayout, QLineEdit, QSpinBox, QFormLayout, QHBoxLayout, \ - QDoubleSpinBox, QFileDialog, QAbstractSpinBox +from AnyQt.QtWidgets import ( + QComboBox, + QButtonGroup, + QLabel, + QCheckBox, + QRadioButton, + QGridLayout, + QLineEdit, + QSpinBox, + QFormLayout, + QHBoxLayout, + QDoubleSpinBox, + QFileDialog, + QAbstractSpinBox, +) from AnyQt.QtWidgets import QWidget, QPushButton, QSizePolicy, QStyle from AnyQt.QtGui import QBrush, QValidator @@ -17,8 +29,12 @@ import Orange.widgets.data.owpreprocess from Orange.widgets import gui from Orange.widgets.data.owpreprocess import PreprocessAction, Description -from Orange.widgets.data.utils.preprocess import ParametersRole, \ - DescriptionRole, BaseEditor, StandardItemModel +from Orange.widgets.data.utils.preprocess import ( + ParametersRole, + DescriptionRole, + BaseEditor, + StandardItemModel, +) from Orange.widgets.settings import Setting from Orange.widgets.utils.concurrent import TaskState, ConcurrentWidgetMixin from Orange.widgets.widget import Input, Output, Msg, Message @@ -29,7 +45,6 @@ from orangecontrib.text.preprocess.normalize import UDPipeStopIteration from orangecontrib.text.tag import AveragedPerceptronTagger, MaxEntTagger, \ POSTagger -from orangecontrib.text.tag.pos import StanfordPOSTaggerError _DEFAULT_NONE = "(none)" @@ -446,7 +461,7 @@ def __init__(self, parent=None, **kwargs): checked=self.DEFAULT_USE_TOKE) self.__check_use.clicked.connect(self.__set_use_tokenizer) self.__combo_lemm = ComboBox( - self, LemmagenLemmatizer.lemmagen_languages, + self, LemmagenLemmatizer.supported_languages, self.__lemmagen_lang, self.__set_lemmagen_lang ) @@ -1162,8 +1177,6 @@ def apply(self): self.Error.file_not_found() except UnicodeError as e: self.Error.invalid_encoding(e) - except StanfordPOSTaggerError as e: - self.Error.stanford_tagger(e) except Exception as e: self.Error.unknown_error(str(e)) diff --git a/orangecontrib/text/widgets/tests/test_owpreprocess.py b/orangecontrib/text/widgets/tests/test_owpreprocess.py index c4f1e47ab..0ccdb73f8 100644 --- a/orangecontrib/text/widgets/tests/test_owpreprocess.py +++ b/orangecontrib/text/widgets/tests/test_owpreprocess.py @@ -165,8 +165,7 @@ def test_migrate_settings_normalize(self): "udpipe_tokenizer": True}} widget = self.create_widget(OWPreprocess, stored_settings=settings) params = [("preprocess.normalize", - {"method": 2, "snowball_language": "French", - "udpipe_language": "German", "udpipe_tokenizer": True})] + {"method": 2, "udpipe_tokenizer": True})] self.assertEqual(widget.storedsettings["preprocessors"], params) def test_migrate_settings_filter(self): @@ -180,7 +179,7 @@ def test_migrate_settings_filter(self): "use_df": False, "use_keep_n": False}} widget = self.create_widget(OWPreprocess, stored_settings=settings) params = [("preprocess.filter", - {"methods": [0, 2, 4], "language": "Finnish", + {"methods": [0, 2, 4], "sw_path": None, "sw_list": [], "lx_path": None, "lx_list": [], "pattern": "foo", "rel_start": 0.3, @@ -365,29 +364,18 @@ def test_init(self): self.assertTrue(self.buttons[0].isChecked()) for i in range(1, 4): self.assertFalse(self.buttons[i].isChecked()) - self.assertEqual(self.combo_sbl.currentText(), "English") - self.assertEqual(self.combo_udl.currentText(), "English") self.assertFalse(self.check_use.isChecked()) def test_parameters(self): params = {"method": NormalizationModule.Porter, - "snowball_language": "English", - "udpipe_language": "English", - "lemmagen_language": "English", "udpipe_tokenizer": False} self.assertDictEqual(self.editor.parameters(), params) def test_set_parameters(self): params = {"method": NormalizationModule.UDPipe, - "snowball_language": "Dutch", - "udpipe_language": "Slovenian", - "lemmagen_language": "Bulgarian", "udpipe_tokenizer": True} self.editor.setParameters(params) self.assertDictEqual(self.editor.parameters(), params) - self.assertEqual(self.combo_sbl.currentText(), "Dutch") - self.assertEqual(self.combo_udl.currentText(), "Slovenian") - self.assertEqual(self.combo_lemm.currentText(), "Bulgarian") self.assertTrue(self.check_use.isChecked()) def test_createinstance(self): @@ -397,20 +385,11 @@ def test_createinstance(self): params = {"method": NormalizationModule.Snowball} pp = self.editor.createinstance(params) self.assertIsInstance(pp, SnowballStemmer) - self.assertIn("", str(pp.normalizer)) - - params = {"method": NormalizationModule.Snowball, - "snowball_language": "Dutch"} - pp = self.editor.createinstance(params) - self.assertIsInstance(pp, SnowballStemmer) - self.assertIn("", str(pp.normalizer)) params = {"method": NormalizationModule.UDPipe, - "udpipe_language": "Finnish", "udpipe_tokenizer": True} pp = self.editor.createinstance(params) self.assertIsInstance(pp, UDPipeLemmatizer) - self.assertEqual(pp._UDPipeLemmatizer__language, "Finnish") self.assertEqual(pp._UDPipeLemmatizer__use_tokenizer, True) def test_repr(self): @@ -424,8 +403,6 @@ def test_udpipe_no_models(self): editor = NormalizationModule() button = editor._SingleMethodModule__group.button(editor.UDPipe) self.assertFalse(button.isEnabled()) - combo = editor._NormalizationModule__combo_udl - self.assertFalse(combo.isEnabled()) check = editor._NormalizationModule__check_use self.assertFalse(check.isEnabled()) @@ -438,10 +415,6 @@ def setUp(self): def check_boxes(self): return [cb for i, cb in self.editor._MultipleMethodModule__cbs] - @property - def combo(self): - return self.editor._FilteringModule__combo - @property def sw_combo(self): return self.editor._FilteringModule__sw_loader.file_combo @@ -477,7 +450,6 @@ def test_init(self): for i in range(1, len(check_boxes)): self.assertFalse(check_boxes[i].isChecked()) self.assertGreater(len(check_boxes[i].toolTip()), 0) - self.assertEqual(self.combo.currentText(), "English") self.assertEqual(self.sw_combo.currentText(), "(none)") self.assertEqual(self.lx_combo.currentText(), "(none)") self.assertEqual(self.line_edit.text(), FilteringModule.DEFAULT_PATTERN) @@ -490,32 +462,47 @@ def test_init(self): self.assertEqual(self.spin.value(), 100) def test_parameters(self): - params = {"methods": [FilteringModule.Stopwords], - "language": "English", "sw_path": None, "lx_path": None, - "sw_list": [], "lx_list": [], - "incl_num": False, - "pattern": FilteringModule.DEFAULT_PATTERN, - "freq_type": 0, - "rel_start": 0.1, "rel_end": 0.9, - "abs_start": 1, "abs_end": 10, - "n_tokens": 100, "pos_tags": "NOUN,VERB", - "invalidated": False} + params = { + "methods": [FilteringModule.Stopwords], + "sw_path": None, + "lx_path": None, + "use_default_sw": True, + "sw_list": [], + "lx_list": [], + "incl_num": False, + "pattern": FilteringModule.DEFAULT_PATTERN, + "freq_type": 0, + "rel_start": 0.1, + "rel_end": 0.9, + "abs_start": 1, + "abs_end": 10, + "n_tokens": 100, + "pos_tags": "NOUN,VERB", + "invalidated": False, + } self.assertDictEqual(self.editor.parameters(), params) def test_set_parameters(self): sw_path = RecentPath.create("Foo", []) lx_path = RecentPath.create("Bar", []) - params = {"methods": [FilteringModule.Lexicon, FilteringModule.Regexp], - "language": "Finnish", - "sw_path": sw_path, "lx_path": lx_path, - "sw_list": [sw_path], "lx_list": [lx_path], - "incl_num": False, - "pattern": "foo", - "freq_type": 1, - "rel_start": 0.2, "rel_end": 0.7, - "abs_start": 2, "abs_end": 15, - "n_tokens": 10, "pos_tags": "JJ", - "invalidated": False} + params = { + "methods": [FilteringModule.Lexicon, FilteringModule.Regexp], + "use_default_sw": True, + "sw_path": sw_path, + "lx_path": lx_path, + "sw_list": [sw_path], + "lx_list": [lx_path], + "incl_num": False, + "pattern": "foo", + "freq_type": 1, + "rel_start": 0.2, + "rel_end": 0.7, + "abs_start": 2, + "abs_end": 15, + "n_tokens": 10, + "pos_tags": "JJ", + "invalidated": False, + } self.editor.setParameters(params) self.assertDictEqual(self.editor.parameters(), params) @@ -527,7 +514,6 @@ def test_set_parameters(self): self.assertFalse(check_boxes[4].isChecked()) self.assertFalse(check_boxes[5].isChecked()) - self.assertEqual(self.combo.currentText(), "Finnish") self.assertEqual(self.sw_combo.currentText(), "Foo") self.assertEqual(self.lx_combo.currentText(), "Bar") self.assertEqual(self.line_edit.text(), "foo") @@ -550,10 +536,10 @@ def test_createinstance(self): self.assertIsInstance(pp[1], MostFrequentTokensFilter) def test_repr(self): - self.assertEqual(str(self.editor), - "Stopwords (Language: English, File: None)") - params = {"methods": [FilteringModule.Lexicon, - FilteringModule.Regexp]} + self.assertEqual( + str(self.editor), "Stopwords (Use default stopwords: Yes, File: None)" + ) + params = {"methods": [FilteringModule.Lexicon, FilteringModule.Regexp]} self.editor.setParameters(params) self.assertEqual( str(self.editor), @@ -646,10 +632,6 @@ def test_createinstance(self): pp = self.editor.createinstance({"method": POSTaggingModule.MaxEnt}) self.assertIsInstance(pp, MaxEntTagger) - # TODO - implement StanfordPOSTagger - # pp = self.editor.createinstance({"method": POSTaggingModule.Stanford}) - # self.assertIsInstance(pp, StanfordPOSTagger) - def test_repr(self): self.assertEqual(str(self.editor), "Averaged Perceptron Tagger")