Skip to content

Commit

Permalink
Preprocess Widget - Use ISO language format for stop words settings
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Nov 17, 2023
1 parent 56cfc5a commit d4588b3
Show file tree
Hide file tree
Showing 2 changed files with 188 additions and 21 deletions.
84 changes: 73 additions & 11 deletions orangecontrib/text/widgets/owpreprocess.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from typing import Dict, Optional, List, Callable, Tuple, Type, Union
from typing import Dict, Optional, List, Callable, Tuple, Type, Union, Iterable
from types import SimpleNamespace
import os
import random
import pkg_resources

from AnyQt.QtCore import Qt, pyqtSignal
from AnyQt.QtCore import Qt, pyqtSignal, QModelIndex
from AnyQt.QtWidgets import QComboBox, QButtonGroup, QLabel, QCheckBox, \
QRadioButton, QGridLayout, QLineEdit, QSpinBox, QFormLayout, QHBoxLayout, \
QDoubleSpinBox, QFileDialog, QAbstractSpinBox
Expand All @@ -24,6 +24,7 @@
from Orange.widgets.widget import Input, Output, Msg, Message

from orangecontrib.text import Corpus
from orangecontrib.text.language import ISO2LANG
from orangecontrib.text.misc import nltk_data_dir
from orangecontrib.text.preprocess import *
from orangecontrib.text.preprocess.normalize import UDPipeStopIteration
Expand Down Expand Up @@ -63,6 +64,57 @@ def __init__(self, master: BaseEditor, items: List[str], value: str,
self.currentTextChanged.connect(callback)


class LanguageComboBox(QComboBox):
"""A combo box for selecting language."""
def __init__(
self,
parent: Optional[BaseEditor],
items: Iterable[str],
value: Optional[str],
include_none: bool,
callback: Callable,
):
"""
Parameters
----------
parent
Combo box's parent widget
items
Combo box's languages (items) as ISO codes.
include_none
Boolean indicating whether to include none option in the start of the list
value
Boxs initial value (as an ISO code).
"""
super().__init__(parent)
self.setMinimumWidth(80)
self.__add_items(items, include_none)
self.set_current_language(value)
self.currentIndexChanged.connect(self.__index_changed)
self.callback = callback

def __add_items(self, items: Iterable[str], include_non: bool):
if include_non:
self.addItem(_DEFAULT_NONE, None)
for itm in sorted(items, key=ISO2LANG.get):
self.addItem(ISO2LANG[itm], itm)

def __index_changed(self, index: QModelIndex):
self.callback(self.itemData(index))

def set_current_language(self, iso_language: Optional[str]):
"""
Set current element of dropdown from ISO language code.
Parameters
----------
iso_language
The ISO language code of element to be selected.
"""
index = self.findData(iso_language)
self.setCurrentIndex(index)


class UDPipeComboBox(QComboBox):
def __init__(self, master: BaseEditor, value: str, default: str,
callback: Callable):
Expand Down Expand Up @@ -570,7 +622,7 @@ class FilteringModule(MultipleMethodModule):
MostFreq: MostFrequentTokensFilter,
PosTag: PosTagFilter}
DEFAULT_METHODS = [Stopwords]
DEFAULT_LANG = "English"
DEFAULT_LANG = "en"
DEFAULT_NONE = None
DEFAULT_INCL_NUM = False
DEFAULT_PATTERN = r"\.|,|:|;|!|\?|\(|\)|\||\+|\'|\"|‘|’|“|”|\'|" \
Expand All @@ -597,9 +649,12 @@ def __init__(self, parent=None, **kwargs):
self.__pos_tag = self.DEFAULT_POS_TAGS
self.__invalidated = False

self.__combo = ComboBox(
self, [_DEFAULT_NONE] + StopwordsFilter.supported_languages(),
self.__sw_lang, self.__set_language
self.__combo = LanguageComboBox(
self,
StopwordsFilter.supported_languages(),
self.__sw_lang,
True,
self.__set_language,
)
self.__sw_loader = FileLoader()
self.__sw_loader.set_file_list()
Expand Down Expand Up @@ -755,10 +810,10 @@ def setParameters(self, params: Dict):
self.__set_tags(params.get("pos_tags", self.DEFAULT_POS_TAGS))
self.__invalidated = False

def __set_language(self, language: str):
def __set_language(self, language: Optional[str]):
if self.__sw_lang != language:
self.__sw_lang = language
self.__combo.setCurrentText(language)
self.__combo.set_current_language(language)
self.changed.emit()
if self.Stopwords in self.methods:
self.edited.emit()
Expand Down Expand Up @@ -899,8 +954,8 @@ def __repr__(self):
texts = []
for method in self.methods:
if method == self.Stopwords:
append = f"Language: {self.__sw_lang}, " \
f"File: {_to_abspath(self.__sw_file)}"
language = ISO2LANG[self.__sw_lang]
append = f"Language: {language}, File: {_to_abspath(self.__sw_file)}"
elif method == self.Lexicon:
append = f"File: {_to_abspath(self.__lx_file)}"
elif method == self.Numbers:
Expand Down Expand Up @@ -1026,7 +1081,7 @@ class OWPreprocess(Orange.widgets.data.owpreprocess.OWPreprocess,
priority = 200
keywords = "preprocess text, text"

settings_version = 3
settings_version = 4

class Inputs:
corpus = Input("Corpus", Corpus)
Expand Down Expand Up @@ -1320,6 +1375,13 @@ def str_into_paths(label):
del pp_settings["start"]
del pp_settings["end"]

# before version 4 languages were saved as full-word language strings
if version < 4:
preprocessors = settings["storedsettings"]["preprocessors"]
for pp_name, pp in preprocessors:
if pp_name == "preprocess.filter" and "language" in pp:
pp["language"] = StopwordsFilter.lang_to_iso(pp["language"])


if __name__ == "__main__":
from Orange.widgets.utils.widgetpreview import WidgetPreview
Expand Down
125 changes: 115 additions & 10 deletions orangecontrib/text/widgets/tests/test_owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,25 @@
from Orange.data import Domain, StringVariable
from orangewidget.utils.filedialogs import RecentPath
from Orange.widgets.tests.base import WidgetTest
from Orange.widgets.tests.utils import simulate

from orangecontrib.text.corpus import Corpus
from orangecontrib.text.preprocess import RegexpTokenizer, WhitespaceTokenizer, \
LowercaseTransformer, HtmlTransformer, PorterStemmer, SnowballStemmer, \
UDPipeLemmatizer, StopwordsFilter, MostFrequentTokensFilter, NGrams
from orangecontrib.text.tag import AveragedPerceptronTagger, MaxEntTagger
from orangecontrib.text.tests.test_preprocess import SF_LIST, SERVER_FILES
from orangecontrib.text.widgets.owpreprocess import OWPreprocess, \
TransformationModule, TokenizerModule, NormalizationModule, \
FilteringModule, NgramsModule, POSTaggingModule
from orangecontrib.text.widgets.owpreprocess import (
OWPreprocess,
TransformationModule,
TokenizerModule,
NormalizationModule,
FilteringModule,
NgramsModule,
POSTaggingModule,
LanguageComboBox,
_DEFAULT_NONE,
)


@patch(SF_LIST, new=Mock(return_value=SERVER_FILES))
Expand Down Expand Up @@ -211,7 +220,7 @@ def test_migrate_settings_filter(self):
"use_df": False, "use_keep_n": False}}
widget = self.create_widget(OWPreprocess, stored_settings=settings)
params = [("preprocess.filter",
{"methods": [0, 2, 4], "language": "Finnish",
{"methods": [0, 2, 4], "language": "fi",
"sw_path": None, "sw_list": [],
"lx_path": None, "lx_list": [],
"pattern": "foo", "rel_start": 0.3,
Expand Down Expand Up @@ -262,6 +271,45 @@ def test_migrate_settings(self):
}
self.create_widget(OWPreprocess, stored_settings=settings)

def test_migrate_language_settings(self):
"""Test migration to iso langauge codes"""
settings = {
"__version__": 3,
"storedsettings": {
"preprocessors": [
(
"preprocess.normalize",
{
"snowball_language": "French",
"udpipe_language": "German",
"lemmagen_language": "Slovenian",
},
),
("preprocess.filter", {"language": "Finnish"}),
]
},
}
widget = self.create_widget(OWPreprocess, stored_settings=settings)
normalize_settings = widget.storedsettings["preprocessors"][0][1]
filter_settings = widget.storedsettings["preprocessors"][1][1]
self.assertEqual("Slovenian", normalize_settings["lemmagen_language"])
self.assertEqual("French", normalize_settings["snowball_language"])
self.assertEqual("German", normalize_settings["udpipe_language"])
self.assertEqual("fi", filter_settings["language"])

# NLTK uses Slovene instead of Slovenian, this is also the reason
# that preprocess widget stored language as Slovene before
# check if it is mapped correctly
settings = {
"__version__": 3,
"storedsettings": {
"preprocessors": [("preprocess.filter", {"language": "Slovene"})]
},
}
widget = self.create_widget(OWPreprocess, stored_settings=settings)
filter_settings = widget.storedsettings["preprocessors"][0][1]
self.assertEqual("sl", filter_settings["language"])


class TestTransformationModule(WidgetTest):
def setUp(self):
Expand Down Expand Up @@ -522,7 +570,7 @@ def test_init(self):

def test_parameters(self):
params = {"methods": [FilteringModule.Stopwords],
"language": "English", "sw_path": None, "lx_path": None,
"language": "en", "sw_path": None, "lx_path": None,
"sw_list": [], "lx_list": [],
"incl_num": False,
"pattern": FilteringModule.DEFAULT_PATTERN,
Expand All @@ -537,7 +585,7 @@ def test_set_parameters(self):
sw_path = RecentPath.create("Foo", [])
lx_path = RecentPath.create("Bar", [])
params = {"methods": [FilteringModule.Lexicon, FilteringModule.Regexp],
"language": "Finnish",
"language": "fi",
"sw_path": sw_path, "lx_path": lx_path,
"sw_list": [sw_path], "lx_list": [lx_path],
"incl_num": False,
Expand Down Expand Up @@ -581,10 +629,13 @@ def test_createinstance(self):
self.assertIsInstance(pp[1], MostFrequentTokensFilter)

def test_repr(self):
self.assertEqual(str(self.editor),
"Stopwords (Language: English, File: None)")
params = {"methods": [FilteringModule.Lexicon,
FilteringModule.Regexp]}
self.assertEqual(str(self.editor), "Stopwords (Language: English, File: None)")
params = self.editor.parameters()
params["language"] = None
self.editor.setParameters(params)
self.assertEqual(str(self.editor), "Stopwords (Language: None, File: None)")

params = {"methods": [FilteringModule.Lexicon, FilteringModule.Regexp]}
self.editor.setParameters(params)
self.assertEqual(
str(self.editor),
Expand Down Expand Up @@ -685,5 +736,59 @@ def test_repr(self):
self.assertEqual(str(self.editor), "Averaged Perceptron Tagger")


class TestLanguageComboBox(WidgetTest):
def test_basic_setup(self):
mock = Mock()
cb = LanguageComboBox(None, ["sl", "en", "sv", "fi"], "fi", False, mock)
self.assertEqual(4, cb.count())
self.assertEqual(
["English", "Finnish", "Slovenian", "Swedish"],
[cb.itemText(i) for i in range(cb.count())],
)
self.assertEqual("Finnish", cb.currentText())

def test_include_none(self):
mock = Mock()
cb = LanguageComboBox(None, ["sl", "en", "sv", "fi"], "fi", True, mock)
self.assertEqual(5, cb.count())
self.assertEqual(
[_DEFAULT_NONE, "English", "Finnish", "Slovenian", "Swedish"],
[cb.itemText(i) for i in range(cb.count())],
)
self.assertEqual("Finnish", cb.currentText())

# test with current item None
cb = LanguageComboBox(None, ["sl", "en", "sv", "fi"], None, True, mock)
self.assertEqual(5, cb.count())
self.assertEqual(
[_DEFAULT_NONE, "English", "Finnish", "Slovenian", "Swedish"],
[cb.itemText(i) for i in range(cb.count())],
)
self.assertEqual(_DEFAULT_NONE, cb.currentText())

def test_set_current_language(self):
mock = Mock()
cb = LanguageComboBox(None, ["sl", "en", "sv", "fi"], "fi", True, mock)
self.assertEqual("Finnish", cb.currentText())
cb.set_current_language("sl")
self.assertEqual("Slovenian", cb.currentText())
cb.set_current_language(None)
self.assertEqual(_DEFAULT_NONE, cb.currentText())

def test_change_item(self):
mock = Mock()
cb = LanguageComboBox(None, ["sl", "en", "sv", "fi"], "fi", True, mock)
self.assertEqual(
[_DEFAULT_NONE, "English", "Finnish", "Slovenian", "Swedish"],
[cb.itemText(i) for i in range(cb.count())],
)
mock.assert_not_called()
simulate.combobox_activate_item(cb, "Slovenian")
mock.assert_called_once_with("sl")
mock.reset_mock()
simulate.combobox_activate_item(cb, _DEFAULT_NONE)
mock.assert_called_once_with(None)


if __name__ == "__main__":
unittest.main()

0 comments on commit d4588b3

Please sign in to comment.