Skip to content

Commit

Permalink
Merge pull request #1034 from PrimozGodec/iso-lang-settings
Browse files Browse the repository at this point in the history
[ENH] Use ISO language setting in widgets
  • Loading branch information
VesnaT authored Feb 2, 2024
2 parents e7c360d + c6a7082 commit 2ca7d7c
Show file tree
Hide file tree
Showing 11 changed files with 253 additions and 111 deletions.
51 changes: 41 additions & 10 deletions orangecontrib/text/language.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from collections import Counter
from typing import Optional
from typing import Optional, Sequence

from AnyQt.QtCore import Qt
from langdetect import DetectorFactory, detect
Expand Down Expand Up @@ -41,7 +41,7 @@
"ga": "Irish",
"gl": "Galician",
"got": "Gothic",
"grc": "Ancient greek",
"grc": "Ancient Greek",
"gu": "Gujarati",
"he": "Hebrew",
"hi": "Hindi",
Expand Down Expand Up @@ -104,21 +104,38 @@
None: None,
}
LANG2ISO = {lang: code for code, lang in ISO2LANG.items()}
DEFAULT_LANGUAGE = "English"
DEFAULT_LANGUAGE = "en"


class LanguageModel(PyListModel):
"""Model for language selection dropdowns in the widgets"""

def __init__(self):
languages = sorted(filter(None, ISO2LANG.values()))
super().__init__(iterable=[None] + languages)
def __init__(
self, include_none: bool = False, languages: Optional[Sequence[str]] = None
):
"""
Parameters
----------
include_none
Indicates if "(no language)" value is available on the top of the list
languages
List of languages available in the dropdown.
If None all add-on supported languages are available.
"""
if languages is None:
# if languages not provided take all available languages
languages = sorted(filter(None, ISO2LANG), key=ISO2LANG.get)
if include_none:
languages = [None] + languages
super().__init__(iterable=languages)

def data(self, index, role=Qt.DisplayRole):
if index.row() == 0 and role == Qt.DisplayRole:
return "(no language)"
else:
return super().data(index, role)
if role == Qt.DisplayRole:
value = super().data(index, role)
if value is None:
return "(no language)"
return ISO2LANG[value]
return super().data(index, role)


DetectorFactory.seed = 0
Expand Down Expand Up @@ -167,3 +184,17 @@ def infer_language_from_variable(variable: DiscreteVariable) -> Optional[str]:
Language ISO code if all documents have the same language, None otherwise
"""
return variable.values[0] if len(variable.values) == 1 else None


# this dictionary hold all changes in language names
LANGUAGE_MIGRATIONS = {
"Ancient greek": "Ancient Greek"
}


def migrate_language_name(language: str) -> str:
"""
We changed some languages names after they were introduced in the add-on.
This function transform any langauge name to its new name if existed.
"""
return LANGUAGE_MIGRATIONS.get(language, language)
21 changes: 15 additions & 6 deletions orangecontrib/text/widgets/owcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@

from orangecontrib.text.corpus import Corpus, get_sample_corpora_dir
from orangecontrib.text.language import (
LANG2ISO,
detect_language,
ISO2LANG,
LanguageModel,
LANG2ISO,
migrate_language_name,
)
from orangecontrib.text.widgets.utils import widgets, QSize

Expand Down Expand Up @@ -106,6 +106,7 @@ class Outputs:
key=list(FileFormat.readers.values()).index)))

settingsHandler = CorpusContextHandler()
settings_version = 2

recent_files = Setting([
"book-excerpts.tab",
Expand All @@ -116,7 +117,7 @@ class Outputs:
])
used_attrs = ContextSetting([])
title_variable = ContextSetting("")
language: str = ContextSetting("English")
language: str = ContextSetting("en")

class Error(OWWidget.Error):
read_file = Msg("Can't read file ({})")
Expand Down Expand Up @@ -163,7 +164,7 @@ def __init__(self):
self,
"language",
label="Language",
model=LanguageModel(),
model=LanguageModel(include_none=True),
sendSelectedValue=True,
**common_settings
)
Expand Down Expand Up @@ -253,7 +254,7 @@ def on_done(self, corpus: Corpus) -> None:
return
# set language on Corpus's language (when corpus with already defined
# language opened) or guess language
self.language = ISO2LANG[corpus.language or detect_language(corpus)]
self.language = corpus.language or detect_language(corpus)
self.openContext(self.corpus)
self.used_attrs_model.extend(self.used_attrs)
self.unused_attrs_model.extend(
Expand Down Expand Up @@ -341,7 +342,7 @@ def remove_duplicates(l):
self.Error.no_text_features_used()

corpus.set_title_variable(self.title_variable)
corpus.attributes["language"] = LANG2ISO[self.language]
corpus.attributes["language"] = self.language
# prevent sending "empty" corpora
dom = corpus.domain
empty = (
Expand Down Expand Up @@ -369,6 +370,14 @@ def describe(features):
('Target', describe(domain.class_vars)),
))

@classmethod
def migrate_context(cls, context, version):
if version < 2:
if "language" in context.values:
language, type_ = context.values["language"]
language = LANG2ISO[migrate_language_name(language)]
context.values["language"] = (language, type_)


if __name__ == '__main__':
from orangewidget.utils.widgetpreview import WidgetPreview
Expand Down
16 changes: 13 additions & 3 deletions orangecontrib/text/widgets/owcreatecorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
from orangewidget.settings import Setting

from orangecontrib.text import Corpus
from orangecontrib.text.language import LANG2ISO, DEFAULT_LANGUAGE, LanguageModel
from orangecontrib.text.language import (
DEFAULT_LANGUAGE, LanguageModel, LANG2ISO, migrate_language_name
)


class EditorsVerticalScrollArea(gui.VerticalScrollArea):
Expand Down Expand Up @@ -78,6 +80,7 @@ class Outputs:

want_main_area = False

settings_version = 2
language: str = Setting(DEFAULT_LANGUAGE)
texts: List[Tuple[str, str]] = Setting([("", "")] * 3)
auto_commit: bool = Setting(True)
Expand All @@ -90,7 +93,7 @@ def __init__(self):
self.controlArea,
self,
"language",
model=LanguageModel(),
model=LanguageModel(include_none=True),
box="Language",
orientation=Qt.Horizontal,
callback=self.commit.deferred,
Expand Down Expand Up @@ -157,14 +160,21 @@ def commit(self):
np.empty((len(self.texts), 0)),
metas=np.array(self.texts),
text_features=[doc_var],
language=LANG2ISO[self.language],
language=self.language,
)
corpus.set_title_variable(title_var)
self.Outputs.corpus.send(corpus)

def sizeHint(self) -> QSize:
return QSize(600, 650)

@classmethod
def migrate_settings(cls, settings, version):
if version is None or version < 2:
if "language" in settings:
language = migrate_language_name(settings["language"])
settings["language"] = LANG2ISO[language]


if __name__ == "__main__":
from orangewidget.utils.widgetpreview import WidgetPreview
Expand Down
27 changes: 15 additions & 12 deletions orangecontrib/text/widgets/owdocumentembedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
from Orange.widgets.widget import Msg, Output, OWWidget

from orangecontrib.text.corpus import Corpus
from orangecontrib.text.language import ISO2LANG, LANG2ISO
from orangecontrib.text.language import (
ISO2LANG, DEFAULT_LANGUAGE, LanguageModel, LANG2ISO
)
from orangecontrib.text.vectorization.document_embedder import (
AGGREGATORS,
AGGREGATORS_ITEMS,
Expand Down Expand Up @@ -39,10 +41,9 @@ class OWDocumentEmbedding(OWBaseVectorizer):
priority = 300

buttons_area_orientation = Qt.Vertical
settings_version = 2
settings_version = 3

Methods = [SBERT, DocumentEmbedder]
DEFAULT_LANGUAGE = "English"

class Outputs(OWBaseVectorizer.Outputs):
skipped = Output("Skipped documents", Corpus)
Expand Down Expand Up @@ -84,7 +85,7 @@ def create_configuration_layout(self):
ibox,
self,
"language",
items=[ISO2LANG[lg] for lg in LANGUAGES],
model=LanguageModel(languages=LANGUAGES),
label="Language:",
sendSelectedValue=True, # value is actual string not index
orientation=Qt.Horizontal,
Expand All @@ -108,10 +109,10 @@ def create_configuration_layout(self):
def set_data(self, corpus):
# set language from corpus as selected language
if corpus and corpus.language in LANGUAGES:
self.language = ISO2LANG[corpus.language]
self.language = corpus.language
else:
# if Corpus's language not supported use default language
self.language = self.DEFAULT_LANGUAGE
self.language = DEFAULT_LANGUAGE

# when workflow loaded use language saved in workflow
if self.__pending_language is not None:
Expand All @@ -127,9 +128,7 @@ def update_method(self):
self.vectorizer = EmbeddingVectorizer(self.init_method(), self.corpus)

def init_method(self):
params = dict(
language=LANG2ISO[self.language], aggregator=self.aggregator
)
params = dict(language=self.language, aggregator=self.aggregator)
kwargs = ({}, params)[self.method]
return self.Methods[self.method](**kwargs)

Expand Down Expand Up @@ -170,18 +169,22 @@ def migrate_settings(cls, settings: Dict[str, Any], version: Optional[int]):
settings["language"] = LANGUAGES[settings["language"]]
if "aggregator" in settings:
settings["aggregator"] = AGGREGATORS[settings["aggregator"]]
if version is None or version < 3 and "language" in settings:
# before version 3 language settings were language names, transform to ISO
settings["language"] = LANG2ISO[settings["language"]]

def send_report(self):
if self.method == 0:
self.report_items((
("Embedder", "Multilingual SBERT"),
))
if self.method == 1:
self.report_items((
items = (
("Embedder", "fastText"),
("Language", self.language),
("Language", ISO2LANG[self.language]),
("Aggregator", self.aggregator),
))
)
self.report_items(items)


if __name__ == "__main__":
Expand Down
21 changes: 13 additions & 8 deletions orangecontrib/text/widgets/owimportdocuments.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,7 @@
from orangecontrib.text.corpus import Corpus
from orangecontrib.text.import_documents import ImportDocuments, NoDocumentsException
from orangecontrib.text.language import (
ISO2LANG,
detect_language,
LANG2ISO,
LanguageModel,
detect_language, LanguageModel, DEFAULT_LANGUAGE, LANG2ISO, migrate_language_name
)

# domain for skipped images output
Expand Down Expand Up @@ -124,6 +121,7 @@ class Outputs:
skipped_documents = Output("Skipped documents", Table)

settingsHandler = ImportDocumentContextHandler()
settings_version = 2

LOCAL_FILE, URL = range(2)
source = settings.Setting(LOCAL_FILE)
Expand All @@ -134,7 +132,7 @@ class Outputs:
lemma_cb = settings.Setting(True)
pos_cb = settings.Setting(False)
ner_cb = settings.Setting(False)
language: str = settings.ContextSetting("English")
language: str = settings.ContextSetting(DEFAULT_LANGUAGE)

want_main_area = False
resizing_enabled = False
Expand Down Expand Up @@ -253,7 +251,7 @@ def __init__(self):
self,
"language",
box="Language",
model=LanguageModel(),
model=LanguageModel(include_none=True),
sendSelectedValue=True,
searchable=True,
callback=self.commit,
Expand Down Expand Up @@ -665,7 +663,7 @@ def __onRunFinished(self):
self.n_text_data = len(corpus)
self.n_text_categories = len(corpus.domain.class_var.values) \
if corpus.domain.class_var else 0
self.language = ISO2LANG[corpus.language or detect_language(corpus)]
self.language = corpus.language or detect_language(corpus)
self.openContext(corpus)
else:
self.language = None
Expand Down Expand Up @@ -727,7 +725,7 @@ def commit(self):
if self.is_conllu:
self.add_features()
if self.corpus:
self.corpus.attributes["language"] = LANG2ISO[self.language]
self.corpus.attributes["language"] = self.language
self.Outputs.data.send(self.corpus)
if self.skipped_documents:
skipped_table = (
Expand Down Expand Up @@ -791,6 +789,13 @@ def send_report(self):
items += [('Number of skipped', len(self.skipped_documents))]
self.report_items(items, )

@classmethod
def migrate_context(cls, context, version):
if version < 2:
if "language" in context.values:
language = LANG2ISO[migrate_language_name(context.values["language"])]
context.values["language"] = language


class UserInterruptError(BaseException):
"""
Expand Down
Loading

0 comments on commit 2ca7d7c

Please sign in to comment.