Skip to content

Commit

Permalink
Settings: Settings - Part-of-speeach Tagging - Tagsets - Mapping Sett…
Browse files Browse the repository at this point in the history
…ings - Allow editing of tagset mapping of spaCy's Catalan, Danish, French, Greek (Modern), Macedonian, Norwegian (Bokmål), Portuguese, Russian, Spanish, and Ukrainian part-of-speech taggers
  • Loading branch information
BLKSerene committed Jan 12, 2024
1 parent 509ec63 commit e00ca2a
Show file tree
Hide file tree
Showing 28 changed files with 517 additions and 93 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
- Work Area: Add Profiler - Lexical Diversity - Brunét's Index / Honoré's statistic

### ✨ Improvements
- Menu: Allow editing of tagset mapping of spaCy's Catalan, Danish, French, Greek (Modern), Macedonian, Norwegian (Bokmål), Portuguese, Russian, Spanish, and Ukrainian part-of-speech taggers
- Utils: Update custom stop word lists

### 📌 Bugfixes
Expand Down
16 changes: 15 additions & 1 deletion wordless/wl_nlp/wl_pos_tagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@
from wordless.wl_nlp import wl_nlp_utils, wl_word_tokenization
from wordless.wl_utils import wl_conversion

UNIVERSAL_TAGSETS_SPACY = [
'spacy_cat', 'spacy_dan', 'spacy_fra', 'spacy_ell', 'spacy_mkd',
'spacy_nob', 'spacy_por', 'spacy_rus', 'spacy_spa', 'spacy_ukr'
]

def wl_pos_tag(main, inputs, lang, pos_tagger = 'default', tagset = 'default'):
tokens_tagged = []

Expand Down Expand Up @@ -168,7 +173,16 @@ def wl_pos_tag(main, inputs, lang, pos_tagger = 'default', tagset = 'default'):
tokens_tagged.insert(empty_offset, ('', ''))

# Convert to universal POS tags
if not pos_tagger.startswith('spacy_') and not pos_tagger.startswith('stanza_') and tagset == 'universal':
if (
tagset == 'universal'
and (
(
not pos_tagger.startswith('spacy_')
and not pos_tagger.startswith('stanza_')
)
or pos_tagger in UNIVERSAL_TAGSETS_SPACY
)
):
mappings = {
tag: tag_universal
for tag, tag_universal, _, _ in main.settings_custom['pos_tagging']['tagsets']['mapping_settings'][lang][pos_tagger]
Expand Down
84 changes: 67 additions & 17 deletions wordless/wl_settings/wl_settings_default.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,24 +16,35 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------

import copy

import networkx
from PyQt5.QtCore import QCoreApplication
from PyQt5.QtWidgets import QDesktopWidget

from wordless.wl_settings import wl_settings_global
from wordless.wl_tagsets import (
wl_tagset_universal,
wl_tagset_cat_universal,
wl_tagset_dan_universal,
wl_tagset_eng_penn_treebank,
wl_tagset_eng_universal,
wl_tagset_ell_universal,
wl_tagset_fra_universal,
wl_tagset_jpn_unidic,
wl_tagset_khm_alt,
wl_tagset_kor_mecab,
wl_tagset_lao_seqlabeling,
wl_tagset_lao_yunshan_cup_2020,
wl_tagset_nor_universal,
wl_tagset_por_universal,
wl_tagset_rus_open_corpora,
wl_tagset_rus_russian_national_corpus,
wl_tagset_rus_universal,
wl_tagset_spa_universal,
wl_tagset_tha_blackboard,
wl_tagset_tha_orchid,
wl_tagset_bod_botok,
wl_tagset_ukr_universal,
wl_tagset_vie_underthesea
)
from wordless.wl_utils import wl_misc, wl_paths
Expand Down Expand Up @@ -1580,51 +1591,90 @@ def init_settings_default(main):
},

'mapping_settings': {
'cat': {
'spacy_cat': copy.deepcopy(wl_tagset_cat_universal.tagset_mapping)
},

'dan': {
'spacy_dan': copy.deepcopy(wl_tagset_dan_universal.tagset_mapping)
},

'eng_gb': {
'nltk_perceptron_eng': wl_tagset_eng_penn_treebank.MAPPINGS,
'nltk_perceptron_eng': wl_tagset_eng_penn_treebank.tagset_mapping,
},

'eng_us': {
'nltk_perceptron_eng': wl_tagset_eng_penn_treebank.MAPPINGS,
'nltk_perceptron_eng': wl_tagset_eng_penn_treebank.tagset_mapping,
},

'ell': {
'spacy_ell': copy.deepcopy(wl_tagset_ell_universal.tagset_mapping)
},

'fra': {
'spacy_fra': copy.deepcopy(wl_tagset_fra_universal.tagset_mapping)
},

'jpn': {
'sudachipy_jpn': wl_tagset_jpn_unidic.MAPPINGS
'sudachipy_jpn': wl_tagset_jpn_unidic.tagset_mapping
},

'khm': {
'khmer_nltk_khm': wl_tagset_khm_alt.MAPPINGS
'khmer_nltk_khm': wl_tagset_khm_alt.tagset_mapping
},

'kor': {
'python_mecab_ko_mecab': wl_tagset_kor_mecab.MAPPINGS
'python_mecab_ko_mecab': wl_tagset_kor_mecab.tagset_mapping
},

'lao': {
'laonlp_seqlabeling': wl_tagset_lao_seqlabeling.MAPPINGS,
'laonlp_yunshan_cup_2020': wl_tagset_lao_yunshan_cup_2020.MAPPINGS
'laonlp_seqlabeling': wl_tagset_lao_seqlabeling.tagset_mapping,
'laonlp_yunshan_cup_2020': wl_tagset_lao_yunshan_cup_2020.tagset_mapping
},

'mkd': {
'spacy_mkd': copy.deepcopy(wl_tagset_eng_universal.tagset_mapping)
},

'nob': {
'spacy_nob': copy.deepcopy(wl_tagset_nor_universal.tagset_mapping)
},

'por_br': {
'spacy_por': copy.deepcopy(wl_tagset_por_universal.tagset_mapping)
},

'por_pt': {
'spacy_por': copy.deepcopy(wl_tagset_por_universal.tagset_mapping)
},

'rus': {
'nltk_perceptron_rus': wl_tagset_rus_russian_national_corpus.MAPPINGS,
'pymorphy3_morphological_analyzer': wl_tagset_rus_open_corpora.MAPPINGS
'nltk_perceptron_rus': wl_tagset_rus_russian_national_corpus.tagset_mapping,
'pymorphy3_morphological_analyzer': wl_tagset_rus_open_corpora.tagset_mapping,
'spacy_rus': copy.deepcopy(wl_tagset_rus_universal.tagset_mapping)
},

'spa': {
'spacy_spa': copy.deepcopy(wl_tagset_spa_universal.tagset_mapping)
},

'tha': {
'pythainlp_perceptron_blackboard': wl_tagset_tha_blackboard.MAPPINGS,
'pythainlp_perceptron_orchid': wl_tagset_tha_orchid.MAPPINGS,
'pythainlp_perceptron_pud': wl_tagset_universal.MAPPINGS
'pythainlp_perceptron_blackboard': wl_tagset_tha_blackboard.tagset_mapping,
'pythainlp_perceptron_orchid': wl_tagset_tha_orchid.tagset_mapping,
'pythainlp_perceptron_pud': copy.deepcopy(wl_tagset_eng_universal.tagset_mapping)
},

'bod': {
'botok_bod': wl_tagset_bod_botok.MAPPINGS
'botok_bod': wl_tagset_bod_botok.tagset_mapping
},

'ukr': {
'pymorphy3_morphological_analyzer': wl_tagset_rus_open_corpora.MAPPINGS
'pymorphy3_morphological_analyzer': wl_tagset_rus_open_corpora.tagset_mapping,
'spacy_ukr': copy.deepcopy(wl_tagset_ukr_universal.tagset_mapping)
},

'vie': {
'underthesea_vie': wl_tagset_vie_underthesea.MAPPINGS
'underthesea_vie': wl_tagset_vie_underthesea.tagset_mapping
}
}
}
Expand Down Expand Up @@ -2342,7 +2392,7 @@ def init_settings_default(main):
}

# Tagsets
settings_default['pos_tagging']['tagsets']['preview_settings']['preview_pos_tagger'] = settings_default['pos_tagging']['pos_tagger_settings']['pos_taggers']
settings_default['pos_tagging']['tagsets']['preview_settings']['preview_pos_tagger'] = settings_default['pos_tagging']['pos_tagger_settings']['pos_taggers'].copy()

# Custom stop word lists
for lang in wl_settings_global.SETTINGS_GLOBAL['langs'].values():
Expand Down
27 changes: 18 additions & 9 deletions wordless/wl_settings/wl_settings_pos_tagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
from PyQt5.QtCore import pyqtSignal, Qt
from PyQt5.QtGui import QStandardItem
from PyQt5.QtWidgets import (
QCheckBox, QGroupBox, QLabel, QPushButton, QStackedWidget,
QTextEdit, QWidget
QCheckBox, QGroupBox, QLabel, QPlainTextEdit, QPushButton,
QStackedWidget, QTextEdit
)

from wordless.wl_dialogs import wl_dialogs_misc, wl_msg_boxes
Expand Down Expand Up @@ -271,8 +271,6 @@ def __init__(self, main):

self.pos_tag_mappings_loaded = False

self.settings_tagsets = QWidget(self)

# Preview Settings
self.group_box_preview_settings = QGroupBox(self.tr('Preview Settings:'), self)

Expand Down Expand Up @@ -316,7 +314,8 @@ def __init__(self, main):
self.stacked_widget_num_pos_tags.addWidget(self.label_tagsets_num_pos_tags)
self.stacked_widget_num_pos_tags.addWidget(self.label_tagsets_uneditable)

self.table_mappings.setItemDelegate(wl_item_delegates.Wl_Item_Delegate_Combo_Box(
self.table_mappings.setItemDelegateForColumn(0, wl_item_delegates.Wl_Item_Delegate_Uneditable(self.table_mappings))
self.table_mappings.setItemDelegateForColumn(1, wl_item_delegates.Wl_Item_Delegate_Combo_Box(
parent = self.table_mappings,
items = [
'ADJ',
Expand All @@ -338,9 +337,10 @@ def __init__(self, main):
'SYM',
'X'
],
col = 1,
editable = True
))
self.table_mappings.setItemDelegateForColumn(2, wl_item_delegates.Wl_Item_Delegate(self.table_mappings, QPlainTextEdit))
self.table_mappings.setItemDelegateForColumn(3, wl_item_delegates.Wl_Item_Delegate(self.table_mappings, QPlainTextEdit))

self.button_tagsets_reset.setMinimumWidth(100)
self.button_tagsets_reset_all.setMinimumWidth(100)
Expand Down Expand Up @@ -399,7 +399,12 @@ def preview_pos_tagger_changed(self):

preview_pos_tagger = self.settings_custom['preview_settings']['preview_pos_tagger'][preview_lang]

if not preview_pos_tagger.startswith('spacy_') and not preview_pos_tagger.startswith('stanza_'):
if (
(
not preview_pos_tagger.startswith('spacy_')
and not preview_pos_tagger.startswith('stanza_')
) or preview_pos_tagger in wl_pos_tagging.UNIVERSAL_TAGSETS_SPACY
):
self.combo_box_tagsets_lang.setEnabled(False)
self.combo_box_tagsets_pos_tagger.setEnabled(False)
self.button_tagsets_reset.setEnabled(False)
Expand Down Expand Up @@ -464,6 +469,8 @@ def reset_currently_shown_table(self):

for i in range(self.table_mappings.model().rowCount()):
self.table_mappings.model().item(i, 1).setText(mappings[i][1])
self.table_mappings.model().item(i, 2).setText(mappings[i][2])
self.table_mappings.model().item(i, 3).setText(mappings[i][3])

self.table_mappings.enable_updates()

Expand Down Expand Up @@ -520,10 +527,12 @@ def apply_settings(self):
# Mapping Settings
preview_lang = self.settings_custom['preview_settings']['preview_lang']
preview_pos_tagger = self.settings_custom['preview_settings']['preview_pos_tagger'][preview_lang]
mapping = self.settings_custom['mapping_settings'][preview_lang][preview_pos_tagger]

for i in range(self.table_mappings.model().rowCount()):
if not preview_pos_tagger.startswith('spacy_'):
self.settings_custom['mapping_settings'][preview_lang][preview_pos_tagger][i][1] = self.table_mappings.model().item(i, 1).text()
mapping[i][1] = self.table_mappings.model().item(i, 1).text()
mapping[i][2] = self.table_mappings.model().item(i, 2).text()
mapping[i][3] = self.table_mappings.model().item(i, 3).text()

return True

Expand Down
2 changes: 1 addition & 1 deletion wordless/wl_tagsets/wl_tagset_bod_botok.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
# ----------------------------------------------------------------------

# Reference: https://github.com/Esukhia/botok/blob/master/botok/vars.py
MAPPINGS = [
tagset_mapping = [
['ADJ', 'ADJ', 'Adjectives', ''],
['ADP', 'ADP', 'Adposition', ''],
['ADV', 'ADV', 'Adverb', ''],
Expand Down
40 changes: 40 additions & 0 deletions wordless/wl_tagsets/wl_tagset_cat_universal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# ----------------------------------------------------------------------
# Wordless: Tagsets - Universal POS tags - Catalan
# Copyright (C) 2018-2024 Ye Lei (叶磊)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------

# Reference: https://universaldependencies.org/ca/pos/
tagset_mapping = [
['ADJ', 'ADJ', 'Adjective', 'gran, vell, verd, incomprensible\nprimer, segon, tercer'],
['ADP', 'ADP', 'Adposition', '[English] in, to, during'],
['ADV', 'ADV', 'Adverb', 'molt, bé, exactament, demà, dalt, baix\nInterrogative or exclamative adverbs: on, quan, com, per què\nDemonstrative adverbs: aquí, allí, ara, després\nTotality adverbs: sempre\nNegative adverbs: mai'],
['AUX', 'AUX', 'Auxiliary', 'Tense auxiliaries: [English] has (done), is (doing), will (do)\nPassive auxiliaries: [English] was (done), got (done)\nModal auxiliaries: [English] should (do), must (do)\nVerbal copulas: [English] (He) is (a teacher.)\nAgreement auxiliaries: [K’iche’] la (2nd person singular formal), alaq (2nd person plural formal)'],
['CONJ', 'CONJ', 'Coordinating/subordinating conjunction', 'See CCONJ and SCONJ'],
['CCONJ', 'CCONJ', 'Coordinating conjunction', '[English] and, or, but'],
['SCONJ', 'SCONJ', 'Subordinating conjunction', '[English] (I believe) that (he will come.), if, while'],
['DET', 'DET', 'Determiner', 'Articles (a closed class indicating definiteness, specificity or givenness): [English] a, an, the\nPossessive determiners (which modify a nominal; note that some languages use PRON for similar words): [Czech] můj, tvůj, jeho, její, náš, váš, jejich\nDemonstrative determiners: [English] (I saw) this (car yesterday.)\nInterrogative determiners: [English] Which (car do you like?)\nRelative determiners: [English] (I wonder) which (car you like.)\nQuantity determiners (quantifiers):\n\tIndefinite: [English] any\n\tUniversal: [English] all\n\tNegative: [English] (We have) no (cars available.)'],
['INTJ', 'INTJ', 'Interjection', 'psst, ai, bravo, hola, Sí(, perque…), No(, no ho crec.)'],
['NOUN', 'NOUN', 'Noun', 'noia, gat, arbre, aire, bellesa'],
['PROPN', 'PROPN', 'Proper noun', '[English] Mary, John, London, NATO, HBO, [email protected], http://universaldependencies.org/, 1-800-COMPANY'],
['NUM', 'NUM', 'Numeral', '0, 1, 2, 3, 4, 5, 2014, 1000000, 3.14159265359\n11/11/1918, 11:00\n[English] one, two, three, seventy-seven\nk (abbreviation for thousand), m (abbreviation for million)\nI, II, III, IV, V, MMXIV'],
['PART', 'PART', 'Particle', 'Possessive marker: [English] ’s\nNegation particle: [English] not; [German] nicht\nQuestion particle: [Japanese] か/ka (adding this particle to the end of a clause turns the clause into a question); [Turkish] mu\nSentence modality: [Czech] ať, kéž, nechť'],
['PRON', 'PRON', 'Pronoun', 'Personal pronouns: [English] I, you, he, she, it, we, they\nReflexive pronouns: [English] myself, yourself, himself, herself, itself, ourselves, yourselves, theirselves\nInterrogative pronouns: who, What (do you think?)\nRelative pronouns (unlike SCONJ relativizers, relative pronouns play a nominal role in the relative clause): [English] (a cat) who (eats fish), that, which, (I wonder) what (you think.)\nIndefinite pronouns: [English] somebody, something, anybody, anything\nTotal pronouns: [English] everybody, everything\nNegative pronouns: [English] nobody, nothing\nPossessive pronouns (which usually stand alone as a nominal): [English] mine, yours, his, hers, its, ours, theirs\nAttributive possessive pronouns (in some languages; others use DET for similar words): [English] my, your'],
['VERB', 'VERB', 'Verb', '[English] run, eat\n[English] runs, ate\n[English] running, eating'],

['PUNCT', 'PUNCT', 'Punctuation', 'Period: .\nComma: ,\nParentheses: ()'],
['SYM', 'SYM', 'Symbol', '$, %, §, ©\n+, −, ×, ÷, =, <, >\n:), ♥‿♥, 😝'],
['X', 'X', 'Other', '[English] (And then he just) xfgh pdl jklw']
]
40 changes: 40 additions & 0 deletions wordless/wl_tagsets/wl_tagset_dan_universal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# ----------------------------------------------------------------------
# Wordless: Tagsets - Universal POS tags - Danish
# Copyright (C) 2018-2024 Ye Lei (叶磊)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------

# Reference: https://universaldependencies.org/da/pos/
tagset_mapping = [
['ADJ', 'ADJ', 'Adjective', 'gammel/gammelt/gamle, grøn/grønt/grønne, ufatlig/ufatligt/ufatlige'],
['ADP', 'ADP', 'Adposition', 'i, på, gennem'],
['ADV', 'ADV', 'Adverb', 'meget (vigtigt), væk, (jeg spiser) ikke (rejer), pludselig'],
['AUX', 'AUX', 'Auxiliary', 'Tense auxiliary: har (købt)\nModal auxiliary: kunne (tænke)\nPassive auxiliary: blev (fundet)\nCopula: var (grøn), er (en løsning)'],
['CONJ', 'CONJ', 'Coordinating/subordinating conjunction', 'See CCONJ and SCONJ'],
['CCONJ', 'CCONJ', 'Coordinating conjunction', 'og, eller, men'],
['SCONJ', 'SCONJ', 'Subordinating conjunction', 'da, hvis, (konstatere) at (manden har søgt hjælp)'],
['DET', 'DET', 'Determiner', 'Articles: en, et, den, det, de\nPossessive determiners: min (bil), deres (holdninger), dit (job)\nNegative determiners: (han har) ingen (empati)'],
['INTJ', 'INTJ', 'Interjection', 'Hmm!, Åh!, Hej!'],
['NOUN', 'NOUN', 'Noun', 'pige, kat, træ, luft, skønhed'],
['PROPN', 'PROPN', 'Proper noun', 'Anna, Otto\nSkåne, USA\nTexaco, Pirelli'],
['NUM', 'NUM', 'Numeral', '0, 1, 2, 3, 2014, 1 000 000, 3.000,15, 3,14159265359\net, to, tre, nitten\nI, II, III, IV, V, MMXIV'],
['PART', 'PART', 'Particle', '(det er muligt) at (ændre det)'],
['PRON', 'PRON', 'Pronoun', 'Personal (subject) pronouns: jeg, du, han, hun, det/den, vi, I, de\nPlaceholder personal pronoun: man (kan gå)\nPersonal (object)/reflexive pronouns: mig, dig, ham, henne, sig, os, hinanden\nDemonstrative pronouns: dette (er et svært spørgsmål)\nPossessive pronouns: vores\nInterrogative pronouns: hvad\nRelative pronouns: hvis\nIndefinite pronouns: nogen, noget\nTotality pronouns: alting\nNegative pronouns: ingen (af os)'],
['VERB', 'VERB', 'Verb', 'at vise, jeg viser, han viste\nat flyve, vi flyver, de fløj'],

['PUNCT', 'PUNCT', 'Punctuation', 'Period: .\nComma: ,\nParentheses: ()'],
['SYM', 'SYM', 'Symbol', '§'],
['X', 'X', 'Other', 'musik(- og billedprogrammer)']
]
Loading

0 comments on commit e00ca2a

Please sign in to comment.