Skip to content

Commit

Permalink
Menu: Allow editing of tagset mapping of spaCy's Catalan, Danish, Fre…
Browse files Browse the repository at this point in the history
…nch, Greek (Modern), Macedonian, Norwegian (Bokmål), Portuguese, Russian, Spanish, and Ukrainian part-of-speech taggers
  • Loading branch information
BLKSerene committed Jan 12, 2024
1 parent 84f8747 commit f8bd0a8
Show file tree
Hide file tree
Showing 28 changed files with 518 additions and 94 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
- Work Area: Add Profiler - Lexical Diversity - Brunét's Index / Honoré's statistic

### ✨ Improvements
- Menu: Allow editing of tagset mapping of spaCy's Catalan, Danish, French, Greek (Modern), Macedonian, Norwegian (Bokmål), Portuguese, Russian, Spanish, and Ukrainian part-of-speech taggers
- Utils: Update custom stop word lists

### 📌 Bugfixes
Expand Down
16 changes: 15 additions & 1 deletion wordless/wl_nlp/wl_pos_tagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@
from wordless.wl_nlp import wl_nlp_utils, wl_word_tokenization
from wordless.wl_utils import wl_conversion

UNIVERSAL_TAGSETS_SPACY = [
'spacy_cat', 'spacy_dan', 'spacy_fra', 'spacy_ell', 'spacy_mkd',
'spacy_nob', 'spacy_por', 'spacy_rus', 'spacy_spa', 'spacy_ukr'
]

def wl_pos_tag(main, inputs, lang, pos_tagger = 'default', tagset = 'default'):
tokens_tagged = []

Expand Down Expand Up @@ -168,7 +173,16 @@ def wl_pos_tag(main, inputs, lang, pos_tagger = 'default', tagset = 'default'):
tokens_tagged.insert(empty_offset, ('', ''))

# Convert to universal POS tags
if not pos_tagger.startswith('spacy_') and not pos_tagger.startswith('stanza_') and tagset == 'universal':
if (
tagset == 'universal'
and (
(
not pos_tagger.startswith('spacy_')
and not pos_tagger.startswith('stanza_')
)
or pos_tagger in UNIVERSAL_TAGSETS_SPACY
)
):
mappings = {
tag: tag_universal
for tag, tag_universal, _, _ in main.settings_custom['pos_tagging']['tagsets']['mapping_settings'][lang][pos_tagger]
Expand Down
84 changes: 67 additions & 17 deletions wordless/wl_settings/wl_settings_default.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,24 +16,35 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------

import copy

import networkx
from PyQt5.QtCore import QCoreApplication
from PyQt5.QtWidgets import QDesktopWidget

from wordless.wl_settings import wl_settings_global
from wordless.wl_tagsets import (
wl_tagset_universal,
wl_tagset_cat_universal,
wl_tagset_dan_universal,
wl_tagset_eng_penn_treebank,
wl_tagset_eng_universal,
wl_tagset_ell_universal,
wl_tagset_fra_universal,
wl_tagset_jpn_unidic,
wl_tagset_khm_alt,
wl_tagset_kor_mecab,
wl_tagset_lao_seqlabeling,
wl_tagset_lao_yunshan_cup_2020,
wl_tagset_nor_universal,
wl_tagset_por_universal,
wl_tagset_rus_open_corpora,
wl_tagset_rus_russian_national_corpus,
wl_tagset_rus_universal,
wl_tagset_spa_universal,
wl_tagset_tha_blackboard,
wl_tagset_tha_orchid,
wl_tagset_bod_botok,
wl_tagset_ukr_universal,
wl_tagset_vie_underthesea
)
from wordless.wl_utils import wl_misc, wl_paths
Expand Down Expand Up @@ -1580,51 +1591,90 @@ def init_settings_default(main):
},

'mapping_settings': {
'cat': {
'spacy_cat': copy.deepcopy(wl_tagset_cat_universal.tagset_mapping)
},

'dan': {
'spacy_dan': copy.deepcopy(wl_tagset_dan_universal.tagset_mapping)
},

'eng_gb': {
'nltk_perceptron_eng': wl_tagset_eng_penn_treebank.MAPPINGS,
'nltk_perceptron_eng': wl_tagset_eng_penn_treebank.tagset_mapping,
},

'eng_us': {
'nltk_perceptron_eng': wl_tagset_eng_penn_treebank.MAPPINGS,
'nltk_perceptron_eng': wl_tagset_eng_penn_treebank.tagset_mapping,
},

'ell': {
'spacy_ell': copy.deepcopy(wl_tagset_ell_universal.tagset_mapping)
},

'fra': {
'spacy_fra': copy.deepcopy(wl_tagset_fra_universal.tagset_mapping)
},

'jpn': {
'sudachipy_jpn': wl_tagset_jpn_unidic.MAPPINGS
'sudachipy_jpn': wl_tagset_jpn_unidic.tagset_mapping
},

'khm': {
'khmer_nltk_khm': wl_tagset_khm_alt.MAPPINGS
'khmer_nltk_khm': wl_tagset_khm_alt.tagset_mapping
},

'kor': {
'python_mecab_ko_mecab': wl_tagset_kor_mecab.MAPPINGS
'python_mecab_ko_mecab': wl_tagset_kor_mecab.tagset_mapping
},

'lao': {
'laonlp_seqlabeling': wl_tagset_lao_seqlabeling.MAPPINGS,
'laonlp_yunshan_cup_2020': wl_tagset_lao_yunshan_cup_2020.MAPPINGS
'laonlp_seqlabeling': wl_tagset_lao_seqlabeling.tagset_mapping,
'laonlp_yunshan_cup_2020': wl_tagset_lao_yunshan_cup_2020.tagset_mapping
},

'mkd': {
'spacy_mkd': copy.deepcopy(wl_tagset_eng_universal.tagset_mapping)
},

'nob': {
'spacy_nob': copy.deepcopy(wl_tagset_nor_universal.tagset_mapping)
},

'por_br': {
'spacy_por': copy.deepcopy(wl_tagset_por_universal.tagset_mapping)
},

'por_pt': {
'spacy_por': copy.deepcopy(wl_tagset_por_universal.tagset_mapping)
},

'rus': {
'nltk_perceptron_rus': wl_tagset_rus_russian_national_corpus.MAPPINGS,
'pymorphy3_morphological_analyzer': wl_tagset_rus_open_corpora.MAPPINGS
'nltk_perceptron_rus': wl_tagset_rus_russian_national_corpus.tagset_mapping,
'pymorphy3_morphological_analyzer': wl_tagset_rus_open_corpora.tagset_mapping,
'spacy_rus': copy.deepcopy(wl_tagset_rus_universal.tagset_mapping)
},

'spa': {
'spacy_spa': copy.deepcopy(wl_tagset_spa_universal.tagset_mapping)
},

'tha': {
'pythainlp_perceptron_blackboard': wl_tagset_tha_blackboard.MAPPINGS,
'pythainlp_perceptron_orchid': wl_tagset_tha_orchid.MAPPINGS,
'pythainlp_perceptron_pud': wl_tagset_universal.MAPPINGS
'pythainlp_perceptron_blackboard': wl_tagset_tha_blackboard.tagset_mapping,
'pythainlp_perceptron_orchid': wl_tagset_tha_orchid.tagset_mapping,
'pythainlp_perceptron_pud': copy.deepcopy(wl_tagset_eng_universal.tagset_mapping)
},

'bod': {
'botok_bod': wl_tagset_bod_botok.MAPPINGS
'botok_bod': wl_tagset_bod_botok.tagset_mapping
},

'ukr': {
'pymorphy3_morphological_analyzer': wl_tagset_rus_open_corpora.MAPPINGS
'pymorphy3_morphological_analyzer': wl_tagset_rus_open_corpora.tagset_mapping,
'spacy_ukr': copy.deepcopy(wl_tagset_ukr_universal.tagset_mapping)
},

'vie': {
'underthesea_vie': wl_tagset_vie_underthesea.MAPPINGS
'underthesea_vie': wl_tagset_vie_underthesea.tagset_mapping
}
}
}
Expand Down Expand Up @@ -2342,7 +2392,7 @@ def init_settings_default(main):
}

# Tagsets
settings_default['pos_tagging']['tagsets']['preview_settings']['preview_pos_tagger'] = settings_default['pos_tagging']['pos_tagger_settings']['pos_taggers']
settings_default['pos_tagging']['tagsets']['preview_settings']['preview_pos_tagger'] = settings_default['pos_tagging']['pos_tagger_settings']['pos_taggers'].copy()

# Custom stop word lists
for lang in wl_settings_global.SETTINGS_GLOBAL['langs'].values():
Expand Down
27 changes: 18 additions & 9 deletions wordless/wl_settings/wl_settings_pos_tagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
from PyQt5.QtCore import pyqtSignal, Qt
from PyQt5.QtGui import QStandardItem
from PyQt5.QtWidgets import (
QCheckBox, QGroupBox, QLabel, QPushButton, QStackedWidget,
QTextEdit, QWidget
QCheckBox, QGroupBox, QLabel, QPlainTextEdit, QPushButton,
QStackedWidget, QTextEdit
)

from wordless.wl_dialogs import wl_dialogs_misc, wl_msg_boxes
Expand Down Expand Up @@ -271,8 +271,6 @@ def __init__(self, main):

self.pos_tag_mappings_loaded = False

self.settings_tagsets = QWidget(self)

# Preview Settings
self.group_box_preview_settings = QGroupBox(self.tr('Preview Settings:'), self)

Expand Down Expand Up @@ -316,7 +314,8 @@ def __init__(self, main):
self.stacked_widget_num_pos_tags.addWidget(self.label_tagsets_num_pos_tags)
self.stacked_widget_num_pos_tags.addWidget(self.label_tagsets_uneditable)

self.table_mappings.setItemDelegate(wl_item_delegates.Wl_Item_Delegate_Combo_Box(
self.table_mappings.setItemDelegateForColumn(0, wl_item_delegates.Wl_Item_Delegate_Uneditable(self.table_mappings))
self.table_mappings.setItemDelegateForColumn(1, wl_item_delegates.Wl_Item_Delegate_Combo_Box(
parent = self.table_mappings,
items = [
'ADJ',
Expand All @@ -338,9 +337,10 @@ def __init__(self, main):
'SYM',
'X'
],
col = 1,
editable = True
))
self.table_mappings.setItemDelegateForColumn(2, wl_item_delegates.Wl_Item_Delegate(self.table_mappings, QPlainTextEdit))
self.table_mappings.setItemDelegateForColumn(3, wl_item_delegates.Wl_Item_Delegate(self.table_mappings, QPlainTextEdit))

self.button_tagsets_reset.setMinimumWidth(100)
self.button_tagsets_reset_all.setMinimumWidth(100)
Expand Down Expand Up @@ -399,7 +399,12 @@ def preview_pos_tagger_changed(self):

preview_pos_tagger = self.settings_custom['preview_settings']['preview_pos_tagger'][preview_lang]

if not preview_pos_tagger.startswith('spacy_') and not preview_pos_tagger.startswith('stanza_'):
if (
(
not preview_pos_tagger.startswith('spacy_')
and not preview_pos_tagger.startswith('stanza_')
) or preview_pos_tagger in wl_pos_tagging.UNIVERSAL_TAGSETS_SPACY
):
self.combo_box_tagsets_lang.setEnabled(False)
self.combo_box_tagsets_pos_tagger.setEnabled(False)
self.button_tagsets_reset.setEnabled(False)
Expand Down Expand Up @@ -464,6 +469,8 @@ def reset_currently_shown_table(self):

for i in range(self.table_mappings.model().rowCount()):
self.table_mappings.model().item(i, 1).setText(mappings[i][1])
self.table_mappings.model().item(i, 2).setText(mappings[i][2])
self.table_mappings.model().item(i, 3).setText(mappings[i][3])

self.table_mappings.enable_updates()

Expand Down Expand Up @@ -520,10 +527,12 @@ def apply_settings(self):
# Mapping Settings
preview_lang = self.settings_custom['preview_settings']['preview_lang']
preview_pos_tagger = self.settings_custom['preview_settings']['preview_pos_tagger'][preview_lang]
mapping = self.settings_custom['mapping_settings'][preview_lang][preview_pos_tagger]

for i in range(self.table_mappings.model().rowCount()):
if not preview_pos_tagger.startswith('spacy_'):
self.settings_custom['mapping_settings'][preview_lang][preview_pos_tagger][i][1] = self.table_mappings.model().item(i, 1).text()
mapping[i][1] = self.table_mappings.model().item(i, 1).text()
mapping[i][2] = self.table_mappings.model().item(i, 2).text()
mapping[i][3] = self.table_mappings.model().item(i, 3).text()

return True

Expand Down
2 changes: 1 addition & 1 deletion wordless/wl_tagsets/wl_tagset_bod_botok.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
# ----------------------------------------------------------------------

# Reference: https://github.com/Esukhia/botok/blob/master/botok/vars.py
MAPPINGS = [
tagset_mapping = [
['ADJ', 'ADJ', 'Adjectives', ''],
['ADP', 'ADP', 'Adposition', ''],
['ADV', 'ADV', 'Adverb', ''],
Expand Down
40 changes: 40 additions & 0 deletions wordless/wl_tagsets/wl_tagset_cat_universal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# ----------------------------------------------------------------------
# Wordless: Tagsets - Universal POS tags - Catalan
# Copyright (C) 2018-2024 Ye Lei (叶磊)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------

# Reference: https://universaldependencies.org/ca/pos/
tagset_mapping = [
['ADJ', 'ADJ', 'Adjective', 'gran, vell, verd, incomprensible\nprimer, segon, tercer'],
['ADP', 'ADP', 'Adposition', '[English] in, to, during'],
['ADV', 'ADV', 'Adverb', 'molt, bé, exactament, demà, dalt, baix\nInterrogative or exclamative adverbs: on, quan, com, per què\nDemonstrative adverbs: aquí, allí, ara, després\nTotality adverbs: sempre\nNegative adverbs: mai'],
['AUX', 'AUX', 'Auxiliary', 'Tense auxiliaries: [English] has (done), is (doing), will (do)\nPassive auxiliaries: [English] was (done), got (done)\nModal auxiliaries: [English] should (do), must (do)\nVerbal copulas: [English] (He) is (a teacher.)\nAgreement auxiliaries: [K’iche’] la (2nd person singular formal), alaq (2nd person plural formal)'],
['CONJ', 'CONJ', 'Coordinating/subordinating conjunction', 'See CCONJ and SCONJ'],
['CCONJ', 'CCONJ', 'Coordinating conjunction', '[English] and, or, but'],
['SCONJ', 'SCONJ', 'Subordinating conjunction', '[English] (I believe) that (he will come.), if, while'],
['DET', 'DET', 'Determiner', 'Articles (a closed class indicating definiteness, specificity or givenness): [English] a, an, the\nPossessive determiners (which modify a nominal; note that some languages use PRON for similar words): [Czech] můj, tvůj, jeho, její, náš, váš, jejich\nDemonstrative determiners: [English] (I saw) this (car yesterday.)\nInterrogative determiners: [English] Which (car do you like?)\nRelative determiners: [English] (I wonder) which (car you like.)\nQuantity determiners (quantifiers):\n\tIndefinite: [English] any\n\tUniversal: [English] all\n\tNegative: [English] (We have) no (cars available.)'],
['INTJ', 'INTJ', 'Interjection', 'psst, ai, bravo, hola, Sí(, perque…), No(, no ho crec.)'],
['NOUN', 'NOUN', 'Noun', 'noia, gat, arbre, aire, bellesa'],
['PROPN', 'PROPN', 'Proper noun', '[English] Mary, John, London, NATO, HBO, [email protected], http://universaldependencies.org/, 1-800-COMPANY'],
['NUM', 'NUM', 'Numeral', '0, 1, 2, 3, 4, 5, 2014, 1000000, 3.14159265359\n11/11/1918, 11:00\n[English] one, two, three, seventy-seven\nk (abbreviation for thousand), m (abbreviation for million)\nI, II, III, IV, V, MMXIV'],
['PART', 'PART', 'Particle', 'Possessive marker: [English] ’s\nNegation particle: [English] not; [German] nicht\nQuestion particle: [Japanese] か/ka (adding this particle to the end of a clause turns the clause into a question); [Turkish] mu\nSentence modality: [Czech] ať, kéž, nechť'],
['PRON', 'PRON', 'Pronoun', 'Personal pronouns: [English] I, you, he, she, it, we, they\nReflexive pronouns: [English] myself, yourself, himself, herself, itself, ourselves, yourselves, theirselves\nInterrogative pronouns: who, What (do you think?)\nRelative pronouns (unlike SCONJ relativizers, relative pronouns play a nominal role in the relative clause): [English] (a cat) who (eats fish), that, which, (I wonder) what (you think.)\nIndefinite pronouns: [English] somebody, something, anybody, anything\nTotal pronouns: [English] everybody, everything\nNegative pronouns: [English] nobody, nothing\nPossessive pronouns (which usually stand alone as a nominal): [English] mine, yours, his, hers, its, ours, theirs\nAttributive possessive pronouns (in some languages; others use DET for similar words): [English] my, your'],
['VERB', 'VERB', 'Verb', '[English] run, eat\n[English] runs, ate\n[English] running, eating'],

['PUNCT', 'PUNCT', 'Punctuation', 'Period: .\nComma: ,\nParentheses: ()'],
['SYM', 'SYM', 'Symbol', '$, %, §, ©\n+, −, ×, ÷, =, <, >\n:), ♥‿♥, 😝'],
['X', 'X', 'Other', '[English] (And then he just) xfgh pdl jklw']
]
40 changes: 40 additions & 0 deletions wordless/wl_tagsets/wl_tagset_dan_universal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# ----------------------------------------------------------------------
# Wordless: Tagsets - Universal POS tags - Danish
# Copyright (C) 2018-2024 Ye Lei (叶磊)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------

# Reference: https://universaldependencies.org/da/pos/
tagset_mapping = [
['ADJ', 'ADJ', 'Adjective', 'gammel/gammelt/gamle, grøn/grønt/grønne, ufatlig/ufatligt/ufatlige'],
['ADP', 'ADP', 'Adposition', 'i, på, gennem'],
['ADV', 'ADV', 'Adverb', 'meget (vigtigt), væk, (jeg spiser) ikke (rejer), pludselig'],
['AUX', 'AUX', 'Auxiliary', 'Tense auxiliary: har (købt)\nModal auxiliary: kunne (tænke)\nPassive auxiliary: blev (fundet)\nCopula: var (grøn), er (en løsning)'],
['CONJ', 'CONJ', 'Coordinating/subordinating conjunction', 'See CCONJ and SCONJ'],
['CCONJ', 'CCONJ', 'Coordinating conjunction', 'og, eller, men'],
['SCONJ', 'SCONJ', 'Subordinating conjunction', 'da, hvis, (konstatere) at (manden har søgt hjælp)'],
['DET', 'DET', 'Determiner', 'Articles: en, et, den, det, de\nPossessive determiners: min (bil), deres (holdninger), dit (job)\nNegative determiners: (han har) ingen (empati)'],
['INTJ', 'INTJ', 'Interjection', 'Hmm!, Åh!, Hej!'],
['NOUN', 'NOUN', 'Noun', 'pige, kat, træ, luft, skønhed'],
['PROPN', 'PROPN', 'Proper noun', 'Anna, Otto\nSkåne, USA\nTexaco, Pirelli'],
['NUM', 'NUM', 'Numeral', '0, 1, 2, 3, 2014, 1 000 000, 3.000,15, 3,14159265359\net, to, tre, nitten\nI, II, III, IV, V, MMXIV'],
['PART', 'PART', 'Particle', '(det er muligt) at (ændre det)'],
['PRON', 'PRON', 'Pronoun', 'Personal (subject) pronouns: jeg, du, han, hun, det/den, vi, I, de\nPlaceholder personal pronoun: man (kan gå)\nPersonal (object)/reflexive pronouns: mig, dig, ham, henne, sig, os, hinanden\nDemonstrative pronouns: dette (er et svært spørgsmål)\nPossessive pronouns: vores\nInterrogative pronouns: hvad\nRelative pronouns: hvis\nIndefinite pronouns: nogen, noget\nTotality pronouns: alting\nNegative pronouns: ingen (af os)'],
['VERB', 'VERB', 'Verb', 'at vise, jeg viser, han viste\nat flyve, vi flyver, de fløj'],

['PUNCT', 'PUNCT', 'Punctuation', 'Period: .\nComma: ,\nParentheses: ()'],
['SYM', 'SYM', 'Symbol', '§'],
['X', 'X', 'Other', 'musik(- og billedprogrammer)']
]
Loading

0 comments on commit f8bd0a8

Please sign in to comment.