diff --git a/CHANGELOG.md b/CHANGELOG.md index 9cc91f0bb..b88e0334a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ - Utils: Add PyThaiNLP's perceptron part-of-speech tagger (Blackboard) - Utils: Add spaCy's Korean sentence recognizer, word tokenizer, part-of-speech tagger, lemmatizer, and dependency parser - Utils: Add spaCy's Malay word tokenizer +- Utils: Add spaCy's Slovenian sentence recognizer, part-of-speech tagger, lemmatizer, and dependency parser ### ❌ Removals - Utils: Remove PyThaiNLP's perceptron part-of-speech tagger (LST20) diff --git a/doc/doc_eng.md b/doc/doc_eng.md index 6b42c76eb..38c78d74b 100644 --- a/doc/doc_eng.md +++ b/doc/doc_eng.md @@ -733,7 +733,7 @@ Serbian (Cyrillic) |⭕️ |✔|✔|✖️|✔|✖️|✖️ Serbian (Latin) |⭕️ |✔|✔|✖️|✔|✖️|✖️ Sinhala |⭕️ |✔|✖️|✖️|✖️|✖️|✖️ Slovak |⭕️ |✔|✔|✖️|✔|✔|✖️ -Slovenian |✔|✔|✔|✖️|✔|✔|✖️ +Slovenian |✔|✔|✔|✔|✔|✔|✔ Somali |⭕️ |⭕️ |✖️|✖️|✖️|✔|✖️ Sorbian (Lower) |⭕️ |✔|✖️|✖️|✖️|✖️|✖️ Sorbian (Upper) |⭕️ |✔|✖️|✖️|✖️|✖️|✖️ diff --git a/requirements_dev.txt b/requirements_dev.txt index d49933e1a..0a0496d8d 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -82,6 +82,7 @@ spacy_models/pl_core_news_sm-3.6.0-py3-none-any.whl spacy_models/pt_core_news_sm-3.6.0-py3-none-any.whl spacy_models/ro_core_news_sm-3.6.0-py3-none-any.whl spacy_models/ru_core_news_sm-3.6.0-py3-none-any.whl +spacy_models/sl_core_news_sm-3.6.0-py3-none-any.whl spacy_models/sv_core_news_sm-3.6.0-py3-none-any.whl spacy_models/uk_core_news_sm-3.6.0-py3-none-any.whl spacy_models/zh_core_web_sm-3.6.0-py3-none-any.whl diff --git a/tests/wl_tests_nlp/test_dependency_parsing.py b/tests/wl_tests_nlp/test_dependency_parsing.py index 923a33830..1c7f3ee0b 100644 --- a/tests/wl_tests_nlp/test_dependency_parsing.py +++ b/tests/wl_tests_nlp/test_dependency_parsing.py @@ -139,6 +139,8 @@ def test_dependency_parse(lang, dependency_parser): assert dependencies == [('Limba', 'limbă', 'nsubj', 4), ('română', 'Limba', 'amod', -1), ('este', 'limbă', 'cop', 2), ('o', 'limbă', 'det', 1), ('limbă', 'limbă', 'ROOT', 0), ('indo-europeană', 'limbă', 'amod', -1), ('din', 'grupul', 'case', 1), ('grupul', 'limbă', 'nmod', -3), ('italic', 'grupul', 'amod', -1), ('și', 'subgrupul', 'cc', 2), ('din', 'subgrupul', 'case', 1), ('subgrupul', 'grupul', 'conj', -4), ('oriental', 'subgrupul', 'amod', -1), ('al', 'limbilor', 'det', 1), ('limbilor', 'subgrupul', 'nmod', -3), ('romanice', 'limbilor', 'amod', -1), ('.', 'limbă', 'punct', -12)] elif lang == 'rus': assert dependencies == [('Ру́сский', 'язы́к', 'amod', 1), ('язы́к', 'язык', 'nsubj', 16), ('(', 'ˈruskʲɪi̯', 'punct', 2), ('[', 'ˈruskʲɪi̯', 'punct', 1), ('ˈruskʲɪi̯', 'язы́к', 'appos', -3), ('jɪˈzɨk', 'ˈruskʲɪi̯', 'flat:foreign', -1), (']', 'ˈruskʲɪi̯', 'punct', -2), ('Информация', 'язы́к', 'appos', -6), ('о', 'файле', 'case', 1), ('файле', 'Информация', 'nmod', -2), ('слушать)[~', 'файле', 'nmod', -1), ('3', 'слушать)[~', 'appos', -1), (']', 'Информация', 'punct', -5), ('[', '⇨', 'punct', 1), ('⇨', 'Информация', 'appos', -7), (']', '⇨', 'punct', -1), ('—', 'язык', 'punct', 1), ('язык', 'язык', 'ROOT', 0), ('восточнославянской', 'группы', 'amod', 1), ('группы', 'язык', 'nmod', -2), ('славянской', 'ветви', 'amod', 1), ('ветви', 'группы', 'nmod', -2), ('индоевропейской', 'семьи', 'amod', 2), ('языковой', 'семьи', 'amod', 1), ('семьи', 'ветви', 'nmod', -3), (',', 'язык', 'punct', 2), ('национальный', 'язык', 'amod', 1), ('язык', 'язык', 'appos', -10), ('русского', 'народа', 'amod', 1), ('народа', 'язык', 'nmod', -2), ('.', 'язык', 'punct', -13)] + elif lang == 'slv': + assert dependencies == [('Slovenščina', 'naziv', 'nsubj', 6), ('[', 'sloˈʋenʃtʃina', 'punct', 1), ('sloˈʋenʃtʃina', 'Slovenščina', 'appos', -2), (']', 'sloˈʋenʃtʃina', 'punct', -1), ('je', 'naziv', 'cop', 2), ('združeni', 'naziv', 'amod', 1), ('naziv', 'naziv', 'ROOT', 0), ('za', 'jezik', 'case', 3), ('uradni', 'jezik', 'amod', 2), ('knjižni', 'jezik', 'amod', 1), ('jezik', 'naziv', 'nmod', -4), ('Slovencev', 'jezik', 'nmod', -1), ('in', 'ime', 'cc', 2), ('skupno', 'ime', 'amod', 1), ('ime', 'naziv', 'conj', -8), ('za', 'narečja', 'case', 1), ('narečja', 'ime', 'nmod', -2), ('in', 'govore', 'cc', 1), ('govore', 'narečja', 'conj', -2), (',', 'govorijo', 'punct', 3), ('ki', 'govorijo', 'mark', 2), ('jih', 'govorijo', 'obj', 1), ('govorijo', 'ime', 'acl', -8), ('ali', 'govorili', 'cc', 4), ('so', 'govorili', 'aux', 3), ('jih', 'govorili', 'obj', 2), ('nekoč', 'govorili', 'advmod', 1), ('govorili', 'govorijo', 'conj', -5), ('Slovenci', 'govorili', 'nsubj', -1), ('.', 'naziv', 'punct', -23)] elif lang == 'spa': assert dependencies == [('El', 'español', 'det', 1), ('español', 'lengua', 'nsubj', 5), ('o', 'castellano', 'cc', 1), ('castellano', 'español', 'conj', -2), ('es', 'lengua', 'cop', 2), ('una', 'lengua', 'det', 1), ('lengua', 'romance', 'amod', 1), ('romance', 'romance', 'ROOT', 0), ('procedente', 'romance', 'amod', -1), ('del', 'latín', 'case', 1), ('latín', 'procedente', 'nmod', -2), ('hablado', 'latín', 'amod', -1), (',', 'perteneciente', 'punct', 1), ('perteneciente', 'latín', 'amod', -3), ('a', 'familia', 'case', 2), ('la', 'familia', 'det', 1), ('familia', 'perteneciente', 'nmod', -3), ('de', 'lenguas', 'case', 1), ('lenguas', 'familia', 'nmod', -2), ('indoeuropeas', 'lenguas', 'amod', -1), ('.', 'romance', 'punct', -13)] elif lang == 'swe': diff --git a/tests/wl_tests_nlp/test_lemmatization.py b/tests/wl_tests_nlp/test_lemmatization.py index 991b4d070..18a82545e 100644 --- a/tests/wl_tests_nlp/test_lemmatization.py +++ b/tests/wl_tests_nlp/test_lemmatization.py @@ -310,7 +310,12 @@ def test_lemmatize(lang, lemmatizer): elif lang == 'slk': assert lemmas == ['slovenčina', 'patriť', 'do', 'skupina', 'západoslovanský', 'jazyk', '(', 'spolu', 's', 'čeština', ',', 'poľština', ',', 'horný', 'a', 'dolný', 'lužickou', 'srbčina', 'a', 'kašubčiný', ')', '.'] elif lang == 'slv': - assert lemmas == ['slovenščina', '[', 'sloˈʋenʃtʃina', ']', 'on', 'združen', 'naziv', 'za', 'uraden', 'knjižen', 'jezik', 'Slovenec', 'in', 'skupen', 'ime', 'za', 'narečje', 'in', 'govor', ',', 'ki', 'on', 'govoriti', 'ali', 'biti', 'on', 'nekoč', 'govorilo', 'Slovenec', '.'] + if lemmatizer == 'simplemma_slv': + assert lemmas == ['slovenščina', '[', 'sloˈʋenʃtʃina', ']', 'on', 'združen', 'naziv', 'za', 'uraden', 'knjižen', 'jezik', 'Slovenec', 'in', 'skupen', 'ime', 'za', 'narečje', 'in', 'govor', ',', 'ki', 'on', 'govoriti', 'ali', 'biti', 'on', 'nekoč', 'govorilo', 'Slovenec', '.'] + elif lemmatizer == 'spacy_slv': + assert lemmas == ['slovenščina', '[', 'sloˈʋenʃtʃina', ']', 'biti', 'združen', 'naziv', 'za', 'uraden', 'knjižen', 'jezik', 'Slovenec', 'in', 'skupen', 'ime', 'za', 'narečje', 'in', 'govor', ',', 'ki', 'on', 'govoriti', 'ali', 'biti', 'on', 'nekoč', 'govoriti', 'Slovenec', '.'] + else: + tests_lang_util_skipped = True elif lang == 'spa': if lemmatizer == 'simplemma_spa': assert lemmas == ['el', 'español', 'o', 'castellano', 'ser', 'uno', 'lengua', 'romance', 'procedente', 'del', 'latín', 'hablar', ',', 'perteneciente', 'a', 'el', 'familia', 'de', 'lengua', 'indoeuropeo', '.'] diff --git a/tests/wl_tests_nlp/test_pos_tagging.py b/tests/wl_tests_nlp/test_pos_tagging.py index 7b2a378ef..b7c6620a2 100644 --- a/tests/wl_tests_nlp/test_pos_tagging.py +++ b/tests/wl_tests_nlp/test_pos_tagging.py @@ -193,6 +193,9 @@ def test_pos_tag(lang, pos_tagger): assert tokens_tagged == tokens_tagged_universal == [('Ру́сский', 'ADJ'), ('язы́к', 'NOUN'), ('(', 'PUNCT'), ('[', 'PUNCT'), ('ˈruskʲɪi̯', 'PUNCT'), ('jɪˈzɨk', 'PROPN'), (']', 'PUNCT'), ('Информация', 'NOUN'), ('о', 'ADP'), ('файле', 'NOUN'), ('слушать)[~', 'PROPN'), ('3', 'NUM'), (']', 'PUNCT'), ('[', 'PUNCT'), ('⇨', 'PROPN'), (']', 'PUNCT'), ('—', 'PUNCT'), ('язык', 'NOUN'), ('восточнославянской', 'ADJ'), ('группы', 'NOUN'), ('славянской', 'ADJ'), ('ветви', 'NOUN'), ('индоевропейской', 'ADJ'), ('языковой', 'ADJ'), ('семьи', 'NOUN'), (',', 'PUNCT'), ('национальный', 'ADJ'), ('язык', 'NOUN'), ('русского', 'ADJ'), ('народа', 'NOUN'), ('.', 'PUNCT')] else: tests_lang_util_skipped = True + elif lang == 'slv': + assert tokens_tagged == [('Slovenščina', 'Ncfsn'), ('[', 'Z'), ('sloˈʋenʃtʃina', 'Ncfsn'), (']', 'Z'), ('je', 'Va-r3s-n'), ('združeni', 'Appmsny'), ('naziv', 'Ncmsan'), ('za', 'Sa'), ('uradni', 'Agpmsay'), ('knjižni', 'Agpmsay'), ('jezik', 'Ncmsan'), ('Slovencev', 'Npmpg'), ('in', 'Cc'), ('skupno', 'Agpnsn'), ('ime', 'Ncnsn'), ('za', 'Sa'), ('narečja', 'Ncnpa'), ('in', 'Cc'), ('govore', 'Ncmpa'), (',', 'Z'), ('ki', 'Cs'), ('jih', 'Pp3mpa--y'), ('govorijo', 'Vmpr3p'), ('ali', 'Cc'), ('so', 'Va-r3p-n'), ('jih', 'Pp3fpa--y'), ('nekoč', 'Rgp'), ('govorili', 'Vmpp-pm'), ('Slovenci', 'Npmpn'), ('.', 'Z')] + assert tokens_tagged_universal == [('Slovenščina', 'NOUN'), ('[', 'PUNCT'), ('sloˈʋenʃtʃina', 'NOUN'), (']', 'PUNCT'), ('je', 'AUX'), ('združeni', 'ADJ'), ('naziv', 'NOUN'), ('za', 'ADP'), ('uradni', 'ADJ'), ('knjižni', 'ADJ'), ('jezik', 'NOUN'), ('Slovencev', 'PROPN'), ('in', 'CCONJ'), ('skupno', 'ADJ'), ('ime', 'NOUN'), ('za', 'ADP'), ('narečja', 'NOUN'), ('in', 'CCONJ'), ('govore', 'NOUN'), (',', 'PUNCT'), ('ki', 'SCONJ'), ('jih', 'PRON'), ('govorijo', 'VERB'), ('ali', 'CCONJ'), ('so', 'AUX'), ('jih', 'PRON'), ('nekoč', 'ADV'), ('govorili', 'VERB'), ('Slovenci', 'PROPN'), ('.', 'PUNCT')] elif lang == 'spa': assert tokens_tagged == tokens_tagged_universal == [('El', 'DET'), ('español', 'NOUN'), ('o', 'CCONJ'), ('castellano', 'NOUN'), ('es', 'AUX'), ('una', 'DET'), ('lengua', 'ADJ'), ('romance', 'NOUN'), ('procedente', 'ADJ'), ('del', 'ADP'), ('latín', 'NOUN'), ('hablado', 'ADJ'), (',', 'PUNCT'), ('perteneciente', 'ADJ'), ('a', 'ADP'), ('la', 'DET'), ('familia', 'NOUN'), ('de', 'ADP'), ('lenguas', 'NOUN'), ('indoeuropeas', 'ADJ'), ('.', 'PUNCT')] elif lang == 'swe': diff --git a/tests/wl_tests_nlp/test_sentence_tokenization.py b/tests/wl_tests_nlp/test_sentence_tokenization.py index 71612a76e..d1862121c 100644 --- a/tests/wl_tests_nlp/test_sentence_tokenization.py +++ b/tests/wl_tests_nlp/test_sentence_tokenization.py @@ -190,10 +190,13 @@ def test_sentence_tokenize(lang, sentence_tokenizer): elif lang == 'rus': assert sentences == ['Ру́сский язы́к ([ˈruskʲɪi̯ jɪˈzɨk] Информация о файле слушать)[~ 3][⇨] — язык восточнославянской группы славянской ветви индоевропейской языковой семьи, национальный язык русского народа.', 'Является одним из наиболее распространённых языков мира — восьмым среди всех языков мира по общей численности говорящих[6] и седьмым по численности владеющих им как родным (2022)[3].', 'Русский является также самым распространённым славянским языком[9] и самым распространённым языком в Европе — географически и по числу носителей языка как родного[7].'] elif lang == 'slv': - if sentence_tokenizer == 'nltk_punkt_slv': + if sentence_tokenizer in [ + 'nltk_punkt_slv', + 'spacy_sentence_recognizer_slv' + ]: assert sentences == ['Slovenščina [sloˈʋenʃtʃina] je združeni naziv za uradni knjižni jezik Slovencev in skupno ime za narečja in govore, ki jih govorijo ali so jih nekoč govorili Slovenci.', 'Govori ga okoli 2,5 (dva in pol) milijona govorcev po svetu, od katerih jih večina živi v Sloveniji.', 'Glede na število govorcev ima razmeroma veliko narečij.', 'Slovenščina je zahodni južnoslovanski jezik in eden redkih indoevropskih jezikov, ki je ohranil dvojino.', 'Za zapisovanje slovenskega jezika se danes uporablja gajica, pisava imenovana po Ljudevitu Gaju, ki jo je priredil po češkem črkopisu.', 'Slovenska gajica se imenuje slovenica.', 'Pišemo jo od marčne revolucije 1848.', 'Do takrat smo uporabljali bohoričico.'] - elif sentence_tokenizer == 'spacy_sentencizer': - assert sentences == ['Slovenščina [sloˈʋenʃtʃina] je združeni naziv za uradni knjižni jezik Slovencev in skupno ime za narečja in govore, ki jih govorijo ali so jih nekoč govorili Slovenci.', 'Govori ga okoli 2,5 (dva in pol) milijona govorcev po svetu, od katerih jih večina živi v Sloveniji.', 'Glede na število govorcev ima razmeroma veliko narečij.', 'Slovenščina je zahodni južnoslovanski jezik in eden redkih indoevropskih jezikov, ki je ohranil dvojino.', 'Za zapisovanje slovenskega jezika se danes uporablja gajica, pisava imenovana po Ljudevitu Gaju, ki jo je priredil po češkem črkopisu.', 'Slovenska gajica se imenuje slovenica.', 'Pišemo jo od marčne revolucije 1848. Do takrat smo uporabljali bohoričico.'] + elif sentence_tokenizer == 'spacy_dependency_parser_slv': + assert sentences == ['Slovenščina [sloˈʋenʃtʃina] je združeni naziv za uradni knjižni jezik Slovencev in skupno ime za narečja in govore, ki jih govorijo ali so jih nekoč govorili Slovenci.', 'Govori ga okoli 2,5 (dva in pol) milijona govorcev po svetu, od katerih jih večina živi v Sloveniji.', 'Glede na število govorcev ima razmeroma veliko narečij.', 'Slovenščina je zahodni južnoslovanski jezik in eden redkih indoevropskih jezikov, ki je ohranil dvojino.', 'Za zapisovanje slovenskega jezika se danes uporablja gajica, pisava imenovana po Ljudevitu Gaju, ki jo je priredil po češkem črkopisu.', 'Slovenska gajica se imenuje slovenica.', 'Pišemo jo od marčne revolucije', '1848.', 'Do takrat smo uporabljali bohoričico.'] else: tests_lang_util_skipped = True elif lang == 'spa': diff --git a/tests/wl_tests_settings/test_settings_global.py b/tests/wl_tests_settings/test_settings_global.py index a51592682..ea8d24227 100644 --- a/tests/wl_tests_settings/test_settings_global.py +++ b/tests/wl_tests_settings/test_settings_global.py @@ -183,7 +183,7 @@ def check_settings_global(self): langs_supported_spacy_lemmatizers.append(lang_code) # Languages without data files - langs_supported_spacy_lemmatizers.extend(['fi', 'ja', 'ko', 'uk']) + langs_supported_spacy_lemmatizers.extend(['fi', 'ja', 'ko', 'sl', 'uk']) langs_supported_spacy_lemmatizers = add_lang_suffixes(langs_supported_spacy_lemmatizers) # Check for missing and extra languages for spaCy's sentence recognizer / sentencizer diff --git a/utils/wl_downloader_ci.py b/utils/wl_downloader_ci.py index 2f9ea2e7a..a2b9b2636 100644 --- a/utils/wl_downloader_ci.py +++ b/utils/wl_downloader_ci.py @@ -40,6 +40,7 @@ spacy.cli.download('pt_core_news_sm') spacy.cli.download('ro_core_news_sm') spacy.cli.download('ru_core_news_sm') +spacy.cli.download('sl_core_news_sm') spacy.cli.download('es_core_news_sm') spacy.cli.download('sv_core_news_sm') spacy.cli.download('uk_core_news_sm') diff --git a/utils/wl_downloader_spacy_models.py b/utils/wl_downloader_spacy_models.py index 2b977d903..85151c1b9 100644 --- a/utils/wl_downloader_spacy_models.py +++ b/utils/wl_downloader_spacy_models.py @@ -33,6 +33,7 @@ ['Greek' , 'ell', 'el'], ['Italian' , 'ita', 'it'], ['Japanese' , 'jpn', 'ja'], + ['Korean' , 'kor', 'ko'], ['Lithuanian' , 'lit', 'lt'], ['Macedonian' , 'mkd', 'mk'], ['Norwegian Bokmål', 'nob', 'nb'], @@ -40,6 +41,7 @@ ['Portuguese' , 'por', 'pt'], ['Romanian' , 'ron', 'ro'], ['Russian' , 'rus', 'ru'], + ['Slovenian' , 'slv', 'sl'], ['Spanish' , 'spa', 'es'], ['Swedish' , 'swe', 'sv'], ['Ukrainian' , 'ukr', 'uk'] @@ -48,15 +50,15 @@ model_name_zho = model_name_eng = 'core_web_sm' model_name_cat = model_name_hrv = model_name_dan = model_name_nld = model_name_fin = \ model_name_fra = model_name_deu = model_name_ell = model_name_ita = model_name_jpn = \ -model_name_lit = model_name_mkd = model_name_nob = model_name_pol = model_name_por = \ -model_name_ron = model_name_rus = model_name_spa = model_name_swe = model_name_ukr = \ -'core_news_sm' +model_name_kor = model_name_lit = model_name_mkd = model_name_nob = model_name_pol = \ +model_name_por = model_name_ron = model_name_rus = model_name_slv = model_name_spa = \ +model_name_swe = model_name_ukr = 'core_news_sm' model_ver_cat = model_ver_zho = model_ver_hrv = model_ver_dan = model_ver_nld = \ model_ver_eng = model_ver_fin = model_ver_fra = model_ver_deu = model_ver_ell = \ -model_ver_ita = model_ver_jpn = model_ver_lit = model_ver_mkd = model_ver_nob = \ -model_ver_pol = model_ver_por = model_ver_ron = model_ver_rus = model_ver_spa = \ -model_ver_swe = model_ver_ukr = '3.6.0' +model_ver_ita = model_ver_jpn = model_ver_kor = model_ver_lit = model_ver_mkd = \ +model_ver_nob = model_ver_pol = model_ver_por = model_ver_ron = model_ver_rus = \ +model_ver_slv = model_ver_spa = model_ver_swe = model_ver_ukr = '3.6.0' # Check updates for lang_text, lang_code_639_3, lang_code_639_1 in langs: diff --git a/utils/wl_packaging.spec b/utils/wl_packaging.spec index 131b626a9..679b062e9 100644 --- a/utils/wl_packaging.spec +++ b/utils/wl_packaging.spec @@ -67,6 +67,7 @@ datas.extend(PyInstaller.utils.hooks.collect_data_files('fi_core_news_sm')) datas.extend(PyInstaller.utils.hooks.collect_data_files('fr_core_news_sm')) datas.extend(PyInstaller.utils.hooks.collect_data_files('it_core_news_sm')) datas.extend(PyInstaller.utils.hooks.collect_data_files('ja_core_news_sm')) +datas.extend(PyInstaller.utils.hooks.collect_data_files('ko_core_news_sm')) datas.extend(PyInstaller.utils.hooks.collect_data_files('lt_core_news_sm')) datas.extend(PyInstaller.utils.hooks.collect_data_files('mk_core_news_sm')) datas.extend(PyInstaller.utils.hooks.collect_data_files('nb_core_news_sm')) @@ -75,6 +76,7 @@ datas.extend(PyInstaller.utils.hooks.collect_data_files('pl_core_news_sm')) datas.extend(PyInstaller.utils.hooks.collect_data_files('pt_core_news_sm')) datas.extend(PyInstaller.utils.hooks.collect_data_files('ro_core_news_sm')) datas.extend(PyInstaller.utils.hooks.collect_data_files('ru_core_news_sm')) +datas.extend(PyInstaller.utils.hooks.collect_data_files('sl_core_news_sm')) datas.extend(PyInstaller.utils.hooks.collect_data_files('es_core_news_sm')) datas.extend(PyInstaller.utils.hooks.collect_data_files('sv_core_news_sm')) datas.extend(PyInstaller.utils.hooks.collect_data_files('uk_core_news_sm')) @@ -135,6 +137,7 @@ hiddenimports = [ 'fr_core_news_sm', 'it_core_news_sm', 'ja_core_news_sm', + 'ko_core_news_sm', 'lt_core_news_sm', 'mk_core_news_sm', 'nb_core_news_sm', @@ -143,6 +146,7 @@ hiddenimports = [ 'pt_core_news_sm', 'ro_core_news_sm', 'ru_core_news_sm', + 'sl_core_news_sm', 'es_core_news_sm', 'sv_core_news_sm', 'uk_core_news_sm', diff --git a/wordless/wl_nlp/wl_nlp_utils.py b/wordless/wl_nlp/wl_nlp_utils.py index c42b41859..303727df6 100644 --- a/wordless/wl_nlp/wl_nlp_utils.py +++ b/wordless/wl_nlp/wl_nlp_utils.py @@ -82,6 +82,7 @@ def to_lang_util_texts(main, util_type, util_codes): 'por': 'pt_core_news_sm', 'ron': 'ro_core_news_sm', 'rus': 'ru_core_news_sm', + 'slv': 'sl_core_news_sm', 'spa': 'es_core_news_sm', 'swe': 'sv_core_news_sm', 'ukr': 'uk_core_news_sm', diff --git a/wordless/wl_settings/wl_settings_default.py b/wordless/wl_settings/wl_settings_default.py index fb8fccccf..ffccdf45b 100644 --- a/wordless/wl_settings/wl_settings_default.py +++ b/wordless/wl_settings/wl_settings_default.py @@ -1204,7 +1204,7 @@ def init_settings_default(main): 'por_pt': 'spacy_dependency_parser_por', 'ron': 'spacy_dependency_parser_ron', 'rus': 'spacy_dependency_parser_rus', - 'slv': 'nltk_punkt_slv', + 'slv': 'spacy_dependency_parser_slv', 'spa': 'spacy_dependency_parser_spa', 'swe': 'spacy_dependency_parser_swe', 'tha': 'pythainlp_crfcut', @@ -1291,7 +1291,7 @@ def init_settings_default(main): 'srp_latn': 'spacy_srp', 'sin': 'spacy_sin', 'slk': 'sacremoses_moses', - 'slv': 'sacremoses_moses', + 'slv': 'spacy_slv', 'dsb': 'spacy_dsb', 'hsb': 'spacy_hsb', 'spa': 'spacy_spa', @@ -1406,6 +1406,7 @@ def init_settings_default(main): 'por_pt': 'spacy_por', 'ron': 'spacy_ron', 'rus': 'spacy_rus', + 'slv': 'spacy_slv', 'spa': 'spacy_spa', 'swe': 'spacy_swe', 'tha': 'pythainlp_perceptron_pud', @@ -1533,7 +1534,7 @@ def init_settings_default(main): 'srp_cyrl': 'spacy_srp', 'srp_latn': 'simplemma_srp_latn', 'slk': 'simplemma_slk', - 'slv': 'simplemma_slv', + 'slv': 'spacy_slv', 'spa': 'spacy_spa', 'swa': 'simplemma_swa', 'swe': 'spacy_swe', @@ -1665,6 +1666,7 @@ def init_settings_default(main): 'por_pt': 'spacy_por', 'ron': 'spacy_ron', 'rus': 'spacy_rus', + 'slv': 'spacy_slv', 'spa': 'spacy_spa', 'swe': 'spacy_swe', 'ukr': 'spacy_ukr' diff --git a/wordless/wl_settings/wl_settings_global.py b/wordless/wl_settings/wl_settings_global.py index 3878ae360..34daab874 100644 --- a/wordless/wl_settings/wl_settings_global.py +++ b/wordless/wl_settings/wl_settings_global.py @@ -410,6 +410,7 @@ def init_settings_global(): _tr('init_settings_global', 'spaCy - Portuguese dependency parser'): 'spacy_dependency_parser_por', _tr('init_settings_global', 'spaCy - Romanian dependency parser'): 'spacy_dependency_parser_ron', _tr('init_settings_global', 'spaCy - Russian dependency parser'): 'spacy_dependency_parser_rus', + _tr('init_settings_global', 'spaCy - Slovenian dependency parser'): 'spacy_dependency_parser_slv', _tr('init_settings_global', 'spaCy - Spanish dependency parser'): 'spacy_dependency_parser_spa', _tr('init_settings_global', 'spaCy - Swedish dependency parser'): 'spacy_dependency_parser_swe', _tr('init_settings_global', 'spaCy - Ukrainian dependency parser'): 'spacy_dependency_parser_ukr', @@ -434,6 +435,7 @@ def init_settings_global(): _tr('init_settings_global', 'spaCy - Portuguese sentence recognizer'): 'spacy_sentence_recognizer_por', _tr('init_settings_global', 'spaCy - Romanian sentence recognizer'): 'spacy_sentence_recognizer_ron', _tr('init_settings_global', 'spaCy - Russian sentence recognizer'): 'spacy_sentence_recognizer_rus', + _tr('init_settings_global', 'spaCy - Slovenian sentence recognizer'): 'spacy_sentence_recognizer_slv', _tr('init_settings_global', 'spaCy - Spanish sentence recognizer'): 'spacy_sentence_recognizer_spa', _tr('init_settings_global', 'spaCy - Swedish sentence recognizer'): 'spacy_sentence_recognizer_swe', _tr('init_settings_global', 'spaCy - Ukrainian sentence recognizer'): 'spacy_sentence_recognizer_ukr', @@ -630,6 +632,7 @@ def init_settings_global(): _tr('init_settings_global', 'spaCy - Portuguese part-of-speech tagger'): 'spacy_por', _tr('init_settings_global', 'spaCy - Romanian part-of-speech tagger'): 'spacy_ron', _tr('init_settings_global', 'spaCy - Russian part-of-speech tagger'): 'spacy_rus', + _tr('init_settings_global', 'spaCy - Slovenian part-of-speech tagger'): 'spacy_slv', _tr('init_settings_global', 'spaCy - Spanish part-of-speech tagger'): 'spacy_spa', _tr('init_settings_global', 'spaCy - Swedish part-of-speech tagger'): 'spacy_swe', _tr('init_settings_global', 'spaCy - Ukrainian part-of-speech tagger'): 'spacy_ukr', @@ -721,6 +724,7 @@ def init_settings_global(): _tr('init_settings_global', 'spaCy - Romanian lemmatizer'): 'spacy_ron', _tr('init_settings_global', 'spaCy - Russian lemmatizer'): 'spacy_rus', _tr('init_settings_global', 'spaCy - Serbian lemmatizer'): 'spacy_srp', + _tr('init_settings_global', 'spaCy - Slovenian lemmatizer'): 'spacy_slv', _tr('init_settings_global', 'spaCy - Spanish lemmatizer'): 'spacy_spa', _tr('init_settings_global', 'spaCy - Swedish lemmatizer'): 'spacy_swe', _tr('init_settings_global', 'spaCy - Tagalog lemmatizer'): 'spacy_tgl', @@ -852,6 +856,7 @@ def init_settings_global(): _tr('init_settings_global', 'spaCy - Portuguese dependency parser'): 'spacy_por', _tr('init_settings_global', 'spaCy - Romanian dependency parser'): 'spacy_ron', _tr('init_settings_global', 'spaCy - Russian dependency parser'): 'spacy_rus', + _tr('init_settings_global', 'spaCy - Slovenian dependency parser'): 'spacy_slv', _tr('init_settings_global', 'spaCy - Spanish dependency parser'): 'spacy_spa', _tr('init_settings_global', 'spaCy - Swedish dependency parser'): 'spacy_swe', _tr('init_settings_global', 'spaCy - Ukrainian dependency parser'): 'spacy_ukr' @@ -1023,7 +1028,8 @@ def init_settings_global(): 'slv': [ 'nltk_punkt_slv', - 'spacy_sentencizer' + 'spacy_dependency_parser_slv', + 'spacy_sentence_recognizer_slv' ], 'spa': [ @@ -1586,6 +1592,7 @@ def init_settings_global(): 'spacy_rus' ], + 'slv': ['spacy_slv'], 'spa': ['spacy_spa'], 'swe': ['spacy_swe'], @@ -1784,7 +1791,11 @@ def init_settings_global(): 'srp_cyrl': ['spacy_srp'], 'srp_latn': ['simplemma_srp_latn'], 'slk': ['simplemma_slk'], - 'slv': ['simplemma_slv'], + + 'slv': [ + 'simplemma_slv', + 'spacy_slv' + ], 'spa': [ 'simplemma_spa', @@ -2058,6 +2069,7 @@ def init_settings_global(): 'por_pt': ['spacy_por'], 'ron': ['spacy_ron'], 'rus': ['spacy_rus'], + 'slv': ['spacy_slv'], 'spa': ['spacy_spa'], 'swe': ['spacy_swe'], 'ukr': ['spacy_ukr']