Skip to content

Commit

Permalink
Utils: Add spaCy's Slovenian sentence recognizer, part-of-speech tagg…
Browse files Browse the repository at this point in the history
…er, lemmatizer, and dependency parser
  • Loading branch information
BLKSerene committed Jul 23, 2023
1 parent 6e8be5d commit b1a72d4
Show file tree
Hide file tree
Showing 14 changed files with 54 additions and 17 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
- Utils: Add PyThaiNLP's perceptron part-of-speech tagger (Blackboard)
- Utils: Add spaCy's Korean sentence recognizer, word tokenizer, part-of-speech tagger, lemmatizer, and dependency parser
- Utils: Add spaCy's Malay word tokenizer
- Utils: Add spaCy's Slovenian sentence recognizer, part-of-speech tagger, lemmatizer, and dependency parser

### ❌ Removals
- Utils: Remove PyThaiNLP's perceptron part-of-speech tagger (LST20)
Expand Down
2 changes: 1 addition & 1 deletion doc/doc_eng.md
Original file line number Diff line number Diff line change
Expand Up @@ -733,7 +733,7 @@ Serbian (Cyrillic) |⭕️ |✔|✔|✖️|✔|✖️|✖️
Serbian (Latin) |⭕️ |✔|✔|✖️|✔|✖️|✖️
Sinhala |⭕️ |✔|✖️|✖️|✖️|✖️|✖️
Slovak |⭕️ |✔|✔|✖️|✔|✔|✖️
Slovenian |✔|✔|✔|✖️|✔|✔|✖️
Slovenian |✔|✔|✔||✔|✔|
Somali |⭕️ |⭕️ |✖️|✖️|✖️|✔|✖️
Sorbian (Lower) |⭕️ |✔|✖️|✖️|✖️|✖️|✖️
Sorbian (Upper) |⭕️ |✔|✖️|✖️|✖️|✖️|✖️
Expand Down
1 change: 1 addition & 0 deletions requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ spacy_models/pl_core_news_sm-3.6.0-py3-none-any.whl
spacy_models/pt_core_news_sm-3.6.0-py3-none-any.whl
spacy_models/ro_core_news_sm-3.6.0-py3-none-any.whl
spacy_models/ru_core_news_sm-3.6.0-py3-none-any.whl
spacy_models/sl_core_news_sm-3.6.0-py3-none-any.whl
spacy_models/sv_core_news_sm-3.6.0-py3-none-any.whl
spacy_models/uk_core_news_sm-3.6.0-py3-none-any.whl
spacy_models/zh_core_web_sm-3.6.0-py3-none-any.whl
2 changes: 2 additions & 0 deletions tests/wl_tests_nlp/test_dependency_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,8 @@ def test_dependency_parse(lang, dependency_parser):
assert dependencies == [('Limba', 'limbă', 'nsubj', 4), ('română', 'Limba', 'amod', -1), ('este', 'limbă', 'cop', 2), ('o', 'limbă', 'det', 1), ('limbă', 'limbă', 'ROOT', 0), ('indo-europeană', 'limbă', 'amod', -1), ('din', 'grupul', 'case', 1), ('grupul', 'limbă', 'nmod', -3), ('italic', 'grupul', 'amod', -1), ('și', 'subgrupul', 'cc', 2), ('din', 'subgrupul', 'case', 1), ('subgrupul', 'grupul', 'conj', -4), ('oriental', 'subgrupul', 'amod', -1), ('al', 'limbilor', 'det', 1), ('limbilor', 'subgrupul', 'nmod', -3), ('romanice', 'limbilor', 'amod', -1), ('.', 'limbă', 'punct', -12)]
elif lang == 'rus':
assert dependencies == [('Ру́сский', 'язы́к', 'amod', 1), ('язы́к', 'язык', 'nsubj', 16), ('(', 'ˈruskʲɪi̯', 'punct', 2), ('[', 'ˈruskʲɪi̯', 'punct', 1), ('ˈruskʲɪi̯', 'язы́к', 'appos', -3), ('jɪˈzɨk', 'ˈruskʲɪi̯', 'flat:foreign', -1), (']', 'ˈruskʲɪi̯', 'punct', -2), ('Информация', 'язы́к', 'appos', -6), ('о', 'файле', 'case', 1), ('файле', 'Информация', 'nmod', -2), ('слушать)[~', 'файле', 'nmod', -1), ('3', 'слушать)[~', 'appos', -1), (']', 'Информация', 'punct', -5), ('[', '⇨', 'punct', 1), ('⇨', 'Информация', 'appos', -7), (']', '⇨', 'punct', -1), ('—', 'язык', 'punct', 1), ('язык', 'язык', 'ROOT', 0), ('восточнославянской', 'группы', 'amod', 1), ('группы', 'язык', 'nmod', -2), ('славянской', 'ветви', 'amod', 1), ('ветви', 'группы', 'nmod', -2), ('индоевропейской', 'семьи', 'amod', 2), ('языковой', 'семьи', 'amod', 1), ('семьи', 'ветви', 'nmod', -3), (',', 'язык', 'punct', 2), ('национальный', 'язык', 'amod', 1), ('язык', 'язык', 'appos', -10), ('русского', 'народа', 'amod', 1), ('народа', 'язык', 'nmod', -2), ('.', 'язык', 'punct', -13)]
elif lang == 'slv':
assert dependencies == [('Slovenščina', 'naziv', 'nsubj', 6), ('[', 'sloˈʋenʃtʃina', 'punct', 1), ('sloˈʋenʃtʃina', 'Slovenščina', 'appos', -2), (']', 'sloˈʋenʃtʃina', 'punct', -1), ('je', 'naziv', 'cop', 2), ('združeni', 'naziv', 'amod', 1), ('naziv', 'naziv', 'ROOT', 0), ('za', 'jezik', 'case', 3), ('uradni', 'jezik', 'amod', 2), ('knjižni', 'jezik', 'amod', 1), ('jezik', 'naziv', 'nmod', -4), ('Slovencev', 'jezik', 'nmod', -1), ('in', 'ime', 'cc', 2), ('skupno', 'ime', 'amod', 1), ('ime', 'naziv', 'conj', -8), ('za', 'narečja', 'case', 1), ('narečja', 'ime', 'nmod', -2), ('in', 'govore', 'cc', 1), ('govore', 'narečja', 'conj', -2), (',', 'govorijo', 'punct', 3), ('ki', 'govorijo', 'mark', 2), ('jih', 'govorijo', 'obj', 1), ('govorijo', 'ime', 'acl', -8), ('ali', 'govorili', 'cc', 4), ('so', 'govorili', 'aux', 3), ('jih', 'govorili', 'obj', 2), ('nekoč', 'govorili', 'advmod', 1), ('govorili', 'govorijo', 'conj', -5), ('Slovenci', 'govorili', 'nsubj', -1), ('.', 'naziv', 'punct', -23)]
elif lang == 'spa':
assert dependencies == [('El', 'español', 'det', 1), ('español', 'lengua', 'nsubj', 5), ('o', 'castellano', 'cc', 1), ('castellano', 'español', 'conj', -2), ('es', 'lengua', 'cop', 2), ('una', 'lengua', 'det', 1), ('lengua', 'romance', 'amod', 1), ('romance', 'romance', 'ROOT', 0), ('procedente', 'romance', 'amod', -1), ('del', 'latín', 'case', 1), ('latín', 'procedente', 'nmod', -2), ('hablado', 'latín', 'amod', -1), (',', 'perteneciente', 'punct', 1), ('perteneciente', 'latín', 'amod', -3), ('a', 'familia', 'case', 2), ('la', 'familia', 'det', 1), ('familia', 'perteneciente', 'nmod', -3), ('de', 'lenguas', 'case', 1), ('lenguas', 'familia', 'nmod', -2), ('indoeuropeas', 'lenguas', 'amod', -1), ('.', 'romance', 'punct', -13)]
elif lang == 'swe':
Expand Down
7 changes: 6 additions & 1 deletion tests/wl_tests_nlp/test_lemmatization.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,12 @@ def test_lemmatize(lang, lemmatizer):
elif lang == 'slk':
assert lemmas == ['slovenčina', 'patriť', 'do', 'skupina', 'západoslovanský', 'jazyk', '(', 'spolu', 's', 'čeština', ',', 'poľština', ',', 'horný', 'a', 'dolný', 'lužickou', 'srbčina', 'a', 'kašubčiný', ')', '.']
elif lang == 'slv':
assert lemmas == ['slovenščina', '[', 'sloˈʋenʃtʃina', ']', 'on', 'združen', 'naziv', 'za', 'uraden', 'knjižen', 'jezik', 'Slovenec', 'in', 'skupen', 'ime', 'za', 'narečje', 'in', 'govor', ',', 'ki', 'on', 'govoriti', 'ali', 'biti', 'on', 'nekoč', 'govorilo', 'Slovenec', '.']
if lemmatizer == 'simplemma_slv':
assert lemmas == ['slovenščina', '[', 'sloˈʋenʃtʃina', ']', 'on', 'združen', 'naziv', 'za', 'uraden', 'knjižen', 'jezik', 'Slovenec', 'in', 'skupen', 'ime', 'za', 'narečje', 'in', 'govor', ',', 'ki', 'on', 'govoriti', 'ali', 'biti', 'on', 'nekoč', 'govorilo', 'Slovenec', '.']
elif lemmatizer == 'spacy_slv':
assert lemmas == ['slovenščina', '[', 'sloˈʋenʃtʃina', ']', 'biti', 'združen', 'naziv', 'za', 'uraden', 'knjižen', 'jezik', 'Slovenec', 'in', 'skupen', 'ime', 'za', 'narečje', 'in', 'govor', ',', 'ki', 'on', 'govoriti', 'ali', 'biti', 'on', 'nekoč', 'govoriti', 'Slovenec', '.']
else:
tests_lang_util_skipped = True
elif lang == 'spa':
if lemmatizer == 'simplemma_spa':
assert lemmas == ['el', 'español', 'o', 'castellano', 'ser', 'uno', 'lengua', 'romance', 'procedente', 'del', 'latín', 'hablar', ',', 'perteneciente', 'a', 'el', 'familia', 'de', 'lengua', 'indoeuropeo', '.']
Expand Down
3 changes: 3 additions & 0 deletions tests/wl_tests_nlp/test_pos_tagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,9 @@ def test_pos_tag(lang, pos_tagger):
assert tokens_tagged == tokens_tagged_universal == [('Ру́сский', 'ADJ'), ('язы́к', 'NOUN'), ('(', 'PUNCT'), ('[', 'PUNCT'), ('ˈruskʲɪi̯', 'PUNCT'), ('jɪˈzɨk', 'PROPN'), (']', 'PUNCT'), ('Информация', 'NOUN'), ('о', 'ADP'), ('файле', 'NOUN'), ('слушать)[~', 'PROPN'), ('3', 'NUM'), (']', 'PUNCT'), ('[', 'PUNCT'), ('⇨', 'PROPN'), (']', 'PUNCT'), ('—', 'PUNCT'), ('язык', 'NOUN'), ('восточнославянской', 'ADJ'), ('группы', 'NOUN'), ('славянской', 'ADJ'), ('ветви', 'NOUN'), ('индоевропейской', 'ADJ'), ('языковой', 'ADJ'), ('семьи', 'NOUN'), (',', 'PUNCT'), ('национальный', 'ADJ'), ('язык', 'NOUN'), ('русского', 'ADJ'), ('народа', 'NOUN'), ('.', 'PUNCT')]
else:
tests_lang_util_skipped = True
elif lang == 'slv':
assert tokens_tagged == [('Slovenščina', 'Ncfsn'), ('[', 'Z'), ('sloˈʋenʃtʃina', 'Ncfsn'), (']', 'Z'), ('je', 'Va-r3s-n'), ('združeni', 'Appmsny'), ('naziv', 'Ncmsan'), ('za', 'Sa'), ('uradni', 'Agpmsay'), ('knjižni', 'Agpmsay'), ('jezik', 'Ncmsan'), ('Slovencev', 'Npmpg'), ('in', 'Cc'), ('skupno', 'Agpnsn'), ('ime', 'Ncnsn'), ('za', 'Sa'), ('narečja', 'Ncnpa'), ('in', 'Cc'), ('govore', 'Ncmpa'), (',', 'Z'), ('ki', 'Cs'), ('jih', 'Pp3mpa--y'), ('govorijo', 'Vmpr3p'), ('ali', 'Cc'), ('so', 'Va-r3p-n'), ('jih', 'Pp3fpa--y'), ('nekoč', 'Rgp'), ('govorili', 'Vmpp-pm'), ('Slovenci', 'Npmpn'), ('.', 'Z')]
assert tokens_tagged_universal == [('Slovenščina', 'NOUN'), ('[', 'PUNCT'), ('sloˈʋenʃtʃina', 'NOUN'), (']', 'PUNCT'), ('je', 'AUX'), ('združeni', 'ADJ'), ('naziv', 'NOUN'), ('za', 'ADP'), ('uradni', 'ADJ'), ('knjižni', 'ADJ'), ('jezik', 'NOUN'), ('Slovencev', 'PROPN'), ('in', 'CCONJ'), ('skupno', 'ADJ'), ('ime', 'NOUN'), ('za', 'ADP'), ('narečja', 'NOUN'), ('in', 'CCONJ'), ('govore', 'NOUN'), (',', 'PUNCT'), ('ki', 'SCONJ'), ('jih', 'PRON'), ('govorijo', 'VERB'), ('ali', 'CCONJ'), ('so', 'AUX'), ('jih', 'PRON'), ('nekoč', 'ADV'), ('govorili', 'VERB'), ('Slovenci', 'PROPN'), ('.', 'PUNCT')]
elif lang == 'spa':
assert tokens_tagged == tokens_tagged_universal == [('El', 'DET'), ('español', 'NOUN'), ('o', 'CCONJ'), ('castellano', 'NOUN'), ('es', 'AUX'), ('una', 'DET'), ('lengua', 'ADJ'), ('romance', 'NOUN'), ('procedente', 'ADJ'), ('del', 'ADP'), ('latín', 'NOUN'), ('hablado', 'ADJ'), (',', 'PUNCT'), ('perteneciente', 'ADJ'), ('a', 'ADP'), ('la', 'DET'), ('familia', 'NOUN'), ('de', 'ADP'), ('lenguas', 'NOUN'), ('indoeuropeas', 'ADJ'), ('.', 'PUNCT')]
elif lang == 'swe':
Expand Down
9 changes: 6 additions & 3 deletions tests/wl_tests_nlp/test_sentence_tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,10 +190,13 @@ def test_sentence_tokenize(lang, sentence_tokenizer):
elif lang == 'rus':
assert sentences == ['Ру́сский язы́к ([ˈruskʲɪi̯ jɪˈzɨk] Информация о файле слушать)[~ 3][⇨] — язык восточнославянской группы славянской ветви индоевропейской языковой семьи, национальный язык русского народа.', 'Является одним из наиболее распространённых языков мира — восьмым среди всех языков мира по общей численности говорящих[6] и седьмым по численности владеющих им как родным (2022)[3].', 'Русский является также самым распространённым славянским языком[9] и самым распространённым языком в Европе — географически и по числу носителей языка как родного[7].']
elif lang == 'slv':
if sentence_tokenizer == 'nltk_punkt_slv':
if sentence_tokenizer in [
'nltk_punkt_slv',
'spacy_sentence_recognizer_slv'
]:
assert sentences == ['Slovenščina [sloˈʋenʃtʃina] je združeni naziv za uradni knjižni jezik Slovencev in skupno ime za narečja in govore, ki jih govorijo ali so jih nekoč govorili Slovenci.', 'Govori ga okoli 2,5 (dva in pol) milijona govorcev po svetu, od katerih jih večina živi v Sloveniji.', 'Glede na število govorcev ima razmeroma veliko narečij.', 'Slovenščina je zahodni južnoslovanski jezik in eden redkih indoevropskih jezikov, ki je ohranil dvojino.', 'Za zapisovanje slovenskega jezika se danes uporablja gajica, pisava imenovana po Ljudevitu Gaju, ki jo je priredil po češkem črkopisu.', 'Slovenska gajica se imenuje slovenica.', 'Pišemo jo od marčne revolucije 1848.', 'Do takrat smo uporabljali bohoričico.']
elif sentence_tokenizer == 'spacy_sentencizer':
assert sentences == ['Slovenščina [sloˈʋenʃtʃina] je združeni naziv za uradni knjižni jezik Slovencev in skupno ime za narečja in govore, ki jih govorijo ali so jih nekoč govorili Slovenci.', 'Govori ga okoli 2,5 (dva in pol) milijona govorcev po svetu, od katerih jih večina živi v Sloveniji.', 'Glede na število govorcev ima razmeroma veliko narečij.', 'Slovenščina je zahodni južnoslovanski jezik in eden redkih indoevropskih jezikov, ki je ohranil dvojino.', 'Za zapisovanje slovenskega jezika se danes uporablja gajica, pisava imenovana po Ljudevitu Gaju, ki jo je priredil po češkem črkopisu.', 'Slovenska gajica se imenuje slovenica.', 'Pišemo jo od marčne revolucije 1848. Do takrat smo uporabljali bohoričico.']
elif sentence_tokenizer == 'spacy_dependency_parser_slv':
assert sentences == ['Slovenščina [sloˈʋenʃtʃina] je združeni naziv za uradni knjižni jezik Slovencev in skupno ime za narečja in govore, ki jih govorijo ali so jih nekoč govorili Slovenci.', 'Govori ga okoli 2,5 (dva in pol) milijona govorcev po svetu, od katerih jih večina živi v Sloveniji.', 'Glede na število govorcev ima razmeroma veliko narečij.', 'Slovenščina je zahodni južnoslovanski jezik in eden redkih indoevropskih jezikov, ki je ohranil dvojino.', 'Za zapisovanje slovenskega jezika se danes uporablja gajica, pisava imenovana po Ljudevitu Gaju, ki jo je priredil po češkem črkopisu.', 'Slovenska gajica se imenuje slovenica.', 'Pišemo jo od marčne revolucije', '1848.', 'Do takrat smo uporabljali bohoričico.']
else:
tests_lang_util_skipped = True
elif lang == 'spa':
Expand Down
2 changes: 1 addition & 1 deletion tests/wl_tests_settings/test_settings_global.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def check_settings_global(self):
langs_supported_spacy_lemmatizers.append(lang_code)

# Languages without data files
langs_supported_spacy_lemmatizers.extend(['fi', 'ja', 'ko', 'uk'])
langs_supported_spacy_lemmatizers.extend(['fi', 'ja', 'ko', 'sl', 'uk'])
langs_supported_spacy_lemmatizers = add_lang_suffixes(langs_supported_spacy_lemmatizers)

# Check for missing and extra languages for spaCy's sentence recognizer / sentencizer
Expand Down
1 change: 1 addition & 0 deletions utils/wl_downloader_ci.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
spacy.cli.download('pt_core_news_sm')
spacy.cli.download('ro_core_news_sm')
spacy.cli.download('ru_core_news_sm')
spacy.cli.download('sl_core_news_sm')
spacy.cli.download('es_core_news_sm')
spacy.cli.download('sv_core_news_sm')
spacy.cli.download('uk_core_news_sm')
Expand Down
14 changes: 8 additions & 6 deletions utils/wl_downloader_spacy_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,15 @@
['Greek' , 'ell', 'el'],
['Italian' , 'ita', 'it'],
['Japanese' , 'jpn', 'ja'],
['Korean' , 'kor', 'ko'],
['Lithuanian' , 'lit', 'lt'],
['Macedonian' , 'mkd', 'mk'],
['Norwegian Bokmål', 'nob', 'nb'],
['Polish' , 'pol', 'pl'],
['Portuguese' , 'por', 'pt'],
['Romanian' , 'ron', 'ro'],
['Russian' , 'rus', 'ru'],
['Slovenian' , 'slv', 'sl'],
['Spanish' , 'spa', 'es'],
['Swedish' , 'swe', 'sv'],
['Ukrainian' , 'ukr', 'uk']
Expand All @@ -48,15 +50,15 @@
model_name_zho = model_name_eng = 'core_web_sm'
model_name_cat = model_name_hrv = model_name_dan = model_name_nld = model_name_fin = \
model_name_fra = model_name_deu = model_name_ell = model_name_ita = model_name_jpn = \
model_name_lit = model_name_mkd = model_name_nob = model_name_pol = model_name_por = \
model_name_ron = model_name_rus = model_name_spa = model_name_swe = model_name_ukr = \
'core_news_sm'
model_name_kor = model_name_lit = model_name_mkd = model_name_nob = model_name_pol = \
model_name_por = model_name_ron = model_name_rus = model_name_slv = model_name_spa = \
model_name_swe = model_name_ukr = 'core_news_sm'

model_ver_cat = model_ver_zho = model_ver_hrv = model_ver_dan = model_ver_nld = \
model_ver_eng = model_ver_fin = model_ver_fra = model_ver_deu = model_ver_ell = \
model_ver_ita = model_ver_jpn = model_ver_lit = model_ver_mkd = model_ver_nob = \
model_ver_pol = model_ver_por = model_ver_ron = model_ver_rus = model_ver_spa = \
model_ver_swe = model_ver_ukr = '3.6.0'
model_ver_ita = model_ver_jpn = model_ver_kor = model_ver_lit = model_ver_mkd = \
model_ver_nob = model_ver_pol = model_ver_por = model_ver_ron = model_ver_rus = \
model_ver_slv = model_ver_spa = model_ver_swe = model_ver_ukr = '3.6.0'

# Check updates
for lang_text, lang_code_639_3, lang_code_639_1 in langs:
Expand Down
4 changes: 4 additions & 0 deletions utils/wl_packaging.spec
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ datas.extend(PyInstaller.utils.hooks.collect_data_files('fi_core_news_sm'))
datas.extend(PyInstaller.utils.hooks.collect_data_files('fr_core_news_sm'))
datas.extend(PyInstaller.utils.hooks.collect_data_files('it_core_news_sm'))
datas.extend(PyInstaller.utils.hooks.collect_data_files('ja_core_news_sm'))
datas.extend(PyInstaller.utils.hooks.collect_data_files('ko_core_news_sm'))
datas.extend(PyInstaller.utils.hooks.collect_data_files('lt_core_news_sm'))
datas.extend(PyInstaller.utils.hooks.collect_data_files('mk_core_news_sm'))
datas.extend(PyInstaller.utils.hooks.collect_data_files('nb_core_news_sm'))
Expand All @@ -75,6 +76,7 @@ datas.extend(PyInstaller.utils.hooks.collect_data_files('pl_core_news_sm'))
datas.extend(PyInstaller.utils.hooks.collect_data_files('pt_core_news_sm'))
datas.extend(PyInstaller.utils.hooks.collect_data_files('ro_core_news_sm'))
datas.extend(PyInstaller.utils.hooks.collect_data_files('ru_core_news_sm'))
datas.extend(PyInstaller.utils.hooks.collect_data_files('sl_core_news_sm'))
datas.extend(PyInstaller.utils.hooks.collect_data_files('es_core_news_sm'))
datas.extend(PyInstaller.utils.hooks.collect_data_files('sv_core_news_sm'))
datas.extend(PyInstaller.utils.hooks.collect_data_files('uk_core_news_sm'))
Expand Down Expand Up @@ -135,6 +137,7 @@ hiddenimports = [
'fr_core_news_sm',
'it_core_news_sm',
'ja_core_news_sm',
'ko_core_news_sm',
'lt_core_news_sm',
'mk_core_news_sm',
'nb_core_news_sm',
Expand All @@ -143,6 +146,7 @@ hiddenimports = [
'pt_core_news_sm',
'ro_core_news_sm',
'ru_core_news_sm',
'sl_core_news_sm',
'es_core_news_sm',
'sv_core_news_sm',
'uk_core_news_sm',
Expand Down
1 change: 1 addition & 0 deletions wordless/wl_nlp/wl_nlp_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def to_lang_util_texts(main, util_type, util_codes):
'por': 'pt_core_news_sm',
'ron': 'ro_core_news_sm',
'rus': 'ru_core_news_sm',
'slv': 'sl_core_news_sm',
'spa': 'es_core_news_sm',
'swe': 'sv_core_news_sm',
'ukr': 'uk_core_news_sm',
Expand Down
8 changes: 5 additions & 3 deletions wordless/wl_settings/wl_settings_default.py
Original file line number Diff line number Diff line change
Expand Up @@ -1204,7 +1204,7 @@ def init_settings_default(main):
'por_pt': 'spacy_dependency_parser_por',
'ron': 'spacy_dependency_parser_ron',
'rus': 'spacy_dependency_parser_rus',
'slv': 'nltk_punkt_slv',
'slv': 'spacy_dependency_parser_slv',
'spa': 'spacy_dependency_parser_spa',
'swe': 'spacy_dependency_parser_swe',
'tha': 'pythainlp_crfcut',
Expand Down Expand Up @@ -1291,7 +1291,7 @@ def init_settings_default(main):
'srp_latn': 'spacy_srp',
'sin': 'spacy_sin',
'slk': 'sacremoses_moses',
'slv': 'sacremoses_moses',
'slv': 'spacy_slv',
'dsb': 'spacy_dsb',
'hsb': 'spacy_hsb',
'spa': 'spacy_spa',
Expand Down Expand Up @@ -1406,6 +1406,7 @@ def init_settings_default(main):
'por_pt': 'spacy_por',
'ron': 'spacy_ron',
'rus': 'spacy_rus',
'slv': 'spacy_slv',
'spa': 'spacy_spa',
'swe': 'spacy_swe',
'tha': 'pythainlp_perceptron_pud',
Expand Down Expand Up @@ -1533,7 +1534,7 @@ def init_settings_default(main):
'srp_cyrl': 'spacy_srp',
'srp_latn': 'simplemma_srp_latn',
'slk': 'simplemma_slk',
'slv': 'simplemma_slv',
'slv': 'spacy_slv',
'spa': 'spacy_spa',
'swa': 'simplemma_swa',
'swe': 'spacy_swe',
Expand Down Expand Up @@ -1665,6 +1666,7 @@ def init_settings_default(main):
'por_pt': 'spacy_por',
'ron': 'spacy_ron',
'rus': 'spacy_rus',
'slv': 'spacy_slv',
'spa': 'spacy_spa',
'swe': 'spacy_swe',
'ukr': 'spacy_ukr'
Expand Down
Loading

0 comments on commit b1a72d4

Please sign in to comment.