diff --git a/ACKNOWLEDGMENTS.md b/ACKNOWLEDGMENTS.md index 763ed78bf..9ae76b1e4 100644 --- a/ACKNOWLEDGMENTS.md +++ b/ACKNOWLEDGMENTS.md @@ -46,7 +46,7 @@ As Wordless stands on the shoulders of giants, I hereby extend my sincere gratit 21|[Sacremoses](https://github.com/alvations/sacremoses) |0.0.53|Liling Tan|[MIT](https://github.com/alvations/sacremoses/blob/master/LICENSE) 22|[SciPy](https://scipy.org/scipylib/) |1.10.1|SciPy Developers|[BSD-3-Clause](https://github.com/scipy/scipy/blob/main/LICENSE.txt) 23|[simplemma](https://github.com/adbar/simplemma) |0.9.1 |Adrien Barbaresi|[MIT](https://github.com/adbar/simplemma/blob/main/LICENSE) -24|[spaCy](https://spacy.io/) |3.5.1 |Matthew Honnibal, Ines Montani, Sofie Van Landeghem,
Adriane Boyd, Paul O'Leary McCann|[MIT](https://github.com/explosion/spaCy/blob/master/LICENSE) +24|[spaCy](https://spacy.io/) |3.6.0 |Matthew Honnibal, Ines Montani, Sofie Van Landeghem,
Adriane Boyd, Paul O'Leary McCann|[MIT](https://github.com/explosion/spaCy/blob/master/LICENSE) 25|[spacy-pkuseg](https://github.com/explosion/spacy-pkuseg) |0.0.32|Ruixuan Luo (罗睿轩), Jingjing Xu (许晶晶),
Xuancheng Ren (任宣丞), Yi Zhang (张艺),
Zhiyuan Zhang (张之远), Bingzhen Wei (位冰镇),
Xu Sun (孙栩)
Adriane Boyd, Ines Montani|[MIT](https://github.com/explosion/spacy-pkuseg/blob/master/LICENSE) 26|[stopword](https://github.com/fergiemcdowall/stopword) |2.0.5 |Fergus McDowall|[MIT](https://github.com/fergiemcdowall/stopword/blob/master/LICENSE) 27|[SudachiPy](https://github.com/WorksApplications/sudachi.rs) |0.6.7 |Works Applications Co., Ltd.|[Apache-2.0](https://github.com/WorksApplications/sudachi.rs/blob/develop/LICENSE) diff --git a/ACKNOWLEDGMENTS_zho_cn.md b/ACKNOWLEDGMENTS_zho_cn.md index 2d6a836a0..18e5ea921 100644 --- a/ACKNOWLEDGMENTS_zho_cn.md +++ b/ACKNOWLEDGMENTS_zho_cn.md @@ -46,7 +46,7 @@ 21|[Sacremoses](https://github.com/alvations/sacremoses) |0.0.53|Liling Tan|[MIT](https://github.com/alvations/sacremoses/blob/master/LICENSE) 22|[SciPy](https://scipy.org/scipylib/) |1.10.1|SciPy 开发人员|[BSD-3-Clause](https://github.com/scipy/scipy/blob/main/LICENSE.txt) 23|[simplemma](https://github.com/adbar/simplemma) |0.9.1 |Adrien Barbaresi|[MIT](https://github.com/adbar/simplemma/blob/main/LICENSE) -24|[spaCy](https://spacy.io/) |3.5.1 |Matthew Honnibal, Ines Montani, Sofie Van Landeghem,
Adriane Boyd, Paul O'Leary McCann|[MIT](https://github.com/explosion/spaCy/blob/master/LICENSE) +24|[spaCy](https://spacy.io/) |3.6.0 |Matthew Honnibal, Ines Montani, Sofie Van Landeghem,
Adriane Boyd, Paul O'Leary McCann|[MIT](https://github.com/explosion/spaCy/blob/master/LICENSE) 25|[spacy-pkuseg](https://github.com/explosion/spacy-pkuseg) |0.0.32|罗睿轩, 许晶晶, 任宣丞, 张艺, 张之远, 位冰镇, 孙栩
Adriane Boyd, Ines Montani|[MIT](https://github.com/explosion/spacy-pkuseg/blob/master/LICENSE) 26|[stopword](https://github.com/fergiemcdowall/stopword) |2.0.5 |Fergus McDowall|[MIT](https://github.com/fergiemcdowall/stopword/blob/master/LICENSE) 27|[SudachiPy](https://github.com/WorksApplications/sudachi.rs) |0.6.7 |Works Applications Co., Ltd.|[Apache-2.0](https://github.com/WorksApplications/sudachi.rs/blob/develop/LICENSE) diff --git a/ACKNOWLEDGMENTS_zho_tw.md b/ACKNOWLEDGMENTS_zho_tw.md index 5a565c5b2..aadcbc902 100644 --- a/ACKNOWLEDGMENTS_zho_tw.md +++ b/ACKNOWLEDGMENTS_zho_tw.md @@ -46,7 +46,7 @@ 21|[Sacremoses](https://github.com/alvations/sacremoses) |0.0.53|Liling Tan|[MIT](https://github.com/alvations/sacremoses/blob/master/LICENSE) 22|[SciPy](https://scipy.org/scipylib/) |1.10.1|SciPy 開發人員|[BSD-3-Clause](https://github.com/scipy/scipy/blob/main/LICENSE.txt) 23|[simplemma](https://github.com/adbar/simplemma) |0.9.1 |Adrien Barbaresi|[MIT](https://github.com/adbar/simplemma/blob/main/LICENSE) -24|[spaCy](https://spacy.io/) |3.5.1 |Matthew Honnibal, Ines Montani, Sofie Van Landeghem,
Adriane Boyd, Paul O'Leary McCann|[MIT](https://github.com/explosion/spaCy/blob/master/LICENSE) +24|[spaCy](https://spacy.io/) |3.6.0 |Matthew Honnibal, Ines Montani, Sofie Van Landeghem,
Adriane Boyd, Paul O'Leary McCann|[MIT](https://github.com/explosion/spaCy/blob/master/LICENSE) 25|[spacy-pkuseg](https://github.com/explosion/spacy-pkuseg) |0.0.32|羅睿軒, 許晶晶, 任宣丞, 張藝, 張之遠, 位冰鎮, 孫栩
Adriane Boyd, Ines Montani|[MIT](https://github.com/explosion/spacy-pkuseg/blob/master/LICENSE) 26|[stopword](https://github.com/fergiemcdowall/stopword) |2.0.5 |Fergus McDowall|[MIT](https://github.com/fergiemcdowall/stopword/blob/master/LICENSE) 27|[SudachiPy](https://github.com/WorksApplications/sudachi.rs) |0.6.7 |Works Applications Co., Ltd.|[Apache-2.0](https://github.com/WorksApplications/sudachi.rs/blob/develop/LICENSE) diff --git a/CHANGELOG.md b/CHANGELOG.md index 14241c887..24324a47d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,6 +30,7 @@ - Dependencies: Upgrade Charset Normalizer to 3.2.0 - Dependencies: Upgrade PyThaiNLP to 4.0.2 - Dependencies: Upgrade Requests to 2.31.0 +- Dependencies: Upgrade spaCy to 3.6.0 - Dependencies: Upgrade wordcloud to 1.9.2 ## [3.2.0](https://github.com/BLKSerene/Wordless/releases/tag/3.2.0) - 03/20/2023 diff --git a/requirements_tests.txt b/requirements_tests.txt index bcfae8e0c..2f360d225 100644 --- a/requirements_tests.txt +++ b/requirements_tests.txt @@ -40,7 +40,7 @@ pymorphy3-dicts-ru == 2.4.417150.4580142 pymorphy3-dicts-uk == 2.4.1.1.1663094765 ## spaCy -spacy == 3.5.1 +spacy == 3.6.0 spacy-lookups-data == 1.0.3 spacy-pkuseg == 0.0.32 diff --git a/tests/wl_tests_nlp/test_dependency_parsing.py b/tests/wl_tests_nlp/test_dependency_parsing.py index 032fc8b3e..075cf78af 100644 --- a/tests/wl_tests_nlp/test_dependency_parsing.py +++ b/tests/wl_tests_nlp/test_dependency_parsing.py @@ -122,7 +122,7 @@ def test_dependency_parse(lang, dependency_parser): elif lang == 'ita': assert dependencies == [('È', 'classificato', 'aux:pass', 1), ('classificato', 'classificato', 'ROOT', 0), ('al', '23º', 'case', 1), ('23º', 'posto', 'nummod', 1), ('posto', 'classificato', 'obl', -3), ('tra', 'lingue', 'case', 2), ('le', 'lingue', 'det', 1), ('lingue', 'posto', 'nmod', -3), ('per', 'numero', 'case', 1), ('numero', 'lingue', 'nmod', -2), ('di', 'parlanti', 'case', 1), ('parlanti', 'numero', 'nmod', -2), ('nel', 'mondo', 'case', 1), ('mondo', 'parlanti', 'nmod', -2), ('e', 'utilizzato', 'cc', 6), (',', 'utilizzato', 'punct', 5), ('in', 'Italia', 'case', 1), ('Italia', 'utilizzato', 'obl', 3), (',', 'Italia', 'punct', -1), ('è', 'utilizzato', 'aux:pass', 1), ('utilizzato', 'classificato', 'conj', -19), ('da', 'milioni', 'case', 3), ('circa', '58', 'advmod', 1), ('58', 'milioni', 'nummod', 1), ('milioni', 'utilizzato', 'obl', -4), ('di', 'residenti.[2', 'case', 1), ('residenti.[2', 'milioni', 'nmod', -2), (']', 'utilizzato', 'punct', -7)] elif lang == 'jpn': - assert dependencies == [('日本', '語', 'compound', 1), ('語', 'ご', 'nmod', 3), ('(', 'ご', 'compound', 2), ('にほん', 'ご', 'compound', 1), ('ご', '2', 'nmod', 6), ('、', 'ご', 'punct', -1), ('にっぽん', '2', 'cc', 4), ('ご', '2', 'compound', 3), ('[', '2', 'punct', 2), ('注', '2', 'compound', 1), ('2', ')', 'nmod', 2), (']', '2', 'punct', -1), (')', '言語', 'nsubj', 35), ('は', ')', 'case', -1), ('、', ')', 'punct', -2), ('日本', '内', 'compound', 2), ('国', '内', 'compound', 1), ('内', '領', 'nmod', 6), ('や', '内', 'case', -1), ('、', '内', 'punct', -2), ('かつて', '領', 'advmod', 3), ('の', 'かつて', 'case', -1), ('日本', '領', 'compound', 1), ('領', '国', 'acl', 3), ('だっ', '領', 'cop', -1), ('た', '領', 'aux', -2), ('国', '言語', 'nmod', 21), ('、', '国', 'punct', -1), ('そして', '使用', 'cc', 14), ('国外', '移民', 'compound', 1), ('移民', '者', 'nmod', 3), ('や', '移民', 'case', -1), ('移住', '者', 'compound', 1), ('者', '含む', 'obj', 2), ('を', '者', 'case', -1), ('含む', '同士', 'acl', 3), ('日本', '同士', 'compound', 2), ('人', '同士', 'compound', 1), ('同士', '間', 'nmod', 2), ('の', '同士', 'case', -1), ('間', '使用', 'obl', 2), ('で', '間', 'case', -1), ('使用', '言語', 'acl', 5), ('さ', '使用', 'aux', -1), ('れ', '使用', 'aux', -2), ('て', '使用', 'mark', -3), ('いる', 'て', 'fixed', -1), ('言語', '言語', 'ROOT', 0), ('。', '言語', 'punct', -1)] + assert dependencies == [('日本', 'ご', 'compound', 4), ('語', 'ご', 'compound', 3), ('(', 'ご', 'compound', 2), ('にほん', 'ご', 'compound', 1), ('ご', ')', 'nmod', 8), ('、', 'ご', 'punct', -1), ('にっぽん', 'ご', 'compound', 1), ('ご', ')', 'compound', 5), ('[', ')', 'punct', 4), ('注', ')', 'compound', 3), ('2', ')', 'compound', 2), (']', ')', 'punct', 1), (')', '言語', 'nsubj', 35), ('は', ')', 'case', -1), ('、', ')', 'punct', -2), ('日本', '内', 'compound', 2), ('国', '内', 'compound', 1), ('内', '領', 'nmod', 6), ('や', '内', 'case', -1), ('、', '内', 'punct', -2), ('かつて', '領', 'advmod', 3), ('の', 'かつて', 'case', -1), ('日本', '領', 'compound', 1), ('領', '国', 'acl', 3), ('だっ', '領', 'cop', -1), ('た', '領', 'aux', -2), ('国', '使用', 'obl', 16), ('、', '国', 'punct', -1), ('そして', '使用', 'cc', 14), ('国外', '移民', 'compound', 1), ('移民', '者', 'nmod', 3), ('や', '移民', 'case', -1), ('移住', '者', 'compound', 1), ('者', '含む', 'obj', 2), ('を', '者', 'case', -1), ('含む', '同士', 'acl', 3), ('日本', '同士', 'compound', 2), ('人', '同士', 'compound', 1), ('同士', '間', 'nmod', 2), ('の', '同士', 'case', -1), ('間', '使用', 'obl', 2), ('で', '間', 'case', -1), ('使用', '言語', 'acl', 5), ('さ', '使用', 'aux', -1), ('れ', '使用', 'aux', -2), ('て', '使用', 'mark', -3), ('いる', 'て', 'fixed', -1), ('言語', '言語', 'ROOT', 0), ('。', '言語', 'punct', -1)] elif lang == 'lit': assert dependencies == [('Lietuvių', 'kalba', 'nmod', 1), ('kalba', 'kalba', 'ROOT', 0), ('–', 'kalba', 'punct', 7), ('iš', 'prokalbės', 'case', 2), ('baltų', 'iš', 'advmod:emph', -1), ('prokalbės', 'kilusi', 'obl:arg', 1), ('kilusi', 'kalba', 'acl', 3), ('lietuvių', 'tautos', 'nmod', 1), ('tautos', 'kalba', 'nmod', 1), ('kalba', 'kalba', 'ROOT', 0), (',', 'kuri', 'punct', 1), ('kuri', 'kalba', 'conj', -2), ('Lietuvoje', 'kuri', 'advmod:emph', -1), ('yra', 'kuri', 'advmod:emph', -2), ('valstybinė', 'kuri', 'advmod:emph', -3), (',', 'Sąjungoje', 'punct', 3), ('o', 'Sąjungoje', 'cc', 2), ('Europos', 'Sąjungoje', 'nmod', 1), ('Sąjungoje', 'valstybinė', 'conj', -4), ('–', 'viena', 'punct', 1), ('viena', 'kuri', 'appos', -9), ('iš', 'kalbų', 'case', 2), ('oficialiųjų', 'kalbų', 'acl', 1), ('kalbų', 'viena', 'obl:arg', -3), ('.', '.', 'ROOT', 0)] elif lang == 'mkd': @@ -140,7 +140,7 @@ def test_dependency_parse(lang, dependency_parser): elif lang == 'spa': assert dependencies == [('El', 'español', 'det', 1), ('español', 'lengua', 'nsubj', 5), ('o', 'castellano', 'cc', 1), ('castellano', 'español', 'conj', -2), ('es', 'lengua', 'cop', 2), ('una', 'lengua', 'det', 1), ('lengua', 'romance', 'amod', 1), ('romance', 'romance', 'ROOT', 0), ('procedente', 'romance', 'amod', -1), ('del', 'latín', 'case', 1), ('latín', 'procedente', 'nmod', -2), ('hablado', 'latín', 'amod', -1), (',', 'perteneciente', 'punct', 1), ('perteneciente', 'latín', 'amod', -3), ('a', 'familia', 'case', 2), ('la', 'familia', 'det', 1), ('familia', 'perteneciente', 'nmod', -3), ('de', 'lenguas', 'case', 1), ('lenguas', 'familia', 'nmod', -2), ('indoeuropeas', 'lenguas', 'amod', -1), ('.', 'romance', 'punct', -13)] elif lang == 'swe': - assert dependencies == [('Svenska', 'språk', 'nsubj', 11), ('(', 'Svenska', 'punct', -1), ('svenska', 'Svenska', 'appos', -2), ('\u2009', 'svenska', 'dep', -1), ('(', 'svenska', 'punct', -2), ('info', 'svenska', 'appos', -3), (')', 'svenska', 'punct', -4), (')', 'Svenska', 'punct', -7), ('är', 'språk', 'cop', 3), ('ett', 'språk', 'det', 2), ('östnordiskt', 'språk', 'amod', 1), ('språk', 'språk', 'ROOT', 0), ('som', 'talas', 'nsubj:pass', 1), ('talas', 'språk', 'acl:relcl', -2), ('av', 'personer', 'case', 4), ('ungefär', 'tio', 'advmod', 1), ('tio', 'miljoner', 'nummod', 1), ('miljoner', 'personer', 'nmod', 1), ('personer', 'talas', 'obl', -5), ('främst', 'talas', 'advmod', -6), ('i', 'Sverige', 'case', 1), ('Sverige', 'talas', 'obl', -8), ('där', 'har', 'advmod', 2), ('språket', 'har', 'nsubj', 1), ('har', 'språk', 'acl:relcl', -13), ('en', 'ställning', 'det', 2), ('dominant', 'ställning', 'amod', 1), ('ställning', 'har', 'obj', -3), ('som', 'huvudspråk', 'mark', 1), ('huvudspråk', 'ställning', 'appos', -2), (',', 'nationalspråket', 'punct', 6), ('men', 'nationalspråket', 'cc', 5), ('även', 'nationalspråket', 'advmod', 4), ('som', 'även', 'fixed', -1), ('det', 'nationalspråket', 'det', 2), ('ena', 'nationalspråket', 'amod', 1), ('nationalspråket', 'huvudspråk', 'conj', -7), ('i', 'Finland', 'case', 1), ('Finland', 'nationalspråket', 'nmod', -2), ('och', 'språk', 'cc', 4), ('som', 'språk', 'mark', 3), ('enda', 'språk', 'amod', 2), ('officiella', 'språk', 'amod', 1), ('språk', 'Finland', 'conj', -5), ('på', 'Åland', 'case', 1), ('Åland', 'språk', 'nmod', -2), ('.', 'språk', 'punct', -35)] + assert dependencies == [('Svenska', 'språk', 'amod', 11), ('(', 'Svenska', 'punct', -1), ('svenska', 'Svenska', 'appos', -2), ('\u2009', 'svenska', 'dep', -1), ('(', 'svenska', 'punct', -2), ('info', 'svenska', 'appos', -3), (')', 'svenska', 'punct', -4), (')', 'Svenska', 'punct', -7), ('är', 'språk', 'cop', 3), ('ett', 'språk', 'det', 2), ('östnordiskt', 'språk', 'amod', 1), ('språk', 'språk', 'ROOT', 0), ('som', 'talas', 'nsubj:pass', 1), ('talas', 'språk', 'acl:relcl', -2), ('av', 'personer', 'case', 4), ('ungefär', 'tio', 'advmod', 1), ('tio', 'miljoner', 'nummod', 1), ('miljoner', 'personer', 'nmod', 1), ('personer', 'talas', 'obl:agent', -5), ('främst', 'personer', 'nmod', -1), ('i', 'Sverige', 'case', 1), ('Sverige', 'talas', 'obl', -8), ('där', 'har', 'advmod', 2), ('språket', 'har', 'nsubj', 1), ('har', 'talas', 'dep', -11), ('en', 'ställning', 'det', 2), ('dominant', 'ställning', 'amod', 1), ('ställning', 'har', 'obj', -3), ('som', 'huvudspråk', 'mark', 1), ('huvudspråk', 'ställning', 'acl', -2), (',', 'nationalspråket', 'punct', 6), ('men', 'nationalspråket', 'cc', 5), ('även', 'nationalspråket', 'advmod', 4), ('som', 'nationalspråket', 'case', 3), ('det', 'nationalspråket', 'det', 2), ('ena', 'nationalspråket', 'amod', 1), ('nationalspråket', 'huvudspråk', 'conj', -7), ('i', 'Finland', 'case', 1), ('Finland', 'nationalspråket', 'nmod', -2), ('och', 'språk', 'cc', 4), ('som', 'språk', 'mark', 3), ('enda', 'språk', 'amod', 2), ('officiella', 'språk', 'amod', 1), ('språk', 'Finland', 'conj', -5), ('på', 'Åland', 'case', 1), ('Åland', 'språk', 'nmod', -2), ('.', 'språk', 'punct', -35)] elif lang == 'ukr': assert dependencies == [('Украї́нська', 'мо́ва', 'amod', 1), ('мо́ва', 'ру́ська', 'nsubj', 12), ('(', 'МФА', 'punct', 1), ('МФА', 'мо́ва', 'appos', -2), (':', '[', 'punct', 1), ('[', 'МФА', 'parataxis', -2), ('ukrɑ̽ˈjɪnʲsʲkɑ̽', '[', 'flat:foreign', -1), ('ˈmɔwɑ̽', '[', 'flat:foreign', -2), (']', '[', 'punct', -3), (',', 'назви', 'punct', 2), ('історичні', 'назви', 'amod', 1), ('назви', 'мо́ва', 'conj', -10), ('—', 'ру́ська', 'punct', 1), ('ру́ська', 'ру́ська', 'ROOT', 0), (',', 'руси́нська[10][11][12', 'punct', 1), ('руси́нська[10][11][12', 'ру́ська', 'conj', -2), (']', 'ру́ська', 'punct', -3), ('[', 'ру́ська', 'nmod', -4), ('*', 'ру́ська', 'punct', -5), ('1', 'ру́ська', 'dep', -6), (']', 'ру́ська', 'parataxis', -7), (')', 'ру́ська', 'punct', -8), ('—', 'мова', 'punct', 2), ('національна', 'мова', 'amod', 1), ('мова', 'мова', 'ROOT', 0), ('українців', 'мова', 'nmod', -1), ('.', 'мова', 'punct', -2)] else: diff --git a/tests/wl_tests_nlp/test_lemmatization.py b/tests/wl_tests_nlp/test_lemmatization.py index da6763679..b3c7549a9 100644 --- a/tests/wl_tests_nlp/test_lemmatization.py +++ b/tests/wl_tests_nlp/test_lemmatization.py @@ -322,7 +322,7 @@ def test_lemmatize(lang, lemmatizer): if lemmatizer == 'simplemma_swe': assert lemmas == ['svensk', '(', 'svensk', '(', 'info', ')', ')', 'ära', 'en', 'östnordiskt', 'språka', 'som', 'tala', 'av', 'ungefär', 'tio', 'miljon', 'person', 'främst', 'i', 'Sverige', 'där', 'språk', 'ha', 'man', 'dominant', 'ställning', 'som', 'huvudspråk', ',', 'mena', 'även', 'som', 'den', 'en', 'nationalspråk', 'i', 'Finland', 'och', 'som', 'enda', 'officiell', 'språka', 'på', 'Åland', '.'] elif lemmatizer == 'spacy_swe': - assert lemmas == ['svensk', '(', 'svensk', '(', 'info', ')', ')', 'vara', 'en', 'östnordisk', 'språk', 'som', 'tala', 'av', 'ungefär', 'tio', 'miljon', 'person', 'främst', 'i', 'Sverige', 'där', 'språk', 'ha', 'en', 'dominant', 'ställning', 'som', 'huvudspråk', ',', 'men', 'även', 'som', 'en', 'ena', 'nationalspråk', 'i', 'Finland', 'och', 'som', 'enda', 'officiell', 'språk', 'på', 'Åland', '.'] + assert lemmas == ['svensk', '(', 'svensk', '(', 'info', ')', ')', 'vara', 'en', 'östnordisk', 'språk', 'som', 'tala', 'av', 'ungefär', 'tio', 'miljon', 'person', 'främst', 'i', 'Sverige', 'där', 'språke', 'ha', 'en', 'dominant', 'ställning', 'som', 'huvudspråk', ',', 'men', 'även', 'som', 'en', 'ena', 'nationalspråk', 'i', 'Finland', 'och', 'som', 'enda', 'officiell', 'språk', 'på', 'Åland', '.'] else: tests_lang_util_skipped = True elif lang == 'tgl': diff --git a/tests/wl_tests_nlp/test_pos_tagging.py b/tests/wl_tests_nlp/test_pos_tagging.py index b73325541..85aea7cc8 100644 --- a/tests/wl_tests_nlp/test_pos_tagging.py +++ b/tests/wl_tests_nlp/test_pos_tagging.py @@ -190,8 +190,8 @@ def test_pos_tag(lang, pos_tagger): elif lang == 'spa': assert tokens_tagged == tokens_tagged_universal == [('El', 'DET'), ('español', 'NOUN'), ('o', 'CCONJ'), ('castellano', 'NOUN'), ('es', 'AUX'), ('una', 'DET'), ('lengua', 'ADJ'), ('romance', 'NOUN'), ('procedente', 'ADJ'), ('del', 'ADP'), ('latín', 'NOUN'), ('hablado', 'ADJ'), (',', 'PUNCT'), ('perteneciente', 'ADJ'), ('a', 'ADP'), ('la', 'DET'), ('familia', 'NOUN'), ('de', 'ADP'), ('lenguas', 'NOUN'), ('indoeuropeas', 'ADJ'), ('.', 'PUNCT')] elif lang == 'swe': - assert tokens_tagged == [('Svenska', 'JJ|POS|UTR/NEU|SIN|DEF|NOM'), ('(', 'PAD'), ('svenska', 'JJ|POS|UTR/NEU|SIN|DEF|NOM'), ('(', 'PAD'), ('info', 'NN|NEU|SIN|IND|NOM'), (')', 'PAD'), (')', 'PAD'), ('är', 'VB|PRS|AKT'), ('ett', 'DT|NEU|SIN|IND'), ('östnordiskt', 'JJ|POS|NEU|SIN|IND|NOM'), ('språk', 'NN|NEU|SIN|IND|NOM'), ('som', 'HP|-|-|-'), ('talas', 'VB|PRS|SFO'), ('av', 'PP'), ('ungefär', 'AB'), ('tio', 'RG|NOM'), ('miljoner', 'NN|UTR|PLU|IND|NOM'), ('personer', 'NN|UTR|PLU|IND|NOM'), ('främst', 'AB|SUV'), ('i', 'PP'), ('Sverige', 'PM|NOM'), ('där', 'HA'), ('språket', 'NN|NEU|SIN|DEF|NOM'), ('har', 'VB|PRS|AKT'), ('en', 'DT|UTR|SIN|IND'), ('dominant', 'JJ|POS|NEU|SIN|IND|NOM'), ('ställning', 'NN|UTR|SIN|IND|NOM'), ('som', 'KN'), ('huvudspråk', 'NN|NEU|SIN|IND|NOM'), (',', 'MID'), ('men', 'KN'), ('även', 'AB'), ('som', 'HA'), ('det', 'DT|NEU|SIN|DEF'), ('ena', 'JJ|POS|UTR/NEU|SIN/PLU|IND/DEF|NOM'), ('nationalspråket', 'NN|NEU|SIN|DEF|NOM'), ('i', 'PP'), ('Finland', 'PM|NOM'), ('och', 'KN'), ('som', 'HP|-|-|-'), ('enda', 'JJ|POS|UTR/NEU|SIN/PLU|IND/DEF|NOM'), ('officiella', 'JJ|POS|UTR/NEU|PLU|IND/DEF|NOM'), ('språk', 'NN|NEU|PLU|IND|NOM'), ('på', 'PP'), ('Åland', 'PM|NOM'), ('.', 'MAD')] - assert tokens_tagged_universal == [('Svenska', 'ADJ'), ('(', 'PUNCT'), ('svenska', 'ADJ'), ('(', 'PUNCT'), ('info', 'NOUN'), (')', 'PUNCT'), (')', 'PUNCT'), ('är', 'AUX'), ('ett', 'DET'), ('östnordiskt', 'ADJ'), ('språk', 'NOUN'), ('som', 'PRON'), ('talas', 'VERB'), ('av', 'ADP'), ('ungefär', 'ADV'), ('tio', 'NUM'), ('miljoner', 'NOUN'), ('personer', 'NOUN'), ('främst', 'ADV'), ('i', 'ADP'), ('Sverige', 'PROPN'), ('där', 'ADV'), ('språket', 'NOUN'), ('har', 'VERB'), ('en', 'DET'), ('dominant', 'ADJ'), ('ställning', 'NOUN'), ('som', 'SCONJ'), ('huvudspråk', 'NOUN'), (',', 'PUNCT'), ('men', 'CCONJ'), ('även', 'ADV'), ('som', 'ADV'), ('det', 'DET'), ('ena', 'ADJ'), ('nationalspråket', 'NOUN'), ('i', 'ADP'), ('Finland', 'PROPN'), ('och', 'CCONJ'), ('som', 'SCONJ'), ('enda', 'ADJ'), ('officiella', 'ADJ'), ('språk', 'NOUN'), ('på', 'ADP'), ('Åland', 'PROPN'), ('.', 'PUNCT')] + assert tokens_tagged == [('Svenska', 'JJ|POS|UTR/NEU|SIN|DEF|NOM'), ('(', 'PAD'), ('svenska', 'JJ|POS|UTR/NEU|SIN|DEF|NOM'), ('(', 'PAD'), ('info', 'JJ|POS|NEU|SIN|IND|NOM'), (')', 'PAD'), (')', 'PAD'), ('är', 'VB|PRS|AKT'), ('ett', 'DT|NEU|SIN|IND'), ('östnordiskt', 'JJ|POS|NEU|SIN|IND|NOM'), ('språk', 'NN|NEU|SIN|IND|NOM'), ('som', 'HP|-|-|-'), ('talas', 'VB|PRS|SFO'), ('av', 'PP'), ('ungefär', 'AB'), ('tio', 'RG|NOM'), ('miljoner', 'NN|UTR|PLU|IND|NOM'), ('personer', 'NN|UTR|PLU|IND|NOM'), ('främst', 'AB|SUV'), ('i', 'PP'), ('Sverige', 'PM|NOM'), ('där', 'HA'), ('språket', 'AB|POS'), ('har', 'VB|PRS|AKT'), ('en', 'DT|UTR|SIN|IND'), ('dominant', 'JJ|POS|UTR|SIN|IND|NOM'), ('ställning', 'NN|UTR|SIN|IND|NOM'), ('som', 'KN'), ('huvudspråk', 'NN|NEU|SIN|IND|NOM'), (',', 'MID'), ('men', 'KN'), ('även', 'AB'), ('som', 'HA'), ('det', 'DT|NEU|SIN|DEF'), ('ena', 'JJ|POS|UTR/NEU|SIN/PLU|IND/DEF|NOM'), ('nationalspråket', 'NN|NEU|SIN|DEF|NOM'), ('i', 'PP'), ('Finland', 'PM|NOM'), ('och', 'KN'), ('som', 'HP|-|-|-'), ('enda', 'JJ|POS|UTR/NEU|SIN/PLU|IND/DEF|NOM'), ('officiella', 'JJ|POS|UTR/NEU|PLU|IND/DEF|NOM'), ('språk', 'NN|NEU|SIN|IND|NOM'), ('på', 'PP'), ('Åland', 'PM|NOM'), ('.', 'MAD')] + assert tokens_tagged_universal == [('Svenska', 'ADJ'), ('(', 'PUNCT'), ('svenska', 'ADJ'), ('(', 'PUNCT'), ('info', 'ADJ'), (')', 'PUNCT'), (')', 'PUNCT'), ('är', 'AUX'), ('ett', 'DET'), ('östnordiskt', 'ADJ'), ('språk', 'NOUN'), ('som', 'PRON'), ('talas', 'VERB'), ('av', 'ADP'), ('ungefär', 'ADV'), ('tio', 'NUM'), ('miljoner', 'NOUN'), ('personer', 'NOUN'), ('främst', 'ADV'), ('i', 'ADP'), ('Sverige', 'PROPN'), ('där', 'ADV'), ('språket', 'ADV'), ('har', 'VERB'), ('en', 'DET'), ('dominant', 'ADJ'), ('ställning', 'NOUN'), ('som', 'SCONJ'), ('huvudspråk', 'NOUN'), (',', 'PUNCT'), ('men', 'CCONJ'), ('även', 'ADV'), ('som', 'SCONJ'), ('det', 'DET'), ('ena', 'ADJ'), ('nationalspråket', 'NOUN'), ('i', 'ADP'), ('Finland', 'PROPN'), ('och', 'CCONJ'), ('som', 'SCONJ'), ('enda', 'ADJ'), ('officiella', 'ADJ'), ('språk', 'NOUN'), ('på', 'ADP'), ('Åland', 'PROPN'), ('.', 'PUNCT')] elif lang == 'tha': if pos_tagger == 'pythainlp_perceptron_blackboard': assert tokens_tagged == [('ภาษาไทย', 'NN'), ('หรือ', 'CC'), ('ภาษาไทย', 'NN'), ('กลาง', 'NN'), ('เป็น', 'VV'), ('ภาษา', 'NN'), ('ใน', 'PS'), ('กลุ่ม', 'NN'), ('ภาษา', 'NN'), ('ไท', 'NN'), ('ซึ่ง', 'CC'), ('เป็น', 'VV'), ('กลุ่มย่อย', 'NN'), ('ของ', 'PS'), ('ตระกูล', 'NN'), ('ภาษา', 'NN'), ('ข', 'NN'), ('ร้า', 'NN'), ('-', 'PU'), ('ไท', 'NN'), ('และ', 'CC'), ('เป็น', 'VV'), ('ภาษาราชการ', 'NN'), ('และ', 'CC'), ('ภาษาประจำชาติ', 'NN'), ('ของ', 'PS'), ('ประเทศ', 'NN'), ('ไทย', 'NN'), ('[', 'NN'), ('3', 'NU'), ('][', 'CL'), ('4', 'NU'), (']', 'CL')] diff --git a/utils/wl_downloader_spacy_models.py b/utils/wl_downloader_spacy_models.py index ede2283f7..2b977d903 100644 --- a/utils/wl_downloader_spacy_models.py +++ b/utils/wl_downloader_spacy_models.py @@ -56,7 +56,7 @@ model_ver_eng = model_ver_fin = model_ver_fra = model_ver_deu = model_ver_ell = \ model_ver_ita = model_ver_jpn = model_ver_lit = model_ver_mkd = model_ver_nob = \ model_ver_pol = model_ver_por = model_ver_ron = model_ver_rus = model_ver_spa = \ -model_ver_swe = model_ver_ukr = '3.5.0' +model_ver_swe = model_ver_ukr = '3.6.0' # Check updates for lang_text, lang_code_639_3, lang_code_639_1 in langs: