From 7d77695b055d307e42e1e65eb1f15e0a52c9b971 Mon Sep 17 00:00:00 2001 From: BLKSerene Date: Sat, 18 May 2024 09:03:50 +0800 Subject: [PATCH] Dependencies: Upgrade Pyphen to 0.15.0; Utils: Add Pyphen's Basque syllable tokenizer --- ACKS.md | 2 +- CHANGELOG.md | 2 ++ doc/trs/zho_cn/ACKS.md | 2 +- doc/trs/zho_tw/ACKS.md | 2 +- requirements/requirements_tests.txt | 2 +- tests/tests_nlp/test_syl_tokenization.py | 2 ++ utils/wl_generate_acks.py | 2 +- wordless/wl_settings/wl_settings_default.py | 1 + wordless/wl_settings/wl_settings_global.py | 2 ++ 9 files changed, 12 insertions(+), 5 deletions(-) diff --git a/ACKS.md b/ACKS.md index 0dad62f3d..067679fcf 100644 --- a/ACKS.md +++ b/ACKS.md @@ -37,7 +37,7 @@ As Wordless stands on the shoulders of giants, I hereby extend my sincere gratit 13|[PyInstaller](http://www.pyinstaller.org/)|6.0|Hartmut Goebel, Jasper Harrison, Bryan A. Jones,
Brénainn Woodsend, Rok Mandeljc|[Bootloader-exception](https://github.com/pyinstaller/pyinstaller/blob/develop/COPYING.txt) 14|[pymorphy3](https://github.com/no-plagiarism/pymorphy3)|2.0.1|Mikhail Korobov, Danylo Halaiko|[MIT](https://github.com/no-plagiarism/pymorphy3/blob/master/LICENSE.txt) 15|[pypdf](https://github.com/py-pdf/pypdf)|3.16.2|Mathieu Fenniak, Ashish Kulkarni, Steve Witham, Martin Thoma|[BSD-3-Clause](https://github.com/py-pdf/pypdf/blob/main/LICENSE) -16|[Pyphen](https://pyphen.org/)|0.14.0|Guillaume Ayoub|[GPL-2.0-or-later/LGPL-2.1-or-later/MPL-1.1](https://github.com/Kozea/Pyphen/blob/master/LICENSE) +16|[Pyphen](https://pyphen.org/)|0.15.0|Guillaume Ayoub|[GPL-2.0-or-later/LGPL-2.1-or-later/MPL-1.1](https://github.com/Kozea/Pyphen/blob/master/LICENSE) 17|[PyQt](https://riverbankcomputing.com/software/pyqt/)|5.15.10|Riverbank Computing|[Commercial-License/GPL-3.0-only](https://www.riverbankcomputing.com/static/Docs/PyQt5/introduction.html#license) 18|[PyThaiNLP](https://github.com/PyThaiNLP/pythainlp)|5.0.3|Wannaphong Phatthiyaphaibun (วรรณพงษ์ ภัททิยไพบูลย์)|[Apache-2.0](https://github.com/PyThaiNLP/pythainlp/blob/dev/LICENSE) 19|[python-docx](https://github.com/python-openxml/python-docx)|1.1.0|Steve Canny|[MIT](https://github.com/python-openxml/python-docx/blob/master/LICENSE) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3bf59a229..6b48d9aeb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,7 @@ ### 🎉 New Features - Settings: Add Settings - Stop Word Lists - Stop Word List Settings - Case-sensitive - Settings: Add Settings - Tables - Dependency Parser +- Utils: Add Pyphen's Basque syllable tokenizer - Utils: Add PyThaiNLP's Han-solo - Utils: Add Stanza's Sindhi part-of-speech tagger - Utils: Add VADER's sentiment analyzers @@ -50,6 +51,7 @@ - Dependencies: Upgrade LaoNLP to 1.1.3 - Dependencies: Upgrade Lingua to 2.0.2 - Dependencies: Upgrade pymorphy3 to 2.0.1 +- Dependencies: Upgrade Pyphen to 0.15.0 - Dependencies: Upgrade PyQt to 5.15.10 - Dependencies: Upgrade PyThaiNLP to 5.0.3 - Dependencies: Upgrade python-docx to 1.1.0 diff --git a/doc/trs/zho_cn/ACKS.md b/doc/trs/zho_cn/ACKS.md index 171f791ea..b1ea5c976 100644 --- a/doc/trs/zho_cn/ACKS.md +++ b/doc/trs/zho_cn/ACKS.md @@ -37,7 +37,7 @@ 13|[PyInstaller](http://www.pyinstaller.org/)|6.0|Hartmut Goebel, Jasper Harrison, Bryan A. Jones,
Brénainn Woodsend, Rok Mandeljc|[Bootloader-exception](https://github.com/pyinstaller/pyinstaller/blob/develop/COPYING.txt) 14|[pymorphy3](https://github.com/no-plagiarism/pymorphy3)|2.0.1|Mikhail Korobov, Danylo Halaiko|[MIT](https://github.com/no-plagiarism/pymorphy3/blob/master/LICENSE.txt) 15|[pypdf](https://github.com/py-pdf/pypdf)|3.16.2|Mathieu Fenniak, Ashish Kulkarni, Steve Witham, Martin Thoma|[BSD-3-Clause](https://github.com/py-pdf/pypdf/blob/main/LICENSE) -16|[Pyphen](https://pyphen.org/)|0.14.0|Guillaume Ayoub|[GPL-2.0-or-later/LGPL-2.1-or-later/MPL-1.1](https://github.com/Kozea/Pyphen/blob/master/LICENSE) +16|[Pyphen](https://pyphen.org/)|0.15.0|Guillaume Ayoub|[GPL-2.0-or-later/LGPL-2.1-or-later/MPL-1.1](https://github.com/Kozea/Pyphen/blob/master/LICENSE) 17|[PyQt](https://riverbankcomputing.com/software/pyqt/)|5.15.10|Riverbank Computing|[Commercial-License/GPL-3.0-only](https://www.riverbankcomputing.com/static/Docs/PyQt5/introduction.html#license) 18|[PyThaiNLP](https://github.com/PyThaiNLP/pythainlp)|5.0.3|Wannaphong Phatthiyaphaibun (วรรณพงษ์ ภัททิยไพบูลย์)|[Apache-2.0](https://github.com/PyThaiNLP/pythainlp/blob/dev/LICENSE) 19|[python-docx](https://github.com/python-openxml/python-docx)|1.1.0|Steve Canny|[MIT](https://github.com/python-openxml/python-docx/blob/master/LICENSE) diff --git a/doc/trs/zho_tw/ACKS.md b/doc/trs/zho_tw/ACKS.md index 5e25ec2ec..76d1c90eb 100644 --- a/doc/trs/zho_tw/ACKS.md +++ b/doc/trs/zho_tw/ACKS.md @@ -37,7 +37,7 @@ 13|[PyInstaller](http://www.pyinstaller.org/)|6.0|Hartmut Goebel, Jasper Harrison, Bryan A. Jones,
Brénainn Woodsend, Rok Mandeljc|[Bootloader-exception](https://github.com/pyinstaller/pyinstaller/blob/develop/COPYING.txt) 14|[pymorphy3](https://github.com/no-plagiarism/pymorphy3)|2.0.1|Mikhail Korobov, Danylo Halaiko|[MIT](https://github.com/no-plagiarism/pymorphy3/blob/master/LICENSE.txt) 15|[pypdf](https://github.com/py-pdf/pypdf)|3.16.2|Mathieu Fenniak, Ashish Kulkarni, Steve Witham, Martin Thoma|[BSD-3-Clause](https://github.com/py-pdf/pypdf/blob/main/LICENSE) -16|[Pyphen](https://pyphen.org/)|0.14.0|Guillaume Ayoub|[GPL-2.0-or-later/LGPL-2.1-or-later/MPL-1.1](https://github.com/Kozea/Pyphen/blob/master/LICENSE) +16|[Pyphen](https://pyphen.org/)|0.15.0|Guillaume Ayoub|[GPL-2.0-or-later/LGPL-2.1-or-later/MPL-1.1](https://github.com/Kozea/Pyphen/blob/master/LICENSE) 17|[PyQt](https://riverbankcomputing.com/software/pyqt/)|5.15.10|Riverbank Computing|[Commercial-License/GPL-3.0-only](https://www.riverbankcomputing.com/static/Docs/PyQt5/introduction.html#license) 18|[PyThaiNLP](https://github.com/PyThaiNLP/pythainlp)|5.0.3|Wannaphong Phatthiyaphaibun (วรรณพงษ์ ภัททิยไพบูลย์)|[Apache-2.0](https://github.com/PyThaiNLP/pythainlp/blob/dev/LICENSE) 19|[python-docx](https://github.com/python-openxml/python-docx)|1.1.0|Steve Canny|[MIT](https://github.com/python-openxml/python-docx/blob/master/LICENSE) diff --git a/requirements/requirements_tests.txt b/requirements/requirements_tests.txt index d0b4e9028..61c58beef 100644 --- a/requirements/requirements_tests.txt +++ b/requirements/requirements_tests.txt @@ -24,7 +24,7 @@ khmer-nltk == 1.6 laonlp == 1.1.3 lingua-language-detector == 2.0.2 nltk == 3.8.1 -pyphen == 0.14.0 +pyphen == 0.15.0 pythainlp == 5.0.3 sacremoses == 0.1.1 simplemma == 0.9.1 diff --git a/tests/tests_nlp/test_syl_tokenization.py b/tests/tests_nlp/test_syl_tokenization.py index 078d29f4f..93c5dac68 100644 --- a/tests/tests_nlp/test_syl_tokenization.py +++ b/tests/tests_nlp/test_syl_tokenization.py @@ -111,6 +111,8 @@ def test_syl_tokenize(lang, syl_tokenizer): assert syls_tokens == [('Afri', 'kaans'), ('is',), ('ti', 'po', 'lo', 'gies'), ('be', 'skou'), ("'n",), ('In', 'do', 'Eu', 'ro', 'pe', 'se'), (',',), ('Wes', 'Ger', 'maan', 'se'), (',',), ('Ne', 'derfran', 'kie', 'se'), ('taal',), (',',), ('[',), ('2',), (']',), ('wat',), ('aan',), ('die',), ('suid', 'punt'), ('van',), ('Afri', 'ka'), ('on', 'der'), ('in', 'vloed'), ('van',), ('ver', 'skeie'), ('an', 'der'), ('ta', 'le'), ('en',), ('taal', 'groe', 'pe'), ('ont', 'staan'), ('het',), ('.',)] case 'sqi': assert syls_tokens == [('Gju', 'ha'), ('shqi', 'pe'), ('(',), ('ose',), ('thjesht',), ('shqi', 'p', 'ja'), (')',), ('ësh', 'të'), ('gju', 'hë'), ('dhe',), ('de', 'gë'), ('e',), ('ve', 'ça', 'n', 'të'), ('e',), ('fa', 'mi', 'l', 'jes'), ('in', 'do', 'e', 'v', 'ro', 'pi', 'ane'), ('që',), ('fli', 'tet'), ('nga',), ('rreth',), ('7', '10'), ('mi', 'li', 'onë'), ('nje', 'rëz'), ('në',), ('bo', 'të'), (',',), ('[',), ('1',), (']',), ('kry', 'esisht'), ('në',), ('Shqi', 'pë', 'ri'), (',',), ('Ko', 'so', 'vë'), ('dhe',), ('Ma', 'qe', 'do', 'ni', 'në'), ('e',), ('Ve', 'ri', 'ut'), (',',), ('por',), ('edhe',), ('në',), ('zo', 'na'), ('të',), ('tje', 'ra'), ('të',), ('Ev', 'ro', 'pës'), ('Ju', 'g', 'li', 'n', 'do', 're'), ('ku',), ('ka',), ('një',), ('po', 'pu', 'll', 'si'), ('shqi', 'p', 'ta', 're'), (',',), ('du', 'ke'), ('pë', 'r', 'f', 'shi', 'rë'), ('Ma', 'lin'), ('e',), ('Zi',), ('dhe',), ('Lu', 'gi', 'nën'), ('e',), ('Pre', 'she', 'vës'), ('.',)] + case 'eus': + assert syls_tokens == [('Eus', 'ka', 'ra'), ('Eus', 'kal'), ('He', 'rri', 'ko'), ('hiz', 'kun', 'tza'), ('da.',), ('[',), ('8',), (']',)] case 'bel': assert syls_tokens == [('Бе', 'ла', 'ру́с', 'кая'), ('мо́', 'ва'), ('—',), ('на', 'цы', 'я', 'на', 'ль', 'ная'), ('мо', 'ва'), ('бе', 'ла', 'ру', 'саў'), (',',), ('ува', 'хо', 'дзіць'), ('у',), ('ін', 'да', 'еў', 'ра', 'пей', 'с', 'кую'), ('моў', 'ную'), ('сям',), ("'",), ('ю',), (',',), ('сла', 'вя', 'н', 'с', 'кую'), ('гру', 'пу'), (',',), ('ус', 'хо', 'д', 'не', 'с', 'ла', 'вя', 'н', 'с', 'кую'), ('па', 'д', 'г', 'ру', 'пу'), ('.',)] case 'bul': diff --git a/utils/wl_generate_acks.py b/utils/wl_generate_acks.py index 719ac3dcd..b4d54fe25 100644 --- a/utils/wl_generate_acks.py +++ b/utils/wl_generate_acks.py @@ -51,7 +51,7 @@ ['PyInstaller', 'http://www.pyinstaller.org/', '6.0', 'Hartmut Goebel, Jasper Harrison, Bryan A. Jones,
Brénainn Woodsend, Rok Mandeljc', 'Bootloader-exception', 'https://github.com/pyinstaller/pyinstaller/blob/develop/COPYING.txt'], ['pymorphy3', 'https://github.com/no-plagiarism/pymorphy3', '2.0.1', 'Mikhail Korobov, Danylo Halaiko', 'MIT', 'https://github.com/no-plagiarism/pymorphy3/blob/master/LICENSE.txt'], ['pypdf', 'https://github.com/py-pdf/pypdf', '3.16.2', 'Mathieu Fenniak, Ashish Kulkarni, Steve Witham, Martin Thoma', 'BSD-3-Clause', 'https://github.com/py-pdf/pypdf/blob/main/LICENSE'], - ['Pyphen', 'https://pyphen.org/', '0.14.0', 'Guillaume Ayoub', 'GPL-2.0-or-later/LGPL-2.1-or-later/MPL-1.1', 'https://github.com/Kozea/Pyphen/blob/master/LICENSE'], + ['Pyphen', 'https://pyphen.org/', '0.15.0', 'Guillaume Ayoub', 'GPL-2.0-or-later/LGPL-2.1-or-later/MPL-1.1', 'https://github.com/Kozea/Pyphen/blob/master/LICENSE'], ['PyQt', 'https://riverbankcomputing.com/software/pyqt/', '5.15.10', 'Riverbank Computing', 'Commercial-License/GPL-3.0-only', 'https://www.riverbankcomputing.com/static/Docs/PyQt5/introduction.html#license'], ['PyThaiNLP', 'https://github.com/PyThaiNLP/pythainlp', '5.0.3', 'Wannaphong Phatthiyaphaibun (วรรณพงษ์ ภัททิยไพบูลย์)', 'Apache-2.0', 'https://github.com/PyThaiNLP/pythainlp/blob/dev/LICENSE'], ['python-docx', 'https://github.com/python-openxml/python-docx', '1.1.0', 'Steve Canny', 'MIT', 'https://github.com/python-openxml/python-docx/blob/master/LICENSE'], diff --git a/wordless/wl_settings/wl_settings_default.py b/wordless/wl_settings/wl_settings_default.py index 7d53f20f0..811b20fac 100644 --- a/wordless/wl_settings/wl_settings_default.py +++ b/wordless/wl_settings/wl_settings_default.py @@ -1435,6 +1435,7 @@ def init_settings_default(main): 'syl_tokenizer_settings': { 'afr': 'pyphen_afr', 'sqi': 'pyphen_sqi', + 'eus': 'pyphen_eus', 'bel': 'pyphen_bel', 'bul': 'pyphen_bul', 'cat': 'pyphen_cat', diff --git a/wordless/wl_settings/wl_settings_global.py b/wordless/wl_settings/wl_settings_global.py index 76be085df..a50db6ff0 100644 --- a/wordless/wl_settings/wl_settings_global.py +++ b/wordless/wl_settings/wl_settings_global.py @@ -723,6 +723,7 @@ _tr('wl_settings_global', 'Pyphen - Afrikaans syllable tokenizer'): 'pyphen_afr', _tr('wl_settings_global', 'Pyphen - Albanian syllable tokenizer'): 'pyphen_sqi', + _tr('wl_settings_global', 'Pyphen - Basque syllable tokenizer'): 'pyphen_eus', _tr('wl_settings_global', 'Pyphen - Belarusian syllable tokenizer'): 'pyphen_bel', _tr('wl_settings_global', 'Pyphen - Bulgarian syllable tokenizer'): 'pyphen_bul', _tr('wl_settings_global', 'Pyphen - Catalan syllable tokenizer'): 'pyphen_cat', @@ -2468,6 +2469,7 @@ 'syl_tokenizers': { 'afr': ['pyphen_afr'], 'sqi': ['pyphen_sqi'], + 'eus': ['pyphen_eus'], 'bel': ['pyphen_bel'], 'bul': ['pyphen_bul'], 'cat': ['pyphen_cat'],