From b6bcb9df26d21568ae985831928d8b474cf21a9f Mon Sep 17 00:00:00 2001 From: BLKSerene Date: Sat, 18 May 2024 02:15:42 +0800 Subject: [PATCH] Dependencies: Upgrade PyThaiNLP to 5.0.3; Utils: Add PyThaiNLP's Han-solo --- ACKS.md | 2 +- CHANGELOG.md | 2 ++ doc/trs/zho_cn/ACKS.md | 2 +- doc/trs/zho_tw/ACKS.md | 2 +- requirements/requirements_dev.txt | 1 - requirements/requirements_tests.txt | 3 +-- tests/tests_nlp/test_syl_tokenization.py | 6 +++++- utils/wl_generate_acks.py | 2 +- wordless/wl_nlp/wl_syl_tokenization.py | 6 ++++-- wordless/wl_settings/wl_settings_default.py | 2 +- wordless/wl_settings/wl_settings_global.py | 6 ++++-- 11 files changed, 21 insertions(+), 13 deletions(-) diff --git a/ACKS.md b/ACKS.md index 72f76b419..0dad62f3d 100644 --- a/ACKS.md +++ b/ACKS.md @@ -39,7 +39,7 @@ As Wordless stands on the shoulders of giants, I hereby extend my sincere gratit 15|[pypdf](https://github.com/py-pdf/pypdf)|3.16.2|Mathieu Fenniak, Ashish Kulkarni, Steve Witham, Martin Thoma|[BSD-3-Clause](https://github.com/py-pdf/pypdf/blob/main/LICENSE) 16|[Pyphen](https://pyphen.org/)|0.14.0|Guillaume Ayoub|[GPL-2.0-or-later/LGPL-2.1-or-later/MPL-1.1](https://github.com/Kozea/Pyphen/blob/master/LICENSE) 17|[PyQt](https://riverbankcomputing.com/software/pyqt/)|5.15.10|Riverbank Computing|[Commercial-License/GPL-3.0-only](https://www.riverbankcomputing.com/static/Docs/PyQt5/introduction.html#license) -18|[PyThaiNLP](https://github.com/PyThaiNLP/pythainlp)|4.0.2|Wannaphong Phatthiyaphaibun (วรรณพงษ์ ภัททิยไพบูลย์)|[Apache-2.0](https://github.com/PyThaiNLP/pythainlp/blob/dev/LICENSE) +18|[PyThaiNLP](https://github.com/PyThaiNLP/pythainlp)|5.0.3|Wannaphong Phatthiyaphaibun (วรรณพงษ์ ภัททิยไพบูลย์)|[Apache-2.0](https://github.com/PyThaiNLP/pythainlp/blob/dev/LICENSE) 19|[python-docx](https://github.com/python-openxml/python-docx)|1.1.0|Steve Canny|[MIT](https://github.com/python-openxml/python-docx/blob/master/LICENSE) 20|[python-mecab-ko](https://github.com/jonghwanhyeon/python-mecab-ko)|1.3.3|Jonghwan Hyeon|[BSD-3-Clause](https://github.com/jonghwanhyeon/python-mecab-ko/blob/main/LICENSE) 21|[Requests](https://github.com/psf/requests)|2.31.0|Kenneth Reitz|[Apache-2.0](https://github.com/psf/requests/blob/main/LICENSE) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0c919f02d..3bf59a229 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,7 @@ ### 🎉 New Features - Settings: Add Settings - Stop Word Lists - Stop Word List Settings - Case-sensitive - Settings: Add Settings - Tables - Dependency Parser +- Utils: Add PyThaiNLP's Han-solo - Utils: Add Stanza's Sindhi part-of-speech tagger - Utils: Add VADER's sentiment analyzers - Work Area: Add Profiler - Lexical Diversity - Brunét's Index / Honoré's statistic @@ -50,6 +51,7 @@ - Dependencies: Upgrade Lingua to 2.0.2 - Dependencies: Upgrade pymorphy3 to 2.0.1 - Dependencies: Upgrade PyQt to 5.15.10 +- Dependencies: Upgrade PyThaiNLP to 5.0.3 - Dependencies: Upgrade python-docx to 1.1.0 - Dependencies: Upgrade Sacremoses to 0.1.1 - Dependencies: Upgrade spaCy to 3.7.2 diff --git a/doc/trs/zho_cn/ACKS.md b/doc/trs/zho_cn/ACKS.md index eb50301a9..171f791ea 100644 --- a/doc/trs/zho_cn/ACKS.md +++ b/doc/trs/zho_cn/ACKS.md @@ -39,7 +39,7 @@ 15|[pypdf](https://github.com/py-pdf/pypdf)|3.16.2|Mathieu Fenniak, Ashish Kulkarni, Steve Witham, Martin Thoma|[BSD-3-Clause](https://github.com/py-pdf/pypdf/blob/main/LICENSE) 16|[Pyphen](https://pyphen.org/)|0.14.0|Guillaume Ayoub|[GPL-2.0-or-later/LGPL-2.1-or-later/MPL-1.1](https://github.com/Kozea/Pyphen/blob/master/LICENSE) 17|[PyQt](https://riverbankcomputing.com/software/pyqt/)|5.15.10|Riverbank Computing|[Commercial-License/GPL-3.0-only](https://www.riverbankcomputing.com/static/Docs/PyQt5/introduction.html#license) -18|[PyThaiNLP](https://github.com/PyThaiNLP/pythainlp)|4.0.2|Wannaphong Phatthiyaphaibun (วรรณพงษ์ ภัททิยไพบูลย์)|[Apache-2.0](https://github.com/PyThaiNLP/pythainlp/blob/dev/LICENSE) +18|[PyThaiNLP](https://github.com/PyThaiNLP/pythainlp)|5.0.3|Wannaphong Phatthiyaphaibun (วรรณพงษ์ ภัททิยไพบูลย์)|[Apache-2.0](https://github.com/PyThaiNLP/pythainlp/blob/dev/LICENSE) 19|[python-docx](https://github.com/python-openxml/python-docx)|1.1.0|Steve Canny|[MIT](https://github.com/python-openxml/python-docx/blob/master/LICENSE) 20|[python-mecab-ko](https://github.com/jonghwanhyeon/python-mecab-ko)|1.3.3|Jonghwan Hyeon|[BSD-3-Clause](https://github.com/jonghwanhyeon/python-mecab-ko/blob/main/LICENSE) 21|[Requests](https://github.com/psf/requests)|2.31.0|Kenneth Reitz|[Apache-2.0](https://github.com/psf/requests/blob/main/LICENSE) diff --git a/doc/trs/zho_tw/ACKS.md b/doc/trs/zho_tw/ACKS.md index 8bff584f4..5e25ec2ec 100644 --- a/doc/trs/zho_tw/ACKS.md +++ b/doc/trs/zho_tw/ACKS.md @@ -39,7 +39,7 @@ 15|[pypdf](https://github.com/py-pdf/pypdf)|3.16.2|Mathieu Fenniak, Ashish Kulkarni, Steve Witham, Martin Thoma|[BSD-3-Clause](https://github.com/py-pdf/pypdf/blob/main/LICENSE) 16|[Pyphen](https://pyphen.org/)|0.14.0|Guillaume Ayoub|[GPL-2.0-or-later/LGPL-2.1-or-later/MPL-1.1](https://github.com/Kozea/Pyphen/blob/master/LICENSE) 17|[PyQt](https://riverbankcomputing.com/software/pyqt/)|5.15.10|Riverbank Computing|[Commercial-License/GPL-3.0-only](https://www.riverbankcomputing.com/static/Docs/PyQt5/introduction.html#license) -18|[PyThaiNLP](https://github.com/PyThaiNLP/pythainlp)|4.0.2|Wannaphong Phatthiyaphaibun (วรรณพงษ์ ภัททิยไพบูลย์)|[Apache-2.0](https://github.com/PyThaiNLP/pythainlp/blob/dev/LICENSE) +18|[PyThaiNLP](https://github.com/PyThaiNLP/pythainlp)|5.0.3|Wannaphong Phatthiyaphaibun (วรรณพงษ์ ภัททิยไพบูลย์)|[Apache-2.0](https://github.com/PyThaiNLP/pythainlp/blob/dev/LICENSE) 19|[python-docx](https://github.com/python-openxml/python-docx)|1.1.0|Steve Canny|[MIT](https://github.com/python-openxml/python-docx/blob/master/LICENSE) 20|[python-mecab-ko](https://github.com/jonghwanhyeon/python-mecab-ko)|1.3.3|Jonghwan Hyeon|[BSD-3-Clause](https://github.com/jonghwanhyeon/python-mecab-ko/blob/main/LICENSE) 21|[Requests](https://github.com/psf/requests)|2.31.0|Kenneth Reitz|[Apache-2.0](https://github.com/psf/requests/blob/main/LICENSE) diff --git a/requirements/requirements_dev.txt b/requirements/requirements_dev.txt index e90afd8ed..86b04beeb 100644 --- a/requirements/requirements_dev.txt +++ b/requirements/requirements_dev.txt @@ -60,7 +60,6 @@ pyqt5 python-docx requests scipy -tzdata; sys_platform == 'win32' # Required by PyThaiNLP on Windows wordcloud # For PyTorch on Linux using CPU diff --git a/requirements/requirements_tests.txt b/requirements/requirements_tests.txt index 9257ff8b1..d0b4e9028 100644 --- a/requirements/requirements_tests.txt +++ b/requirements/requirements_tests.txt @@ -25,7 +25,7 @@ laonlp == 1.1.3 lingua-language-detector == 2.0.2 nltk == 3.8.1 pyphen == 0.14.0 -pythainlp == 4.0.2 +pythainlp == 5.0.3 sacremoses == 0.1.1 simplemma == 0.9.1 stanza == 1.7.0 @@ -64,7 +64,6 @@ pytest python-docx requests scipy -tzdata; sys_platform == 'win32' # Required by PyThaiNLP on Windows wordcloud # For PyTorch on Linux using CPU diff --git a/tests/tests_nlp/test_syl_tokenization.py b/tests/tests_nlp/test_syl_tokenization.py index 534933dee..078d29f4f 100644 --- a/tests/tests_nlp/test_syl_tokenization.py +++ b/tests/tests_nlp/test_syl_tokenization.py @@ -190,7 +190,11 @@ def test_syl_tokenize(lang, syl_tokenizer): case 'tel': assert syls_tokens == [('తె', 'లు', 'గు'), ('అనే', 'ది'), ('ద్రా', 'విడ'), ('భా', 'షల'), ('కు', 'టుం', 'బా', 'ని', 'కి'), ('చెం', 'దిన'), ('భాష',), ('.',)] case 'tha': - assert syls_tokens == [('ภา', 'ษา', 'ไทย'), ('หรือ',), ('ภา', 'ษา', 'ไทย'), ('กลาง',), ('เป็น',), ('ภา', 'ษา'), ('ใน',), ('กลุ่ม',), ('ภา', 'ษา'), ('ไท',), ('ซึ่ง',), ('เป็น',), ('กลุ่ม', 'ย่อย'), ('ของ',), ('ตระ', 'กูล'), ('ภา', 'ษา'), ('ข',), ('ร้า',), ('-',), ('ไท',), ('และ',), ('เป็น',), ('ภา', 'ษา', 'ราช', 'การ'), ('และ',), ('ภา', 'ษา', 'ประ', 'จำ', 'ชาติ'), ('ของ',), ('ประ', 'เทศ'), ('ไทย',), ('[',), ('3',), ('][',), ('4',), (']',)] + match syl_tokenizer: + case 'pythainlp_han_solo': + assert syls_tokens == [('ภา', 'ษา', 'ไทย'), ('หรือ',), ('ภา', 'ษา', 'ไทย'), ('กลาง',), ('เป็น',), ('ภา', 'ษา'), ('ใน',), ('กลุ่ม',), ('ภา', 'ษา'), ('ไท',), ('ซึ่ง',), ('เป็น',), ('กลุ่ม', 'ย่อย'), ('ของ',), ('ตระ', 'กูล'), ('ภา', 'ษา'), ('ข',), ('ร้า',), ('-',), ('ไท',), ('และ',), ('เป็น',), ('ภา', 'ษา', 'ราช', 'การ'), ('และ',), ('ภา', 'ษา', 'ประ', 'จำ', 'ชาติ'), ('ของ',), ('ประ', 'เทศ'), ('ไทย',), ('[3', '][4', ']')] + case 'pythainlp_syl_dict': + assert syls_tokens == [('ภา', 'ษา', 'ไทย'), ('หรือ',), ('ภา', 'ษา', 'ไทย'), ('กลาง',), ('เป็น',), ('ภา', 'ษา'), ('ใน',), ('กลุ่ม',), ('ภา', 'ษา'), ('ไท',), ('ซึ่ง',), ('เป็น',), ('กลุ่ม', 'ย่อย'), ('ของ',), ('ตระ', 'กูล'), ('ภา', 'ษา'), ('ข',), ('ร้า',), ('-',), ('ไท',), ('และ',), ('เป็น',), ('ภา', 'ษา', 'ราช', 'การ'), ('และ',), ('ภา', 'ษา', 'ประ', 'จำ', 'ชาติ'), ('ของ',), ('ประ', 'เทศ'), ('ไทย',), ('[3][4]',)] case 'ukr': assert syls_tokens == [('Укра', 'ї', '́', 'н', 'сь', 'ка'), ('мо', '́', 'ва'), ('(',), ('МФА',), (':',), ('[',), ('ukrɑ̽ˈjɪnʲsʲkɑ̽',), ('ˈmɔwɑ̽',), (']',), (',',), ('іс', 'то', 'ри', 'ч', 'ні'), ('на', 'зви'), ('—',), ('ру', '́', 'сь', 'ка'), ('[',), ('10',), (']',), ('[',), ('11',), (']',), ('[',), ('12',), (']',), ('[',), ('*',), ('1',), (']',), (')',), ('—',), ('на', 'ціо', 'на', 'ль', 'на'), ('мо', 'ва'), ('укра', 'ї', 'н', 'ців'), ('.',)] case 'zul': diff --git a/utils/wl_generate_acks.py b/utils/wl_generate_acks.py index 58b59beb0..719ac3dcd 100644 --- a/utils/wl_generate_acks.py +++ b/utils/wl_generate_acks.py @@ -53,7 +53,7 @@ ['pypdf', 'https://github.com/py-pdf/pypdf', '3.16.2', 'Mathieu Fenniak, Ashish Kulkarni, Steve Witham, Martin Thoma', 'BSD-3-Clause', 'https://github.com/py-pdf/pypdf/blob/main/LICENSE'], ['Pyphen', 'https://pyphen.org/', '0.14.0', 'Guillaume Ayoub', 'GPL-2.0-or-later/LGPL-2.1-or-later/MPL-1.1', 'https://github.com/Kozea/Pyphen/blob/master/LICENSE'], ['PyQt', 'https://riverbankcomputing.com/software/pyqt/', '5.15.10', 'Riverbank Computing', 'Commercial-License/GPL-3.0-only', 'https://www.riverbankcomputing.com/static/Docs/PyQt5/introduction.html#license'], - ['PyThaiNLP', 'https://github.com/PyThaiNLP/pythainlp', '4.0.2', 'Wannaphong Phatthiyaphaibun (วรรณพงษ์ ภัททิยไพบูลย์)', 'Apache-2.0', 'https://github.com/PyThaiNLP/pythainlp/blob/dev/LICENSE'], + ['PyThaiNLP', 'https://github.com/PyThaiNLP/pythainlp', '5.0.3', 'Wannaphong Phatthiyaphaibun (วรรณพงษ์ ภัททิยไพบูลย์)', 'Apache-2.0', 'https://github.com/PyThaiNLP/pythainlp/blob/dev/LICENSE'], ['python-docx', 'https://github.com/python-openxml/python-docx', '1.1.0', 'Steve Canny', 'MIT', 'https://github.com/python-openxml/python-docx/blob/master/LICENSE'], ['python-mecab-ko', 'https://github.com/jonghwanhyeon/python-mecab-ko', '1.3.3', 'Jonghwan Hyeon', 'BSD-3-Clause', 'https://github.com/jonghwanhyeon/python-mecab-ko/blob/main/LICENSE'], ['Requests', 'https://github.com/psf/requests', '2.31.0', 'Kenneth Reitz', 'Apache-2.0', 'https://github.com/psf/requests/blob/main/LICENSE'], diff --git a/wordless/wl_nlp/wl_syl_tokenization.py b/wordless/wl_nlp/wl_syl_tokenization.py index 83b97178e..bdc55485d 100644 --- a/wordless/wl_nlp/wl_syl_tokenization.py +++ b/wordless/wl_nlp/wl_syl_tokenization.py @@ -105,7 +105,9 @@ def wl_syl_tokenize_tokens(main, tokens, lang, syl_tokenizer): else: syls_tokens.append([token]) # Thai - elif syl_tokenizer == 'pythainlp_tha': - syls_tokens.append(pythainlp.subword_tokenize(token, engine = 'dict')) + elif syl_tokenizer == 'pythainlp_han_solo': + syls_tokens.append(pythainlp.tokenize.syllable_tokenize(token, engine = 'han_solo')) + elif syl_tokenizer == 'pythainlp_syl_dict': + syls_tokens.append(pythainlp.tokenize.syllable_tokenize(token, engine = 'dict')) return syls_tokens diff --git a/wordless/wl_settings/wl_settings_default.py b/wordless/wl_settings/wl_settings_default.py index e1b5b7b72..7d53f20f0 100644 --- a/wordless/wl_settings/wl_settings_default.py +++ b/wordless/wl_settings/wl_settings_default.py @@ -1473,7 +1473,7 @@ def init_settings_default(main): 'spa': 'pyphen_spa', 'swe': 'pyphen_swe', 'tel': 'pyphen_tel', - 'tha': 'pythainlp_tha', + 'tha': 'pythainlp_han_solo', 'ukr': 'pyphen_ukr', 'zul': 'pyphen_zul' }, diff --git a/wordless/wl_settings/wl_settings_global.py b/wordless/wl_settings/wl_settings_global.py index b818da39d..76be085df 100644 --- a/wordless/wl_settings/wl_settings_global.py +++ b/wordless/wl_settings/wl_settings_global.py @@ -765,7 +765,8 @@ _tr('wl_settings_global', 'Pyphen - Ukrainian syllable tokenizer'): 'pyphen_ukr', _tr('wl_settings_global', 'Pyphen - Zulu syllable tokenizer'): 'pyphen_zul', - _tr('wl_settings_global', 'PyThaiNLP - Thai syllable tokenizer'): 'pythainlp_tha' + _tr('wl_settings_global', 'PyThaiNLP - Han-solo'): 'pythainlp_han_solo', + _tr('wl_settings_global', 'PyThaiNLP - Syllable dictionary'): 'pythainlp_syl_dict' }, 'pos_taggers': { @@ -2519,7 +2520,8 @@ 'tha': [ 'pyphen_tha', - 'pythainlp_tha' + 'pythainlp_han_solo', + 'pythainlp_syl_dict' ], 'ukr': ['pyphen_ukr'],