Skip to content

Commit

Permalink
Utils: Add spaCy's Malay word tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
BLKSerene committed Jul 22, 2023
1 parent b8754ed commit 6e8be5d
Show file tree
Hide file tree
Showing 7 changed files with 10 additions and 3 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
### 🎉 New Features
- Utils: Add PyThaiNLP's perceptron part-of-speech tagger (Blackboard)
- Utils: Add spaCy's Korean sentence recognizer, word tokenizer, part-of-speech tagger, lemmatizer, and dependency parser
- Utils: Add spaCy's Malay word tokenizer

### ❌ Removals
- Utils: Remove PyThaiNLP's perceptron part-of-speech tagger (LST20)
Expand Down
2 changes: 1 addition & 1 deletion doc/doc_eng.md
Original file line number Diff line number Diff line change
Expand Up @@ -709,7 +709,7 @@ Lithuanian |✔|✔|✔|✔|✔|✔|✔
Lugbara |⭕️ |⭕️ |✖️|✖️|✖️|✔|✖️
Luxembourgish |⭕️ |✔|✖️|✖️|✔|✖️|✖️
Macedonian |✔|✔|✖️|✔|✔|✖️|✔
Malay |⭕️ |⭕️ |✖️|✖️|✔|✔|✖️
Malay |⭕️ ||✖️|✖️|✔|✔|✖️
Malayalam |✔|✔|✖️|✖️|✖️|✖️|✖️
Manx |⭕️ |⭕️ |✖️|✖️|✔|✖️|✖️
Marathi |⭕️ |✔|✖️|✖️|✖️|✔|✖️
Expand Down
2 changes: 1 addition & 1 deletion tests/wl_test_lang_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,7 +374,7 @@
SENTENCE_LIT = TEXT_LIT[0]
SENTENCE_LTZ = "D'Lëtzebuergesch gëtt an der däitscher Dialektologie als ee westgermaneschen, mëtteldäitschen Dialekt aklasséiert, deen zum Muselfränkesche gehéiert."
SENTENCE_MKD = TEXT_MKD[0]
SENTENCE_MSA = 'Bahasa Melayu (Tulisan Jawi: بهاس ملايو; Rencong: ꤷꥁꤼ ꤸꥍꤾꤿꥈ) ialah sejenis bahasa Melayu-Polinesia di bawah keluarga bahasa Austronesia yang telah digunakan di wilayah Indonesia, Malaysia, dan persekitarannya sejak melebihi 1,000 tahun lalu.'
SENTENCE_MSA = 'Bahasa Melayu (Tulisan Jawi: بهاس ملايو; Rencong: ꤷꥁꤼ ꤸꥍꤾꤿꥈ) ialah salah satu daripada bahasa-bahasa Melayu-Polinesia di bawah keluarga bahasa Austronesia, yang merupakan bahasa rasmi di Brunei, Indonesia, Malaysia dan Singapura, serta dituturkan di Timor Leste dan sebahagian wilayah di Kemboja , Filipina dan Thailand.'
SENTENCE_MAL = TEXT_MAL[0]
SENTENCE_GLV = 'She Gaelg (graït: /gɪlg/) çhengey Ghaelagh Vannin.'
SENTENCE_MAR = 'मराठी भाषा ही इंडो-युरोपीय भाषाकुळातील एक भाषा आहे.'
Expand Down
2 changes: 1 addition & 1 deletion tests/wl_tests_nlp/test_lemmatization.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ def test_lemmatize(lang, lemmatizer):
else:
tests_lang_util_skipped = True
elif lang == 'msa':
assert lemmas == ['bahasa', 'Melayu', '(', 'tulisan', 'Jawi', ':', 'bahasa', 'Melayu', ';', 'rencong', ':', 'ꤷꥁꤼ', 'ꤸꥍꤾꤿꥈ', ')', 'ialah', 'sejenis', 'bahasa', 'Melayu', '-', 'Polinesia', 'di', 'bawah', 'keluarga', 'bahasa', 'Austronesia', 'hiang', 'telah', 'digunakan', 'di', 'wilayah', 'Indonesia', ',', 'Malaysia', ',', 'دان', 'persekitaran', 'sejak', 'melebihi', '1,000', 'تاهون', 'lalu', '.']
assert lemmas == ['bahasa', 'Melayu', '(', 'tulisan', 'Jawi', ':', 'bahasa', 'Melayu', ';', 'rencong', ':', 'ꤷꥁꤼ', 'ꤸꥍꤾꤿꥈ', ')', 'ialah', 'salah', 'ساتو', 'daripada', 'bahasa', '-', 'bahasa', 'Melayu', '-', 'Polinesia', 'di', 'bawah', 'keluarga', 'bahasa', 'Austronesia', ',', 'hiang', 'merupakan', 'bahasa', 'rasmi', 'di', 'Brunei', ',', 'Indonesia', ',', 'Malaysia', 'دان', 'Singapura', ',', 'serta', 'dituturkan', 'di', 'timur', 'Leste', 'دان', 'sebahagian', 'wilayah', 'di', 'Kemboja', ',', 'Filipina', 'دان', 'Thailand', '.']
elif lang == 'glv':
assert lemmas == ['She', 'Gaelg', '(', 'graït', ':', '/gɪlg/', ')', 'çhengey', 'Gaelagh', 'Mannin', '.']
elif lang == 'nob':
Expand Down
2 changes: 2 additions & 0 deletions tests/wl_tests_nlp/test_word_tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,8 @@ def test_word_tokenize(lang, word_tokenizer):
assert tokens == ["D'", 'Lëtzebuergesch', 'gëtt', 'an', 'der', 'däitscher', 'Dialektologie', 'als', 'ee', 'westgermaneschen', ',', 'mëtteldäitschen', 'Dialekt', 'aklasséiert', ',', 'deen', 'zum', 'Muselfränkesche', 'gehéiert', '.']
elif lang == 'mkd':
assert tokens == ['Македонски', 'јазик', '—', 'јужнословенски', 'јазик', ',', 'дел', 'од', 'групата', 'на', 'словенски', 'јазици', 'од', 'јазичното', 'семејство', 'на', 'индоевропски', 'јазици', '.']
elif lang == 'msa':
assert tokens == ['Bahasa', 'Melayu', '(', 'Tulisan', 'Jawi', ':', 'بهاس', 'ملايو', ';', 'Rencong', ':', 'ꤷꥁꤼ', 'ꤸꥍꤾꤿꥈ', ')', 'ialah', 'salah', 'satu', 'daripada', 'bahasa', '-', 'bahasa', 'Melayu', '-', 'Polinesia', 'di', 'bawah', 'keluarga', 'bahasa', 'Austronesia', ',', 'yang', 'merupakan', 'bahasa', 'rasmi', 'di', 'Brunei', ',', 'Indonesia', ',', 'Malaysia', 'dan', 'Singapura', ',', 'serta', 'dituturkan', 'di', 'Timor', 'Leste', 'dan', 'sebahagian', 'wilayah', 'di', 'Kemboja', ',', 'Filipina', 'dan', 'Thailand', '.']
elif lang == 'mal':
if word_tokenizer == 'sacremoses_moses':
assert tokens == ['ഇന്ത്യയിൽ', 'കേരള', 'സംസ്ഥാനത്തിലും', 'കേന്ദ്രഭരണപ്രദേശങ്ങളായ', 'ലക്ഷദ്വീപിലും', 'പോണ്ടിച്ചേരിയുടെ', 'ഭാഗമായ', 'മാഹിയിലും', 'തമിഴ്നാട്ടിലെ', 'കന്യാകുമാരി', 'ജില്ലയിലും', 'നീലഗിരി', 'ജില്ലയിലെ', 'ഗൂഡല്ലൂർ', 'താലൂക്കിലും', 'സംസാരിക്കപ്പെടുന്ന', 'ഭാഷയാണ്', 'മലയാളം', '.']
Expand Down
1 change: 1 addition & 0 deletions wordless/wl_settings/wl_settings_default.py
Original file line number Diff line number Diff line change
Expand Up @@ -1272,6 +1272,7 @@ def init_settings_default(main):
'lit': 'spacy_lit',
'ltz': 'spacy_ltz',
'mkd': 'spacy_mkd',
'msa': 'spacy_msa',
'mal': 'sacremoses_moses',
'mar': 'sacremoses_moses',
'mni': 'sacremoses_moses',
Expand Down
3 changes: 3 additions & 0 deletions wordless/wl_settings/wl_settings_global.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,6 +505,7 @@ def init_settings_global():
_tr('init_settings_global', 'spaCy - Lithuanian word tokenizer'): 'spacy_lit',
_tr('init_settings_global', 'spaCy - Luxembourgish word tokenizer'): 'spacy_ltz',
_tr('init_settings_global', 'spaCy - Macedonian word tokenizer'): 'spacy_mkd',
_tr('init_settings_global', 'spaCy - Malay word tokenizer'): 'spacy_msa',
_tr('init_settings_global', 'spaCy - Malayalam word tokenizer'): 'spacy_mal',
_tr('init_settings_global', 'spaCy - Marathi word tokenizer'): 'spacy_mar',
_tr('init_settings_global', 'spaCy - Nepali word tokenizer'): 'spacy_nep',
Expand Down Expand Up @@ -1284,6 +1285,8 @@ def init_settings_global():
'spacy_mkd'
],

'msa': ['spacy_msa'],

'mal': [
'sacremoses_moses',
'spacy_mal'
Expand Down

0 comments on commit 6e8be5d

Please sign in to comment.