Skip to content

Commit

Permalink
Utils: Update Wordless's sentence and sentence segment splitters
Browse files Browse the repository at this point in the history
  • Loading branch information
BLKSerene committed Jul 23, 2023
1 parent e47a52f commit 632c216
Show file tree
Hide file tree
Showing 3 changed files with 138 additions and 7 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@
- Utils: Add spaCy's Malay word tokenizer
- Utils: Add spaCy's Slovenian sentence recognizer, part-of-speech tagger, lemmatizer, and dependency parser

### ✨ Improvements
- Utils: Update Wordless's sentence and sentence segment splitters

### ❌ Removals
- Utils: Remove PyThaiNLP's perceptron part-of-speech tagger (LST20)

Expand Down
4 changes: 2 additions & 2 deletions tests/wl_tests_nlp/test_sentence_tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ def test_sentence_split(lang):
text = ''.join(getattr(wl_test_lang_examples, f'TEXT_{lang.upper()}'))
)

if lang not in ['zho_cn', 'zho_tw', 'jpn', 'tha', 'bod', 'khm']:
if lang not in ['zho_cn', 'zho_tw', 'jpn', 'tha', 'bod']:
assert len(sentences_split) > 1

@pytest.mark.parametrize('lang', test_langs)
Expand Down Expand Up @@ -355,7 +355,7 @@ def test_sentence_seg_tokenize_tokens(lang):
)
sentence_segs = wl_sentence_tokenization.wl_sentence_seg_tokenize_tokens(main, tokens)

if lang not in ['tha', 'khm']:
if lang not in ['tha']:
assert len(sentence_segs) > 1

if __name__ == '__main__':
Expand Down
138 changes: 133 additions & 5 deletions wordless/wl_nlp/wl_sentence_tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,11 +132,67 @@ def wl_sentence_tokenize(main, text, lang, sentence_tokenizer = 'default'):

return sentences

# Reference:
# References:
# https://stackoverflow.com/questions/9506869/are-there-character-collections-for-all-international-full-stop-punctuations/9508766#9508766
# https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%5B%3ATerminal_Punctuation%3DYes%3A%5D%26%5B%3ASentence_Break%3D%2F%5BAS%5DTerm%2F%3A%5D%5D&g=&i=
# https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%3ATerminal_Punctuation%CE%B2%3DYes%3A%5D%26%5B%3ASentence_Break%CE%B2%3D%2F%5BAS%5DTerm%2F%3A%5D&g=&i=
SENTENCE_TERMINATORS = {
'\u0021', '\u002E', '\u003F', '\u0589', '\u061D', '\u061E', '\u061F', '\u06D4', '\u0700', '\u0701', '\u0702', '\u07F9', '\u0837', '\u0839', '\u083D', '\u083E', '\u0964', '\u0965', '\u104A', '\u104B', '\u1362', '\u1367', '\u1368', '\u166E', '\u1735', '\u1736', '\u1803', '\u1809', '\u1944', '\u1945', '\u1AA8', '\u1AA9', '\u1AAA', '\u1AAB', '\u1B5A', '\u1B5B', '\u1B5E', '\u1B5F', '\u1B7D', '\u1B7E', '\u1C3B', '\u1C3C', '\u1C7E', '\u1C7F', '\u203C', '\u2047', '\u2048', '\u2049', '\u203D', '\u2E2E', '\u2E53', '\u2E54', '\u2E3C', '\u3002', '\uA4FF', '\uA60E', '\uA60F', '\uA6F3', '\uA6F7', '\uA876', '\uA877', '\uA8CE', '\uA8CF', '\uA92F', '\uA9C8', '\uA9C9', '\uAA5D', '\uAA5E', '\uAA5F', '\uAAF0', '\uAAF1', '\uABEB', '\uFE52', '\uFE56', '\uFE57', '\uFF01', '\uFF0E', '\uFF1F', '\uFF61', '\U00010A56', '\U00010A57', '\U00010F55', '\U00010F56', '\U00010F57', '\U00010F58', '\U00010F59', '\U00010F86', '\U00010F87', '\U00010F88', '\U00010F89', '\U00011047', '\U00011048', '\U000110BE', '\U000110BF', '\U000110C0', '\U000110C1', '\U00011141', '\U00011142', '\U00011143', '\U000111C5', '\U000111C6', '\U000111CD', '\U000111DE', '\U000111DF', '\U00011238', '\U00011239', '\U0001123B', '\U0001123C', '\U000112A9', '\U0001144B', '\U0001144C', '\U000115C2', '\U000115C3', '\U000115C9', '\U000115CA', '\U000115CB', '\U000115CC', '\U000115CD', '\U000115CE', '\U000115CF', '\U000115D0', '\U000115D1', '\U000115D2', '\U000115D3', '\U000115D4', '\U000115D5', '\U000115D6', '\U000115D7', '\U00011641', '\U00011642', '\U0001173C', '\U0001173D', '\U0001173E', '\U00011944', '\U00011946', '\U00011A42', '\U00011A43', '\U00011A9B', '\U00011A9C', '\U00011C41', '\U00011C42', '\U00011EF7', '\U00011EF8', '\U00016A6E', '\U00016A6F', '\U00016AF5', '\U00016B37', '\U00016B38', '\U00016B44', '\U00016E98', '\U0001BC9F', '\U0001DA88'
'\u0021', '\u002E', '\u003F',
'\u0589',
'\u061D', '\u061E', '\u061F', '\u06D4',
'\u0700', '\u0701', '\u0702',
'\u07F9',
'\u0837', '\u0839', '\u083D', '\u083E',
'\u0964', '\u0965',
'\u104A', '\u104B',
'\u1362', '\u1367', '\u1368',
'\u166E',
'\u1735', '\u1736',
'\u17D4', '\u17D5',
'\u1803', '\u1809',
'\u1944', '\u1945',
'\u1AA8', '\u1AA9', '\u1AAA', '\u1AAB',
'\u1B5A', '\u1B5B', '\u1B5E', '\u1B5F', '\u1B7D', '\u1B7E',
'\u1C3B', '\u1C3C',
'\u1C7E', '\u1C7F',
'\u203C', '\u2047', '\u2048', '\u2049', '\u203D',
'\u2E2E', '\u2E53', '\u2E54', '\u2E3C',
'\u3002',
'\uA4FF',
'\uA60E', '\uA60F',
'\uA6F3', '\uA6F7',
'\uA876', '\uA877',
'\uA8CE', '\uA8CF',
'\uA92F',
'\uA9C8', '\uA9C9',
'\uAA5D', '\uAA5E', '\uAA5F',
'\uAAF0', '\uAAF1', '\uABEB',
'\uFE52', '\uFE56', '\uFE57',
'\uFF01', '\uFF0E', '\uFF1F', '\uFF61',
'\U00010A56', '\U00010A57',
'\U00010F55', '\U00010F56', '\U00010F57', '\U00010F58', '\U00010F59',
'\U00010F86', '\U00010F87', '\U00010F88', '\U00010F89',
'\U00011047', '\U00011048',
'\U000110BE', '\U000110BF', '\U000110C0', '\U000110C1',
'\U00011141', '\U00011142', '\U00011143',
'\U000111C5', '\U000111C6', '\U000111CD', '\U000111DE', '\U000111DF',
'\U00011238', '\U00011239', '\U0001123B', '\U0001123C',
'\U000112A9',
'\U0001144B', '\U0001144C',
'\U000115C2', '\U000115C3', '\U000115C9', '\U000115CA', '\U000115CB', '\U000115CC', '\U000115CD', '\U000115CE', '\U000115CF', '\U000115D0', '\U000115D1', '\U000115D2', '\U000115D3', '\U000115D4', '\U000115D5', '\U000115D6', '\U000115D7',
'\U00011641', '\U00011642',
'\U0001173C', '\U0001173D', '\U0001173E',
'\U00011944', '\U00011946',
'\U00011A42', '\U00011A43',
'\U00011A9B', '\U00011A9C',
'\U00011C41', '\U00011C42',
'\U00011EF7', '\U00011EF8',
'\U00011F43', '\U00011F44',
'\U00016A6E', '\U00016A6F',
'\U00016AF5',
'\U00016B37', '\U00016B38', '\U00016B44',
'\U00016E98',
'\U0001BC9F',
'\U0001DA88'
}
RE_SENTENCE_TERMINATORS = ''.join(SENTENCE_TERMINATORS)

Expand All @@ -146,9 +202,81 @@ def wl_sentence_split(main, text): # pylint: disable=unused-argument
for sentence in re.findall(fr'.+?[{RE_SENTENCE_TERMINATORS}]+\s+|.+?$', text.strip())
]

# Reference: https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=[:Terminal_Punctuation=Yes:]
# Reference: https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=[:Terminal_Punctuation%CE%B2=Yes:]
SENTENCE_SEG_TERMINATORS = {
'\u0021', '\u002C', '\u002E', '\u003A', '\u003B', '\u003F', '\u037E', '\u0387', '\u0589', '\u05C3', '\u060C', '\u061B', '\u061D', '\u061E', '\u061F', '\u06D4', '\u0700', '\u0701', '\u0702', '\u0703', '\u0704', '\u0705', '\u0706', '\u0707', '\u0708', '\u0709', '\u070A', '\u070C', '\u07F8', '\u07F9', '\u0830', '\u0831', '\u0832', '\u0833', '\u0834', '\u0835', '\u0836', '\u0837', '\u0838', '\u0839', '\u083A', '\u083B', '\u083C', '\u083D', '\u083E', '\u085E', '\u0964', '\u0965', '\u0E5A', '\u0E5B', '\u0F08', '\u0F0D', '\u0F0E', '\u0F0F', '\u0F10', '\u0F11', '\u0F12', '\u104A', '\u104B', '\u1361', '\u1362', '\u1363', '\u1364', '\u1365', '\u1366', '\u1367', '\u1368', '\u166E', '\u16EB', '\u16EC', '\u16ED', '\u1735', '\u1736', '\u17D4', '\u17D5', '\u17D6', '\u17DA', '\u1802', '\u1803', '\u1804', '\u1805', '\u1808', '\u1809', '\u1944', '\u1945', '\u1AA8', '\u1AA9', '\u1AAA', '\u1AAB', '\u1B5A', '\u1B5B', '\u1B5D', '\u1B5E', '\u1B5F', '\u1B7D', '\u1B7E', '\u1C3B', '\u1C3C', '\u1C3D', '\u1C3E', '\u1C3F', '\u1C7E', '\u1C7F', '\u203C', '\u2047', '\u2048', '\u2049', '\u203D', '\u2E2E', '\u2E4C', '\u2E4E', '\u2E4F', '\u2E53', '\u2E54', '\u2E3C', '\u2E41', '\u3001', '\u3002', '\uA4FE', '\uA4FF', '\uA60D', '\uA60E', '\uA60F', '\uA6F3', '\uA6F4', '\uA6F5', '\uA6F6', '\uA6F7', '\uA876', '\uA877', '\uA8CE', '\uA8CF', '\uA92F', '\uA9C7', '\uA9C8', '\uA9C9', '\uAA5D', '\uAA5E', '\uAA5F', '\uAADF', '\uAAF0', '\uAAF1', '\uABEB', '\uFE50', '\uFE51', '\uFE52', '\uFE54', '\uFE55', '\uFE56', '\uFE57', '\uFF01', '\uFF0C', '\uFF0E', '\uFF1A', '\uFF1B', '\uFF1F', '\uFF61', '\uFF64', '\U0001039F', '\U000103D0', '\U00010857', '\U0001091F', '\U00010A56', '\U00010A57', '\U00010AF0', '\U00010AF1', '\U00010AF2', '\U00010AF3', '\U00010AF4', '\U00010AF5', '\U00010B3A', '\U00010B3B', '\U00010B3C', '\U00010B3D', '\U00010B3E', '\U00010B3F', '\U00010B99', '\U00010B9A', '\U00010B9B', '\U00010B9C', '\U00010F55', '\U00010F56', '\U00010F57', '\U00010F58', '\U00010F59', '\U00010F86', '\U00010F87', '\U00010F88', '\U00010F89', '\U00011047', '\U00011048', '\U00011049', '\U0001104A', '\U0001104B', '\U0001104C', '\U0001104D', '\U000110BE', '\U000110BF', '\U000110C0', '\U000110C1', '\U00011141', '\U00011142', '\U00011143', '\U000111C5', '\U000111C6', '\U000111CD', '\U000111DE', '\U000111DF', '\U00011238', '\U00011239', '\U0001123A', '\U0001123B', '\U0001123C', '\U000112A9', '\U0001144B', '\U0001144C', '\U0001144D', '\U0001145A', '\U0001145B', '\U000115C2', '\U000115C3', '\U000115C4', '\U000115C5', '\U000115C9', '\U000115CA', '\U000115CB', '\U000115CC', '\U000115CD', '\U000115CE', '\U000115CF', '\U000115D0', '\U000115D1', '\U000115D2', '\U000115D3', '\U000115D4', '\U000115D5', '\U000115D6', '\U000115D7', '\U00011641', '\U00011642', '\U0001173C', '\U0001173D', '\U0001173E', '\U00011944', '\U00011946', '\U00011A42', '\U00011A43', '\U00011A9B', '\U00011A9C', '\U00011AA1', '\U00011AA2', '\U00011C41', '\U00011C42', '\U00011C43', '\U00011C71', '\U00011EF7', '\U00011EF8', '\U00012470', '\U00012471', '\U00012472', '\U00012473', '\U00012474', '\U00016A6E', '\U00016A6F', '\U00016AF5', '\U00016B37', '\U00016B38', '\U00016B39', '\U00016B44', '\U00016E97', '\U00016E98', '\U0001BC9F', '\U0001DA87', '\U0001DA88', '\U0001DA89', '\U0001DA8A'
'\u0021', '\u002C', '\u002E', '\u003A', '\u003B', '\u003F',
'\u037E', '\u0387',
'\u0589',
'\u05C3',
'\u060C', '\u061B', '\u061D', '\u061E', '\u061F', '\u06D4',
'\u0700', '\u0701', '\u0702', '\u0703', '\u0704', '\u0705', '\u0706', '\u0707', '\u0708', '\u0709', '\u070A', '\u070C',
'\u07F8', '\u07F9',
'\u0830', '\u0831', '\u0832', '\u0833', '\u0834', '\u0835', '\u0836', '\u0837', '\u0838', '\u0839', '\u083A', '\u083B', '\u083C', '\u083D', '\u083E',
'\u085E',
'\u0964', '\u0965',
'\u0E5A', '\u0E5B',
'\u0F08', '\u0F0D', '\u0F0E', '\u0F0F', '\u0F10', '\u0F11', '\u0F12',
'\u104A', '\u104B',
'\u1361', '\u1362', '\u1363', '\u1364', '\u1365', '\u1366', '\u1367', '\u1368',
'\u166E',
'\u16EB', '\u16EC', '\u16ED',
'\u1735', '\u1736',
'\u17D4', '\u17D5', '\u17D6', '\u17DA',
'\u1802', '\u1803', '\u1804', '\u1805', '\u1808', '\u1809',
'\u1944', '\u1945',
'\u1AA8', '\u1AA9', '\u1AAA', '\u1AAB',
'\u1B5A', '\u1B5B', '\u1B5D', '\u1B5E', '\u1B5F', '\u1B7D', '\u1B7E',
'\u1C3B', '\u1C3C', '\u1C3D', '\u1C3E', '\u1C3F',
'\u1C7E', '\u1C7F',
'\u203C', '\u2047', '\u2048', '\u2049', '\u203D',
'\u2E2E', '\u2E4C', '\u2E4E', '\u2E4F', '\u2E53', '\u2E54', '\u2E3C', '\u2E41',
'\u3001', '\u3002',
'\uA4FE', '\uA4FF',
'\uA60D', '\uA60E', '\uA60F',
'\uA6F3', '\uA6F4', '\uA6F5', '\uA6F6', '\uA6F7',
'\uA876', '\uA877',
'\uA8CE', '\uA8CF',
'\uA92F',
'\uA9C7', '\uA9C8', '\uA9C9',
'\uAA5D', '\uAA5E', '\uAA5F',
'\uAADF',
'\uAAF0', '\uAAF1', '\uABEB',
'\uFE50', '\uFE51', '\uFE52', '\uFE54', '\uFE55', '\uFE56', '\uFE57',
'\uFF01', '\uFF0C', '\uFF0E', '\uFF1A', '\uFF1B', '\uFF1F', '\uFF61', '\uFF64',
'\U0001039F',
'\U000103D0',
'\U00010857',
'\U0001091F',
'\U00010A56', '\U00010A57',
'\U00010AF0', '\U00010AF1', '\U00010AF2', '\U00010AF3', '\U00010AF4', '\U00010AF5',
'\U00010B3A', '\U00010B3B', '\U00010B3C', '\U00010B3D', '\U00010B3E', '\U00010B3F',
'\U00010B99', '\U00010B9A', '\U00010B9B', '\U00010B9C',
'\U00010F55', '\U00010F56', '\U00010F57', '\U00010F58', '\U00010F59',
'\U00010F86', '\U00010F87', '\U00010F88', '\U00010F89',
'\U00011047', '\U00011048', '\U00011049', '\U0001104A', '\U0001104B', '\U0001104C', '\U0001104D',
'\U000110BE', '\U000110BF', '\U000110C0', '\U000110C1',
'\U00011141', '\U00011142', '\U00011143',
'\U000111C5', '\U000111C6', '\U000111CD', '\U000111DE', '\U000111DF',
'\U00011238', '\U00011239', '\U0001123A', '\U0001123B', '\U0001123C',
'\U000112A9',
'\U0001144B', '\U0001144C', '\U0001144D', '\U0001145A', '\U0001145B',
'\U000115C2', '\U000115C3', '\U000115C4', '\U000115C5', '\U000115C9', '\U000115CA', '\U000115CB', '\U000115CC', '\U000115CD', '\U000115CE', '\U000115CF', '\U000115D0', '\U000115D1', '\U000115D2', '\U000115D3', '\U000115D4', '\U000115D5', '\U000115D6', '\U000115D7',
'\U00011641', '\U00011642',
'\U0001173C', '\U0001173D', '\U0001173E',
'\U00011944', '\U00011946',
'\U00011A42', '\U00011A43',
'\U00011A9B', '\U00011A9C', '\U00011AA1', '\U00011AA2',
'\U00011C41', '\U00011C42', '\U00011C43',
'\U00011C71',
'\U00011EF7', '\U00011EF8',
'\U00011F43', '\U00011F44',
'\U00012470', '\U00012471', '\U00012472', '\U00012473', '\U00012474',
'\U00016A6E', '\U00016A6F',
'\U00016AF5',
'\U00016B37', '\U00016B38', '\U00016B39', '\U00016B44',
'\U00016E97', '\U00016E98',
'\U0001BC9F',
'\U0001DA87', '\U0001DA88', '\U0001DA89', '\U0001DA8A'
}
RE_SENTENCE_SEG_TERMINATORS = ''.join(SENTENCE_SEG_TERMINATORS)

Expand Down

0 comments on commit 632c216

Please sign in to comment.