|
204 | 204 | SENT_4 = ["ผม", "กิน", "ข้าว", " ", "\n", "เธอ", "เล่น", "เกม"] |
205 | 205 |
|
206 | 206 |
|
| 207 | +class DetokenizeTestCase(unittest.TestCase): |
| 208 | + """Detokenize and regrouping test cases""" |
| 209 | + |
| 210 | + def test_word_detokenize(self): |
| 211 | + self.assertIsInstance(word_detokenize(["ผม", "5"]), str) |
| 212 | + self.assertEqual( |
| 213 | + word_detokenize(["ผม", "เลี้ยง", "5", "ตัว"]), "ผมเลี้ยง 5 ตัว" |
| 214 | + ) |
| 215 | + self.assertEqual( |
| 216 | + word_detokenize(["ผม", "เลี้ยง", " ", "5", "ตัว"], "list"), |
| 217 | + [["ผม", "เลี้ยง", " ", "5", " ", "ตัว"]], |
| 218 | + ) |
| 219 | + self.assertEqual( |
| 220 | + word_detokenize(["ผม", "เลี้ยง", "5", "10", "ตัว", "ๆ", "คน", "ดี"]), |
| 221 | + "ผมเลี้ยง 5 10 ตัว ๆ คนดี", |
| 222 | + ) |
| 223 | + self.assertEqual( |
| 224 | + word_detokenize(["ผม", "เลี้ยง", "5", "ตัว", " ", "ๆ", "คน", "ดี"]), |
| 225 | + "ผมเลี้ยง 5 ตัว ๆ คนดี", |
| 226 | + ) |
| 227 | + self.assertEqual( |
| 228 | + word_detokenize(["ม่ายย", " ", "ผม", "เลี้ยง", "5", "ตัว"]), |
| 229 | + "ม่ายย ผมเลี้ยง 5 ตัว", |
| 230 | + ) |
| 231 | + |
| 232 | + def test_numeric_data_format(self): |
| 233 | + engines = ["newmm"] |
| 234 | + |
| 235 | + for engine in engines: |
| 236 | + self.assertIn( |
| 237 | + "127.0.0.1", |
| 238 | + word_tokenize("ไอพีของคุณคือ 127.0.0.1 ครับ", engine=engine), |
| 239 | + ) |
| 240 | + |
| 241 | + tokens = word_tokenize( |
| 242 | + "เวลา 12:12pm มีโปรโมชั่น 11.11", engine=engine |
| 243 | + ) |
| 244 | + self.assertTrue( |
| 245 | + any(value in tokens for value in ["12:12pm", "12:12"]), |
| 246 | + msg=f"{engine}: {tokens}", |
| 247 | + ) |
| 248 | + self.assertIn("11.11", tokens) |
| 249 | + |
| 250 | + self.assertIn( |
| 251 | + "1,234,567.89", |
| 252 | + word_tokenize("รางวัลมูลค่า 1,234,567.89 บาท", engine=engine), |
| 253 | + ) |
| 254 | + |
| 255 | + tokens = word_tokenize("อัตราส่วน 2.5:1 คือ 5:2", engine=engine) |
| 256 | + self.assertIn("2.5:1", tokens) |
| 257 | + self.assertIn("5:2", tokens) |
| 258 | + |
| 259 | + |
207 | 260 | class TokenizeTestCase(unittest.TestCase): |
208 | 261 | def test_Tokenizer(self): |
209 | 262 | _tokenizer = Tokenizer(DEFAULT_WORD_DICT_TRIE) |
@@ -550,56 +603,3 @@ def test_tcc_p(self): |
550 | 603 | # ) |
551 | 604 | self.assertEqual(list(tcc_p.tcc("")), []) |
552 | 605 | self.assertEqual(tcc_p.tcc_pos(""), set()) |
553 | | - |
554 | | - |
555 | | -class DetokenizeTestCase(unittest.TestCase): |
556 | | - """Detokenize and regrouping test cases""" |
557 | | - |
558 | | - def test_word_detokenize(self): |
559 | | - self.assertIsInstance(word_detokenize(["ผม", "5"]), str) |
560 | | - self.assertEqual( |
561 | | - word_detokenize(["ผม", "เลี้ยง", "5", "ตัว"]), "ผมเลี้ยง 5 ตัว" |
562 | | - ) |
563 | | - self.assertEqual( |
564 | | - word_detokenize(["ผม", "เลี้ยง", " ", "5", "ตัว"], "list"), |
565 | | - [["ผม", "เลี้ยง", " ", "5", " ", "ตัว"]], |
566 | | - ) |
567 | | - self.assertEqual( |
568 | | - word_detokenize(["ผม", "เลี้ยง", "5", "10", "ตัว", "ๆ", "คน", "ดี"]), |
569 | | - "ผมเลี้ยง 5 10 ตัว ๆ คนดี", |
570 | | - ) |
571 | | - self.assertEqual( |
572 | | - word_detokenize(["ผม", "เลี้ยง", "5", "ตัว", " ", "ๆ", "คน", "ดี"]), |
573 | | - "ผมเลี้ยง 5 ตัว ๆ คนดี", |
574 | | - ) |
575 | | - self.assertEqual( |
576 | | - word_detokenize(["ม่ายย", " ", "ผม", "เลี้ยง", "5", "ตัว"]), |
577 | | - "ม่ายย ผมเลี้ยง 5 ตัว", |
578 | | - ) |
579 | | - |
580 | | - def test_numeric_data_format(self): |
581 | | - engines = ["newmm"] |
582 | | - |
583 | | - for engine in engines: |
584 | | - self.assertIn( |
585 | | - "127.0.0.1", |
586 | | - word_tokenize("ไอพีของคุณคือ 127.0.0.1 ครับ", engine=engine), |
587 | | - ) |
588 | | - |
589 | | - tokens = word_tokenize( |
590 | | - "เวลา 12:12pm มีโปรโมชั่น 11.11", engine=engine |
591 | | - ) |
592 | | - self.assertTrue( |
593 | | - any(value in tokens for value in ["12:12pm", "12:12"]), |
594 | | - msg=f"{engine}: {tokens}", |
595 | | - ) |
596 | | - self.assertIn("11.11", tokens) |
597 | | - |
598 | | - self.assertIn( |
599 | | - "1,234,567.89", |
600 | | - word_tokenize("รางวัลมูลค่า 1,234,567.89 บาท", engine=engine), |
601 | | - ) |
602 | | - |
603 | | - tokens = word_tokenize("อัตราส่วน 2.5:1 คือ 5:2", engine=engine) |
604 | | - self.assertIn("2.5:1", tokens) |
605 | | - self.assertIn("5:2", tokens) |
0 commit comments