Merge pull request #981 from bact/add-nlpo3-to-compact

bact · web-flow · commit 1601e99f712e · 2024-11-04T07:37:33.000Z
Add documentation about compact install option
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -44,7 +44,7 @@ so it may be a good idea to familiarize yourself with it.
 
 - We use the famous [gitflow](http://nvie.com/posts/a-successful-git-branching-model/)
 to manage our branches.
-- When you create pull requests on GitHub, Github Actions and AppVeyor will run tests
+- When you create pull requests on GitHub, GitHub Actions will run tests
 and several checks automatically. Click the "Details" link at the end of
 each check to see what needs to be fixed.
 
@@ -66,7 +66,7 @@ To run unit tests locally together with code coverage test:
 (from main `pythainlp/` directory)
 
 ```sh
-coverage run -m unittest discover
+coverage run -m unittest tests.core
 ```
 
 See code coverage test:
@@ -75,13 +75,16 @@ See code coverage test:
 coverage report
 ```
 
-Generate code coverage test in HTML (files will be available in `htmlcov/` directory):
+Generate code coverage test in HTML
+(files will be available in `htmlcov/` directory):
 
 ```sh
 coverage html
 ```
 
-Make sure the tests pass on both Github Actions and AppVeyor.
+Make sure the tests pass on GitHub Actions.
+
+See more in [tests/README.md](./tests/README.md)
 
 ## Releasing
 
diff --git a/README.md b/README.md
@@ -76,6 +76,7 @@ pip install pythainlp[extra1,extra2,...]
 Possible `extras`:
 
 - `full` (install everything)
+- `compact` (install a stable and small subset of dependencies)
 - `attacut` (to support attacut, a fast and accurate tokenizer)
 - `benchmarks` (for [word tokenization benchmarking](tokenization-benchmark.md))
 - `icu` (for ICU, International Components for Unicode, support in transliteration and tokenization)
@@ -85,7 +86,8 @@ Possible `extras`:
 - `thai2rom` (for machine-learnt romanization)
 - `wordnet` (for Thai WordNet API)
 
-For dependency details, look at the `extras` variable in [`setup.py`](https://github.com/PyThaiNLP/pythainlp/blob/dev/setup.py).
+For dependency details, look at the `extras` variable in
+[`setup.py`](https://github.com/PyThaiNLP/pythainlp/blob/dev/setup.py).
 
 ## Data Directory
 
@@ -110,6 +112,19 @@ To show how to use:
 thainlp help
 ```
 
+## Testing and test suites
+
+We test core functionalities on all officially supported Python versions.
+
+Some functionality requiring extra dependencies may be tested less frequently
+due to potential version conflicts or incompatibilities between packages.
+
+Test cases are categorized into three groups: core, compact, and extra.
+You can find these tests in the [tests/](/tests/) directory.
+
+For more detailed information on testing, please refer to the tests README:
+[tests/README.md](./tests/README.md)
+
 ## Licenses
 
 | | License |
diff --git a/README_TH.md b/README_TH.md
@@ -2,10 +2,9 @@
   <img src="https://avatars0.githubusercontent.com/u/32934255?s=200&v=4"/>
   <h1>PyThaiNLP: Thai Natural Language Processing in Python</h1>
   <a href="https://pypi.python.org/pypi/pythainlp"><img alt="pypi" src="https://img.shields.io/pypi/v/pythainlp.svg"/></a>
-  <a href="https://www.python.org/downloads/release/python-370/"><img alt="Python 3.7" src="https://img.shields.io/badge/python-3.7-blue.svg"/></a>
+  <a href="https://www.python.org/downloads/"><img alt="Python 3.9" src="https://img.shields.io/badge/python-3.9-blue.svg"/></a>
   <a href="https://opensource.org/licenses/Apache-2.0"><img alt="License" src="https://img.shields.io/badge/License-Apache%202.0-blue.svg"/></a>
   <a href="https://pepy.tech/project/pythainlp"><img alt="Download" src="https://pepy.tech/badge/pythainlp/month"/></a>
-  <a href="https://ci.appveyor.com/project/wannaphongcom/pythainlp-9y1ch"><img alt="Build status" src="https://ci.appveyor.com/api/projects/status/9g3mfcwchi8em40x?svg=true"/></a>
   <a href="https://coveralls.io/github/PyThaiNLP/pythainlp?branch=dev"><img alt="Coverage Status" src="https://coveralls.io/repos/github/PyThaiNLP/pythainlp/badge.svg?branch=dev"/></a>
   <a href="https://www.codacy.com/app/pythainlp/pythainlp_2"><img alt="Codacy Badge" src="https://api.codacy.com/project/badge/Grade/cb946260c87a4cc5905ca608704406f7"/></a>
   <a href="https://app.fossa.io/projects/git%2Bgithub.com%2FPyThaiNLP%2Fpythainlp"><img alt="FOSSA Status" src="https://app.fossa.io/api/projects/git%2Bgithub.com%2FPyThaiNLP%2Fpythainlp.svg?type=shield"/></a>
@@ -51,6 +50,7 @@ PyThaiNLP มีความสามารถพื้นฐานสำหร
 - Thai datetime formatting (`thai_strftime`)
 - Thai-English keyboard misswitched fix (`eng_to_thai`, `thai_to_eng`)
 - Command-line interface for basic functions, like tokenization and pos tagging (run `thainlp` in your shell)
+
 </details>
 
 อ่านรายละเอียดได้ที่ [tutorials](https://pythainlp.org/tutorials)
@@ -82,6 +82,7 @@ pip install pythainlp[extra1,extra2,...]
   <summary>รายการสำหรับติดตั้งผ่าน <code>extras</code></summary>
 
 - `full` (ติดตั้งทุกอย่าง)
+- `compact` (ติดตั้งไลบารีชุดเล็กที่ทดสอบแล้วว่าไม่ตีกันเองและติดตั้งได้ในทุกระบบปฏิบัติการ)
 - `attacut` (เพื่อสนับสนุน attacut ซึ่งเป็นตัวตัดคำที่ทำงานได้รวดเร็วและมีประสิทธิภาพ)
 - `benchmarks` (สำหรับ [word tokenization benchmarking](tokenization-benchmark.md))
 - `icu` (สำหรับการรองรับ ICU หรือ International Components for Unicode ในการถอดเสียงเป็นอักษรและการตัดแบ่งคำ)
@@ -90,6 +91,7 @@ pip install pythainlp[extra1,extra2,...]
 - `thai2fit` (สำหรับ Thai word vector)
 - `thai2rom` (สำหรับการถอดอักษรไทยเป็นอักษรโรมัน)
 - `wordnet` (สำหรับ Thai WordNet API)
+
 </details>
 
 สำหรับโมดูลที่ต้องการ สามารถดูรายละเอียดได้ที่ตัวแปร `extras` ใน [`setup.py`](https://github.com/PyThaiNLP/pythainlp/blob/dev/setup.py).
diff --git a/setup.py b/setup.py
@@ -67,7 +67,7 @@
     "ipa": ["epitran>=1.1"],
     "ml": ["numpy>=1.22", "torch>=1.0.0"],
     "mt5": ["sentencepiece>=0.1.91", "transformers>=4.6.0"],
-    "nlpo3": ["nlpo3>=1.2.2"],
+    "nlpo3": ["nlpo3>=1.3.0"],
     "onnx": ["numpy>=1.22", "onnxruntime>=1.10.0", "sentencepiece>=0.1.91"],
     "oskut": ["oskut>=1.3"],
     "sefr_cut": ["sefr_cut>=1.1"],
@@ -119,7 +119,7 @@
         "fastcoref>=2.1.5",
         "gensim>=4.0.0",
         "khamyo>=0.2.0",
-        "nlpo3>=1.2.2",
+        "nlpo3>=1.3.0",
         "nltk>=3.3",
         "numpy>=1.22",
         "onnxruntime>=1.10.0",
diff --git a/tests/README.md b/tests/README.md
@@ -14,10 +14,10 @@ Tests are categorized into three groups: core, compact, and extra.
 ## Compact Tests (testc_*.py)
 
 - Run `unittest tests.compact`
-- Test a limited set of additional functionalities that rely on optional
-  dependencies specified in `requirements.txt`.
-- These dependencies are `PyYAML`, `numpy`, `pyicu`, `python-crfsuite`, and
-  `requests`.
+- Test a limited set of functionalities that rely on a stable and small subset
+  of optional dependencies specified in `requirements.txt`.
+- These dependencies are `PyYAML`, `numpy`, `pyicu`,
+  `python-crfsuite`, and `requests`.
 - Test with the latest two stable Python versions.
 
 ## Extra Tests (testx_*.py)
diff --git a/tests/compact/testc_tokenize.py b/tests/compact/testc_tokenize.py
@@ -25,19 +25,6 @@
 )
 
 
-class WordTokenizeICUTestCase(unittest.TestCase):
-    def test_icu(self):
-        self.assertEqual(pyicu.segment(None), [])
-        self.assertEqual(pyicu.segment(""), [])
-        self.assertEqual(
-            word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="icu"),
-            ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"],
-        )
-
-    def test_word_tokenize_icu(self):
-        self.assertIsNotNone(word_tokenize(TEXT_1, engine="icu"))
-
-
 class SentTokenizeCRFCutTestCase(unittest.TestCase):
     def test_sent_tokenize(self):
         # Use default engine (crfcut)
@@ -88,3 +75,16 @@ def test_subword_tokenize(self):
         self.assertNotIn(
             "า", subword_tokenize("สวัสดีดาวอังคาร", engine="han_solo")
         )
+
+
+class WordTokenizeICUTestCase(unittest.TestCase):
+    def test_icu(self):
+        self.assertEqual(pyicu.segment(None), [])
+        self.assertEqual(pyicu.segment(""), [])
+        self.assertEqual(
+            word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="icu"),
+            ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"],
+        )
+
+    def test_word_tokenize_icu(self):
+        self.assertIsNotNone(word_tokenize(TEXT_1, engine="icu"))
diff --git a/tests/core/test_tokenize.py b/tests/core/test_tokenize.py
@@ -204,6 +204,59 @@
 SENT_4 = ["ผม", "กิน", "ข้าว", " ", "\n", "เธอ", "เล่น", "เกม"]
 
 
+class DetokenizeTestCase(unittest.TestCase):
+    """Detokenize and regrouping test cases"""
+
+    def test_word_detokenize(self):
+        self.assertIsInstance(word_detokenize(["ผม", "5"]), str)
+        self.assertEqual(
+            word_detokenize(["ผม", "เลี้ยง", "5", "ตัว"]), "ผมเลี้ยง 5 ตัว"
+        )
+        self.assertEqual(
+            word_detokenize(["ผม", "เลี้ยง", " ", "5", "ตัว"], "list"),
+            [["ผม", "เลี้ยง", " ", "5", " ", "ตัว"]],
+        )
+        self.assertEqual(
+            word_detokenize(["ผม", "เลี้ยง", "5", "10", "ตัว", "ๆ", "คน", "ดี"]),
+            "ผมเลี้ยง 5 10 ตัว ๆ คนดี",
+        )
+        self.assertEqual(
+            word_detokenize(["ผม", "เลี้ยง", "5", "ตัว", " ", "ๆ", "คน", "ดี"]),
+            "ผมเลี้ยง 5 ตัว ๆ คนดี",
+        )
+        self.assertEqual(
+            word_detokenize(["ม่ายย", " ", "ผม", "เลี้ยง", "5", "ตัว"]),
+            "ม่ายย ผมเลี้ยง 5 ตัว",
+        )
+
+    def test_numeric_data_format(self):
+        engines = ["newmm"]
+
+        for engine in engines:
+            self.assertIn(
+                "127.0.0.1",
+                word_tokenize("ไอพีของคุณคือ 127.0.0.1 ครับ", engine=engine),
+            )
+
+            tokens = word_tokenize(
+                "เวลา 12:12pm มีโปรโมชั่น 11.11", engine=engine
+            )
+            self.assertTrue(
+                any(value in tokens for value in ["12:12pm", "12:12"]),
+                msg=f"{engine}: {tokens}",
+            )
+            self.assertIn("11.11", tokens)
+
+            self.assertIn(
+                "1,234,567.89",
+                word_tokenize("รางวัลมูลค่า 1,234,567.89 บาท", engine=engine),
+            )
+
+            tokens = word_tokenize("อัตราส่วน 2.5:1 คือ 5:2", engine=engine)
+            self.assertIn("2.5:1", tokens)
+            self.assertIn("5:2", tokens)
+
+
 class TokenizeTestCase(unittest.TestCase):
     def test_Tokenizer(self):
         _tokenizer = Tokenizer(DEFAULT_WORD_DICT_TRIE)
@@ -550,56 +603,3 @@ def test_tcc_p(self):
         # )
         self.assertEqual(list(tcc_p.tcc("")), [])
         self.assertEqual(tcc_p.tcc_pos(""), set())
-
-
-class DetokenizeTestCase(unittest.TestCase):
-    """Detokenize and regrouping test cases"""
-
-    def test_word_detokenize(self):
-        self.assertIsInstance(word_detokenize(["ผม", "5"]), str)
-        self.assertEqual(
-            word_detokenize(["ผม", "เลี้ยง", "5", "ตัว"]), "ผมเลี้ยง 5 ตัว"
-        )
-        self.assertEqual(
-            word_detokenize(["ผม", "เลี้ยง", " ", "5", "ตัว"], "list"),
-            [["ผม", "เลี้ยง", " ", "5", " ", "ตัว"]],
-        )
-        self.assertEqual(
-            word_detokenize(["ผม", "เลี้ยง", "5", "10", "ตัว", "ๆ", "คน", "ดี"]),
-            "ผมเลี้ยง 5 10 ตัว ๆ คนดี",
-        )
-        self.assertEqual(
-            word_detokenize(["ผม", "เลี้ยง", "5", "ตัว", " ", "ๆ", "คน", "ดี"]),
-            "ผมเลี้ยง 5 ตัว ๆ คนดี",
-        )
-        self.assertEqual(
-            word_detokenize(["ม่ายย", " ", "ผม", "เลี้ยง", "5", "ตัว"]),
-            "ม่ายย ผมเลี้ยง 5 ตัว",
-        )
-
-    def test_numeric_data_format(self):
-        engines = ["newmm"]
-
-        for engine in engines:
-            self.assertIn(
-                "127.0.0.1",
-                word_tokenize("ไอพีของคุณคือ 127.0.0.1 ครับ", engine=engine),
-            )
-
-            tokens = word_tokenize(
-                "เวลา 12:12pm มีโปรโมชั่น 11.11", engine=engine
-            )
-            self.assertTrue(
-                any(value in tokens for value in ["12:12pm", "12:12"]),
-                msg=f"{engine}: {tokens}",
-            )
-            self.assertIn("11.11", tokens)
-
-            self.assertIn(
-                "1,234,567.89",
-                word_tokenize("รางวัลมูลค่า 1,234,567.89 บาท", engine=engine),
-            )
-
-            tokens = word_tokenize("อัตราส่วน 2.5:1 คือ 5:2", engine=engine)
-            self.assertIn("2.5:1", tokens)
-            self.assertIn("5:2", tokens)