v0.3.0

hscspring · hscspring · commit c3ed63e49986 · 2020-07-09T12:41:23.000+08:00
diff --git a/README.md b/README.md
@@ -4,7 +4,11 @@ This is a pre-processing tool for NLP.
 ## Features
 
 - a flexible pipe line for text io
-- a flexible tool for text clean and extract and kinds of length
+- a flexible tool for text clean and extract
+- Sentence cut and Chinese character cut
+- Chinese character normalization
+- kinds of length
+- stopwords
 - some magic usage in pre-processing
 
 ## Install
@@ -215,19 +219,27 @@ print(sent_list)
 ```python
 # Cut to Chinese chars
 from pnlp import cut_zhchar
-text = "你好，hello, 520 = ”我爱你“。"
+text = "你好，hello, 520 i love u. = ”我爱你“。"
 char_list = cut_zhchar(text)
 print(char_list)
 """
-['你', '好', '，', 'hello', ',', ' ', '520', ' ', '=', ' ', '”', '我', '爱', '你', '“', '。']
+['你', '好', '，', 'hello', ',', ' ', '520', ' ', 'i', ' ', 'love', ' ', 'u', '.', ' ', '=', ' ', '”', '我', '爱', '你', '“', '。']
 """
 char_list = cut_zhchar(text, remove_blank=True)
 print(char_list)
 """
-['你', '好', '，', 'hello', ',', '520', '=', '”', '我', '爱', '你', '“', '。']
+['你', '好', '，', 'hello', ',', '520', 'i', 'love', 'u', '.', '=', '”', '我', '爱', '你', '“', '。']
 """
 ```
 
+### Normalization
+
+```python
+from pnlp import num_norm
+num_norm.num2zh(1024) == "一千零二十四"
+num_norm.num2zh(1024).to_money() == "壹仟零贰拾肆"
+num_norm.zh2num("一千零二十四") == 1024
+```
 
 ### StopWords
 
@@ -307,6 +319,14 @@ $ python -m pytest
 
 ## ChangeLog
 
+### v0.3.0
+
+Update `cut_sentence`; Add `NumNorm`.
+
+### v0.28-29
+
+Update `cut_zhchar`.
+
 ### v0.27
 
 Add `cut_zhchar`.
diff --git a/pnlp/__init__.py b/pnlp/__init__.py
@@ -3,17 +3,23 @@
 from .piop import write_file, write_json, check_dir
 from .ptxt import Regex, Text, Length
 from .pcut import cut_sentence, cut_zhchar
+from .pnorm import NumNorm
 from .pmag import MagicDict
 
 from .utils import pstr
 
 from .stopwords import chinese_stopwords, english_stopwords, StopWords
 
 
+num_norm = NumNorm()
+reg = Regex()
+reader = Reader()
+
+
 __title__ = 'pnlp'
-__version__ = '0.27'
+__version__ = '0.3.0'
 __author__ = 'Yam'
 __license__ = 'MIT'
 __copyright__ = 'Copyright 2019, 2020 Yam'
-__all__ = ['Reader', 'Text', 'Regex', 'Length', 'MagicDict']
+__all__ = ['Reader', 'Text', 'Regex', 'Length', 'MagicDict', 'Chinese2Arabic']
 
diff --git a/pnlp/pcut.py b/pnlp/pcut.py
@@ -4,32 +4,65 @@
 
 
 psent = re.compile(r'''
-    [。.！!?？…]+[”][。.!?！？…]?
+    \n+
     |
-    (?<=[a-zA-Z"”》）)〉〕】>」』\u4e00-\u9fa5])[.。！!?？…～~]+
+    [。.！!?？…]+[”][。.!！?？…～~]?
+    |
+    (?<=[a-zA-Z"”》）)〉〕】>」』\u4e00-\u9fa5])[。.!！?？…～~]+
     ''', re.UNICODE | re.VERBOSE)
 # referenced from jieba
-punzh = pstr(Regex.pun_zh) - "-"
-re_zh = re.compile(rf"([\u4E00-\u9FD5{punzh}+#&_]+)", re.UNICODE)
-re_skip = re.compile(r"(\s)", re.UNICODE)
+punzh = pstr(Regex.pun_zh) - "-"  # for minus number eg -2
+punen = pstr(Regex.pun_en) - "."  # for float number eg 1.3
+pun = punzh + punen
+pzh = re.compile(rf"([\u4E00-\u9FD5{pun}+#&])", re.UNICODE)
+pen = re.compile(r"([a-zA-Z]+)", re.UNICODE)
+pskip = re.compile(r"(\s)", re.UNICODE)
+pspecial = re.compile(r"([-.])")  # split to single
+pnum = re.compile(r"""
+    ([-]?\d{1,}[.]?\d{0,}%)
+    |
+    ([-]?\d{1,}[./]?\d{0,})    
+    """, re.UNICODE | re.VERBOSE)
 
 
 def cut_zhchar(text: str, remove_blank: bool = False) -> list:
     lst = []
-    blocks = re_zh.split(text)
+    blocks = pzh.split(text)
     for block in blocks:
         if not block:
             continue
-        if re_zh.match(block):
+        if pzh.match(block):
             for char in block:
                 lst.append(char)
         else:
-            skips = re_skip.split(block)
+            skips = pskip.split(block)
             for skip in skips:
-                if remove_blank:
-                    skip = re_skip.sub("", skip)
-                if skip:
-                    lst.append(skip)
+                if pen.search(skip):
+                    for en_part in pen.split(skip):
+                        if en_part:
+                            spe = pspecial.search(en_part)
+                            if not spe:
+                                lst.append(en_part)
+                            else:
+                                for spe_part in pspecial.split(en_part):
+                                    if spe_part:
+                                        lst.append(spe_part)
+                elif pnum.search(skip):
+                    if skip[-1] != ".":
+                        lst.append(skip)
+                    else:
+                        i = 0
+                        while skip[-1] == ".":
+                            i += 1
+                            skip = skip[:-1]
+                        lst.append(skip)
+                        for _ in range(i):
+                            lst.append(".")
+                else:
+                    if remove_blank:
+                        skip = pskip.sub("", skip)
+                    if skip:
+                        lst.append(skip)
     return lst
 
 
diff --git a/pnlp/pnorm.py b/pnlp/pnorm.py
@@ -0,0 +1,189 @@
+from dataclasses import dataclass
+from typing import TypeVar
+
+
+T = TypeVar('T', str, float, int)
+
+ZH_NUM = {
+    '〇': 0, '一': 1, '二': 2, '三': 3, '四': 4, '五': 5,
+    '六': 6, '七': 7, '八': 8, '九': 9, '零': 0,
+    '壹': 1, '贰': 2, '叁': 3, '肆': 4, '伍': 5,
+    '陆': 6, '柒': 7, '捌': 8, '玖': 9, '貮': 2, '两': 2,
+}
+
+ZH_UNIT = {
+    '十': 10,
+    '拾': 10,
+    '百': 100,
+    '佰': 100,
+    '千': 1000,
+    '仟': 1000,
+    '万': 10000,
+    '萬': 10000,
+    '亿': 100000000,
+    '億': 100000000,
+    '兆': 10000000000000,
+}
+
+
+ARB_NUM = {
+    0: "零",
+    1: "一",
+    2: "二",
+    3: "三",
+    4: "四",
+    5: "五",
+    6: "六",
+    7: "七",
+    8: "八",
+    9: "九",
+    10: "十",
+    100: "百",
+    1000: "千",
+    10000: "万",
+    100000000: "亿",
+    10000000000000: "兆"
+}
+
+ZH2MONEY = {
+    "一": "壹",
+    "二": "贰",
+    "三": "叁",
+    "四": "肆",
+    "五": "伍",
+    "六": "陆",
+    "七": "柒",
+    "八": "捌",
+    "九": "玖",
+    "十": "拾",
+    "百": "佰",
+    "千": "仟",
+    "万": "萬",
+    "亿": "億"
+}
+
+
+class pnumstr(str):
+
+    def to_money(self):
+        for c in self:
+            mc = ZH2MONEY.get(c)
+            if mc:
+                self = self.replace(c, mc)
+        return self
+
+
+@dataclass
+class NumNorm:
+    """
+    Chinese_to_Arabic
+    modifed from https://github.com/bamtercelboo/corpus_process_script/blob/master/cn_to_arabic/cn_to_arabic.py
+    """
+    @staticmethod
+    def num_len(num: int) -> int:
+        if num == 0:
+            return 1
+        if num < 0:
+            num = -num
+        i = 0
+        while num != 0:
+            num //= 10
+            i += 1
+        return i
+
+    def num2zh(self, num: int) -> str:
+        def get_base(num):
+            zh = ARB_NUM.get(num)
+            if num < 10:
+                return zh
+            else:
+                return "一" + zh
+
+        def get_less_than_10w(num):
+            res = ""
+            while num != 0:
+                if num < 10:
+                    res += ARB_NUM.get(num)
+                    break
+                length = NumNorm.num_len(num)
+                divider = 10 ** (length - 1)
+                high = num // divider
+                res += ARB_NUM.get(high)
+                res += ARB_NUM.get(divider)
+                num = num % divider
+                new_len = NumNorm.num_len(num)
+                if length - new_len > 1 and num != 0:
+                    res += "零"
+            return res
+
+        def get_interval(num: int, lower: int, unit: str):
+            res = ""
+            length = NumNorm.num_len(num)
+            divider = lower / 10
+            high = num // divider
+            res = get_less_than_10w(high)
+            high_len = NumNorm.num_len(high)
+            res += unit
+            num -= high * divider
+            new_len = NumNorm.num_len(num)
+            if length - high_len - new_len > 0 and num != 0:
+                res += "零"
+            return res, num
+
+        def get_10w_to_1y(num):
+            res, num = get_interval(num, 10**5, "万")
+            if 0 < num < 100000:
+                res += get_less_than_10w(num)
+            return res
+
+        def get_1y_to_1z(num):
+            res, num = get_interval(num, 10**9, "亿")
+            if 0 < num < 100000000:
+                res += get_10w_to_1y(num)
+            return res
+
+        if num in ARB_NUM:
+            result = get_base(num)
+            return pnumstr(result)
+        # 十万
+        if num < 10**5:
+            result = get_less_than_10w(num)
+        # 一亿
+        elif num < 10**8:
+            result = get_10w_to_1y(num)
+        # 一兆
+        elif num < 10**13:
+            result = get_1y_to_1z(num)
+        else:
+            result = "超大"
+        return pnumstr(result)
+
+    def zh2num(self, zh: str) -> T:
+        unit = 0
+        digit_list = []
+        for zhdigit in reversed(zh):
+            if zhdigit in ZH_UNIT:
+                unit = ZH_UNIT.get(zhdigit)
+                if unit == 10000 or unit == 100000000:
+                    digit_list.append(unit)
+                    unit = 1
+            else:
+                digit = ZH_NUM.get(zhdigit)
+                if unit:
+                    digit *= unit
+                    unit = 0
+                digit_list.append(digit)
+        if unit == 10:
+            digit_list.append(10)
+        val, tmp = 0, 0
+        for x in reversed(digit_list):
+            if x == 10000 or x == 100000000:
+                val += tmp * x
+                tmp = 0
+            else:
+                tmp += x
+        val += tmp
+        if val == 0 and zh != "零":
+            return zh
+        else:
+            return val
diff --git a/pnlp/utils.py b/pnlp/utils.py
@@ -7,4 +7,6 @@ def __sub__(self, other):
             if c in other:
                 continue
             result.append(c)
-        return "".join(result)
+        return "".join(result)
+
+
diff --git a/setup.py b/setup.py
@@ -5,15 +5,17 @@
 
 setuptools.setup(
     name="pnlp",
-    version="0.27",
+    version="0.3.0",
     author="Yam",
     author_email="haoshaochun@gmail.com",
     description="A pre-processing tool for NLP.",
     long_description=long_description,
     long_description_content_type="text/markdown",
     url="https://github.com/hscspring/pnlp",
     include_package_data=True,
-    packages=setuptools.find_packages(),
+    # default is `setup.py` path, so do not need a `package_dir` attr
+    # if another dir, should be declared by `package_dir`
+    packages=setuptools.find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
     install_requires=[
           'addict',
           'pyyaml',
diff --git a/tests/test_pcut.py b/tests/test_pcut.py
diff --git a/tests/test_pnorm.py b/tests/test_pnorm.py