Skip to content

Commit c3ed63e

Browse files
committed
v0.3.0
1 parent 4e7ad94 commit c3ed63e

File tree

8 files changed

+672
-40
lines changed

8 files changed

+672
-40
lines changed

README.md

+24-4
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,11 @@ This is a pre-processing tool for NLP.
44
## Features
55

66
- a flexible pipe line for text io
7-
- a flexible tool for text clean and extract and kinds of length
7+
- a flexible tool for text clean and extract
8+
- Sentence cut and Chinese character cut
9+
- Chinese character normalization
10+
- kinds of length
11+
- stopwords
812
- some magic usage in pre-processing
913

1014
## Install
@@ -215,19 +219,27 @@ print(sent_list)
215219
```python
216220
# Cut to Chinese chars
217221
from pnlp import cut_zhchar
218-
text = "你好,hello, 520 = ”我爱你“。"
222+
text = "你好,hello, 520 i love u. = ”我爱你“。"
219223
char_list = cut_zhchar(text)
220224
print(char_list)
221225
"""
222-
['你', '好', ',', 'hello', ',', ' ', '520', ' ', '=', ' ', '”', '我', '爱', '你', '“', '。']
226+
['你', '好', ',', 'hello', ',', ' ', '520', ' ', 'i', ' ', 'love', ' ', 'u', '.', ' ', '=', ' ', '”', '我', '爱', '你', '“', '。']
223227
"""
224228
char_list = cut_zhchar(text, remove_blank=True)
225229
print(char_list)
226230
"""
227-
['你', '好', ',', 'hello', ',', '520', '=', '”', '我', '爱', '你', '“', '。']
231+
['你', '好', ',', 'hello', ',', '520', 'i', 'love', 'u', '.', '=', '”', '我', '爱', '你', '“', '。']
228232
"""
229233
```
230234

235+
### Normalization
236+
237+
```python
238+
from pnlp import num_norm
239+
num_norm.num2zh(1024) == "一千零二十四"
240+
num_norm.num2zh(1024).to_money() == "壹仟零贰拾肆"
241+
num_norm.zh2num("一千零二十四") == 1024
242+
```
231243

232244
### StopWords
233245

@@ -307,6 +319,14 @@ $ python -m pytest
307319

308320
## ChangeLog
309321

322+
### v0.3.0
323+
324+
Update `cut_sentence`; Add `NumNorm`.
325+
326+
### v0.28-29
327+
328+
Update `cut_zhchar`.
329+
310330
### v0.27
311331

312332
Add `cut_zhchar`.

pnlp/__init__.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,23 @@
33
from .piop import write_file, write_json, check_dir
44
from .ptxt import Regex, Text, Length
55
from .pcut import cut_sentence, cut_zhchar
6+
from .pnorm import NumNorm
67
from .pmag import MagicDict
78

89
from .utils import pstr
910

1011
from .stopwords import chinese_stopwords, english_stopwords, StopWords
1112

1213

14+
num_norm = NumNorm()
15+
reg = Regex()
16+
reader = Reader()
17+
18+
1319
__title__ = 'pnlp'
14-
__version__ = '0.27'
20+
__version__ = '0.3.0'
1521
__author__ = 'Yam'
1622
__license__ = 'MIT'
1723
__copyright__ = 'Copyright 2019, 2020 Yam'
18-
__all__ = ['Reader', 'Text', 'Regex', 'Length', 'MagicDict']
24+
__all__ = ['Reader', 'Text', 'Regex', 'Length', 'MagicDict', 'Chinese2Arabic']
1925

pnlp/pcut.py

+45-12
Original file line numberDiff line numberDiff line change
@@ -4,32 +4,65 @@
44

55

66
psent = re.compile(r'''
7-
[。.!!??…]+[”][。.!?!?…]?
7+
\n+
88
|
9-
(?<=[a-zA-Z"”》))〉〕】>」』\u4e00-\u9fa5])[.。!!??…~~]+
9+
[。.!!??…]+[”][。.!!??…~~]?
10+
|
11+
(?<=[a-zA-Z"”》))〉〕】>」』\u4e00-\u9fa5])[。.!!??…~~]+
1012
''', re.UNICODE | re.VERBOSE)
1113
# referenced from jieba
12-
punzh = pstr(Regex.pun_zh) - "-"
13-
re_zh = re.compile(rf"([\u4E00-\u9FD5{punzh}+#&_]+)", re.UNICODE)
14-
re_skip = re.compile(r"(\s)", re.UNICODE)
14+
punzh = pstr(Regex.pun_zh) - "-" # for minus number eg -2
15+
punen = pstr(Regex.pun_en) - "." # for float number eg 1.3
16+
pun = punzh + punen
17+
pzh = re.compile(rf"([\u4E00-\u9FD5{pun}+#&])", re.UNICODE)
18+
pen = re.compile(r"([a-zA-Z]+)", re.UNICODE)
19+
pskip = re.compile(r"(\s)", re.UNICODE)
20+
pspecial = re.compile(r"([-.])") # split to single
21+
pnum = re.compile(r"""
22+
([-]?\d{1,}[.]?\d{0,}%)
23+
|
24+
([-]?\d{1,}[./]?\d{0,})
25+
""", re.UNICODE | re.VERBOSE)
1526

1627

1728
def cut_zhchar(text: str, remove_blank: bool = False) -> list:
1829
lst = []
19-
blocks = re_zh.split(text)
30+
blocks = pzh.split(text)
2031
for block in blocks:
2132
if not block:
2233
continue
23-
if re_zh.match(block):
34+
if pzh.match(block):
2435
for char in block:
2536
lst.append(char)
2637
else:
27-
skips = re_skip.split(block)
38+
skips = pskip.split(block)
2839
for skip in skips:
29-
if remove_blank:
30-
skip = re_skip.sub("", skip)
31-
if skip:
32-
lst.append(skip)
40+
if pen.search(skip):
41+
for en_part in pen.split(skip):
42+
if en_part:
43+
spe = pspecial.search(en_part)
44+
if not spe:
45+
lst.append(en_part)
46+
else:
47+
for spe_part in pspecial.split(en_part):
48+
if spe_part:
49+
lst.append(spe_part)
50+
elif pnum.search(skip):
51+
if skip[-1] != ".":
52+
lst.append(skip)
53+
else:
54+
i = 0
55+
while skip[-1] == ".":
56+
i += 1
57+
skip = skip[:-1]
58+
lst.append(skip)
59+
for _ in range(i):
60+
lst.append(".")
61+
else:
62+
if remove_blank:
63+
skip = pskip.sub("", skip)
64+
if skip:
65+
lst.append(skip)
3366
return lst
3467

3568

pnlp/pnorm.py

+189
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
from dataclasses import dataclass
2+
from typing import TypeVar
3+
4+
5+
T = TypeVar('T', str, float, int)
6+
7+
ZH_NUM = {
8+
'〇': 0, '一': 1, '二': 2, '三': 3, '四': 4, '五': 5,
9+
'六': 6, '七': 7, '八': 8, '九': 9, '零': 0,
10+
'壹': 1, '贰': 2, '叁': 3, '肆': 4, '伍': 5,
11+
'陆': 6, '柒': 7, '捌': 8, '玖': 9, '貮': 2, '两': 2,
12+
}
13+
14+
ZH_UNIT = {
15+
'十': 10,
16+
'拾': 10,
17+
'百': 100,
18+
'佰': 100,
19+
'千': 1000,
20+
'仟': 1000,
21+
'万': 10000,
22+
'萬': 10000,
23+
'亿': 100000000,
24+
'億': 100000000,
25+
'兆': 10000000000000,
26+
}
27+
28+
29+
ARB_NUM = {
30+
0: "零",
31+
1: "一",
32+
2: "二",
33+
3: "三",
34+
4: "四",
35+
5: "五",
36+
6: "六",
37+
7: "七",
38+
8: "八",
39+
9: "九",
40+
10: "十",
41+
100: "百",
42+
1000: "千",
43+
10000: "万",
44+
100000000: "亿",
45+
10000000000000: "兆"
46+
}
47+
48+
ZH2MONEY = {
49+
"一": "壹",
50+
"二": "贰",
51+
"三": "叁",
52+
"四": "肆",
53+
"五": "伍",
54+
"六": "陆",
55+
"七": "柒",
56+
"八": "捌",
57+
"九": "玖",
58+
"十": "拾",
59+
"百": "佰",
60+
"千": "仟",
61+
"万": "萬",
62+
"亿": "億"
63+
}
64+
65+
66+
class pnumstr(str):
67+
68+
def to_money(self):
69+
for c in self:
70+
mc = ZH2MONEY.get(c)
71+
if mc:
72+
self = self.replace(c, mc)
73+
return self
74+
75+
76+
@dataclass
77+
class NumNorm:
78+
"""
79+
Chinese_to_Arabic
80+
modifed from https://github.com/bamtercelboo/corpus_process_script/blob/master/cn_to_arabic/cn_to_arabic.py
81+
"""
82+
@staticmethod
83+
def num_len(num: int) -> int:
84+
if num == 0:
85+
return 1
86+
if num < 0:
87+
num = -num
88+
i = 0
89+
while num != 0:
90+
num //= 10
91+
i += 1
92+
return i
93+
94+
def num2zh(self, num: int) -> str:
95+
def get_base(num):
96+
zh = ARB_NUM.get(num)
97+
if num < 10:
98+
return zh
99+
else:
100+
return "一" + zh
101+
102+
def get_less_than_10w(num):
103+
res = ""
104+
while num != 0:
105+
if num < 10:
106+
res += ARB_NUM.get(num)
107+
break
108+
length = NumNorm.num_len(num)
109+
divider = 10 ** (length - 1)
110+
high = num // divider
111+
res += ARB_NUM.get(high)
112+
res += ARB_NUM.get(divider)
113+
num = num % divider
114+
new_len = NumNorm.num_len(num)
115+
if length - new_len > 1 and num != 0:
116+
res += "零"
117+
return res
118+
119+
def get_interval(num: int, lower: int, unit: str):
120+
res = ""
121+
length = NumNorm.num_len(num)
122+
divider = lower / 10
123+
high = num // divider
124+
res = get_less_than_10w(high)
125+
high_len = NumNorm.num_len(high)
126+
res += unit
127+
num -= high * divider
128+
new_len = NumNorm.num_len(num)
129+
if length - high_len - new_len > 0 and num != 0:
130+
res += "零"
131+
return res, num
132+
133+
def get_10w_to_1y(num):
134+
res, num = get_interval(num, 10**5, "万")
135+
if 0 < num < 100000:
136+
res += get_less_than_10w(num)
137+
return res
138+
139+
def get_1y_to_1z(num):
140+
res, num = get_interval(num, 10**9, "亿")
141+
if 0 < num < 100000000:
142+
res += get_10w_to_1y(num)
143+
return res
144+
145+
if num in ARB_NUM:
146+
result = get_base(num)
147+
return pnumstr(result)
148+
# 十万
149+
if num < 10**5:
150+
result = get_less_than_10w(num)
151+
# 一亿
152+
elif num < 10**8:
153+
result = get_10w_to_1y(num)
154+
# 一兆
155+
elif num < 10**13:
156+
result = get_1y_to_1z(num)
157+
else:
158+
result = "超大"
159+
return pnumstr(result)
160+
161+
def zh2num(self, zh: str) -> T:
162+
unit = 0
163+
digit_list = []
164+
for zhdigit in reversed(zh):
165+
if zhdigit in ZH_UNIT:
166+
unit = ZH_UNIT.get(zhdigit)
167+
if unit == 10000 or unit == 100000000:
168+
digit_list.append(unit)
169+
unit = 1
170+
else:
171+
digit = ZH_NUM.get(zhdigit)
172+
if unit:
173+
digit *= unit
174+
unit = 0
175+
digit_list.append(digit)
176+
if unit == 10:
177+
digit_list.append(10)
178+
val, tmp = 0, 0
179+
for x in reversed(digit_list):
180+
if x == 10000 or x == 100000000:
181+
val += tmp * x
182+
tmp = 0
183+
else:
184+
tmp += x
185+
val += tmp
186+
if val == 0 and zh != "零":
187+
return zh
188+
else:
189+
return val

pnlp/utils.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,6 @@ def __sub__(self, other):
77
if c in other:
88
continue
99
result.append(c)
10-
return "".join(result)
10+
return "".join(result)
11+
12+

setup.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,17 @@
55

66
setuptools.setup(
77
name="pnlp",
8-
version="0.27",
8+
version="0.3.0",
99
author="Yam",
1010
author_email="[email protected]",
1111
description="A pre-processing tool for NLP.",
1212
long_description=long_description,
1313
long_description_content_type="text/markdown",
1414
url="https://github.com/hscspring/pnlp",
1515
include_package_data=True,
16-
packages=setuptools.find_packages(),
16+
# default is `setup.py` path, so do not need a `package_dir` attr
17+
# if another dir, should be declared by `package_dir`
18+
packages=setuptools.find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
1719
install_requires=[
1820
'addict',
1921
'pyyaml',

0 commit comments

Comments
 (0)