5
5
from common .conf import get_conf
6
6
conf = get_conf ()
7
7
8
+ langid_languages = [
9
+ "af" ,
10
+ "am" ,
11
+ "an" ,
12
+ "ar" ,
13
+ "as" ,
14
+ "az" ,
15
+ "be" ,
16
+ "bg" ,
17
+ "bn" ,
18
+ "br" ,
19
+ "bs" ,
20
+ "ca" ,
21
+ "cs" ,
22
+ "cy" ,
23
+ "da" ,
24
+ "de" ,
25
+ "dz" ,
26
+ "el" ,
27
+ "en" ,
28
+ "eo" ,
29
+ "es" ,
30
+ "et" ,
31
+ "eu" ,
32
+ "fa" ,
33
+ "fi" ,
34
+ "fo" ,
35
+ "fr" ,
36
+ "ga" ,
37
+ "gl" ,
38
+ "gu" ,
39
+ "he" ,
40
+ "hi" ,
41
+ "hr" ,
42
+ "ht" ,
43
+ "hu" ,
44
+ "hy" ,
45
+ "id" ,
46
+ "is" ,
47
+ "it" ,
48
+ "ja" ,
49
+ "jv" ,
50
+ "ka" ,
51
+ "kk" ,
52
+ "km" ,
53
+ "kn" ,
54
+ "ko" ,
55
+ "ku" ,
56
+ "ky" ,
57
+ "la" ,
58
+ "lb" ,
59
+ "lo" ,
60
+ "lt" ,
61
+ "lv" ,
62
+ "mg" ,
63
+ "mk" ,
64
+ "ml" ,
65
+ "mn" ,
66
+ "mr" ,
67
+ "ms" ,
68
+ "mt" ,
69
+ "nb" ,
70
+ "ne" ,
71
+ "nl" ,
72
+ "nn" ,
73
+ "no" ,
74
+ "oc" ,
75
+ "or" ,
76
+ "pa" ,
77
+ "pl" ,
78
+ "ps" ,
79
+ "pt" ,
80
+ "qu" ,
81
+ "ro" ,
82
+ "ru" ,
83
+ "rw" ,
84
+ "se" ,
85
+ "si" ,
86
+ "sk" ,
87
+ "sl" ,
88
+ "sq" ,
89
+ "sr" ,
90
+ "sv" ,
91
+ "sw" ,
92
+ "ta" ,
93
+ "te" ,
94
+ "th" ,
95
+ "tl" ,
96
+ "tr" ,
97
+ "ug" ,
98
+ "uk" ,
99
+ "ur" ,
100
+ "vi" ,
101
+ "vo" ,
102
+ "wa" ,
103
+ "xh" ,
104
+ "zh" ,
105
+ "zu" ,
106
+ ]
8
107
9
108
class ChineseTextSplitter :
10
109
def __init__ (self , pdf : bool = False , sentence_size : int = 256 , ** kwargs ):
@@ -71,4 +170,106 @@ def split_text(s1:str, sentence_size:int=256) ->list:
71
170
ls = text_splitter .split_text (s1 )
72
171
73
172
ls .append (conf .negativate_class ) # 添加other 项目
74
- return ls
173
+ return ls
174
+
175
+
176
+ def mark_text (text :str , pattern :str = r'[A-Za-z]+' ) -> List [Dict ]:
177
+ '''
178
+ 识别输入文本,自动标记中英文。暂时只支持中英文。
179
+ 通过re正则识别,还可以通过模型推理,常用的有:1.langid;2.langdetect;3.fasttext
180
+ 中文:[\u4e00 -\u9fa5 ]
181
+ 英文:[a-zA-Z]
182
+ '''
183
+ engs = re .findall (pattern , text )
184
+ langs = []
185
+ if engs :
186
+ for item in engs :
187
+ p1 , p_left = text .split (item )
188
+ if p1 :
189
+ langs .append ((p1 , 'ZH' ))
190
+ langs .append ((item , 'EN' ))
191
+ text = p_left
192
+ if text :
193
+ langs .append ((text , 'ZH' ))
194
+
195
+ return langs
196
+
197
+ def split_by_language (text : str , target_languages : list = ["zh" , "ja" , "en" ]) -> list :
198
+ '''
199
+ 安装语种分割文本,返回文本段、语种的list
200
+ '''
201
+ pattern = (
202
+ r"[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`"
203
+ r"\!?\。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」"
204
+ r"『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+"
205
+ )
206
+ sentences = re .split (pattern , text )
207
+
208
+ pre_lang = ""
209
+ start = 0
210
+ end = 0
211
+ sentences_list = []
212
+
213
+ sorted_target_languages = sorted (target_languages )
214
+ if sorted_target_languages in [["en" , "zh" ], ["en" , "ja" ], ["en" , "ja" , "zh" ]]:
215
+ new_sentences = []
216
+ for sentence in sentences :
217
+ new_sentences .extend (split_alpha_nonalpha (sentence ))
218
+ sentences = new_sentences
219
+
220
+ for sentence in sentences :
221
+ if check_is_none (sentence ):
222
+ continue
223
+
224
+ lang = classify_language (sentence , target_languages )
225
+
226
+ end += text [end :].index (sentence )
227
+ if pre_lang != "" and pre_lang != lang :
228
+ sentences_list .append ((text [start :end ], pre_lang ))
229
+ start = end
230
+ end += len (sentence )
231
+ pre_lang = lang
232
+ sentences_list .append ((text [start :], pre_lang ))
233
+
234
+ return sentences_list
235
+
236
+ def classify_language (text : str , target_languages : list = None ) -> str :
237
+ module = 'langid'
238
+ if module == "fastlid" or module == "fasttext" :
239
+ from fastlid import fastlid , supported_langs
240
+
241
+ classifier = fastlid
242
+ if target_languages != None :
243
+ target_languages = [
244
+ lang for lang in target_languages if lang in supported_langs
245
+ ]
246
+ fastlid .set_languages = target_languages
247
+ elif module == "langid" :
248
+ import langid
249
+
250
+ classifier = langid .classify
251
+ if target_languages != None :
252
+ target_languages = [
253
+ lang for lang in target_languages if lang in langid_languages
254
+ ]
255
+ langid .set_languages (target_languages )
256
+ else :
257
+ raise ValueError (f"Wrong module { module } " )
258
+
259
+ lang = classifier (text )[0 ]
260
+
261
+ return lang
262
+
263
+ def split_alpha_nonalpha (text ):
264
+ return re .split (
265
+ r"(?:(?<=[\u4e00-\u9fff])|(?<=[\u3040-\u30FF]))(?=[a-zA-Z])|(?<=[a-zA-Z])(?:(?=[\u4e00-\u9fff])|(?=[\u3040-\u30FF]))" ,
266
+ text ,
267
+ )
268
+
269
+ def check_is_none (item ) -> bool :
270
+ """none -> True, not none -> False"""
271
+ return (
272
+ item is None
273
+ or (isinstance (item , str ) and str (item ).isspace ())
274
+ or str (item ) == ""
275
+ )
0 commit comments