Skip to content

Commit a830e25

Browse files
committed
1、增加文本语言检测,自动识别,用于voc模块。
1 parent c04180f commit a830e25

File tree

3 files changed

+231
-17
lines changed

3 files changed

+231
-17
lines changed

requirements.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,5 @@ docarray[hnswlib]>=0.39.1
88
scipy==1.11.4
99
librosa
1010
edge_tts
11-
langdetect
11+
langdetect
12+
langid

voc/audio.py

+27-15
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
from common.conf import get_conf
99
conf = get_conf()
1010

11+
from worker import text
12+
1113
# hps = get_hparams_from_file(conf.voc_conf)
1214

1315
def complete_audio(audio, sample_rate=44100):
@@ -50,22 +52,32 @@ def generate_audio(
5052

5153
with torch.no_grad():
5254
for idx, piece in enumerate(slices):
53-
audio = infer(
54-
piece,
55-
sdp_ratio=sdp_ratio,
56-
noise_scale=noise_scale,
57-
noise_scale_w=noise_scale_w,
58-
length_scale=length_scale,
59-
sid=speaker,
60-
language=language,
61-
hps=hps,
62-
model=model,
63-
device=conf.device,
64-
)
55+
# 文本语言处理
56+
piece_au_list = []
57+
sentences_list = text.split_by_language(piece)
58+
for sentences, lang in sentences_list:
59+
lang = lang.upper()
60+
if lang == "JA":
61+
lang = "JP"
62+
63+
audio = infer(
64+
sentences,
65+
sdp_ratio=sdp_ratio,
66+
noise_scale=noise_scale,
67+
noise_scale_w=noise_scale_w,
68+
length_scale=length_scale,
69+
sid=speaker,
70+
language=lang,
71+
hps=hps,
72+
model=model,
73+
device=conf.device,
74+
)
6575

66-
# 音频对齐,取整
67-
audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio)
68-
lenght, com_audio = complete_audio(audio16bit, sample_rate=hps.data.sampling_rate)
76+
# 音频对齐,取整
77+
audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio)
78+
piece_au_list.append(audio16bit)
79+
piece_audio = np.concatenate(piece_au_list)
80+
lenght, com_audio = complete_audio(piece_audio, sample_rate=hps.data.sampling_rate)
6981

7082
audio_list.append((lenght, com_audio))
7183

worker/text.py

+202-1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,105 @@
55
from common.conf import get_conf
66
conf = get_conf()
77

8+
langid_languages = [
9+
"af",
10+
"am",
11+
"an",
12+
"ar",
13+
"as",
14+
"az",
15+
"be",
16+
"bg",
17+
"bn",
18+
"br",
19+
"bs",
20+
"ca",
21+
"cs",
22+
"cy",
23+
"da",
24+
"de",
25+
"dz",
26+
"el",
27+
"en",
28+
"eo",
29+
"es",
30+
"et",
31+
"eu",
32+
"fa",
33+
"fi",
34+
"fo",
35+
"fr",
36+
"ga",
37+
"gl",
38+
"gu",
39+
"he",
40+
"hi",
41+
"hr",
42+
"ht",
43+
"hu",
44+
"hy",
45+
"id",
46+
"is",
47+
"it",
48+
"ja",
49+
"jv",
50+
"ka",
51+
"kk",
52+
"km",
53+
"kn",
54+
"ko",
55+
"ku",
56+
"ky",
57+
"la",
58+
"lb",
59+
"lo",
60+
"lt",
61+
"lv",
62+
"mg",
63+
"mk",
64+
"ml",
65+
"mn",
66+
"mr",
67+
"ms",
68+
"mt",
69+
"nb",
70+
"ne",
71+
"nl",
72+
"nn",
73+
"no",
74+
"oc",
75+
"or",
76+
"pa",
77+
"pl",
78+
"ps",
79+
"pt",
80+
"qu",
81+
"ro",
82+
"ru",
83+
"rw",
84+
"se",
85+
"si",
86+
"sk",
87+
"sl",
88+
"sq",
89+
"sr",
90+
"sv",
91+
"sw",
92+
"ta",
93+
"te",
94+
"th",
95+
"tl",
96+
"tr",
97+
"ug",
98+
"uk",
99+
"ur",
100+
"vi",
101+
"vo",
102+
"wa",
103+
"xh",
104+
"zh",
105+
"zu",
106+
]
8107

9108
class ChineseTextSplitter:
10109
def __init__(self, pdf: bool = False, sentence_size: int = 256, **kwargs):
@@ -71,4 +170,106 @@ def split_text(s1:str, sentence_size:int=256) ->list:
71170
ls = text_splitter.split_text(s1)
72171

73172
ls.append(conf.negativate_class) # 添加other 项目
74-
return ls
173+
return ls
174+
175+
176+
def mark_text(text:str, pattern:str=r'[A-Za-z]+') -> List[Dict]:
177+
'''
178+
识别输入文本,自动标记中英文。暂时只支持中英文。
179+
通过re正则识别,还可以通过模型推理,常用的有:1.langid;2.langdetect;3.fasttext
180+
中文:[\u4e00-\u9fa5]
181+
英文:[a-zA-Z]
182+
'''
183+
engs = re.findall(pattern, text)
184+
langs = []
185+
if engs:
186+
for item in engs:
187+
p1, p_left = text.split(item)
188+
if p1:
189+
langs.append((p1, 'ZH'))
190+
langs.append((item, 'EN'))
191+
text = p_left
192+
if text:
193+
langs.append((text, 'ZH'))
194+
195+
return langs
196+
197+
def split_by_language(text: str, target_languages: list = ["zh", "ja", "en"]) -> list:
198+
'''
199+
安装语种分割文本,返回文本段、语种的list
200+
'''
201+
pattern = (
202+
r"[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`"
203+
r"\!?\。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」"
204+
r"『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+"
205+
)
206+
sentences = re.split(pattern, text)
207+
208+
pre_lang = ""
209+
start = 0
210+
end = 0
211+
sentences_list = []
212+
213+
sorted_target_languages = sorted(target_languages)
214+
if sorted_target_languages in [["en", "zh"], ["en", "ja"], ["en", "ja", "zh"]]:
215+
new_sentences = []
216+
for sentence in sentences:
217+
new_sentences.extend(split_alpha_nonalpha(sentence))
218+
sentences = new_sentences
219+
220+
for sentence in sentences:
221+
if check_is_none(sentence):
222+
continue
223+
224+
lang = classify_language(sentence, target_languages)
225+
226+
end += text[end:].index(sentence)
227+
if pre_lang != "" and pre_lang != lang:
228+
sentences_list.append((text[start:end], pre_lang))
229+
start = end
230+
end += len(sentence)
231+
pre_lang = lang
232+
sentences_list.append((text[start:], pre_lang))
233+
234+
return sentences_list
235+
236+
def classify_language(text: str, target_languages: list = None) -> str:
237+
module = 'langid'
238+
if module == "fastlid" or module == "fasttext":
239+
from fastlid import fastlid, supported_langs
240+
241+
classifier = fastlid
242+
if target_languages != None:
243+
target_languages = [
244+
lang for lang in target_languages if lang in supported_langs
245+
]
246+
fastlid.set_languages = target_languages
247+
elif module == "langid":
248+
import langid
249+
250+
classifier = langid.classify
251+
if target_languages != None:
252+
target_languages = [
253+
lang for lang in target_languages if lang in langid_languages
254+
]
255+
langid.set_languages(target_languages)
256+
else:
257+
raise ValueError(f"Wrong module {module}")
258+
259+
lang = classifier(text)[0]
260+
261+
return lang
262+
263+
def split_alpha_nonalpha(text):
264+
return re.split(
265+
r"(?:(?<=[\u4e00-\u9fff])|(?<=[\u3040-\u30FF]))(?=[a-zA-Z])|(?<=[a-zA-Z])(?:(?=[\u4e00-\u9fff])|(?=[\u3040-\u30FF]))",
266+
text,
267+
)
268+
269+
def check_is_none(item) -> bool:
270+
"""none -> True, not none -> False"""
271+
return (
272+
item is None
273+
or (isinstance(item, str) and str(item).isspace())
274+
or str(item) == ""
275+
)

0 commit comments

Comments
 (0)