Skip to content

Commit c23258e

Browse files
committed
1. 增加配置项audio,用于控制使用的tts模块 2. 增加common/config-template.json,用于一般使用场景
1 parent d1dbc08 commit c23258e

File tree

8 files changed

+156
-58
lines changed

8 files changed

+156
-58
lines changed

README.md

+52-8
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,13 @@
55
- 文本(脚本)与视频相关度较低时(threshold<0.5),检索不到视频。可以尝试降低相关度阈值,或者提供更多相关性视频素材。
66
- 文本(脚本)与图片,未作相关度阈值检测,简单返回top-N。
77
- 当与文本匹配的视频片段或者图片不足时,系统使用空白(黑色)背景作为填充帧,补足时长。
8+
- 可以调整输出视频分辨率&帧率,默认分辨率为1080P(1920x1080,16:9),30fps
89

910
## install(安装)
1011
### 安装miniconda 或者其他python管理工具
1112

12-
13+
- 官网下载安装文件
14+
- 配置更改下载源
1315

1416
### 创建python(建议3.9版本)环境
1517

@@ -33,17 +35,59 @@ pip install -r requirements.txt
3335

3436
## configuration(配置)
3537

36-
### bert 模型
38+
### 配置文件
3739

38-
### cn_clip 模型
40+
- 首先加载./data/conf/config.json 目录配置项,加载系统配置。
41+
- 如果未找到**./data/conf/config.json**,则加载./common/config-template.json 配置文件。
3942

40-
### voc 模型
43+
```python
44+
{
45+
"version":"v0.1.0",
46+
"audio":"ms-tts", # 音频模块,使用自己训练&部署模型,或者使用第三方接口,默认使用
47+
48+
"clip_model_name":"ViT-H-14", # cn_clip 库以及模型文件下载存储目录
49+
"download_root":"./clip_cn",
50+
"device":"cuda",
51+
52+
"sentence_size": 256, # 分词语句最大长度
53+
"negativate_class": "other",
54+
55+
"max_size": 224, # 与cn_clip模型相关,ViT-H-14窗口大小为224,这里统一把图片/视频帧缩放至224
56+
57+
"img_top_n": 5, # 视频&图片检索默认设置,可由webui控制
58+
"video_top_n": 3,
59+
"threshold": 0.6,
60+
"d_value": 0.1,
61+
"max_lenght": 20,
62+
63+
"image_vs":"./data/vs/image_vs/", # 向量数据库目录,一个存储图片特征,一个存储视频特征
64+
"video_vs":"./data/vs/video_vs/",
65+
66+
"voc_conf":"./data/conf/voc.json", # 当audio选项为voc时,加载模型配置文件
67+
"voc_model":"G_54000.pth", # voc对应的自训练tts模型
68+
69+
"sample_rate":44100, # 音频采样率
4170

42-
### config.json
71+
"cache_path":"./data/cache", #缓存目录
72+
"output_path": "./data/output", # 视频&音频 结果输出目录
4373

44-
- openi: 配置token
74+
"mirror": "openi", # 下载bert模型时,指定源,启智https://openi.org.cn/
75+
"openi_token": "", # openi对应的token
4576

46-
## using
77+
"port":7800
78+
}
79+
```
80+
81+
### bert 模型
4782

48-
![主界面](./images/main.png)
83+
voc模块,对应的语言模型,支持ZH、EN、JP三语种。当audio配置项为ms-tts时,不需要此模型。
84+
85+
### cn_clip 模型
86+
87+
中文clip模型,用于跨模态检索。更多信息可访问[github项目地址](https://github.com/OFA-Sys/Chinese-CLIP)
88+
89+
CN-CLIPViT-H/14模型文件可以提前下载,下载地址:[Download](https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/checkpoints/clip_cn_vit-h-14.pt)
90+
91+
### voc 模型
4992

93+
自训练的语音模型,当audio配置项为ms-tts时,不需要此模型。

common/conf.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import functools
55

66
available_setting = {
7+
"audio":"voc", # 语音转换模型,voc:使用自训练模型;ms-tts: 使用微软在线tts服务
78
"device": "cuda",
89

910
"clip_model_name": "ViT-H-14", # chinese clip model: 选择的模型
@@ -38,7 +39,7 @@
3839

3940
# 模型镜像源,默认huggingface,使用openi镜像源需指定openi_token
4041
"mirror": "openi",
41-
"openi_token": "e6ea9886c4b70aaf4b62d6efe444fa574744b5ac", # openi token
42+
"openi_token": "", # openi token
4243

4344
"tmp_dir":'./tmp/' #
4445
}
@@ -77,7 +78,7 @@ def __getattr__(self, name):
7778

7879
def load_config(config_path):
7980
if not os.path.exists(config_path):
80-
config_path = "./data/conf/config-template.json"
81+
config_path = "./common/config-template.json"
8182

8283
config_str = read_file(config_path)
8384

common/config-template.json

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
{
2+
"version":"v0.1.0",
3+
4+
"audio":"ms-tts",
5+
6+
"clip_model_name":"ViT-H-14",
7+
"download_root":"./clip_cn",
8+
"device":"cuda",
9+
10+
"sentence_size": 256,
11+
"negativate_class": "other",
12+
13+
"max_size": 224,
14+
15+
"img_top_n": 5,
16+
"video_top_n": 3,
17+
"threshold": 0.6,
18+
"d_value": 0.1,
19+
"max_lenght": 20,
20+
21+
"image_vs":"./data/vs/image_vs/",
22+
"video_vs":"./data/vs/video_vs/",
23+
24+
"voc_conf":"./data/conf/voc.json",
25+
"voc_model":"D:\\ai\\audio\\Bert-VITS2\\models\\G_54000.pth",
26+
27+
"sample_rate":44100,
28+
29+
"cache_path":"./data/cache",
30+
"output_path": "./data/output",
31+
32+
"mirror": "openi",
33+
"openi_token": "",
34+
35+
"port":7800
36+
}

voc/audio.py

-2
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,6 @@ def generate_audio(
5050

5151
with torch.no_grad():
5252
for idx, piece in enumerate(slices):
53-
if piece == conf.negativate_class:
54-
continue
5553
audio = infer(
5654
piece,
5755
sdp_ratio=sdp_ratio,

webui.py

+27-6
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,18 @@ def find(s_text, video_paths, image_paths, threshold, video_length, d_value, ima
4646
from worker.search import find_image, find_video
4747
image_paths = convert_path(image_paths)
4848
video_paths = convert_path(video_paths)
49+
50+
# 检索参数设置
51+
kwargs = {}
52+
kwargs['threshold'] = threshold
53+
kwargs['lenght'] = video_length
54+
kwargs['dvalue'] = d_value
55+
kwargs['itopN'] = image_top_n
56+
kwargs['vtopN'] = video_top_n
4957

50-
image_s= find_image(s_text, image_paths, threshold, image_top_n)
58+
image_s= find_image(s_text, image_paths, **kwargs)
5159

52-
video_s = find_video(s_text, video_paths, threshold, d_value, video_top_n, video_length)
60+
video_s = find_video(s_text, video_paths, **kwargs)
5361

5462
return image_s, video_s
5563

@@ -66,8 +74,20 @@ def compose(scripts, video_paths, image_paths, resolution_rate, frame_rate, thre
6674

6775
from worker.compose import synthesis
6876

77+
# 视频分辨率以及帧率设置
78+
kwargs = dict(width=width, height=height, whr=wh_rate, frate=frame_rate)
79+
# 检索参数设置
80+
kwargs['threshold'] = threshold
81+
kwargs['lenght'] = video_length
82+
kwargs['dvalue'] = d_value
83+
kwargs['itopN'] = image_top_n
84+
kwargs['vtopN'] = video_top_n
85+
86+
# tts设置
87+
kwargs['speaker'] = 'fangqi'
88+
kwargs['language'] = 'ZH'
6989

70-
result = synthesis(scripts, video_paths, image_paths, width=width, height=height, whr=wh_rate, frate=frame_rate)
90+
result = synthesis(scripts, video_paths, image_paths, **kwargs)
7191

7292
return result
7393

@@ -122,10 +142,11 @@ def compose(scripts, video_paths, image_paths, resolution_rate, frame_rate, thre
122142
- 文案与视频素材尽可能匹配
123143
- 当给定文本匹配素材不足时,采用黑色帧填充
124144
- 文案目录未给定时,则检索匹配数据库中的全部图片&视频""")
125-
with gr.Row():
126-
scripts = gr.Textbox(placeholder='请输入脚本(文本)', lines=3, max_lines=20, label='用于视频剪辑的脚本')
145+
with gr.Row():
146+
with gr.Column():
147+
scripts = gr.Textbox(placeholder='请输入脚本(文本)', lines=3, max_lines=20, label='用于视频剪辑的脚本')
127148
with gr.Column():
128-
resolution_rate = gr.Dropdown(choices=list(resolutions.keys()), value='720p[宽]',label='视频分辨率')
149+
resolution_rate = gr.Dropdown(choices=list(resolutions.keys()), value='720p:1280x720',label='视频分辨率')
129150
frame_rate = gr.Slider(minimum=24, maximum=60, value=30, step=1, label="视频帧率")
130151
with gr.Column():
131152
btn_compose = gr.Button(value='剪辑视频')

worker/compose.py

+23-16
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,7 @@
1212
conf = get_conf()
1313

1414
from .text import split_text
15-
from . import video
16-
from . import image
17-
18-
19-
from voc import audio
15+
from . import video, image
2016

2117
from worker.search import find_image, find_video
2218

@@ -35,21 +31,32 @@ def synthesis(texts, video_folder, img_folder, **kwargs):
3531

3632
logger.info(f'split_text: {docs}')
3733

38-
audio_results = audio.generate_audio(
39-
docs,
40-
sdp_ratio=0.2,
41-
noise_scale=0.6,
42-
noise_scale_w=0.8,
43-
length_scale=1.0,
44-
speaker='fangqi',
45-
language='ZH'
34+
# 转换音频时,剔除文本拆分时,添加的最后一项(other)
35+
if conf.audio == 'voc':
36+
from voc import audio
37+
audio_results = audio.generate_audio(
38+
docs[:-1],
39+
sdp_ratio=0.2,
40+
noise_scale=0.6,
41+
noise_scale_w=0.8,
42+
length_scale=1.0,
43+
speaker=kwargs['speaker'],
44+
language=kwargs['language'],
45+
)
46+
else:
47+
# 调用ms tts 接口,生成语音
48+
from . import audio
49+
audio_results = audio.generate_audio(
50+
docs[:-1], _rate=0, _volume=0,
51+
_lang='Auto', _gender='女',
52+
sample_rate=conf.sample_rate,
4653
)
4754

4855
# 视频搜索
49-
video_results = find_video(texts, video_folder)
56+
video_results = find_video(texts, video_folder, **kwargs)
5057

5158
# 图片搜索
52-
image_results = find_image(texts, img_folder)
59+
image_results = find_image(texts, img_folder, **kwargs)
5360

5461
docs_videos = [] # 记录每段文本对应的视频文件
5562

@@ -123,7 +130,7 @@ def synthesis(texts, video_folder, img_folder, **kwargs):
123130
# 拼接处理音频
124131
audio_file = conf.output_path + f'/audio_{now}.wav'
125132
audios = [ item[1] for item in audio_results]
126-
audio_file = audio.concat_audios(audios[:-1], audio_file)
133+
audio_file = audio.concat_audios(audios, audio_file)
127134

128135
# 拼接处理视频
129136
ret_video = concat_fragments(docs_videos, -1, docs, conf.cache_path)

worker/search.py

+10-21
Original file line numberDiff line numberDiff line change
@@ -236,10 +236,13 @@ def update_image_db(model, preprocess, image_paths, db):
236236
if len(img_fs):
237237
db.index(img_fs) # 添加新图片至数据库中
238238

239-
def find_image(text, image_paths, threshold=0.5, topn=3):
239+
def find_image(text, image_paths, **kwargs):
240240
'''
241241
image_paths : 图片目录列表
242242
'''
243+
threshold = kwargs.get('threshold', conf.threshold)
244+
topn = kwargs.get('itopN', conf.img_top_n)
245+
243246
# 加载模型
244247
model, preprocess = load_chinese_clip(conf.clip_model_name, conf.download_root)
245248
# 读取数据库
@@ -318,9 +321,14 @@ def update_video_db(model, preprocess, db, video_paths):
318321
if video_f:
319322
db.index(video_f)
320323

321-
def find_video(text,video_paths, threshold=0.5, dvalue=0.1, topn=3, length=10):
324+
def find_video(text,video_paths, **kwargs):
322325
'''
323326
'''
327+
threshold = kwargs.get('threshold', conf.threshold)
328+
dvalue = kwargs.get('dvalue', conf.d_value)
329+
topn = kwargs.get('vtopN', conf.video_top_n)
330+
length = kwargs.get('lenght', conf.max_lenght)
331+
324332
# 加载模型
325333
model, preprocess = load_chinese_clip(conf.clip_model_name, conf.download_root)
326334
# 读取数据库
@@ -379,26 +387,7 @@ def find_video(text,video_paths, threshold=0.5, dvalue=0.1, topn=3, length=10):
379387
s_results[kw] = index_list
380388

381389
logger.info('find video results: {}'.format(s_results))
382-
# video_r = {}
383-
# for kw, index_list in s_results:
384-
# videos = []
385-
# for idx, item in enumerate(index_list):
386-
# left, right = index_list[idx]['leftIndex'], index_list[idx]['rightIndex']
387-
# # duration = right - left
388-
# start = getTime(left) # 将其转换为标准时间
389-
390-
# max_index = item['maxImage']['index']
391-
# uri = item['maxImage']['uri']
392-
# score = item['maxImage']['score']
393-
394-
# output = "{}/clip_{}_{}.mp4".format(conf.tmp_dir, kw, idx)
395390

396-
# logger.info('cut video:{} from: {} to: {}. maxImage:{}\noutput:{}'.format(uri, left, right, max_index, output))
397-
# cutVideo(start,right-left, uri, output) # 对视频进行切分
398-
399-
# videos.append({'uri':output, 'score':score, 'origin_video':uri, 'range':[left, right]})
400-
401-
# video_r[kw] = videos
402391
return s_results
403392

404393

worker/video.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -126,11 +126,11 @@ def imgs_to_video(images, lenght, output, **kwargs):
126126
input_images += f'-i {dst} '
127127

128128
frame_rate = len(images)/lenght
129-
129+
rate = kwargs.get('frate', 30) #
130130
# -s 1080x1920
131131
# 拼接缓存目录的的 图片,进行有序拼接
132-
logger.info(f'图片至视频:ffmpeg -framerate {frame_rate} -f image2 -i {cache_path}/image_%02d.jpg -c:v libx264 -t {lenght} -r 30 -pix_fmt yuv420p {output}')
133-
os.system(f'ffmpeg -framerate {frame_rate} -f image2 -i {cache_path}/image_%02d.jpg -c:v libx264 -t {lenght} -r 30 -pix_fmt yuv420p {output}')
132+
logger.info(f'图片至视频:ffmpeg -framerate {frame_rate} -f image2 -i {cache_path}/image_%02d.jpg -c:v libx264 -t {lenght} -r {rate} -pix_fmt yuv420p {output}')
133+
os.system(f'ffmpeg -framerate {frame_rate} -f image2 -i {cache_path}/image_%02d.jpg -c:v libx264 -t {lenght} -r {rate} -pix_fmt yuv420p {output}')
134134

135135

136136
def concat_videos(videos, output):
@@ -172,8 +172,10 @@ def compose(video_f, audio_f, output):
172172
'''
173173
音频、视频组合
174174
'''
175+
# '-filter_complex "[0:v]=[v];[1:a]=[a];[v][a]concat=n=1:v=1:a=1" -c:v libx264 -c:a acc -movfalgs +faststart'
175176
logger.info(f'compose video&audio: ffmpeg -i {video_f} -i {audio_f} -c:v copy -c:a aac -strict experimental {output}')
176177
os.system(f'ffmpeg -i {video_f} -i {audio_f} -c:v copy -c:a aac -strict experimental {output}')
178+
# os.system(f'ffmpeg -i {video_f} -i {audio_f} -filter_complex "[0:v]=[v];[1:a]=[a];[v][a]concat=n=1:v=1:a=1" -c:v libx264 -c:a acc -movfalgs +faststart -strict experimental {output}')
177179

178180
def scale_video(video_path, output, **kwargs):
179181
'''

0 commit comments

Comments
 (0)