DiffSinger/scripts/vocode.py at main · dankyu-fly/DiffSinger · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# coding=utf8
import argparse
import os
import pathlib
import sys

root_dir = pathlib.Path(__file__).parent.parent.resolve()
os.environ['PYTHONPATH'] = str(root_dir)
sys.path.insert(0, str(root_dir))

import numpy as np
import torch
import tqdm

from inference.ds_acoustic import DiffSingerAcousticInfer
from utils.infer_utils import cross_fade, save_wav
from utils.hparams import set_hparams, hparams

parser = argparse.ArgumentParser(description='Run DiffSinger vocoder')
parser.add_argument('mel', type=str, help='Path to the input file')
parser.add_argument('--exp', type=str, required=False, help='Read vocoder class and path from chosen experiment')
parser.add_argument('--config', type=str, required=False, help='Read vocoder class and path from config file')
parser.add_argument('--class', type=str, required=False, help='Specify vocoder class')
parser.add_argument('--ckpt', type=str, required=False, help='Specify vocoder checkpoint path')
parser.add_argument('--out', type=str, required=False, help='Path of the output folder')
parser.add_argument('--title', type=str, required=False, help='Title of output file')
args = parser.parse_args()

mel = pathlib.Path(args.mel)
name = mel.stem if not args.title else args.title
config = None
if args.exp:
    config = root_dir / 'checkpoints' / args.exp / 'config.yaml'
elif args.config:
    config = pathlib.Path(args.config)
else:
    assert False, 'Either argument \'--exp\' or \'--config\' should be specified.'

sys.argv = [
    sys.argv[0],
    '--config',
    str(config)
]
set_hparams(print_hparams=False)

cls = getattr(args, 'class')
if cls:
    hparams['vocoder'] = cls
if args.ckpt:
    hparams['vocoder_ckpt'] = args.ckpt


out = args.out
if args.out:
    out = pathlib.Path(args.out)
else:
    out = mel.parent

mel_seq = torch.load(mel)
assert isinstance(mel_seq, list), 'Not a valid mel sequence.'
assert len(mel_seq) > 0, 'Mel sequence is empty.'

sample_rate = hparams['audio_sample_rate']
infer_ins = DiffSingerAcousticInfer(load_model=False)


def run_vocoder(path: pathlib.Path):
    result = np.zeros(0)
    current_length = 0

    for seg_mel in tqdm.tqdm(mel_seq, desc='mel segment', total=len(mel_seq)):
        seg_audio = infer_ins.run_vocoder(seg_mel['mel'].to(infer_ins.device), f0=seg_mel['f0'].to(infer_ins.device))
        seg_audio = seg_audio.squeeze(0).cpu().numpy()
        silent_length = round(seg_mel['offset'] * sample_rate) - current_length
        if silent_length >= 0:
            result = np.append(result, np.zeros(silent_length))
            result = np.append(result, seg_audio)
        else:
            result = cross_fade(result, seg_audio, current_length + silent_length)
        current_length = current_length + silent_length + seg_audio.shape[0]

    print(f'| save audio: {path}')
    save_wav(result, path, sample_rate)


os.makedirs(out, exist_ok=True)
try:
    run_vocoder(out / (name + '.wav'))
except KeyboardInterrupt:
    exit(-1)