-
Notifications
You must be signed in to change notification settings - Fork 1
/
remix.py
112 lines (84 loc) · 3.19 KB
/
remix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import argparse
import csv
from pydub import AudioSegment
parser = argparse.ArgumentParser(
description="Remix a WAV file based on a diarisation CSV file"
)
parser.add_argument("--ds", help="CSV file with diarisation", required=True)
parser.add_argument("--input", help="Input WAV file", required=True)
parser.add_argument(
"--speakers",
action="store_true",
help="Generate audio with samples of all speakers",
)
parser.add_argument(
"--sample", action="store_true", help="Generate audio using only first 5 minutes"
)
parser.add_argument(
"--include-speakers",
help="Speakers to include, comma-separated",
)
parser.add_argument(
"--exclude-speakers",
help="Speakers to exclude, comma-separated",
)
args = parser.parse_args()
def main():
full_audio = AudioSegment.from_wav(args.input)
with open(args.ds) as csvfile:
reader = csv.reader(csvfile)
speakers = get_speakers(reader, args.include_speakers, args.exclude_speakers)
csvfile.seek(0)
if not args.speakers:
segments = build_audio(reader, speakers, full_audio)
output_filename_suffix = ".onesided.wav"
else:
segments = build_speakers_sample(reader, speakers, full_audio)
output_filename_suffix = ".speakers.wav"
output_filename = args.input.replace(".wav", output_filename_suffix)
segments.export(output_filename, format="wav")
def build_audio(csv_reader, speakers, full_audio):
segments = AudioSegment.empty()
for start, end, speaker in csv_reader:
start = float(start) * 1000
end = float(end) * 1000
# Fix clipping of first sample
if start < 500:
start = 0
if args.sample and start > 5 * 60 * 1000:
break
if speaker in speakers:
print(f"{start / float(len(full_audio))*100:.0f}%")
segments += full_audio[start:end]
return segments
def build_speakers_sample(csv_reader, speakers, full_audio):
segments = AudioSegment.empty()
chime = AudioSegment.from_wav("audio/chime.wav")
for tgt_speaker in sorted(speakers):
for start, end, speaker in csv_reader:
start = float(start) * 1000
end = float(end) * 1000
# Fix clipping of first sample
if start < 500:
start = 0
# Limit samples to 10 seconds
if end - start > 10000:
end = start + 10000
if speaker == tgt_speaker and end - start > 1000:
segments += chime
segments += AudioSegment.from_wav(f"audio/num-{tgt_speaker[-2:]}.wav")
segments += full_audio[start:end]
break
return segments
def get_speakers(csv_reader, include_speakers, exclude_speakers):
speakers = set()
for start, end, speaker in csv_reader:
speakers.add(speaker)
if include_speakers:
speakers = speakers.intersection(parse_speakers(include_speakers))
if exclude_speakers:
speakers = speakers.difference(parse_speakers(exclude_speakers))
return speakers
def parse_speakers(speakers):
return set([f"SPEAKER_{int(s):02d}" for s in ",".join(speakers).split(",")])
main()