-
Notifications
You must be signed in to change notification settings - Fork 1
/
speech_to_text.py
69 lines (55 loc) · 3.08 KB
/
speech_to_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import sys
import os
import logging
import argparse
import datetime
import numpy as np
import wavTranscriber
import speaker_diarization
# Debug helpers
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
def main(args):
parser = argparse.ArgumentParser()
parser.add_argument('--audio', required=True,
help='Path to the audio file to run (WAV format)')
args = parser.parse_args()
# Run VAD on the input file
waveFile = args.audio
logger.debug("Loading wav file %s" % waveFile)
segments, sample_rate, audio_length = wavTranscriber.vad_segment_generator(waveFile, 3)
logger.debug("Processing speaker diarization")
segments = speaker_diarization.diarize(segments)
f = open(waveFile.replace(".wav", ".txt"), 'w')
logger.debug("Processing speech recognition")
# Point to a path containing the pre-trained models & resolve ~ if used
model_path = os.path.expanduser("deepspeech/models/")
# Resolve all the paths of model files
output_graph, alphabet, lm, trie = wavTranscriber.resolve_models(model_path)
# Load output_graph, alpahbet, lm and trie
model_retval = wavTranscriber.load_model(output_graph, alphabet, lm, trie)
inference_time = 0.0
for i, segment in enumerate(segments):
# Run deepspeech on the chunk that just completed VAD
logger.debug("[Speech recognition] Processing chunk %002d" % (i,))
audio = np.frombuffer(segment.bytes, dtype=np.int16)
output = wavTranscriber.stt(model_retval[0], audio, sample_rate)
inference_time += output[1]
logger.debug("[Speech recognition] Transcript: %s" % output[0])
f.write("%s - %s Speaker %s: %s\n" % (str(datetime.timedelta(seconds=round(segment.begin, 3)))[:-3],
str(datetime.timedelta(seconds=round(segment.end, 3)))[:-3],
segment.speaker, output[0]))
# Summary of the files processed
f.close()
logger.debug("Saved transcript @: %s" % waveFile.replace(".wav", ".txt"))
# Extract filename from the full file path
filename, ext = os.path.split(os.path.basename(waveFile))
title_names = ['Filename', 'Duration(s)', 'Inference Time(s)', 'Model Load Time(s)', 'LM Load Time(s)']
logger.debug("************************************************************************************************************")
logger.debug("%-30s %-20s %-20s %-20s %s" % (title_names[0], title_names[1], title_names[2], title_names[3], title_names[4]))
logger.debug("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" % (filename + ext, audio_length, inference_time, model_retval[1], model_retval[2]))
logger.debug("************************************************************************************************************")
print("\n%-30s %-20s %-20s %-20s %s" % (title_names[0], title_names[1], title_names[2], title_names[3], title_names[4]))
print("%-30s %-20.3f %-20.3f %-20.3f %-0.3f" % (filename + ext, audio_length, inference_time, model_retval[1], model_retval[2]))
if __name__ == '__main__':
main(sys.argv[1:])