forked from walterevansaugusta/nurseryRhymes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtranscribe.py
86 lines (69 loc) · 2.9 KB
/
transcribe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#Credit to https://towardsdatascience.com/auto-transcribe-google-speech-api-time-offsets-in-python-7c24c4db3720
import argparse
import io
import os
#tqdm will let us watch progress
from tqdm import tqdm
from google.oauth2 import service_account
credentials = service_account.Credentials.from_service_account_file('api-key.json')
def transcribe_file_with_word_time_offsets(speech_file,language):
#This outermost loop exists to handle multiple audio file rhymes, ones longer than 50 seconds
files = sorted(os.listdir('parts/'))
transcript = ''
offset = 0
for f in files:
print("Start")
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
print("checking credentials")
client = speech.SpeechClient(credentials=credentials)
print("Checked")
print(f)
with io.open("parts/" + f, 'rb') as audio_file:
content = audio_file.read()
print("audio file read")
audio = types.RecognitionAudio(content=content)
print("config start")
config = types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
language_code=language,
enable_word_time_offsets=True)
print("Recognizing:")
response = client.recognize(config, audio)
print("Recognized")
for result in response.results:
alternative = result.alternatives[0]
print('Transcript: {}'.format(alternative.transcript))
for word_info in alternative.words:
word = word_info.word
start_time = word_info.start_time
end_time = word_info.end_time
transcript += 'Word: {}, start_time: {}, end_time: {}'.format(
word,
start_time.seconds + start_time.nanos * 1e-9 + offset,
end_time.seconds + end_time.nanos * 1e-9 + offset)
transcript += "\n"
offset += 50
print (transcript)
#Name is modified to be a text file. 5 is length of 'wavs/'
name = speech_file + '/transcript.txt'
with open(name, "w") as g:
g.write(transcript)
print("Copying mp3s to wavs")
for f in tqdm(os.listdir('Rhyme_packs/')):
for filename in tqdm(os.listdir('Rhyme_packs/' + f + "/")):
print (filename)
location = 'Rhyme_packs/' + f + "/" + filename
loc = location + "/audio.mp3"
print (location)
os.system('mkdir "parts')
command = 'ffmpeg -i ' + loc + ' -ac 1 -f segment -segment_time 50 "parts/' + '%09d.wav" -y'
os.system(command)
transcribe_file_with_word_time_offsets(location, "en-US")
os.system('del parts /F /Q')
"""files = sorted(os.listdir('wavs/'))
for f in tqdm(files):
print(f)
name = "wavs/" + f
transcribe_file_with_word_time_offsets(name, "en-US")"""