-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutils.py
116 lines (93 loc) · 3.97 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
"""
Defines various functions for processing the data.
"""
import numpy as np
import soundfile
from numpy.lib.stride_tricks import as_strided
from char_map import char_map, index_map
from python_speech_features import fbank
from python_speech_features import lifter
from python_speech_features import dct
def calc_feat_dim(window, max_freq):
return int(0.001 * window * max_freq) + 1
def conv_output_length(input_length, filter_size, border_mode, stride,
dilation=1):
if input_length is None:
return None
assert border_mode in {'same', 'valid'}
dilated_filter_size = filter_size + (filter_size - 1) * (dilation - 1)
if border_mode == 'same':
output_length = input_length
elif border_mode == 'valid':
output_length = input_length - dilated_filter_size + 1
print(output_length)
return (output_length + stride - 1) // stride
def spectrogram(samples, fft_length=256, sample_rate=2, hop_length=128):
assert not np.iscomplexobj(samples), "Must not pass in complex numbers"
window = np.hanning(fft_length)[:, None]
window_norm = np.sum(window ** 2)
scale = window_norm * sample_rate
trunc = (len(samples) - fft_length) % hop_length
x = samples[:len(samples) - trunc]
# "stride trick" reshape to include overlap
nshape = (fft_length, (len(x) - fft_length) // hop_length + 1)
nstrides = (x.strides[0], x.strides[0] * hop_length)
x = as_strided(x, shape=nshape, strides=nstrides)
# window stride sanity check
assert np.all(x[:, 1] == samples[hop_length:(hop_length + fft_length)])
# broadcast window, compute fft over columns and square mod
x = np.fft.rfft(x * window, axis=0)
x = np.absolute(x) ** 2
# scale, 2.0 for everything except dc and fft_length/2
x[1:-1, :] *= (2.0 / scale)
x[(0, -1), :] /= scale
freqs = float(sample_rate) / fft_length * np.arange(x.shape[0])
return x, freqs
def spectrogram_from_file(filename, step=10, window=20, max_freq=None,
eps=1e-14):
with soundfile.SoundFile(filename) as sound_file:
audio = sound_file.read(dtype='float32')
sample_rate = sound_file.samplerate
if audio.ndim >= 2:
audio = np.mean(audio, 1)
if max_freq is None:
max_freq = sample_rate / 2
if max_freq > sample_rate / 2:
raise ValueError("max_freq must not be greater than half of "
" sample rate")
if step > window:
raise ValueError("step size must not be greater than window size")
hop_length = int(0.001 * step * sample_rate)
fft_length = int(0.001 * window * sample_rate)
pxx, freqs = spectrogram(
audio, fft_length=fft_length, sample_rate=sample_rate,
hop_length=hop_length)
ind = np.where(freqs <= max_freq)[0][-1] + 1
return np.transpose(np.log(pxx[:ind, :] + eps))
def mfcc_from_file(signal, samplerate=16000, winlen=0.025, winstep=0.01, numcep=13,
nfilt=26, nfft=512, lowfreq=0, highfreq=None, preemph=0.97, ceplifter=22, appendEnergy=True,
winfunc=lambda x: np.ones((x,))):
feat, energy = fbank(signal, samplerate, winlen, winstep, nfilt, nfft, lowfreq, highfreq, preemph, winfunc)
feat = np.log(feat)
feat = dct(feat, type=2, axis=1, norm='ortho')[:, :numcep]
feat = lifter(feat, ceplifter)
if appendEnergy: feat[:, 0] = np.log(energy) # replace first cepstral coefficient with log of frame energy
return feat
def text_to_int_sequence(text):
""" Convert text to an integer sequence """
int_sequence = []
for c in text:
if c == ' ':
ch = char_map['<SPACE>']
else:
ch = char_map[c]
int_sequence.append(ch)
return int_sequence
def int_sequence_to_text(int_sequence):
""" Convert an integer sequence to text """
print(int_sequence)
text = []
for c in int_sequence:
ch = index_map[c]
text.append(ch)
return text