-
Notifications
You must be signed in to change notification settings - Fork 1
/
wavenet_util.py
233 lines (195 loc) · 6.77 KB
/
wavenet_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import librosa.display as dsp
import numpy as np
import tensorflow as tf
def _assert_valid_input_type(s):
assert s == 'mulaw-quantize' or s == 'mulaw' or s == 'raw'
def is_mulaw_quantize(s):
_assert_valid_input_type(s)
return s == 'mulaw-quantize'
def is_mulaw(s):
_assert_valid_input_type(s)
return s == 'mulaw'
def is_raw(s):
_assert_valid_input_type(s)
return s == 'raw'
def is_scalar_input(s):
return is_raw(s) or is_mulaw(s)
#From https://github.com/r9y9/nnmnkwii/blob/master/nnmnkwii/preprocessing/generic.py
def mulaw(x, mu=256):
"""Mu-Law companding
Method described in paper [1]_.
.. math::
f(x) = sign(x) ln (1 + mu |x|) / ln (1 + mu)
Args:
x (array-like): Input signal. Each value of input signal must be in
range of [-1, 1].
mu (number): Compression parameter ``μ``.
Returns:
array-like: Compressed signal ([-1, 1])
See also:
:func:`nnmnkwii.preprocessing.inv_mulaw`
:func:`nnmnkwii.preprocessing.mulaw_quantize`
:func:`nnmnkwii.preprocessing.inv_mulaw_quantize`
.. [1] Brokish, Charles W., and Michele Lewis. "A-law and mu-law companding
implementations using the tms320c54x." SPRA163 (1997).
"""
mu = 255
return _sign(x) * _log1p(mu * _abs(x)) / _log1p(mu)
def inv_mulaw(y, mu=256):
"""Inverse of mu-law companding (mu-law expansion)
.. math::
f^{-1}(x) = sign(y) (1 / mu) (1 + mu)^{|y|} - 1)
Args:
y (array-like): Compressed signal. Each value of input signal must be in
range of [-1, 1].
mu (number): Compression parameter ``μ``.
Returns:
array-like: Uncomprresed signal (-1 <= x <= 1)
See also:
:func:`nnmnkwii.preprocessing.inv_mulaw`
:func:`nnmnkwii.preprocessing.mulaw_quantize`
:func:`nnmnkwii.preprocessing.inv_mulaw_quantize`
"""
mu = 255
return _sign(y) * (1.0 / mu) * ((1.0 + mu)**_abs(y) - 1.0)
def mulaw_quantize(x, mu=256):
"""Mu-Law companding + quantize
Args:
x (array-like): Input signal. Each value of input signal must be in
range of [-1, 1].
mu (number): Compression parameter ``μ``.
Returns:
array-like: Quantized signal (dtype=int)
- y ∈ [0, mu] if x ∈ [-1, 1]
- y ∈ [0, mu) if x ∈ [-1, 1)
.. note::
If you want to get quantized values of range [0, mu) (not [0, mu]),
then you need to provide input signal of range [-1, 1).
Examples:
>>> from scipy.io import wavfile
>>> import pysptk
>>> import numpy as np
>>> from nnmnkwii import preprocessing as P
>>> fs, x = wavfile.read(pysptk.util.example_audio_file())
>>> x = (x / 32768.0).astype(np.float32)
>>> y = P.mulaw_quantize(x)
>>> print(y.min(), y.max(), y.dtype)
15 246 int64
See also:
:func:`nnmnkwii.preprocessing.mulaw`
:func:`nnmnkwii.preprocessing.inv_mulaw`
:func:`nnmnkwii.preprocessing.inv_mulaw_quantize`
"""
mu = 255
y = mulaw(x, mu)
# scale [-1, 1] to [0, mu]
return _asint((y + 1) / 2 * mu)
def inv_mulaw_quantize(y, mu=256):
"""Inverse of mu-law companding + quantize
Args:
y (array-like): Quantized signal (∈ [0, mu]).
mu (number): Compression parameter ``μ``.
Returns:
array-like: Uncompressed signal ([-1, 1])
Examples:
>>> from scipy.io import wavfile
>>> import pysptk
>>> import numpy as np
>>> from nnmnkwii import preprocessing as P
>>> fs, x = wavfile.read(pysptk.util.example_audio_file())
>>> x = (x / 32768.0).astype(np.float32)
>>> x_hat = P.inv_mulaw_quantize(P.mulaw_quantize(x))
>>> x_hat = (x_hat * 32768).astype(np.int16)
See also:
:func:`nnmnkwii.preprocessing.mulaw`
:func:`nnmnkwii.preprocessing.inv_mulaw`
:func:`nnmnkwii.preprocessing.mulaw_quantize`
"""
# [0, m) to [-1, 1]
mu = 255
y = 2 * _asfloat(y) / mu - 1
return inv_mulaw(y, mu)
def _sign(x):
#wrapper to support tensorflow tensors/numpy arrays
isnumpy = isinstance(x, np.ndarray)
isscalar = np.isscalar(x)
return np.sign(x) if (isnumpy or isscalar) else tf.sign(x)
def _log1p(x):
#wrapper to support tensorflow tensors/numpy arrays
isnumpy = isinstance(x, np.ndarray)
isscalar = np.isscalar(x)
return np.log1p(x) if (isnumpy or isscalar) else tf.log1p(x)
def _abs(x):
#wrapper to support tensorflow tensors/numpy arrays
isnumpy = isinstance(x, np.ndarray)
isscalar = np.isscalar(x)
return np.abs(x) if (isnumpy or isscalar) else tf.abs(x)
def _asint(x):
#wrapper to support tensorflow tensors/numpy arrays
isnumpy = isinstance(x, np.ndarray)
isscalar = np.isscalar(x)
return x.astype(np.int) if isnumpy else int(x) if isscalar else tf.cast(x, tf.int32)
def _asfloat(x):
#wrapper to support tensorflow tensors/numpy arrays
isnumpy = isinstance(x, np.ndarray)
isscalar = np.isscalar(x)
return x.astype(np.float32) if isnumpy else float(x) if isscalar else tf.cast(x, tf.float32)
def sequence_mask(input_lengths, max_len=None, expand=True):
if max_len is None:
max_len = tf.reduce_max(input_lengths)
if expand:
return tf.expand_dims(tf.sequence_mask(input_lengths, max_len, dtype=tf.float32), axis=-1)
return tf.sequence_mask(input_lengths, max_len, dtype=tf.float32)
def waveplot(path, y_hat, y_target, hparams, title=None):
sr = hparams.sample_rate
fig = plt.figure(figsize=(12, 4))
if y_target is not None:
ax = plt.subplot(3, 1, 1)
dsp.waveplot(y_target, sr=sr)
ax.set_title('Target waveform')
ax = plt.subplot(3, 1, 2)
dsp.waveplot(y_hat, sr=sr)
ax.set_title('Predicted waveform')
else:
ax = plt.subplot(2, 1, 1)
dsp.waveplot(y_hat, sr=sr)
ax.set_title('Generated waveform')
if title is not None:
# Set common labels
fig.text(0.5, 0.18, title, horizontalalignment='center', fontsize=16)
plt.tight_layout()
plt.savefig(path, format="png")
plt.close()
def plot_spectrogram(pred_spectrogram, path, title=None, split_title=False, target_spectrogram=None, max_len=None, auto_aspect=False):
if max_len is not None:
target_spectrogram = target_spectrogram[:max_len]
pred_spectrogram = pred_spectrogram[:max_len]
if split_title:
title = split_title_line(title)
fig = plt.figure(figsize=(10, 8))
# Set common labels
fig.text(0.5, 0.18, title, horizontalalignment='center', fontsize=16)
#target spectrogram subplot
if target_spectrogram is not None:
ax1 = fig.add_subplot(311)
ax2 = fig.add_subplot(312)
if auto_aspect:
im = ax1.imshow(np.rot90(target_spectrogram), aspect='auto', interpolation='none')
else:
im = ax1.imshow(np.rot90(target_spectrogram), interpolation='none')
ax1.set_title('Target Mel-Spectrogram')
fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax1)
ax2.set_title('Predicted Mel-Spectrogram')
else:
ax2 = fig.add_subplot(211)
if auto_aspect:
im = ax2.imshow(np.rot90(pred_spectrogram), aspect='auto', interpolation='none')
else:
im = ax2.imshow(np.rot90(pred_spectrogram), interpolation='none')
fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax2)
plt.tight_layout()
plt.savefig(path, format='png')
plt.close()