-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathaudio_gen.py
348 lines (283 loc) · 18 KB
/
audio_gen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import os
# human hearing constants
# JND_AZIM = 0.017453 # FIXME 1 degree in rad now, but should be the smallest in literature at angle 0 - not used yet here
# JND_AMPL = 3. # FIXME 3db for now
# JND_FREQ = 2. # FIXME 2 Hz for now
MIN_FREQ = 100.
MAX_AMPL = 1.
# we perceive 10 octaves total (if fs is 44100), and have 64 oct/sec sensitivity
# with a 10 ms section length, that's a 0.64 oct change max
# f of 0->1 here is 0->10 octaves (if fs is 44100), so 10/0.64=15.6th of the spectrum can change within 10ms
# 1/15.6 = 0.064 is the max dfreq for one modulation, if 5 modulations, max dfreq is around 0.32
# if fs < 44100 or frequency max < 20kHz, the spectrum is smaller, so max dfreq > 0.32
# if section length < 10ms, max dfreq should get lowered
MAX_DFREQ = 0.5 # max change summed up across all modulations
MAX_DA = 0.5 # max change across all modulations
MAX_DAZIM = np.pi / 2. # across all modulations, in radian
ALPHA_A = 0.3 # alpha parameter of amplitude distribution
# A_F = 92. # 'A' parameter of frequency distribution, defining the spread of freq in use, 163 for humans
# 92. --> 11603 Hz max
# 87. --> 10978 Hz max
# 60. --> 7600 Hz max # defined below
HEAD_RADIUS = 8.75 / 100. # m
SPEED_OF_SOUND = 343. # m/s
# Woordowrth model: from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3985894/
def ITD2(angle, freq):
return HEAD_RADIUS / SPEED_OF_SOUND * (angle + tf.sin(angle))
# fitted to Figure 1 of https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0089033
# in dB: ILD = (f/10000)^(0.42)*sin(ang)*20
# B = 20*log10(p1/p0) from https://en.wikipedia.org/wiki/Sound_localization
# p1/p0 = 10^(B/20)
# in pressure level ratio, as left/right = 10^((f/10000)^(0.42)*sin(ang)*20/20) = 10^((f/10000)^(0.42)*sin(ang))
def ILD2(angle, freq): # my own constructed ILD that works for + and - angles
return tf.pow(10., tf.pow(freq / 10000., 0.42) * tf.sin(angle))
# upsample to stream length: df, da, dazim
def upsample_1d(x, nsoundstream, to_size, name, dtype):
# x = tf.expand_dims(x, axis=1)
x = tf.expand_dims(x, axis=-1)
x = tf.image.resize_bilinear(x, [nsoundstream, to_size]) # sound stream dimension stays intact
return tf.cast(tf.reshape(x, [-1, nsoundstream, to_size]), dtype=dtype, name=name + '_upsampled')
# soundscape = k x soundstream = k x nmodulation x soundsection
# if delta is constant, then just repeat it for nmodulation times, and then provide as input
def build_audio_synthetizer(f0_in, df_in, a0_in, da_in, azim0_in, dazim_in, phase_in, fs, dtype, batch_size, stream_params):
nsoundstream = stream_params['nsoundstream']
nmodulation = stream_params['nmodulation'] # number of modulations = number of sound sections
attack_len = int(stream_params['attack_len_msec'] / 1000. * fs)
decay_len = int(stream_params['decay_len_msec'] / 1000. * fs)
section_len = int(stream_params['section_len_msec'] / 1000. * fs)
soundstream_len = int(nmodulation * section_len)
soundscape_len = int(soundstream_len * stream_params['soundscape_len_by_stream_len'])
# time and ramper
t = tf.cast(tf.range(start=0, limit=soundstream_len, delta=1) / fs, dtype=dtype, name='t')
ramper = tf.concat([tf.constant(np.logspace(-1., 0., attack_len), dtype=dtype),
tf.ones(soundstream_len - attack_len - decay_len, dtype=dtype),
tf.constant(np.logspace(0., -1., decay_len), dtype=dtype)], axis=0)
# freq
# from https://www.ncbi.nlm.nih.gov/pubmed/2373794
# CF = A(10^(αx) − k), CF distribution in Hz, x is [0,1], alpha=2.1, k=0.85, A=165.4 for humans
# A set to 100 to have a max freq of 12kHz
if fs > 44000: # choose with of freq distribution according to the sampling freq
A_F = 92.
elif fs > 22000:
A_F = 87.
else: # 16k
A_F = 60.
df = tf.cumsum(df_in * MAX_DFREQ / nmodulation, axis=-1) + f0_in
df = tf.clip_by_value(df, 0., 1.)
raw_df = A_F * (tf.pow(10., 2.1 * df) - 0.85) + MIN_FREQ
# amplitude
# TODO apply function on da to account for gain by freq nonlinearities
# inverse of y = x^α, α=0.3 --> x = y^(1/α)
da = tf.cumsum(da_in * MAX_DA / nmodulation, axis=-1) + a0_in
da = tf.nn.relu(da) # only positive values are sensible
da = tf.pow(da, 1./ALPHA_A) # inverse of y = x^alpha loudness function
raw_da = tf.clip_by_value(da, 0., MAX_AMPL)
# azimuth
# TODO apply freq dependent accuracy function to dazim
# azim0_in = tf.tanh(azim0_in) # encourage center values close to 0, but no need, tanh already computed in separate_params
# dazim_in = dazim_in
azim0 = azim0_in * (np.pi / 2.) # [-1,1] -> [-pi/2,pi/2]
dazim = dazim_in * MAX_DAZIM / nmodulation # [-1,1] -> [-max_dazim,max_dazim]
raw_dazim = tf.cumsum(dazim, axis=-1) + azim0 # FIXME dazim gets out of [-pi/2,pi/2] interval
# phase
raw_phase = phase_in # no scaling
stream_phase_offset = tf.cast((soundscape_len - soundstream_len) * raw_phase, dtype=tf.int32) # stops backprop
df = upsample_1d(raw_df, nsoundstream, soundstream_len, 'df', dtype)
da = upsample_1d(raw_da, nsoundstream, soundstream_len, 'da', dtype)
dazim = upsample_1d(raw_dazim, nsoundstream, soundstream_len, 'dazim', dtype)
if stream_params['binaural']:
# compute ITD and ILD
itd0_roll = tf.cast(ITD2(azim0, f0_in) * fs / 2., dtype=tf.int32) # how much to roll the soundscape to emulate ITD
itd = ITD2(dazim, df)
ild = ILD2(dazim, df) # >1 means the left is louder, <1 right is louder
ild_sqrt = tf.sqrt(ild) # to influence both channels equally by ILD, and not favour 1 channel
# construct sound
sound_stream_l = da * tf.sin(2. * np.pi * df * t + itd/2.) # push one, pull the other equally
sound_stream_r = da * tf.sin(2. * np.pi * df * t - itd/2.)
# construct binaural sound: batch x stream x time x chan(l,r)
sound_stream_l = sound_stream_l * ild_sqrt
sound_stream_r = sound_stream_r / ild_sqrt
two_channel_ramp = tf.stack([ramper, ramper], axis=-1)
sound_stream = tf.stack([sound_stream_l, sound_stream_r], axis=3) * two_channel_ramp # l;r binaural stream
# place soundstream in soundscape
bin_sound_scape = tf.concat([sound_stream, tf.zeros([batch_size, nsoundstream, soundscape_len - soundstream_len, 2], dtype=dtype)], axis=2)
# sound_scape = tf.map_fn(lambda x: tf.manip.roll(x[0], x[1], axis=[0]), [sound_scape, stream_phase_offset], dtype=dtype)
bin_sound_scape = tf.map_fn(lambda stream_x: tf.map_fn(lambda x: tf.manip.roll(x[0], x[1], axis=[-2]), [stream_x[0], stream_x[1]],
dtype=dtype), [bin_sound_scape, stream_phase_offset], dtype=dtype)
sound_scape_l = tf.map_fn(lambda stream_x: tf.map_fn(lambda x: tf.manip.roll(x[0], x[1], axis=[-1]), [stream_x[0], stream_x[1]],
dtype=dtype), [bin_sound_scape[:, :, :, 0], -itd0_roll], dtype=dtype)
sound_scape_r = tf.map_fn(lambda stream_x: tf.map_fn(lambda x: tf.manip.roll(x[0], x[1], axis=[-1]), [stream_x[0], stream_x[1]],
dtype=dtype), [bin_sound_scape[:, :, :, 1], itd0_roll], dtype=dtype)
# sound_scape_l = tf.map_fn(lambda x: tf.manip.roll(x[0], x[1], axis=[0]), [sound_scape[:,:,0], -itd0_roll], dtype=dtype)
# sound_scape_r = tf.map_fn(lambda x: tf.manip.roll(x[0], x[1], axis=[0]), [sound_scape[:,:,1], itd0_roll], dtype=dtype)
bin_sound_scape = tf.stack([sound_scape_l, sound_scape_r], axis=3)
bin_sound_scape = tf.reduce_sum(bin_sound_scape, axis=1)
bin_sound_scape = bin_sound_scape / tf.reduce_max(bin_sound_scape) # to range [0,1]
#bin_sound_scape = bin_sound_scape / tf.reshape(tf.reduce_max(bin_sound_scape, axis=1) + 1e-6, [-1, 1, 2])
else: # single channel, no need for binaural sound
bin_sound_scape = None
sound_stream = da * tf.sin(2. * np.pi * df * t) * ramper
sound_scape = tf.concat([sound_stream, tf.zeros([batch_size, nsoundstream, soundscape_len - soundstream_len], dtype=dtype)], axis=-1)
sound_scape = tf.map_fn(lambda stream_x: tf.map_fn(lambda x: tf.manip.roll(x[0], x[1], axis=[-1]), [stream_x[0], stream_x[1]], dtype=dtype),
[sound_scape, stream_phase_offset], dtype=dtype)
sound_scape = tf.reduce_sum(sound_scape, axis=1)
sound_scape = sound_scape / tf.reshape(tf.reduce_max(sound_scape, axis=1) + 1e-6, [batch_size, 1])
return {'sound_scape': sound_scape, 'sound_stream': sound_stream, 'df': df, 'da': da, 'dazim': dazim, 't': t,
'raw_dazim': raw_dazim, 'raw_da': raw_da, 'raw_df': raw_df, 'raw_phase': raw_phase, 'bin_sound_scape': bin_sound_scape}
def write_wav_files(data, fs, path, prefix): # 20*40*20*4/batch_size
data_norm = data / np.max(np.abs(data)) # [-1,1]
for i, w in enumerate(data_norm):
wavfile.write(os.path.join(path, 'v1_10_' + str(prefix) + '_' + str(i) + '.wav'), fs, w)
def nparams_needed(nsoundstream, nmodulation, varying_delta, const_phase): # batch size is not incorporated
return (3 if const_phase else 4) * nsoundstream + 3 * nsoundstream * (nmodulation if varying_delta else 1)
def constant_phase(nsoundstream, batch_size):
# just place soundstreams evenly
return np.array([np.reshape(np.linspace(0, 1, nsoundstream), [nsoundstream, 1])] * batch_size)
def separate_params(inp_params, nsoundstream, nmodulation, varying_delta, logging, const_phase_tens):
# inp_params: batch_size x nparams_needed
# split to: f0_in, df_in, a0_in, da_in, azim0_in, dazim_in, phase_in
d_inp_len = nmodulation if varying_delta else 1
if const_phase_tens is None:
split_sizes = [nsoundstream, nsoundstream * d_inp_len, nsoundstream, nsoundstream * d_inp_len, nsoundstream,
nsoundstream * d_inp_len, nsoundstream]
f0_in, df_in, a0_in, da_in, azim0_in, dazim_in, phase_in = tf.split(inp_params, split_sizes, axis=1)
phase_in = tf.layers.dense(phase_in, phase_in.shape[1], activation=tf.nn.sigmoid) # FIXME try to remove sigmoid and rather clip by val
else: # const phase
split_sizes = [nsoundstream, nsoundstream * d_inp_len, nsoundstream, nsoundstream * d_inp_len, nsoundstream,
nsoundstream * d_inp_len]
f0_in, df_in, a0_in, da_in, azim0_in, dazim_in = tf.split(inp_params, split_sizes, axis=1)
phase_in = const_phase_tens # no need for constraints it's already between [0,1]
# constrain tensors, values might be too large, not fitting the [0,1] and [-1,1] interval needed for audio_gen
# f0_in, a0_in, phase_in (already solved): [0,1]
f0_in = tf.layers.dense(f0_in, f0_in.shape[1], activation=tf.nn.sigmoid) # FIXME try to remove sigmoid and rather clip by val, it's safe
a0_in = tf.layers.dense(a0_in, a0_in.shape[1], activation=tf.nn.sigmoid)
# azim0_in, df_in, da_in, dazim_in: [-1,1]
azim0_in = tf.layers.dense(azim0_in, azim0_in.shape[1], activation=tf.nn.tanh)
df_in = tf.layers.dense(df_in, df_in.shape[1], activation=tf.nn.tanh)
da_in = tf.layers.dense(da_in, da_in.shape[1], activation=tf.nn.tanh)
dazim_in = tf.layers.dense(dazim_in, dazim_in.shape[1], activation=tf.nn.tanh)
if logging:
tf.summary.histogram('f0', f0_in, family='SYNTH_PARAMS')
tf.summary.histogram('a0', a0_in, family='SYNTH_PARAMS')
tf.summary.histogram('phase', phase_in, family='SYNTH_PARAMS')
tf.summary.histogram('azim0', azim0_in, family='SYNTH_PARAMS')
tf.summary.histogram('df', df_in, family='SYNTH_PARAMS')
tf.summary.histogram('da', da_in, family='SYNTH_PARAMS')
tf.summary.histogram('dazim', dazim_in, family='SYNTH_PARAMS')
f0_in = tf.reshape(f0_in, [-1, nsoundstream, 1])
a0_in = tf.reshape(a0_in, [-1, nsoundstream, 1])
azim0_in = tf.reshape(azim0_in, [-1, nsoundstream, 1])
phase_in = tf.reshape(phase_in, [-1, nsoundstream, 1])
df_in = tf.reshape(df_in, [-1, nsoundstream, d_inp_len])
da_in = tf.reshape(da_in, [-1, nsoundstream, d_inp_len])
dazim_in = tf.reshape(dazim_in, [-1, nsoundstream, d_inp_len])
if not varying_delta: # repeat delta values
df_in = tf.tile(df_in, [1, 1, nmodulation])
da_in = tf.tile(da_in, [1, 1, nmodulation])
dazim_in = tf.tile(dazim_in, [1, 1, nmodulation])
return f0_in, df_in, a0_in, da_in, azim0_in, dazim_in, phase_in
# needed in color draw model init
def soundscape_len(audio_params, fs):
section_len = int(audio_params['section_len_msec'] / 1000. * fs)
soundstream_len = int(audio_params['nmodulation'] * section_len)
return int(soundstream_len * audio_params['soundscape_len_by_stream_len'])
def gen_single_input(f0_in, a0_in, azim0_in, phase_in, df_in, da_in, dazim_in,
f0_, a0_, azim0_, phase_, df_, da_, dazim_, batch_size=1):
return {
f0_in: [[[f0_]]*nsoundstream]*batch_size,
a0_in: [[[a0_]]*nsoundstream]*batch_size,
azim0_in: [[[azim0_]]*nsoundstream]*batch_size,
phase_in: constant_phase(nsoundstream, batch_size), # equal spacing from phase_
df_in: [[df_]*nsoundstream]*batch_size,
da_in: [[da_]*nsoundstream]*batch_size,
dazim_in: [[dazim_]*nsoundstream]*batch_size
}
def rand_gen_input(f0_in, a0_in, azim0_in, phase_in, df_in, da_in, dazim_in, batch_size, varying_delta, const_phase):
if not const_phase:
phase = np.random.rand(batch_size, nsoundstream, 1)
else:
phase = constant_phase(nsoundstream, batch_size)
return {
f0_in: np.random.rand(batch_size, nsoundstream, 1),
a0_in: np.random.rand(batch_size, nsoundstream, 1),
azim0_in: np.random.rand(batch_size, nsoundstream, 1) * 2. - 1.,
phase_in: phase,
df_in: (np.random.rand(batch_size, nsoundstream, nmodulation)
if varying_delta else np.repeat(np.random.rand(batch_size, nsoundstream, 1), nmodulation, 2)) * 2. - 1.,
da_in: (np.random.rand(batch_size, nsoundstream, nmodulation)
if varying_delta else np.repeat(np.random.rand(batch_size, nsoundstream, 1), nmodulation, 2)) * 2. - 1.,
dazim_in: (np.random.rand(batch_size, nsoundstream, nmodulation)
if varying_delta else np.repeat(np.random.rand(batch_size, nsoundstream, 1), nmodulation, 2)) * 2. - 1.
}
# ! NOTE: ONLY USED WHEN RUNNING THIS SCRIPT
DEFAULT_STREAM_PARAMS = {
'binaural': True,
'varying_delta': True,
'nsoundstream': 3,
'nmodulation': 4, # number of modulations: number of sound sections
'section_len_msec': 10,
'attack_len_msec': 1,
'decay_len_msec': 1,
'soundscape_len_by_stream_len': 2.5, # should be >=1
'const_phase': True
}
if __name__ == '__main__':
import sounddevice
from scipy.io import wavfile
# constants
fs = 44100
dtype = tf.float32
batch_size = 1
nsoundstream = DEFAULT_STREAM_PARAMS['nsoundstream']
nmodulation = DEFAULT_STREAM_PARAMS['nmodulation']
const_phase = DEFAULT_STREAM_PARAMS['const_phase']
soundscape_len = soundscape_len(DEFAULT_STREAM_PARAMS, fs), soundscape_len(DEFAULT_STREAM_PARAMS, fs)/fs*1000
print('soundscape len:', soundscape_len, 'ms')
print('params needed:', nparams_needed(nsoundstream, nmodulation, True, DEFAULT_STREAM_PARAMS['const_phase']))
# placeholders of parameters
f0_in = tf.placeholder(dtype, (batch_size, nsoundstream, 1), name='f0') # [0,1]
a0_in = tf.placeholder(dtype, (batch_size, nsoundstream, 1), name='a0') # [0,1]
azim0_in = tf.placeholder(dtype, (batch_size, nsoundstream, 1), name='azim0') # [-1,1], counterclock-wise
phase_in = tf.placeholder(dtype, (batch_size, nsoundstream, 1), name='phase') # [0,1]
df_in = tf.placeholder(dtype, (batch_size, nsoundstream, nmodulation), name='df') # [-1,1]
da_in = tf.placeholder(dtype, (batch_size, nsoundstream, nmodulation), name='da') # [-1,1]
dazim_in = tf.placeholder(dtype, (batch_size, nsoundstream, nmodulation), name='dazim') # [-1,1]
placeholders = [f0_in, a0_in, azim0_in, phase_in, df_in, da_in, dazim_in]
# build synthetizer
synth_tensors = build_audio_synthetizer(f0_in, df_in, a0_in, da_in, azim0_in, dazim_in, phase_in, fs, dtype, batch_size,
DEFAULT_STREAM_PARAMS)
# generate input
varying_delta = False
inp1 = gen_single_input(*placeholders, f0_=0.3, a0_=0.7, azim0_=0, phase_=0.,
df_=[0] * nmodulation, da_=[0] * nmodulation, dazim_=[-0.4] * nmodulation,
batch_size=batch_size)
rand_inp = rand_gen_input(*placeholders, batch_size, varying_delta, const_phase)
# change here whether to use a randomly generated input (rand_inp), or a tailored one (inp1)
input_used = rand_inp
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
sound_scape_, t_, df_, da_, dazim_ = sess.run([synth_tensors['bin_sound_scape'], synth_tensors['t'],
synth_tensors['df'], synth_tensors['da'], synth_tensors['dazim']], input_used)
print('DF', df_)
print('DA', da_)
print('DAZIM', dazim_)
# binary plot:
# plt.plot(np.arange(len(sound_scape_[0])), sound_scape_[0,:,0], np.arange(len(sound_scape_[0])), sound_scape_[0,:,1]); plt.legend(['l', 'r'])
# plt.figure(); plt.plot(np.arange(len(df_[0])), df_[0])
# print('min', np.min(sound_scape_[0]), 'max', np.max(sound_scape_[0]))
# print('amin', np.min(ramper_[0]), 'amax', np.max(ramper_[0]))
# print(ramper_.tolist())
# print('df', df_[0])
# generate wav files
# for i in range(20*40*20*4//batch_size):
# write_wav_files(sound_scape_, fs, '/media/viktor/0C22201D22200DF0/audio_data/stream', i)
plt.plot(np.arange(len(sound_scape_[0, :])) / fs * 1000, sound_scape_[0, :] / np.max(sound_scape_[0, :], None))
plt.legend(['left', 'right'])
plt.xlabel('t (ms)')
plt.ylabel('A (sound pressure)')
repeated_soundscape = np.tile(sound_scape_[0, :], [20, 1])
sounddevice.play(repeated_soundscape, fs)
plt.show()