forked from AidaDSP/Automated-GuitarAmpModelling
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprep_wav.py
313 lines (270 loc) · 14.5 KB
/
prep_wav.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
# Creating a valid dataset for the trainining script
# using wav files provided by user.
# Example of usage:
# python3 prep_wav.py -f input.wav target.wav -l "RNN-aidadsp-1"
# the files will be splitted 70% 15% 15%
# and used to populate train test val.
# This is done to have different data for training, testing and validation phase
# according with the paper.
# If the user provide multiple wav files pairs e.g. guitar_in.wav guitar_tg.wav bass_in.wav bass_tg.wav
# then 70% of guitar_in.wav is concatenated to 70% of bass_in.wav and so on.
# If the user provide guitar and bass files of the same length, then the same amount
# of guitar and bass recorded material will be used for network training.
import CoreAudioML.miscfuncs as miscfuncs
from CoreAudioML.dataset import audio_converter, audio_splitter
from scipy.io import wavfile
import numpy as np
import argparse
import os
import csv
from colab_functions import save_wav, parse_csv, peak, align_target
import librosa
def nonConditionedWavParse(args):
print("Using config file %s" % args.load_config)
file_name = ""
configs = miscfuncs.json_load(args.load_config, args.config_location)
try:
file_name = configs['file_name']
except KeyError:
print("Error: config file doesn't have file_name defined")
exit(1)
try:
blip_offset = configs['blip_offset']
except KeyError:
print("Warning: config file doesn't have blip_offset defined")
blip_offset = 0
try:
blip_window = configs['blip_window']
except KeyError:
print("Warning: config file doesn't have blip_window defined")
blip_window = None
try:
blip_locations = configs['blip_locations']
except KeyError:
print("Warning: config file doesn't have blip_locations defined")
blip_locations = None
try:
blip_window = configs['blip_window']
except KeyError:
print("Warning: config file doesn't have blip_window defined")
blip_window = None
if args.denoise:
from colab_functions import denoise
train_in = np.ndarray([0], dtype=np.float32)
train_tg = np.ndarray([0], dtype=np.float32)
test_in = np.ndarray([0], dtype=np.float32)
test_tg = np.ndarray([0], dtype=np.float32)
val_in = np.ndarray([0], dtype=np.float32)
val_tg = np.ndarray([0], dtype=np.float32)
for in_file, tg_file in zip(args.files[::2], args.files[1::2]):
#print("Input file name: %s" % in_file)
in_data, in_rate = librosa.load(in_file, sr=None, mono=True)
#print("Target file name: %s" % tg_file)
tg_data, tg_rate = librosa.load(tg_file, sr=None, mono=True)
#print("Input rate: %d length: %d [samples]" % (in_rate, in_data.size))
#print("Target rate: %d length: %d [samples]" % (tg_rate, tg_data.size))
if in_rate != tg_rate:
print("Error! Sample rate needs to be equal")
exit(1)
if in_rate != 48000 or tg_rate != 48000:
print("Converting audio sample rate to 48kHz.")
in_data = librosa.resample(in_data, orig_sr=in_rate, target_sr=48000)
tg_data = librosa.resample(tg_data, orig_sr=tg_rate, target_sr=48000)
rate = 48000
x_all = audio_converter(in_data)
y_all = audio_converter(tg_data)
# Auto-align
if blip_locations and blip_window:
y_all_aligned = align_target(tg_data=y_all, blip_offset=blip_offset, blip_locations=tuple(blip_locations), blip_window=blip_window)
if y_all_aligned is not None:
y_all = y_all_aligned
else:
print("Error! Was not able to calculate alignment delay!")
exit(1)
else:
print("Warning! Auto-alignment disabled...")
if(x_all.size != y_all.size):
min_size = min(x_all.size, y_all.size)
#print("Warning! Length for audio files\n\r %s\n\r %s\n\rdoes not match, setting both to %d [samples]" % (in_file, tg_file, min_size))
x_all = np.resize(x_all, min_size)
y_all = np.resize(y_all, min_size)
# Noise reduction, using CPU
if args.denoise:
y_all = denoise(waveform=y_all)
# Normalization
if args.norm:
in_lvl = peak(x_all)
y_all = peak(y_all, in_lvl)
# Default to 70% 15% 15% split
if not args.csv_file:
splitted_x = audio_splitter(x_all, [0.70, 0.15, 0.15])
splitted_y = audio_splitter(y_all, [0.70, 0.15, 0.15])
else:
# Csv file to be named as in file
[train_bounds, test_bounds, val_bounds] = parse_csv(os.path.splitext(in_file)[0] + ".csv")
splitted_x = [np.ndarray([0], dtype=np.float32), np.ndarray([0], dtype=np.float32), np.ndarray([0], dtype=np.float32)]
splitted_y = [np.ndarray([0], dtype=np.float32), np.ndarray([0], dtype=np.float32), np.ndarray([0], dtype=np.float32)]
for bounds in train_bounds:
splitted_x[0] = np.append(splitted_x[0], audio_splitter(x_all, bounds, unit='s'))
splitted_y[0] = np.append(splitted_y[0], audio_splitter(y_all, bounds, unit='s'))
for bounds in test_bounds:
splitted_x[1] = np.append(splitted_x[1], audio_splitter(x_all, bounds, unit='s'))
splitted_y[1] = np.append(splitted_y[1], audio_splitter(y_all, bounds, unit='s'))
for bounds in val_bounds:
splitted_x[2] = np.append(splitted_x[2], audio_splitter(x_all, bounds, unit='s'))
splitted_y[2] = np.append(splitted_y[2], audio_splitter(y_all, bounds, unit='s'))
train_in = np.append(train_in, splitted_x[0])
train_tg = np.append(train_tg, splitted_y[0])
test_in = np.append(test_in, splitted_x[1])
test_tg = np.append(test_tg, splitted_y[1])
val_in = np.append(val_in, splitted_x[2])
val_tg = np.append(val_tg, splitted_y[2])
print("Saving processed wav files into dataset")
save_wav("Data/train/" + file_name + "-input.wav", rate, train_in)
save_wav("Data/train/" + file_name + "-target.wav", rate, train_tg)
save_wav("Data/test/" + file_name + "-input.wav", rate, test_in)
save_wav("Data/test/" + file_name + "-target.wav", rate, test_tg)
save_wav("Data/val/" + file_name + "-input.wav", rate, val_in)
save_wav("Data/val/" + file_name + "-target.wav", rate, val_tg)
def conditionedWavParse(args):
print("Using config file %s" % args.load_config)
file_name = ""
configs = miscfuncs.json_load(args.load_config, args.config_location)
try:
file_name = configs['file_name']
except KeyError:
print("Error: config file doesn't have file_name defined")
exit(1)
try:
blip_offset = configs['blip_offset']
except KeyError:
print("Warning: config file doesn't have blip_offset defined")
blip_offset = 0
try:
blip_locations = configs['blip_locations']
except KeyError:
print("Warning: config file doesn't have blip_locations defined")
blip_locations = None
try:
blip_window = configs['blip_window']
except KeyError:
print("Warning: config file doesn't have blip_window defined")
blip_window = None
if args.denoise:
from colab_functions import denoise
params = configs['params']
counter = 0
main_rate = 0
all_train_in = np.array([[]]*(1 + params['n']), dtype=np.float32) # 1 channel for in audio, n channels per parameters
all_train_tg = np.array([[]], dtype=np.float32) # 1 channels of all (out audio)
all_test_in = np.array([[]]*(1 + params['n']), dtype=np.float32) # 1 channel for in audio, n channels per parameters
all_test_tg = np.array([[]], dtype=np.float32) # 1 channels of all (out audio)
all_val_in = np.array([[]]*(1 + params['n']), dtype=np.float32) # 1 channel for in audio, n channels per parameters
all_val_tg = np.array([[]], dtype=np.float32) # 1 channels of all (out audio)
for entry in params['datasets']:
#print("Input file name: %s" % entry['input'])
in_data, in_rate = librosa.load(entry['input'], sr=None, mono=True)
#print("Target file name: %s" % entry['target'])
tg_data, tg_rate = librosa.load(entry['target'], sr=None, mono=True)
#print("Input rate: %d length: %d [samples]" % (in_rate, in_data.size))
#print("Target rate: %d length: %d [samples]" % (tg_rate, tg_data.size))
if in_rate != tg_rate:
print("Error! Sample rate needs to be equal")
exit(1)
if in_rate != 48000 or tg_rate != 48000:
print("Converting audio sample rate to 48kHz.")
in_data = librosa.resample(in_data, orig_sr=in_rate, target_sr=48000)
tg_data = librosa.resample(tg_data, orig_sr=tg_rate, target_sr=48000)
rate = 48000
x_all = audio_converter(in_data)
y_all = audio_converter(tg_data)
# Auto-align
if blip_locations and blip_window:
y_all_aligned = align_target(tg_data=y_all, blip_offset=blip_offset, blip_locations=tuple(blip_locations), blip_window=blip_window)
if y_all_aligned is not None:
y_all = y_all_aligned
else:
print("Error! Was not able to calculate alignment delay!")
exit(1)
else:
print("Warning! Auto-alignment disabled...")
if(x_all.size != y_all.size):
min_size = min(x_all.size, y_all.size)
#print("Warning! Length for audio files\n\r %s\n\r %s\n\rdoes not match, setting both to %d [samples]" % (entry['input'], entry['target'], min_size))
x_all = np.resize(x_all, min_size)
y_all = np.resize(y_all, min_size)
# Noise reduction, using CPU
if args.denoise:
y_all = denoise(waveform=y_all)
# Normalization
if args.norm:
in_lvl = peak(x_all)
y_all = peak(y_all, in_lvl)
# Default to 70% 15% 15% split
if not args.csv_file:
splitted_x = audio_splitter(x_all, [0.70, 0.15, 0.15])
splitted_y = audio_splitter(y_all, [0.70, 0.15, 0.15])
else:
# Csv file to be named as in file
[train_bounds, test_bounds, val_bounds] = parse_csv(os.path.splitext(entry['input'])[0] + ".csv")
splitted_x = [np.ndarray([0], dtype=np.float32), np.ndarray([0], dtype=np.float32), np.ndarray([0], dtype=np.float32)]
splitted_y = [np.ndarray([0], dtype=np.float32), np.ndarray([0], dtype=np.float32), np.ndarray([0], dtype=np.float32)]
for bounds in train_bounds:
splitted_x[0] = np.append(splitted_x[0], audio_splitter(x_all, bounds, unit='s'))
splitted_y[0] = np.append(splitted_y[0], audio_splitter(y_all, bounds, unit='s'))
for bounds in test_bounds:
splitted_x[1] = np.append(splitted_x[1], audio_splitter(x_all, bounds, unit='s'))
splitted_y[1] = np.append(splitted_y[1], audio_splitter(y_all, bounds, unit='s'))
for bounds in val_bounds:
splitted_x[2] = np.append(splitted_x[2], audio_splitter(x_all, bounds, unit='s'))
splitted_y[2] = np.append(splitted_y[2], audio_splitter(y_all, bounds, unit='s'))
# Initialize lists to handle the number of parameters
params_train = []
params_val = []
params_test = []
# Create a list of np arrays of the parameter values
for val in entry["params"]:
# Create the parameter arrays
params_train.append(np.array([val]*len(splitted_x[0]), dtype=np.float32))
params_test.append(np.array([val]*len(splitted_x[1]), dtype=np.float32))
params_val.append(np.array([val]*len(splitted_x[2]), dtype=np.float32))
# Convert the lists to numpy arrays
params_train = np.array(params_train, dtype=np.float32)
params_val = np.array(params_val, dtype=np.float32)
params_test = np.array(params_test, dtype=np.float32)
# Append the audio and paramters to the full data sets
all_train_in = np.append(all_train_in, np.append([splitted_x[0]], params_train, axis=0), axis = 1)
all_train_tg = np.append(all_train_tg, splitted_y[0])
all_test_in = np.append(all_test_in, np.append([splitted_x[1]], params_test, axis=0), axis = 1)
all_test_tg = np.append(all_test_tg, splitted_y[1])
all_val_in = np.append(all_val_in, np.append([splitted_x[2]], params_val, axis=0), axis = 1)
all_val_tg = np.append(all_val_tg, splitted_y[2])
# Save the wav files
save_wav("Data/train/" + file_name + "-input.wav", rate, all_train_in.T, flatten=False)
save_wav("Data/test/" + file_name + "-input.wav", rate, all_test_in.T, flatten=False)
save_wav("Data/val/" + file_name + "-input.wav", rate, all_val_in.T, flatten=False)
save_wav("Data/train/" + file_name + "-target.wav", rate, all_train_tg)
save_wav("Data/test/" + file_name + "-target.wav", rate, all_test_tg)
save_wav("Data/val/" + file_name + "-target.wav", rate, all_val_tg)
def main(args):
if args.files:
if (len(args.files) % 2) and not args.parameterize:
print("Error: you should provide arguments in pairs see help")
exit(1)
if args.parameterize is True:
conditionedWavParse(args)
else:
nonConditionedWavParse(args)
print("Done!")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--files', '-f', nargs='+', help='provide input target files in pairs e.g. guitar_in.wav guitar_tg.wav bass_in.wav bass_tg.wav')
parser.add_argument('--load_config', '-l',
help="File path, to a JSON config file, arguments listed in the config file will replace the defaults", default='RNN-aidadsp-1')
parser.add_argument('--csv_file', '-csv', action=argparse.BooleanOptionalAction, default=False, help='Use csv file for split bounds')
parser.add_argument('--config_location', '-cl', default='Configs', help='Location of the "Configs" directory')
parser.add_argument('--parameterize', '-p', action=argparse.BooleanOptionalAction, default=False, help='Perform parameterized training')
parser.add_argument('--norm', '-n', action=argparse.BooleanOptionalAction, default=False, help='Perform normalization of target tracks so that they will match the volume of the input tracks')
parser.add_argument('--denoise', '-dn', action=argparse.BooleanOptionalAction, default=False, help='Perform noise removal on target tracks leveraging noisereduce package')
args = parser.parse_args()
main(args)