forked from ksw0306/FloWaveNet
-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessing.py
102 lines (82 loc) · 3.73 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from concurrent.futures import ProcessPoolExecutor
from functools import partial
import numpy as np
import os
import librosa
from multiprocessing import cpu_count
import argparse
def build_from_path(in_dir, out_dir, num_workers=1):
executor = ProcessPoolExecutor(max_workers=num_workers)
futures = []
index = 1
with open(os.path.join(in_dir, 'metadata.csv'), encoding='utf-8') as f:
for line in f:
parts = line.strip().split('|')
wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0])
text = parts[2]
futures.append(executor.submit(
partial(_process_utterance, out_dir, index, wav_path, text)))
index += 1
return [future.result() for future in futures]
def _process_utterance(out_dir, index, wav_path, text):
# Load the audio to a numpy array:
wav, sr = librosa.load(wav_path, sr=22050)
wav = wav / np.abs(wav).max() * 0.999
out = wav
constant_values = 0.0
out_dtype = np.float32
n_fft = 1024
hop_length = 256
reference = 20.0
min_db = -100
# Compute a mel-scale spectrogram from the trimmed wav:
# (N, D)
mel_spectrogram = librosa.feature.melspectrogram(wav, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=80,
fmin=125, fmax=7600).T
# mel_spectrogram = np.round(mel_spectrogram, decimals=2)
mel_spectrogram = 20 * np.log10(np.maximum(1e-4, mel_spectrogram)) - reference
mel_spectrogram = np.clip((mel_spectrogram - min_db) / (-min_db), 0, 1)
pad = (out.shape[0] // hop_length + 1) * hop_length - out.shape[0]
pad_l = pad // 2
pad_r = pad // 2 + pad % 2
# zero pad for quantized signal
out = np.pad(out, (pad_l, pad_r), mode="constant", constant_values=constant_values)
N = mel_spectrogram.shape[0]
assert len(out) >= N * hop_length
# time resolution adjustment
# ensure length of raw audio is multiple of hop_size so that we can use
# transposed convolution to upsample
out = out[:N * hop_length]
assert len(out) % hop_length == 0
timesteps = len(out)
# Write the spectrograms to disk:
audio_filename = 'ljspeech-audio-%05d.npy' % index
mel_filename = 'ljspeech-mel-%05d.npy' % index
np.save(os.path.join(out_dir, audio_filename),
out.astype(out_dtype), allow_pickle=False)
np.save(os.path.join(out_dir, mel_filename),
mel_spectrogram.astype(np.float32), allow_pickle=False)
# Return a tuple describing this training example:
return audio_filename, mel_filename, timesteps, text
def preprocess(in_dir, out_dir, num_workers):
os.makedirs(out_dir, exist_ok=True)
metadata = build_from_path(in_dir, out_dir, num_workers)
write_metadata(metadata, out_dir)
def write_metadata(metadata, out_dir):
with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
for m in metadata:
f.write('|'.join([str(x) for x in m]) + '\n')
frames = sum([m[2] for m in metadata])
sr = 22050
hours = frames / sr / 3600
print('Wrote %d utterances, %d time steps (%.2f hours)' % (len(metadata), frames, hours))
print('Max input length: %d' % max(len(m[3]) for m in metadata))
print('Max output length: %d' % max(m[2] for m in metadata))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Preprocessing',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--in_dir', '-i', type=str, default='./', help='In Directory')
parser.add_argument('--out_dir', '-o', type=str, default='./', help='Out Directory')
args = parser.parse_args()
num_workers = cpu_count()
preprocess(args.in_dir, args.out_dir, num_workers)