This repository has been archived by the owner on Aug 10, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathaudio.py
128 lines (88 loc) · 3.45 KB
/
audio.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# from https://github.com/keithito/tacotron/blob/master/util/audio.py
import librosa
import librosa.filters
import math
import numpy as np
from scipy import signal
from hparams import hparams
from scipy.io import wavfile
n_fft = hparams.fft_size * 2
hop_length = hparams.hop_length
win_length = hparams.win_length
def load_wav(path):
return librosa.core.load(path, sr=hparams.sample_rate)[0]
def save_wav(wav, path):
wav *= hparams.audio_max_value / max(0.01, np.max(np.abs(wav)))
wavfile.write(path, hparams.sample_rate, wav.astype(np.int16))
def trim(quantized):
start, end = start_and_end_indices(quantized, hparams.silence_threshold)
return quantized[start:end]
def adjust_time_resolution(quantized, mel):
"""Adjust time resolution by repeating features
Args:
quantized (ndarray): (T,)
mel (ndarray): (N, D)
Returns:
tuple: Tuple of (T,) and (T, D)
"""
assert len(quantized.shape) == 1
assert len(mel.shape) == 2
upsample_factor = quantized.size // mel.shape[0]
mel = np.repeat(mel, upsample_factor, axis=0)
n_pad = quantized.size - mel.shape[0]
if n_pad != 0:
assert n_pad > 0
mel = np.pad(mel, [(0, n_pad), (0, 0)], mode="constant", constant_values=0)
# trim
start, end = start_and_end_indices(quantized, hparams.silence_threshold)
return quantized[start:end], mel[start:end, :]
adjast_time_resolution = adjust_time_resolution # 'adjust' is correct spelling, this is for compatibility
def start_and_end_indices(quantized, silence_threshold=2):
for start in range(quantized.size):
if abs(quantized[start] - 127) > silence_threshold:
break
for end in range(quantized.size - 1, 1, -1):
if abs(quantized[end] - 127) > silence_threshold:
break
assert abs(quantized[start] - 127) > silence_threshold
assert abs(quantized[end] - 127) > silence_threshold
return start, end
def _stft(y):
return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=True)
def melspectrogram(y):
D = _stft(y)
S = _amp_to_db(_linear_to_mel(np.abs(D))) - hparams.ref_level_db
if not hparams.allow_clipping_in_normalization:
assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0
return _normalize(S)
def spectrogram(y):
D = _stft(y)
S = _amp_to_db(np.abs(D)) - hparams.ref_level_db
return _normalize(S)
def get_hop_size():
hop_size = hparams.hop_size
if hop_size is None:
assert hparams.frame_shift_ms is not None
hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
return hop_size
# Conversions:
_mel_basis = None
def _linear_to_mel(spectrogram):
global _mel_basis
if _mel_basis is None:
_mel_basis = _build_mel_basis()
return np.dot(_mel_basis, spectrogram)
def _build_mel_basis():
assert hparams.fmax <= hparams.sample_rate // 2
return librosa.filters.mel(hparams.sample_rate, n_fft,
fmin=hparams.fmin, fmax=hparams.fmax,
n_mels=hparams.num_mels)
def _amp_to_db(x):
min_level = np.exp(hparams.min_level_db / 20 * np.log(10))
return 20 * np.log10(np.maximum(min_level, x))
def _db_to_amp(x):
return np.power(10.0, x * 0.05)
def _normalize(S):
return np.clip((S - hparams.min_level_db) / -hparams.min_level_db, 0, 1)
def _denormalize(S):
return (np.clip(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db