Skip to content

Commit 07056db

Browse files
GaryGao99zh794390558
authored andcommitted
add base frontend (#84)
* add base frontend * fix doc * fix syntax error * rename frontend and hparam * add fbank * fix bug * add read wav * fix read_wav * specific interfaces of feature functions * fix fbank * Update read_wav.py add return sample_rate * Update read_wav_test.py fix test_wav * spectrum add spectrum and its test * Update speech_features.png fix fbank op * fbank fix fbank and it's test * change test_wav * fix rely * fix config setting * pitch * fix config setting * pitch * cepstrum * fix pitch * fix call * fix config and call * zcr * plp * framepow * analyfiltbank, synthfiltbank, delta_delta * fix spectrum * fix comment * fix fbank test * fix fbank * fix delta_delta * fix fbank and it's test * fix hparam * fix sample_rate setting * fbank_pitch features * add deepdiff package * fix fbank_pitch features * get fbank_pitch features using kaldi io * make fbank_pitch features using shell * fix sample_rate setting and add doc * fix dtype of params * fix sample_rate && add doc * fix feat_lib dependency and sample rate * remove contrib_audio dependency * Delete compute_pitch_feats.py * Merge branch 'master' into frontend * fix espnet path * delete make_fbank_pitch.sh * fix sample rate and doc * fix doc * Revert "remove contrib_audio dependency" This reverts commit a7735e8. * fix read_wav * fix import error * fix read && write wav * fix assert sample rate * make fbank features * make features * fix assert sample rate * fix write wav * fix write_wav return * fix write_wav test * Update write_wav_test.py
1 parent 04380e7 commit 07056db

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+3760
-6
lines changed

delta/data/frontend/__init__.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Copyright (C) 2017 Beijing Didi Infinity Technology and Development Co.,Ltd.
2+
# All rights reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
# ==============================================================================
16+
''' init of frontend package'''

delta/data/frontend/analyfiltbank.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# Copyright (C) 2017 Beijing Didi Infinity Technology and Development Co.,Ltd.
2+
# All rights reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
# ==============================================================================
16+
17+
import tensorflow as tf
18+
19+
from delta.layers.ops import py_x_ops
20+
from delta.utils.hparam import HParams
21+
from delta.data.frontend.base_frontend import BaseFrontend
22+
23+
class Analyfiltbank(BaseFrontend):
24+
25+
def __init__(self, config:dict):
26+
super().__init__(config)
27+
28+
@classmethod
29+
def params(cls, config=None):
30+
"""
31+
Set params.
32+
:param config: contains three optional parameters:window_length(float, default=0.030),
33+
frame_length(float, default=0.010), sample_rate(float, default=16000.0).
34+
:return: An object of class HParams, which is a set of hyperparameters as name-value pairs.
35+
"""
36+
37+
window_length = 0.030
38+
frame_length = 0.010
39+
sample_rate = 16000.0
40+
41+
hparams = HParams(cls=cls)
42+
hparams.add_hparam('window_length', window_length)
43+
hparams.add_hparam('frame_length', frame_length)
44+
hparams.add_hparam('sample_rate', sample_rate)
45+
46+
if config is not None:
47+
hparams.override_from_dict(config)
48+
49+
return hparams
50+
51+
def call(self, audio_data, sample_rate):
52+
"""
53+
Caculate power spectrum and phase spectrum of audio data.
54+
:param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
55+
:param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
56+
:return: Two returns:
57+
power spectrum —— A float tensor of size (num_frames, num_frequencies) containing
58+
power spectrum and of every frame in speech.
59+
phase spectrum —— A float tensor of size (num_frames, num_frequencies) containing
60+
phase spectrum and of every frame in speech.
61+
"""
62+
63+
p = self.config
64+
with tf.name_scope('analyfiltbank'):
65+
66+
if sample_rate == None:
67+
sample_rate = tf.constant(p.sample_rate, dtype=float)
68+
69+
assert_op = tf.compat.v1.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=float))
70+
with tf.control_dependencies([assert_op]):
71+
72+
power_spectrum, phase_spectrum = py_x_ops.analyfiltbank(
73+
audio_data,
74+
sample_rate,
75+
window_length=p.window_length,
76+
frame_length=p.frame_length)
77+
78+
return power_spectrum, phase_spectrum
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
# Copyright (C) 2017 Beijing Didi Infinity Technology and Development Co.,Ltd.
2+
# All rights reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
# ==============================================================================
16+
17+
import tensorflow as tf
18+
import os
19+
from pathlib import Path
20+
from delta.data.frontend.read_wav import ReadWav
21+
from delta.data.frontend.analyfiltbank import Analyfiltbank
22+
import numpy as np
23+
24+
25+
class Test(tf.test.TestCase):
26+
27+
def test_analyfiltbank(self):
28+
wav_path = str(
29+
Path(os.environ['MAIN_ROOT']).joinpath('delta/layers/ops/data/sm1_cln.wav'))
30+
31+
with self.session():
32+
33+
read_wav = ReadWav.params().instantiate()
34+
audio_data, sample_rate = read_wav(wav_path)
35+
36+
analyfiltbank = Analyfiltbank.params().instantiate()
37+
power_spc, phase_spc = analyfiltbank(audio_data.eval(), sample_rate.eval())
38+
39+
power_spc_true = np.array(
40+
[[
41+
4.2182300e-04, 3.6964193e-04, 3.9906241e-05, 2.8196722e-05,
42+
3.3976138e-04, 3.7671626e-04, 2.2727624e-04, 7.2495081e-05,
43+
4.3451786e-05, 3.4654513e-06
44+
],
45+
[
46+
1.4681223e-05, 2.8831255e-05, 3.5616580e-05, 3.9359711e-05,
47+
1.2714787e-04, 1.2794189e-04, 3.6509471e-05, 1.7578101e-05,
48+
5.9672035e-05, 2.9785692e-06
49+
],
50+
[
51+
8.8715387e-05, 6.0998322e-05, 2.7695101e-05, 1.6866413e-04,
52+
4.6845453e-05, 3.3532990e-05, 5.7005627e-06, 5.1852752e-05,
53+
1.8390550e-05, 8.3459439e-05
54+
],
55+
[
56+
1.1405386e-05, 1.8942148e-06, 1.6338145e-06, 1.8362705e-05,
57+
8.4106450e-06, 4.4174294e-06, 3.6533682e-05, 5.0541588e-05,
58+
1.6701326e-06, 1.8736981e-05
59+
],
60+
[
61+
2.9108920e-05, 1.6862698e-05, 3.3437627e-05, 6.9332527e-05,
62+
5.0028186e-05, 5.9426224e-05, 2.1895030e-06, 2.3780794e-06,
63+
4.7786685e-05, 7.3811811e-05
64+
],
65+
[
66+
1.6433882e-05, 9.5777386e-07, 2.0980822e-06, 4.8990279e-07,
67+
1.4232077e-05, 1.5986938e-05, 2.9042780e-05, 1.1719906e-05,
68+
2.4548817e-06, 5.3594176e-06
69+
],
70+
[
71+
9.1289467e-06, 9.4249899e-06, 7.4781286e-07, 1.8923520e-05,
72+
6.5740237e-06, 4.3209452e-06, 3.9396346e-06, 1.2287317e-05,
73+
4.6807354e-06, 5.8512210e-06
74+
],
75+
[
76+
1.6150383e-05, 2.6649790e-05, 1.8610657e-05, 2.2872716e-06,
77+
1.4209920e-05, 2.3279742e-06, 6.6038615e-06, 2.6169775e-05,
78+
2.8335158e-05, 1.7595910e-06
79+
],
80+
[
81+
6.8095047e-05, 9.1859045e-05, 2.6713702e-05, 3.0580850e-05,
82+
1.4539381e-05, 4.2510033e-05, 2.2579852e-05, 1.4843822e-05,
83+
2.0883192e-05, 6.0624756e-05
84+
],
85+
[
86+
1.6092306e-05, 1.4245335e-05, 2.4250150e-05, 6.0177539e-05,
87+
6.7926321e-06, 3.4922948e-07, 2.1843030e-06, 8.5554876e-07,
88+
2.6831965e-06, 2.0012436e-05
89+
]])
90+
91+
phase_spc_true = np.array(
92+
[[
93+
3.1415927, 3.1415927, 3.1415927, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
94+
3.1415927
95+
],
96+
[
97+
0.01752237, 1.6688037, 1.4971976, 1.4470094, 2.0516894,
98+
-2.3112175, -0.7115377, 2.9614341, -1.2494497, -0.7055688
99+
],
100+
[
101+
2.614648, 0.63351387, -2.0660093, 1.7626916, -1.1257634,
102+
3.017448, -2.892095, -1.2209401, 1.7407895, -1.0281658
103+
],
104+
[
105+
1.02424, -1.8967879, -0.6139833, 2.587602, 3.0070715, 1.5781559,
106+
-1.899145, -1.1459525, -0.24284656, -0.8106653
107+
],
108+
[
109+
-0.08220324, 0.5497215, 1.7031444, -2.8960562, -1.3680246,
110+
0.4349923, 2.0676146, 1.2389332, 2.6312854, -1.7511902
111+
],
112+
[
113+
0.17763095, 2.7475302, -0.20671827, 1.0719725, -2.388657,
114+
1.189566, -1.0643665, 2.5955305, -0.69036585, -0.5287417
115+
],
116+
[
117+
-0.9477449, -2.7059674, 0.53469753, 1.9289348, 0.24833842,
118+
0.03517391, -1.4778724, -0.16577117, -1.7509687, -0.46875867
119+
],
120+
[
121+
1.5570146, -2.9596932, -0.7975963, 3.0060582, -1.038453,
122+
0.14911443, -1.5873562, 0.7229206, 2.679422, -1.1890441
123+
],
124+
[
125+
-2.2543156, 0.47845784, -2.8412538, -0.5494534, 1.6583048,
126+
-1.4567885, 1.0724461, -2.70243, -0.2690962, 1.8831034
127+
],
128+
[
129+
-0.32710192, 0.01503609, 0.29720783, -0.7409194, -2.183623,
130+
2.3637679, 0.6405145, 1.4975713, 0.18241015, 2.2659144
131+
]])
132+
133+
self.assertEqual(tf.rank(power_spc).eval(), 2)
134+
self.assertEqual(tf.rank(phase_spc).eval(), 2)
135+
self.assertAllClose(power_spc.eval().transpose()[:10, :10], power_spc_true)
136+
self.assertAllClose(phase_spc.eval().transpose()[:10, :10], phase_spc_true)
137+
138+
if __name__ == '__main__':
139+
tf.test.main()

delta/data/frontend/base_frontend.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# Copyright (C) 2017 Beijing Didi Infinity Technology and Development Co.,Ltd.
2+
# All rights reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
# ==============================================================================
16+
''' base interface of Frontend '''
17+
18+
import abc
19+
import tensorflow as tf
20+
21+
from delta.utils.hparam import HParams
22+
23+
class ABCFrontend(metaclass=abc.ABCMeta):
24+
''' abstract of Frontend '''
25+
def __init__(self, config):
26+
raise NotImplementedError()
27+
28+
@abc.abstractmethod
29+
def call(self, *args, **kwargs):
30+
''' implementation func '''
31+
raise NotImplementedError()
32+
33+
34+
class BaseFrontend(ABCFrontend):
35+
''' wrapper of abstrcat Frontend'''
36+
37+
def __init__(self, config:dict):
38+
self._config = config
39+
40+
@property
41+
def config(self):
42+
''' config property '''
43+
return self._config
44+
45+
@classmethod
46+
def params(cls, config=None):
47+
''' set params '''
48+
raise NotImplementedError()
49+
50+
def __call__(self, *args, **kwargs):
51+
''' call '''
52+
return self.call(*args, **kwargs)

delta/data/frontend/cepstrum.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
# Copyright (C) 2017 Beijing Didi Infinity Technology and Development Co.,Ltd.
2+
# All rights reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
# ==============================================================================
16+
17+
import tensorflow as tf
18+
19+
from delta.layers.ops import py_x_ops
20+
from delta.utils.hparam import HParams
21+
from delta.data.frontend.base_frontend import BaseFrontend
22+
23+
class Cepstrum(BaseFrontend):
24+
25+
def __init__(self, config:dict):
26+
super().__init__(config)
27+
28+
@classmethod
29+
def params(cls, config=None):
30+
"""
31+
Set params.
32+
:param config: contains five optional parameters:window_length(float, default=0.025),
33+
frame_length(float, default=0.010), sample_rate(float, default=16000.0),
34+
ceps_subband_num(int, default=13), tag_ceps_mean_norm(bool, default=True).
35+
:return:An object of class HParams, which is a set of hyperparameters as name-value pairs.
36+
"""
37+
38+
window_length = 0.025
39+
frame_length = 0.010
40+
ceps_subband_num = 13
41+
tag_ceps_mean_norm = True
42+
sample_rate = 16000.0
43+
44+
hparams = HParams(cls=cls)
45+
hparams.add_hparam('window_length', window_length)
46+
hparams.add_hparam('frame_length', frame_length)
47+
hparams.add_hparam('ceps_subband_num', ceps_subband_num)
48+
hparams.add_hparam('tag_ceps_mean_norm', tag_ceps_mean_norm)
49+
hparams.add_hparam('sample_rate', sample_rate)
50+
51+
if config is not None:
52+
hparams.override_from_dict(config)
53+
54+
return hparams
55+
56+
def call(self, audio_data, sample_rate=None):
57+
"""
58+
Caculate cepstrum of audio data.
59+
:param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
60+
:param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
61+
:return:A float tensor of size (num_frames, ceps_subband_num) containing normalized cepstrum
62+
(tag_ceps_mean_norm = True) or cepstrum (tag_ceps_mean_norm = False) of every frame in speech.
63+
"""
64+
65+
p = self.config
66+
67+
with tf.name_scope('cepstrum'):
68+
69+
if sample_rate == None:
70+
sample_rate = tf.constant(p.sample_rate, dtype=float)
71+
72+
assert_op = tf.compat.v1.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=float))
73+
with tf.control_dependencies([assert_op]):
74+
75+
cepstrum = py_x_ops.cepstrum(
76+
audio_data,
77+
sample_rate,
78+
window_length=p.window_length,
79+
frame_length=p.frame_length,
80+
ceps_subband_num=p.ceps_subband_num,
81+
tag_ceps_mean_norm=p.tag_ceps_mean_norm)
82+
83+
return cepstrum

0 commit comments

Comments
 (0)