From 1ee62441e975cbb41c1488bcbdf5037e14c86b16 Mon Sep 17 00:00:00 2001 From: dengchengyun Date: Thu, 19 Dec 2019 21:41:12 +0800 Subject: [PATCH 1/6] fix bugs and docs of FE --- core/ops/.gitignore | 2 + core/ops/kernels/analyfiltbank.cc | 4 +- core/ops/kernels/fbank.cc | 2 + core/ops/kernels/fbank.h | 1 + core/ops/kernels/fbank_op.cc | 7 + core/ops/kernels/framepow.cc | 39 ++++- core/ops/kernels/framepow.h | 8 +- core/ops/kernels/framepow_op.cc | 11 +- core/ops/kernels/mfcc_dct_op.cc | 23 ++- core/ops/kernels/mfcc_mel_filterbank.cc | 6 + core/ops/kernels/mfcc_mel_filterbank.h | 1 + core/ops/kernels/resample.cc | 3 +- core/ops/kernels/resample.h | 4 + core/ops/kernels/spectrum.cc | 133 ++++++++++-------- core/ops/kernels/spectrum.h | 17 ++- core/ops/kernels/spectrum_op.cc | 26 ++-- core/ops/kernels/speed_op.cc | 84 +++++++++++ core/ops/kernels/support_functions.cc | 35 +++-- core/ops/kernels/support_functions.h | 7 +- core/ops/kernels/synthfiltbank.cc | 6 +- core/ops/kernels/x_ops.cc | 22 ++- core/ops/py_x_ops.py | 1 + delta/data/frontend/add_noise_end_to_end.py | 6 +- .../frontend/add_noise_end_to_end_test.py | 5 +- delta/data/frontend/add_rir_noise_aecres.py | 11 +- .../frontend/add_rir_noise_aecres_test.py | 5 +- delta/data/frontend/analyfiltbank.py | 30 ++-- delta/data/frontend/analyfiltbank_test.py | 5 +- delta/data/frontend/cepstrum.py | 30 ++-- delta/data/frontend/cepstrum_test.py | 5 +- delta/data/frontend/cmvn.py | 25 +++- delta/data/frontend/delta_delta.py | 5 +- delta/data/frontend/delta_delta_test.py | 5 +- delta/data/frontend/fbank.py | 71 +++++++--- delta/data/frontend/fbank_pitch.py | 95 +++++++++++-- delta/data/frontend/fbank_pitch_test.py | 5 +- delta/data/frontend/fbank_test.py | 9 +- delta/data/frontend/framepow.py | 49 +++++-- delta/data/frontend/framepow_test.py | 20 +-- delta/data/frontend/mfcc.py | 90 +++++++----- delta/data/frontend/mfcc_test.py | 5 +- delta/data/frontend/pitch.py | 91 ++++++++---- delta/data/frontend/pitch_test.py | 5 +- delta/data/frontend/plp.py | 24 +++- delta/data/frontend/plp_test.py | 5 +- delta/data/frontend/read_wav.py | 44 ++++-- delta/data/frontend/read_wav_test.py | 14 +- delta/data/frontend/spectrum.py | 54 ++++--- delta/data/frontend/spectrum_test.py | 8 +- delta/data/frontend/synthfiltbank.py | 2 +- delta/data/frontend/synthfiltbank_test.py | 5 +- delta/data/frontend/write_wav.py | 2 +- delta/data/frontend/write_wav_test.py | 5 +- delta/data/frontend/zcr.py | 24 +++- delta/data/frontend/zcr_test.py | 5 +- utils/speech/compute_fbank_feats.py | 10 +- utils/speech/compute_fbank_pitch.py | 6 + utils/speech/compute_mfcc_feats.py | 6 + utils/speech/compute_spectrum_feats.py | 6 + utils/speech/make_fbank.sh | 5 +- utils/speech/make_fbank_pitch.sh | 3 + utils/speech/make_mfcc.sh | 3 + utils/speech/make_spectrum.sh | 3 + 63 files changed, 906 insertions(+), 342 deletions(-) create mode 100644 core/ops/kernels/speed_op.cc diff --git a/core/ops/.gitignore b/core/ops/.gitignore index e63aa584..3f028b5b 100644 --- a/core/ops/.gitignore +++ b/core/ops/.gitignore @@ -1,4 +1,6 @@ gen/ +cppjieba +*.so !data/sm1_cln.wav *.scp !noiselist.scp diff --git a/core/ops/kernels/analyfiltbank.cc b/core/ops/kernels/analyfiltbank.cc index 9e63e81b..6c57b47b 100644 --- a/core/ops/kernels/analyfiltbank.cc +++ b/core/ops/kernels/analyfiltbank.cc @@ -79,6 +79,7 @@ int Analyfiltbank::proc_afb(const float* mic_buf) { xcomplex* win = static_cast(malloc(sizeof(xcomplex) * i_FFTSiz)); xcomplex* fftwin = static_cast(malloc(sizeof(xcomplex) * i_FFTSiz)); + float* fft_buf = static_cast(malloc(sizeof(float) * 2 * i_FFTSiz)); /* generate window */ gen_window(pf_WINDOW, i_WinLen, s_WinTyp); @@ -96,7 +97,7 @@ int Analyfiltbank::proc_afb(const float* mic_buf) { } /* fft */ - dit_r2_fft(win, fftwin, i_FFTSiz, -1); + dit_r2_fft(win, fftwin, fft_buf, i_FFTSiz, -1); for (k = 0; k < i_NumFrq; k++) { pf_PowSpc[n * i_NumFrq + k] = complex_abs2(fftwin[k]); @@ -106,6 +107,7 @@ int Analyfiltbank::proc_afb(const float* mic_buf) { free(win); free(fftwin); + free(fft_buf); return 1; } diff --git a/core/ops/kernels/fbank.cc b/core/ops/kernels/fbank.cc index 4a8a7702..0427443f 100644 --- a/core/ops/kernels/fbank.cc +++ b/core/ops/kernels/fbank.cc @@ -33,6 +33,8 @@ Fbank::Fbank() upper_frequency_limit_(kDefaultUpperFrequencyLimit), filterbank_channel_count_(kDefaultFilterbankChannelCount) {} +Fbank::~Fbank() {} + bool Fbank::Initialize(int input_length, double input_sample_rate) { if (input_length < 1) { LOG(ERROR) << "Input length must be positive."; diff --git a/core/ops/kernels/fbank.h b/core/ops/kernels/fbank.h index 7bc4356d..d286c1af 100644 --- a/core/ops/kernels/fbank.h +++ b/core/ops/kernels/fbank.h @@ -32,6 +32,7 @@ namespace delta { class Fbank { public: Fbank(); + ~Fbank(); bool Initialize(int input_length, double input_sample_rate); // Input is a single squared-magnitude spectrogram frame. The input spectrum // is converted to linear magnitude and weighted into bands using a diff --git a/core/ops/kernels/fbank_op.cc b/core/ops/kernels/fbank_op.cc index 697dfa7e..a766e22b 100644 --- a/core/ops/kernels/fbank_op.cc +++ b/core/ops/kernels/fbank_op.cc @@ -49,6 +49,11 @@ class FbankOp : public OpKernel { sample_rate_tensor.shape().DebugString(), " instead.")); const int32 sample_rate = sample_rate_tensor.scalar()(); + if (upper_frequency_limit_ <= 0) + upper_frequency_limit_ = sample_rate / 2.0 + upper_frequency_limit_; + else if (upper_frequency_limit_ > sample_rate / 2.0 || upper_frequency_limit_ <= lower_frequency_limit_) + upper_frequency_limit_ = sample_rate / 2.0; + // shape [channels, time, bins] const int spectrogram_channels = spectrogram.dim_size(2); const int spectrogram_samples = spectrogram.dim_size(1); @@ -94,6 +99,8 @@ class FbankOp : public OpKernel { for (int i = 0; i < filterbank_channel_count_; ++i) { output_data[i] = fbank_output[i]; } + std::vector().swap(fbank_input); + std::vector().swap(fbank_output); } } } diff --git a/core/ops/kernels/framepow.cc b/core/ops/kernels/framepow.cc index 21b66240..4fc13f2f 100644 --- a/core/ops/kernels/framepow.cc +++ b/core/ops/kernels/framepow.cc @@ -27,6 +27,8 @@ const float frame_length_sec = 0.010; FramePow::FramePow() { window_length_sec_ = window_length_sec; frame_length_sec_ = frame_length_sec; + i_snip_edges = 1; + i_remove_dc_offset = true; pf_FrmEng = NULL; } @@ -40,27 +42,54 @@ void FramePow::set_frame_length_sec(float frame_length_sec) { frame_length_sec_ = frame_length_sec; } +void FramePow::set_snip_edges(int snip_edges) { i_snip_edges = snip_edges; } + +void FramePow::set_remove_dc_offset(bool remove_dc_offset) { + i_remove_dc_offset = remove_dc_offset; + } + int FramePow::init_eng(int input_size, float sample_rate) { f_SamRat = sample_rate; i_WinLen = static_cast(window_length_sec_ * f_SamRat); i_FrmLen = static_cast(frame_length_sec_ * f_SamRat); - i_NumFrm = (input_size - i_WinLen) / i_FrmLen + 1; + if (i_snip_edges == 1) + i_NumFrm = (input_size - i_WinLen) / i_FrmLen + 1; + else + i_NumFrm = (input_size + i_FrmLen / 2) / i_FrmLen; pf_FrmEng = static_cast(malloc(sizeof(float) * i_NumFrm)); return 1; } -int FramePow::proc_eng(const float* mic_buf) { - int n, k; +int FramePow::proc_eng(const float* mic_buf, int input_size) { + int i, n, k; float* win = static_cast(malloc(sizeof(float) * i_WinLen)); for (n = 0; n < i_NumFrm; n++) { pf_FrmEng[n] = 0.0; + float sum = 0.0; + float energy = 0.0; for (k = 0; k < i_WinLen; k++) { - win[k] = mic_buf[n * i_FrmLen + k]; - pf_FrmEng[n] = pf_FrmEng[n] + win[k] * win[k]; + int index = n * i_FrmLen + k; + if (index < input_size) + win[k] = mic_buf[index]; + else + win[k] = 0.0f; + sum += win[k]; + } + + if (i_remove_dc_offset == true) { + float mean = sum / i_WinLen; + for (int l = 0; l < i_WinLen; l++) win[l] -= mean; } + + for (i = 0; i < i_WinLen; i++) { + energy += win[i] * win[i]; + } + + pf_FrmEng[n] = log(energy); + } free(win); diff --git a/core/ops/kernels/framepow.h b/core/ops/kernels/framepow.h index 3019deda..c756da78 100644 --- a/core/ops/kernels/framepow.h +++ b/core/ops/kernels/framepow.h @@ -27,6 +27,8 @@ class FramePow { private: float window_length_sec_; float frame_length_sec_; + int i_snip_edges; + bool i_remove_dc_offset; float f_SamRat; int i_WinLen; @@ -44,9 +46,13 @@ class FramePow { void set_frame_length_sec(float frame_length_sec); + void set_snip_edges(int snip_edges); + + void set_remove_dc_offset(bool remove_dc_offset); + int init_eng(int input_size, float sample_rate); - int proc_eng(const float* mic_buf); + int proc_eng(const float* mic_buf, int input_size); int get_eng(float* output); diff --git a/core/ops/kernels/framepow_op.cc b/core/ops/kernels/framepow_op.cc index 6707b6bc..55897d5a 100644 --- a/core/ops/kernels/framepow_op.cc +++ b/core/ops/kernels/framepow_op.cc @@ -29,6 +29,9 @@ class FramePowOp : public OpKernel { explicit FramePowOp(OpKernelConstruction* context) : OpKernel(context) { OP_REQUIRES_OK(context, context->GetAttr("window_length", &window_length_)); OP_REQUIRES_OK(context, context->GetAttr("frame_length", &frame_length_)); + OP_REQUIRES_OK(context, context->GetAttr("snip_edges", &snip_edges_)); + OP_REQUIRES_OK(context, + context->GetAttr("remove_dc_offset", &remove_dc_offset_)); } void Compute(OpKernelContext* context) override { @@ -49,6 +52,8 @@ class FramePowOp : public OpKernel { FramePow cls_eng; cls_eng.set_window_length_sec(window_length_); cls_eng.set_frame_length_sec(frame_length_); + cls_eng.set_snip_edges(snip_edges_); + cls_eng.set_remove_dc_offset(remove_dc_offset_); OP_REQUIRES(context, cls_eng.init_eng(L, sample_rate), errors::InvalidArgument( "framepow_class initialization failed for length ", L, @@ -58,6 +63,8 @@ class FramePowOp : public OpKernel { int i_WinLen = static_cast(window_length_ * sample_rate); int i_FrmLen = static_cast(frame_length_ * sample_rate); int i_NumFrm = (L - i_WinLen) / i_FrmLen + 1; + if (snip_edges_ == 2) i_NumFrm = (L + i_FrmLen / 2) / i_FrmLen; + if (i_NumFrm < 1) i_NumFrm = 1; OP_REQUIRES_OK(context, context->allocate_output( 0, TensorShape({1, i_NumFrm}), &output_tensor)); @@ -65,13 +72,15 @@ class FramePowOp : public OpKernel { float* output_flat = output_tensor->flat().data(); int ret; - ret = cls_eng.proc_eng(input_flat); + ret = cls_eng.proc_eng(input_flat, L); ret = cls_eng.get_eng(output_flat); } private: float window_length_; float frame_length_; + int snip_edges_; + bool remove_dc_offset_; }; REGISTER_KERNEL_BUILDER(Name("FramePow").Device(DEVICE_CPU), FramePowOp); diff --git a/core/ops/kernels/mfcc_dct_op.cc b/core/ops/kernels/mfcc_dct_op.cc index c9da6b79..8d94b0db 100644 --- a/core/ops/kernels/mfcc_dct_op.cc +++ b/core/ops/kernels/mfcc_dct_op.cc @@ -41,10 +41,10 @@ class MfccDctOp : public OpKernel { OP_REQUIRES(context, fbank.dims() == 3, errors::InvalidArgument("Fbank must be 3-dimensional", fbank.shape().DebugString())); - const Tensor& spectrum = context->input(1); - OP_REQUIRES(context, spectrum.dims() == 3, - errors::InvalidArgument("Spectrum must be 3-dimensional", - spectrum.shape().DebugString())); + const Tensor& framepow = context->input(1); + OP_REQUIRES(context, framepow.dims() == 1, + errors::InvalidArgument("Framepow must be 1-dimensional", + framepow.shape().DebugString())); const Tensor& sample_rate_tensor = context->input(2); OP_REQUIRES(context, TensorShapeUtils::IsScalar(sample_rate_tensor.shape()), errors::InvalidArgument( @@ -56,8 +56,6 @@ class MfccDctOp : public OpKernel { const int fbank_channels = fbank.dim_size(2); const int fbank_samples = fbank.dim_size(1); const int audio_channels = fbank.dim_size(0); - const int spectrum_samples = spectrum.dim_size(1); - const int spectrum_channels = spectrum.dim_size(2); MfccDct mfcc; mfcc.set_coefficient_count(coefficient_count_); @@ -77,7 +75,7 @@ class MfccDctOp : public OpKernel { &output_tensor)); const float* fbank_flat = fbank.flat().data(); - const float* spectrum_flat = spectrum.flat().data(); + const float* framepow_flat = framepow.flat().data(); float* output_flat = output_tensor->flat().data(); for (int audio_channel = 0; audio_channel < audio_channels; @@ -86,13 +84,10 @@ class MfccDctOp : public OpKernel { const float* sample_data = fbank_flat + (audio_channel * fbank_samples * fbank_channels) + (fbank_sample * fbank_channels); - const float* spectrum_data = - spectrum_flat + (audio_channel * fbank_samples * spectrum_channels) + - (fbank_sample * spectrum_channels); + const float* framepow_data = framepow_flat + fbank_sample; std::vector mfcc_input(sample_data, sample_data + fbank_channels); - std::vector spectrum_input(spectrum_data, - spectrum_data + spectrum_channels); + std::vector framepow_input(framepow_data, framepow_data + 1); std::vector mfcc_output; mfcc.Compute(mfcc_input, &mfcc_output); DCHECK_EQ(coefficient_count_, mfcc_output.size()); @@ -103,10 +98,10 @@ class MfccDctOp : public OpKernel { output_data[i] = mfcc_output[i]; } if (use_energy_) - output_data[0] = spectrum_input[0]; + output_data[0] = framepow_input[0]; std::vector().swap(mfcc_input); - std::vector().swap(spectrum_input); + std::vector().swap(framepow_input); std::vector().swap(mfcc_output); } } diff --git a/core/ops/kernels/mfcc_mel_filterbank.cc b/core/ops/kernels/mfcc_mel_filterbank.cc index 76f848e9..097faa6c 100644 --- a/core/ops/kernels/mfcc_mel_filterbank.cc +++ b/core/ops/kernels/mfcc_mel_filterbank.cc @@ -38,6 +38,12 @@ namespace tensorflow { MfccMelFilterbank::MfccMelFilterbank() : initialized_(false) {} +MfccMelFilterbank::~MfccMelFilterbank() { + std::vector().swap(center_frequencies_); + std::vector().swap(weights_); + std::vector().swap(band_mapper_); +} + bool MfccMelFilterbank::Initialize(int input_length, double input_sample_rate, int output_channel_count, double lower_frequency_limit, diff --git a/core/ops/kernels/mfcc_mel_filterbank.h b/core/ops/kernels/mfcc_mel_filterbank.h index e9bcc6a1..2a745b2d 100644 --- a/core/ops/kernels/mfcc_mel_filterbank.h +++ b/core/ops/kernels/mfcc_mel_filterbank.h @@ -27,6 +27,7 @@ namespace tensorflow { class MfccMelFilterbank { public: MfccMelFilterbank(); + ~MfccMelFilterbank(); bool Initialize(int input_length, // Number of unique FFT bins fftsize/2+1. double input_sample_rate, int output_channel_count, double lower_frequency_limit, double upper_frequency_limit); diff --git a/core/ops/kernels/resample.cc b/core/ops/kernels/resample.cc index 8b7f7327..ed6d1be5 100644 --- a/core/ops/kernels/resample.cc +++ b/core/ops/kernels/resample.cc @@ -35,7 +35,6 @@ LinearResample::LinearResample(int samp_rate_in_hz, assert(samp_rate_in_hz > 0.0 && samp_rate_out_hz > 0.0 && filter_cutoff_hz > 0.0 && - filter_cutoff_hz*2 <= samp_rate_in_hz && filter_cutoff_hz*2 <= samp_rate_out_hz && num_zeros > 0); @@ -56,7 +55,7 @@ int LinearResample::GetNumOutputSamples(int input_num_samp, // work out the number of ticks in the time interval // [ 0, input_num_samp/samp_rate_in_ ). - int interval_length_in_ticks = input_num_samp * ticks_per_input_period; + long long interval_length_in_ticks = (long long)input_num_samp * (long long)ticks_per_input_period; if (!flush) { BaseFloat window_width = num_zeros_ / (2.0 * filter_cutoff_); int window_width_ticks = floor(window_width * tick_freq); diff --git a/core/ops/kernels/resample.h b/core/ops/kernels/resample.h index 06cef89c..19c07014 100644 --- a/core/ops/kernels/resample.h +++ b/core/ops/kernels/resample.h @@ -25,6 +25,10 @@ limitations under the License. #include #include +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/platform/logging.h" +using namespace tensorflow; // NOLINT + using namespace std; #include "kernels/support_functions.h" diff --git a/core/ops/kernels/spectrum.cc b/core/ops/kernels/spectrum.cc index d284437d..eec21b11 100644 --- a/core/ops/kernels/spectrum.cc +++ b/core/ops/kernels/spectrum.cc @@ -30,19 +30,21 @@ Spectrum::Spectrum() { window_length_sec_ = window_length_sec; frame_length_sec_ = frame_length_sec; i_OutTyp = 1; - i_snip_edges = true; + i_snip_edges = 1; i_raw_energy = 1; f_PreEph = 0.97; i_is_fbank = true; i_remove_dc_offset = true; + i_dither = 0.0; snprintf(s_WinTyp, sizeof(s_WinTyp), "povey"); pf_WINDOW = NULL; pf_SPC = NULL; -} - -Spectrum::~Spectrum() { - free(pf_WINDOW); - free(pf_SPC); + win_temp = NULL; + win_buf = NULL; + eph_buf = NULL; + win = NULL; + fftwin = NULL; + fft_buf = NULL; } void Spectrum::set_window_length_sec(float window_length_sec) { @@ -55,35 +57,34 @@ void Spectrum::set_frame_length_sec(float frame_length_sec) { void Spectrum::set_output_type(int output_type) { i_OutTyp = output_type; } -void Spectrum::set_snip_edges(bool snip_edges) { i_snip_edges = snip_edges; } +void Spectrum::set_snip_edges(int snip_edges) { i_snip_edges = snip_edges; } -void Spectrum::set_raw_energy(int raw_energy) { i_raw_energy = raw_energy; } +void Spectrum::set_raw_energy(int raw_energy) {i_raw_energy = raw_energy;} -void Spectrum::set_is_fbank(bool is_fbank) { i_is_fbank = is_fbank; } +void Spectrum::set_is_fbank(bool is_fbank) {i_is_fbank = is_fbank;} -void Spectrum::set_remove_dc_offset(bool remove_dc_offset) { - i_remove_dc_offset = remove_dc_offset; -} +void Spectrum::set_remove_dc_offset(bool remove_dc_offset) {i_remove_dc_offset = remove_dc_offset;} -void Spectrum::set_preEph(float preEph) { f_PreEph = preEph; } +void Spectrum::set_preEph(float preEph) {f_PreEph = preEph;} -void Spectrum::set_window_type(char* window_type) { - snprintf(s_WinTyp, sizeof(s_WinTyp), "%s", window_type); +void Spectrum::set_dither(float dither) {i_dither = dither;} + +void Spectrum::set_window_type(char* window_type){ + snprintf(s_WinTyp, sizeof(s_WinTyp), "%s", window_type); } int Spectrum::init_spc(int input_size, float sample_rate) { f_SamRat = sample_rate; i_WinLen = static_cast(window_length_sec_ * f_SamRat); i_FrmLen = static_cast(frame_length_sec_ * f_SamRat); - if (i_snip_edges == true) + if (i_snip_edges == 1) i_NumFrm = (input_size - i_WinLen) / i_FrmLen + 1; else i_NumFrm = (input_size + i_FrmLen / 2) / i_FrmLen; + if (i_NumFrm < 1) + i_NumFrm = 1; i_FFTSiz = static_cast(pow(2.0f, ceil(log2(i_WinLen)))); i_NumFrq = i_FFTSiz / 2 + 1; - if (i_NumFrm < 1) i_NumFrm = 1; - pf_WINDOW = static_cast(malloc(sizeof(float) * i_WinLen)); - pf_SPC = static_cast(malloc(sizeof(float) * i_NumFrq * i_NumFrm)); return 1; } @@ -91,36 +92,44 @@ int Spectrum::init_spc(int input_size, float sample_rate) { int Spectrum::proc_spc(const float* mic_buf, int input_size) { int n, k; - /* generate window */ - gen_window(pf_WINDOW, i_WinLen, s_WinTyp); - if (input_size < i_WinLen) - std::cerr << "Wraning: The length of input data is shorter than " - << window_length_sec_ << " s." << std::endl; + std::cerr<<"Wraning: The length of input data is shorter than "<< window_length_sec_ << " s." <(malloc(sizeof(xcomplex) * i_FFTSiz)); - float* win_buf = static_cast(malloc(sizeof(float) * i_WinLen)); - float* eph_buf = static_cast(malloc(sizeof(float) * i_WinLen)); - float* win_temp = static_cast(malloc(sizeof(float) * i_WinLen)); - xcomplex* fftwin = - static_cast(malloc(sizeof(xcomplex) * i_FFTSiz)); + //malloc + pf_WINDOW = static_cast(malloc(sizeof(float) * i_WinLen)); + pf_SPC = static_cast(malloc(sizeof(float) * i_NumFrq * i_NumFrm)); + win = static_cast(malloc(sizeof(xcomplex) * i_FFTSiz)); + win_buf = static_cast(malloc(sizeof(float) * i_WinLen)); + eph_buf = static_cast(malloc(sizeof(float) * i_WinLen)); + win_temp = static_cast(malloc(sizeof(float) * i_WinLen)); + fftwin = static_cast(malloc(sizeof(xcomplex) * i_FFTSiz)); + fft_buf = static_cast(malloc(sizeof(float) * 2 * i_FFTSiz)); // c.r&c.i + + /* generate window */ + gen_window(pf_WINDOW, i_WinLen, s_WinTyp); for (n = 0; n < i_NumFrm; n++) { float signal_raw_log_energy = 0.0; float sum = 0.0; - for (int l = 0; l < i_WinLen; l++) { + for (int l = 0; l < i_WinLen; l++){ int index = n * i_FrmLen + l; - if (index < input_size) + if (index < input_size) { win_buf[l] = mic_buf[index]; - else + } else { win_buf[l] = 0.0f; + } sum += win_buf[l]; } - if (i_remove_dc_offset == true) { + if(i_dither != 0.0) { + do_dither(win_buf, i_WinLen, i_dither); + } + + if (i_remove_dc_offset == true){ float mean = sum / i_WinLen; - for (int l = 0; l < i_WinLen; l++) win_buf[l] -= mean; + for (int l = 0; l < i_WinLen; l++) { + win_buf[l] -= mean; + } } /* do pre-emphais */ @@ -129,53 +138,58 @@ int Spectrum::proc_spc(const float* mic_buf, int input_size) { for (k = 0; k < i_WinLen; k++) { win[k].r = eph_buf[k] * pf_WINDOW[k]; win[k].i = 0.0f; - if (i_raw_energy == 1) - win_temp[k] = win_buf[k]; - else - win_temp[k] = win[k].r; } - for (k = i_WinLen; k < i_FFTSiz; k++) { - win[k].r = 0.0f; - win[k].i = 0.0f; + if (i_raw_energy == 1) { + std::memcpy(win_temp, win_buf, i_WinLen * sizeof(float)); } + else { + for (k = 0; k < i_WinLen; k++) { + win_temp[k] = win[k].r; + } + } + + std::memset((void*)&(win[i_WinLen]), 0, sizeof(float) * 2 * (i_FFTSiz - i_WinLen));; /* raw energy */ signal_raw_log_energy = compute_energy(win_temp, i_WinLen); /* fft */ - dit_r2_fft(win, fftwin, i_FFTSiz, -1); + dit_r2_fft(win, fftwin, fft_buf, i_FFTSiz, -1); - for (k = 0; k < i_NumFrq; k++) { - if (k == 0 && i_is_fbank == false) { - fftwin[k].r = sqrt(signal_raw_log_energy); - fftwin[k].i = 0.0f; - } - if (i_OutTyp == 1) + if (!i_is_fbank) { + fftwin[0].r = sqrt(signal_raw_log_energy); + fftwin[0].i = 0.0f; + } + + if (i_OutTyp == 1) { + for (k = 0; k < i_NumFrq; k++) { pf_SPC[n * i_NumFrq + k] = complex_abs2(fftwin[k]); - else if (i_OutTyp == 2) + } + } else if (i_OutTyp == 2) { + for (k = 0; k < i_NumFrq; k++) { pf_SPC[n * i_NumFrq + k] = log(complex_abs2(fftwin[k])); - else - return -1; + } + } else { + return -1; } } + free(pf_WINDOW); free(win_temp); free(win_buf); free(eph_buf); free(win); free(fftwin); + free(fft_buf); return 1; } int Spectrum::get_spc(float* output) { - int n, m; - for (m = 0; m < i_NumFrq; m++) { - for (n = 0; n < i_NumFrm; n++) { - output[n * i_NumFrq + m] = pf_SPC[n * i_NumFrq + m]; - } - } + std::memcpy((void*)output, (void*)pf_SPC, \ + i_NumFrq * i_NumFrm * sizeof(float)); + free(pf_SPC); return 1; } @@ -192,4 +206,5 @@ int Spectrum::write_spc() { fclose(fp); return 1; } + } // namespace delta diff --git a/core/ops/kernels/spectrum.h b/core/ops/kernels/spectrum.h index 29c152d8..517890ce 100644 --- a/core/ops/kernels/spectrum.h +++ b/core/ops/kernels/spectrum.h @@ -22,6 +22,7 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "kernels/complex_defines.h" +#include "kernels/support_functions.h" using namespace tensorflow; // NOLINT @@ -40,26 +41,32 @@ class Spectrum { float f_PreEph; char s_WinTyp[40]; int i_OutTyp; // 1: PSD, 2:log(PSD) - bool i_snip_edges; + int i_snip_edges; int i_raw_energy; bool i_remove_dc_offset; bool i_is_fbank; + float i_dither; float* pf_WINDOW; float* pf_SPC; + xcomplex* win; + float* win_buf; + float* eph_buf; + float* win_temp; + xcomplex* fftwin; + float* fft_buf; + public: Spectrum(); - ~Spectrum(); - void set_window_length_sec(float window_length_sec); void set_frame_length_sec(float frame_length_sec); void set_output_type(int output_type); - void set_snip_edges(bool snip_edges); + void set_snip_edges(int snip_edges); void set_raw_energy(int raw_energy); @@ -71,6 +78,8 @@ class Spectrum { void set_remove_dc_offset(bool remove_dc_offset); + void set_dither(float dither); + int init_spc(int input_size, float sample_rate); int proc_spc(const float* mic_buf, int input_size); diff --git a/core/ops/kernels/spectrum_op.cc b/core/ops/kernels/spectrum_op.cc index f36a7593..d6afecea 100644 --- a/core/ops/kernels/spectrum_op.cc +++ b/core/ops/kernels/spectrum_op.cc @@ -14,7 +14,6 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include #include "kernels/spectrum.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" @@ -22,8 +21,10 @@ limitations under the License. #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/lib/core/status.h" +#include namespace delta { + class SpecOp : public OpKernel { public: explicit SpecOp(OpKernelConstruction* context) : OpKernel(context) { @@ -34,9 +35,9 @@ class SpecOp : public OpKernel { OP_REQUIRES_OK(context, context->GetAttr("raw_energy", &raw_energy_)); OP_REQUIRES_OK(context, context->GetAttr("preEph_coeff", &preEph_coeff_)); OP_REQUIRES_OK(context, context->GetAttr("window_type", &window_type_)); - OP_REQUIRES_OK(context, - context->GetAttr("remove_dc_offset", &remove_dc_offset_)); + OP_REQUIRES_OK(context, context->GetAttr("remove_dc_offset", &remove_dc_offset_)); OP_REQUIRES_OK(context, context->GetAttr("is_fbank", &is_fbank_)); + OP_REQUIRES_OK(context, context->GetAttr("dither", &dither_)); } void Compute(OpKernelContext* context) override { @@ -52,10 +53,8 @@ class SpecOp : public OpKernel { sample_rate_tensor.shape().DebugString(), " instead.")); const float sample_rate = sample_rate_tensor.scalar()(); - // shape - const int L = input_tensor.dim_size(0); - char* window_type = const_cast(window_type_.c_str()); Spectrum cls_spc; + char* window_type = const_cast(window_type_.c_str()); cls_spc.set_window_length_sec(window_length_); cls_spc.set_frame_length_sec(frame_length_); cls_spc.set_output_type(output_type_); @@ -65,6 +64,10 @@ class SpecOp : public OpKernel { cls_spc.set_window_type(window_type); cls_spc.set_remove_dc_offset(remove_dc_offset_); cls_spc.set_is_fbank(is_fbank_); + cls_spc.set_dither(dither_); + + // shape + const int L = input_tensor.dim_size(0); OP_REQUIRES(context, cls_spc.init_spc(L, sample_rate), errors::InvalidArgument( "spectrum_class initialization failed for length ", L, @@ -74,9 +77,11 @@ class SpecOp : public OpKernel { int i_WinLen = static_cast(window_length_ * sample_rate); int i_FrmLen = static_cast(frame_length_ * sample_rate); int i_NumFrm = (L - i_WinLen) / i_FrmLen + 1; - bool i_snip_edges = snip_edges_; - if (i_snip_edges == false) i_NumFrm = (L + i_FrmLen / 2) / i_FrmLen; - if (i_NumFrm < 1) i_NumFrm = 1; + int i_snip_edges = snip_edges_; + if (i_snip_edges == 2) + i_NumFrm = (L + i_FrmLen / 2) / i_FrmLen; + if (i_NumFrm < 1) + i_NumFrm = 1; int i_FrqNum = static_cast(pow(2.0f, ceil(log2(i_WinLen))) / 2 + 1); OP_REQUIRES_OK( context, context->allocate_output(0, TensorShape({i_NumFrm, i_FrqNum}), @@ -94,12 +99,13 @@ class SpecOp : public OpKernel { float window_length_; float frame_length_; int output_type_; - bool snip_edges_; + int snip_edges_; int raw_energy_; float preEph_coeff_; string window_type_; bool remove_dc_offset_; bool is_fbank_; + float dither_; }; REGISTER_KERNEL_BUILDER(Name("Spectrum").Device(DEVICE_CPU), SpecOp); diff --git a/core/ops/kernels/speed_op.cc b/core/ops/kernels/speed_op.cc new file mode 100644 index 00000000..43452ecd --- /dev/null +++ b/core/ops/kernels/speed_op.cc @@ -0,0 +1,84 @@ +/* Copyright (C) 2017 Beijing Didi Infinity Technology and Development Co.,Ltd. +All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "kernels/resample.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/core/status.h" + +namespace delta { + +class SpeedOp : public OpKernel { + public: + explicit SpeedOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, + context->GetAttr("lowpass_filter_width", &lowpass_filter_width_)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& input_tensor = context->input(0); + OP_REQUIRES(context, input_tensor.dims() == 1, + errors::InvalidArgument("input signal must be 1-dimensional", + input_tensor.shape().DebugString())); + const Tensor& sample_rate_tensor = context->input(1); + OP_REQUIRES(context, TensorShapeUtils::IsScalar(sample_rate_tensor.shape()), + errors::InvalidArgument( + "Input sample_rate should be a scalar tensor, got ", + sample_rate_tensor.shape().DebugString(), " instead.")); + const Tensor& resample_rate_tensor = context->input(2); + OP_REQUIRES(context, TensorShapeUtils::IsScalar(resample_rate_tensor.shape()), + errors::InvalidArgument( + "Resample sample_rate should be a scalar tensor, got ", + resample_rate_tensor.shape().DebugString(), " instead.")); + const int sample_rate = static_cast(sample_rate_tensor.scalar()()); + const int resample_freq = static_cast(resample_rate_tensor.scalar()()); + const float* input_flat = input_tensor.flat().data(); + const int L = input_tensor.dim_size(0); + + lowpass_cutoff_ = min(resample_freq / 2, sample_rate / 2); + LinearResample cls_resample_(sample_rate, resample_freq, + lowpass_cutoff_, + lowpass_filter_width_); + vector waveform(L); + for (int i = 0; i < L; i++){ + waveform[i] = static_cast(input_flat[i]); + } + vector downsampled_wave; + cls_resample_.Resample(waveform, false, &downsampled_wave); + int output_length = downsampled_wave.size(); + Tensor* output_tensor = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape({1, output_length}), + &output_tensor)); + float* output_flat = output_tensor->flat().data(); + for (int j = 0; j < output_length; j++) + output_flat[j] = downsampled_wave[j]; + + std::vector().swap(downsampled_wave); + std::vector().swap(waveform); + cls_resample_.Reset(); + } + + private: + float lowpass_cutoff_; + int lowpass_filter_width_; +}; + +REGISTER_KERNEL_BUILDER(Name("Speed").Device(DEVICE_CPU), SpeedOp); + +} // namespace delta diff --git a/core/ops/kernels/support_functions.cc b/core/ops/kernels/support_functions.cc index 02d92ddc..b0631486 100644 --- a/core/ops/kernels/support_functions.cc +++ b/core/ops/kernels/support_functions.cc @@ -110,7 +110,7 @@ int gen_window(float* w, int L, char* typ) { w[n] = 0.54 - 0.46 * cos(pn[n]); } } else if (strcmp(typ, "povey") == 0) { - for (n = 0; n < L; n++) { + for (n = 0; n < L; n++){ w[n] = pow(0.5 - 0.5 * cos(pn[n]), 0.85); } } else if (strcmp(typ, "blac") == 0) { @@ -121,6 +121,7 @@ int gen_window(float* w, int L, char* typ) { printf("Window type not support!\n"); return -1; } + free(pn); return 0; } @@ -525,12 +526,10 @@ int compute_lpc(int ncep, int nfrm, int pord, float* x, float* y) { /* Radix-2 DIT FFT */ /* isign=-1 ==> FFT, isign=1 ==> IFFT */ -int dit_r2_fft(xcomplex* input, xcomplex* output, int N, int isign) { +int dit_r2_fft(xcomplex* input, xcomplex* output, float* in_buf, int N, int isign) { float wtemp, wr, wpr, wpi, wi, theta; float tempr, tempi; int i = 0, j = 0, n = 0, k = 0, m = 0, istep, mmax; - float* in_buf = - static_cast(malloc(sizeof(float) * 2 * N)); // c.r&c.i float* out_buf; float den; if (isign == -1) @@ -592,29 +591,29 @@ int dit_r2_fft(xcomplex* input, xcomplex* output, int N, int isign) { output[i].r = out_buf[i * 2 + 1] / den; output[i].i = out_buf[i * 2 + 2] / den; } - free(in_buf); return 0; } /* compute energy of frame */ -float compute_energy(const float* input, int L) { - float energy = 0; - for (int i = 0; i < L; i++) { - energy += input[i] * input[i]; - } - return energy; +float compute_energy(const float* input, int L){ + float energy = 0; + for (int i = 0; i < L; i++){ + energy += input[i] * input[i]; + } + return energy; } /* do pre_emphasis on frame */ -int do_frame_preemphasis(float* input, float* output, int i_size, float coef) { - if (coef == 0.0) { +int do_frame_preemphasis(float* input, float* output, int i_size, float coef){ + if (coef == 0.0){ + memcpy(output, input, sizeof(float) * i_size); + return 0; + } memcpy(output, input, sizeof(float) * i_size); + for (int i = i_size - 1; i > 0; i--) + output[i] -= coef * output[i-1]; + output[0] -= coef * output[0]; return 0; - } - memcpy(output, input, sizeof(float) * i_size); - for (int i = i_size - 1; i > 0; i--) output[i] -= coef * output[i - 1]; - output[0] -= coef * output[0]; - return 0; } /* return subvector */ diff --git a/core/ops/kernels/support_functions.h b/core/ops/kernels/support_functions.h index 105dc9f7..3cf27f2f 100644 --- a/core/ops/kernels/support_functions.h +++ b/core/ops/kernels/support_functions.h @@ -27,8 +27,6 @@ limitations under the License. #include #include -using namespace std; - #include "kernels/complex_defines.h" #ifndef M_PI @@ -39,9 +37,10 @@ using namespace std; #define M_2PI 6.283185307179586476925286766559005 #endif +using namespace std; + namespace delta { typedef float BaseFloat; - /* compute mean */ float compute_mean(float* input, int i_size); @@ -105,7 +104,7 @@ int do_levinson(int pord, float* r, float* a); int compute_lpc(int ncep, int nfrm, int pord, float* x, float* y); /* radix-2 DIT FFT */ -int dit_r2_fft(xcomplex* input, xcomplex* output, int N, int isign); +int dit_r2_fft(xcomplex* input, xcomplex* output, float* in_buf, int N, int isign); /* compute energy of frame */ float compute_energy(const float* input, int L); diff --git a/core/ops/kernels/synthfiltbank.cc b/core/ops/kernels/synthfiltbank.cc index e1bb9901..946f69f1 100644 --- a/core/ops/kernels/synthfiltbank.cc +++ b/core/ops/kernels/synthfiltbank.cc @@ -68,7 +68,7 @@ int Synthfiltbank::proc_sfb(const float* powspc, const float* phaspc) { xcomplex* win = static_cast(malloc(sizeof(xcomplex) * i_FFTSiz)); xcomplex* fftwin = static_cast(malloc(sizeof(xcomplex) * i_FFTSiz)); - + float* fft_buf = static_cast(malloc(sizeof(float) * 2 * i_FFTSiz)); /* generate window */ gen_window(pf_WINDOW, i_WinLen, s_WinTyp); @@ -84,7 +84,7 @@ int Synthfiltbank::proc_sfb(const float* powspc, const float* phaspc) { fftwin[k].i = -1.0f * fftwin[i_FFTSiz - k].i; } /* ifft */ - dit_r2_fft(fftwin, win, i_FFTSiz, 1); + dit_r2_fft(fftwin, win, fft_buf, i_FFTSiz, 1); for (k = 0; k < i_WinLen; k++) { pf_wav[n * i_FrmLen + k] += @@ -94,7 +94,7 @@ int Synthfiltbank::proc_sfb(const float* powspc, const float* phaspc) { free(win); free(fftwin); - + free(fft_buf); return 1; } diff --git a/core/ops/kernels/x_ops.cc b/core/ops/kernels/x_ops.cc index 20681190..601124cd 100644 --- a/core/ops/kernels/x_ops.cc +++ b/core/ops/kernels/x_ops.cc @@ -365,6 +365,8 @@ REGISTER_OP("Pitch") REGISTER_OP("FramePow") .Input("input_data: float") .Input("sample_rate: float") + .Attr("snip_edges: int = 1") + .Attr("remove_dc_offset: bool = true") .Attr("window_length: float = 0.025") .Attr("frame_length: float = 0.010") .Output("output: float") @@ -421,11 +423,12 @@ REGISTER_OP("Spectrum") .Attr("frame_length: float = 0.010") .Attr("window_type: string") .Attr("output_type: int = 2") - .Attr("snip_edges: bool = true") + .Attr("snip_edges: int = 1") .Attr("raw_energy: int = 1") .Attr("preEph_coeff: float = 0.97") .Attr("remove_dc_offset: bool = true") .Attr("is_fbank: bool = true") + .Attr("dither: float = 0.0") .Output("output: float") .SetShapeFn(SpectrumShapeFn) .Doc(R"doc( @@ -529,9 +532,24 @@ filterbank_channel_count: int, resolution of the Mel bank used internally. output: float, fbank features, a tensor of shape [audio_channels, spectrogram_length, bank_feat_dim]. )doc"); +REGISTER_OP("Speed") + .Input("input_data: float") + .Input("sample_rate: int32") + .Input("resample_freq: int32") + .Attr("lowpass_filter_width: int = 1") + .Output("output: float") + .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c){ + return Status::OK(); + }) + .Doc(R"doc( + Create pitch feature files. + input_data: float, input wave, a tensor of shape [1, data_length]. + sample_rate: float, NB 8000, WB 16000 etc. + )doc"); + REGISTER_OP("MfccDct") .Input("fbank: float") - .Input("spectrum: float") + .Input("framepow: float") .Input("sample_rate: int32") .Attr("coefficient_count: int = 13") .Attr("cepstral_lifter: float = 22") diff --git a/core/ops/py_x_ops.py b/core/ops/py_x_ops.py index 83941e33..681e4587 100644 --- a/core/ops/py_x_ops.py +++ b/core/ops/py_x_ops.py @@ -51,6 +51,7 @@ delta_delta = gen_x_ops.delta_delta mfcc = gen_x_ops.mfcc_dct add_rir_noise_aecres = gen_x_ops.add_rir_noise_aecres +speed = gen_x_ops.speed def jieba_cut(input_sentence, use_file=True, hmm=True): diff --git a/delta/data/frontend/add_noise_end_to_end.py b/delta/data/frontend/add_noise_end_to_end.py index 4820feda..26329fb0 100644 --- a/delta/data/frontend/add_noise_end_to_end.py +++ b/delta/data/frontend/add_noise_end_to_end.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""This model adds noise/rir to signal and writes it to file.""" import delta.compat as tf from delta.utils.hparam import HParams @@ -23,7 +24,10 @@ class AddNoiseEndToEnd(BaseFrontend): - + """ + Add a random signal-to-noise ratio noise or impulse response to clean speech, and + write it to wavfile. + """ def __init__(self, config: dict): super().__init__(config) self.add_noise = Add_rir_noise_aecres(config) diff --git a/delta/data/frontend/add_noise_end_to_end_test.py b/delta/data/frontend/add_noise_end_to_end_test.py index 4e4dd5d6..7152a822 100644 --- a/delta/data/frontend/add_noise_end_to_end_test.py +++ b/delta/data/frontend/add_noise_end_to_end_test.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""The model tests OP of Add_noise_rir_end_to_end """ import os from pathlib import Path @@ -33,7 +34,9 @@ def change_file_path(scp_path, filetype, newfilePath): class AddNoiseEndToEndTest(tf.test.TestCase): - + """ + AddNoiseEndToEnd OP test. + """ def test_add_noise_end_to_end(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) diff --git a/delta/data/frontend/add_rir_noise_aecres.py b/delta/data/frontend/add_rir_noise_aecres.py index b7728362..35426409 100644 --- a/delta/data/frontend/add_rir_noise_aecres.py +++ b/delta/data/frontend/add_rir_noise_aecres.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""This model adds noise/rir to signal.""" import delta.compat as tf from delta.utils.hparam import HParams @@ -21,7 +22,9 @@ class Add_rir_noise_aecres(BaseFrontend): - + """ + Add a random signal-to-noise ratio noise or impulse response to clean speech. + """ def __init__(self, config: dict): super().__init__(config) @@ -71,8 +74,10 @@ def params(cls, config=None): def call(self, audio_data, sample_rate=None): """ Caculate power spectrum or log power spectrum of audio data. - :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. - :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. + :param audio_data: the audio signal from which to compute spectrum. + Should be an (1, N) tensor. + :param sample_rate: [option]the samplerate of the signal we working with, + default is 16kHz. :return: A float tensor of size N containing add-noise audio. """ diff --git a/delta/data/frontend/add_rir_noise_aecres_test.py b/delta/data/frontend/add_rir_noise_aecres_test.py index 2b266d42..05939dbb 100644 --- a/delta/data/frontend/add_rir_noise_aecres_test.py +++ b/delta/data/frontend/add_rir_noise_aecres_test.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""The model tests OP of Add_noise_rir """ import os from pathlib import Path @@ -35,7 +36,9 @@ def change_file_path(scp_path, filetype, newfilePath): class AddRirNoiseAecresTest(tf.test.TestCase): - + """ + AddNoiseRIR OP test. + """ def test_add_rir_noise_aecres(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) diff --git a/delta/data/frontend/analyfiltbank.py b/delta/data/frontend/analyfiltbank.py index 6d68be4f..7ce91df2 100644 --- a/delta/data/frontend/analyfiltbank.py +++ b/delta/data/frontend/analyfiltbank.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""This model extracts power-spectrum && phase-spectrum features per frame.""" import delta.compat as tf from core.ops import py_x_ops @@ -21,7 +22,10 @@ class Analyfiltbank(BaseFrontend): - + """ + Compute power-spectrum && phase-spectrum features of every frame in speech, + return two float tensors with size (num_frames, num_frequencies). + """ def __init__(self, config: dict): super().__init__(config) @@ -29,9 +33,13 @@ def __init__(self, config: dict): def params(cls, config=None): """ Set params. - :param config: contains three optional parameters:window_length(float, default=0.030), - frame_length(float, default=0.010), sample_rate(int, default=16000). - :return: An object of class HParams, which is a set of hyperparameters as name-value pairs. + :param config: contains three optional parameters: + --sample_rate : Waveform data sample frequency (must match the waveform + file, if specified there). (float, default = 16000) + --window_length : Window length in seconds. (float, default = 0.030) + --frame_length : Hop length in seconds. (float, default = 0.010) + :return: An object of class HParams, which is a set of hyperparameters as + name-value pairs. """ window_length = 0.030 @@ -51,13 +59,15 @@ def params(cls, config=None): def call(self, audio_data, sample_rate=None): """ Caculate power spectrum and phase spectrum of audio data. - :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. - :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. + :param audio_data: the audio signal from which to compute spectrum. + Should be an (1, N) tensor. + :param sample_rate: [option]the samplerate of the signal we working with, + default is 16kHz. :return: Two returns: - power spectrum —— A float tensor of size (num_frames, num_frequencies) containing - power spectrum and of every frame in speech. - phase spectrum —— A float tensor of size (num_frames, num_frequencies) containing - phase spectrum and of every frame in speech. + power spectrum —— A float tensor of size (num_frames, num_frequencies) + containing power spectrum and of every frame in speech. + phase spectrum —— A float tensor of size (num_frames, num_frequencies) + containing phase spectrum and of every frame in speech. """ p = self.config diff --git a/delta/data/frontend/analyfiltbank_test.py b/delta/data/frontend/analyfiltbank_test.py index 73ea0bf6..d8007a8b 100644 --- a/delta/data/frontend/analyfiltbank_test.py +++ b/delta/data/frontend/analyfiltbank_test.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""The model tests Analyfiltbank FE.""" from pathlib import Path import numpy as np @@ -24,7 +25,9 @@ class Test(tf.test.TestCase): - + """ + Analyfiltbank extraction test. + """ def test_analyfiltbank(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) diff --git a/delta/data/frontend/cepstrum.py b/delta/data/frontend/cepstrum.py index 5e98c368..1cf9e011 100644 --- a/delta/data/frontend/cepstrum.py +++ b/delta/data/frontend/cepstrum.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""This model extracts Cepstrum features per frame.""" import delta.compat as tf @@ -22,7 +23,10 @@ class Cepstrum(BaseFrontend): - + """ + Compute Cepstrum features of every frame in speech, return a float tensor + with size (num_frames, ceps_subband_num). + """ def __init__(self, config: dict): super().__init__(config) @@ -30,10 +34,15 @@ def __init__(self, config: dict): def params(cls, config=None): """ Set params. - :param config: contains five optional parameters:window_length(float, default=0.025), - frame_length(float, default=0.010), sample_rate(int, default=16000), - ceps_subband_num(int, default=13), tag_ceps_mean_norm(bool, default=True). - :return:An object of class HParams, which is a set of hyperparameters as name-value pairs. + :param config: contains five optional parameters: + --sample_rate : Waveform data sample frequency (must match the waveform + file, if specified there). (float, default = 16000) + --window_length : Window length in seconds. (float, default = 0.025) + --frame_length : Hop length in seconds. (float, default = 0.010) + --ceps_subband_num : Number of Ceps_subband. (int, default=13). + --tag_ceps_mean_norm : Flag of tag_ceps_mean_norm. (bool, default=True). + :return:An object of class HParams, which is a set of hyperparameters as + name-value pairs. """ window_length = 0.025 @@ -57,10 +66,13 @@ def params(cls, config=None): def call(self, audio_data, sample_rate=None): """ Caculate cepstrum of audio data. - :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. - :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. - :return:A float tensor of size (num_frames, ceps_subband_num) containing normalized cepstrum - (tag_ceps_mean_norm = True) or cepstrum (tag_ceps_mean_norm = False) of every frame in speech. + :param audio_data: the audio signal from which to compute spectrum. + Should be an (1, N) tensor. + :param sample_rate: [option]the samplerate of the signal we working with, + default is 16kHz. + :return:A float tensor of size (num_frames, ceps_subband_num) containing + normalized cepstrum (tag_ceps_mean_norm = True) or cepstrum + (tag_ceps_mean_norm = False) of every frame in speech. """ p = self.config diff --git a/delta/data/frontend/cepstrum_test.py b/delta/data/frontend/cepstrum_test.py index 550842ed..fcbb4b4d 100644 --- a/delta/data/frontend/cepstrum_test.py +++ b/delta/data/frontend/cepstrum_test.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""The model tests Cepstrum FE.""" import numpy as np from pathlib import Path @@ -24,7 +25,9 @@ class CepstrumTest(tf.test.TestCase): - + """ + Cepstrum extraction test. + """ def test_cepstrum(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) diff --git a/delta/data/frontend/cmvn.py b/delta/data/frontend/cmvn.py index 0cdf7750..7717f9d0 100644 --- a/delta/data/frontend/cmvn.py +++ b/delta/data/frontend/cmvn.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""The model computes CMVN of features.""" import io import kaldiio @@ -22,13 +23,27 @@ class CMVN(BaseFrontend): - + """ + Compute and apply CMVN to features. + """ def __init__(self, config: dict): super().__init__(config) @classmethod def params(cls, config=None): - + """ + Set params. + :param config: contains seven optional parameters: + --norm_means : Flag of norm_means. (bool, default=True) + --norm_vars : Flag of norm_vars. (bool, default=False) + --utt2spk : Use for speaker CMVN. (string, default=None) + --spk2utt : Rspecifier for speaker to utterance-list map. + (string, default=None) + --reverse : Flag of reverse. (bool, default=False) + --std_floor : Floor to std. (float, default=1.0e-20) + --filetype : Type of input file. (string, default='mat') + :return: + """ norm_means = True norm_vars = False utt2spk = None @@ -52,7 +67,11 @@ def params(cls, config=None): return hparams def call(self, stats): - + """ + Do CMVN. + :param stats: Statistics of features. + :return: Mean and std of features. + """ p = self.config if isinstance(stats, dict): diff --git a/delta/data/frontend/delta_delta.py b/delta/data/frontend/delta_delta.py index 911a5955..430967e8 100644 --- a/delta/data/frontend/delta_delta.py +++ b/delta/data/frontend/delta_delta.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""This model does delta_delta to features.""" import delta.compat as tf @@ -22,7 +23,9 @@ class DeltaDelta(BaseFrontend): - + """ + Do Delta_delta to features. + """ def __init__(self, config: dict): super().__init__(config) diff --git a/delta/data/frontend/delta_delta_test.py b/delta/data/frontend/delta_delta_test.py index 616b6a4e..d8f0cc74 100644 --- a/delta/data/frontend/delta_delta_test.py +++ b/delta/data/frontend/delta_delta_test.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""The model tests Delta_delta FE.""" import delta.compat as tf from delta.data.frontend.delta_delta import DeltaDelta @@ -22,7 +23,9 @@ class Delta_delta_Test(tf.test.TestCase): - + """ + Delta_delta extraction test. + """ def test_delta_delta(self): self.feat_dim = 80 diff --git a/delta/data/frontend/fbank.py b/delta/data/frontend/fbank.py index 06f8fbe4..3bd55ab7 100644 --- a/delta/data/frontend/fbank.py +++ b/delta/data/frontend/fbank.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""This model extracts Fbank features per frame.""" import tensorflow as tf from core.ops import py_x_ops @@ -22,7 +23,11 @@ class Fbank(BaseFrontend): - + """ + Computing filter banks is applying triangular filters on a Mel-scale to the power + spectrum to extract frequency bands. Return a float tensor with shape + (num_channels, num_frames, num_frequencies). + """ def __init__(self, config: dict): super().__init__(config) self.spect = Spectrum(config) @@ -31,20 +36,34 @@ def __init__(self, config: dict): def params(cls, config=None): """ Set params. - :param config: contains thirteen optional parameters. - --sample_rate : Sample frequency of waveform data. (int, default = 16000) - --window_length : Window length in seconds. (float, default = 0.025) - --frame_length : Hop length in seconds. (float, default = 0.010) - --snip_edges : If True, the last frame (shorter than window_length) will be cutoff. If False, 1 // 2 frame_length data will be padded to data. (int, default = True) - ---raw_energy : If 1, compute frame energy before preemphasis and windowing. If 2, compute frame energy after preemphasis and windowing. (int, default = 1) - --preeph_coeff : Coefficient for use in frame-signal preemphasis. (float, default = 0.97) - --window_type : Type of window ("hamm"|"hann"|"povey"|"rect"|"blac"|"tria"). (string, default = "povey") - --remove_dc_offset : Subtract mean from waveform on each frame (bool, default = true) - --is_fbank : If true, compute power spetrum without frame energy. If false, using the frame energy instead of the square of the constant component of the signal. (bool, default = true) - --output_type : If 1, return power spectrum. If 2, return log-power spectrum. (int, default = 1) - --upper_frequency_limit : High cutoff frequency for mel bins (if < 0, offset from Nyquist) (float, default = 0) - --lower_frequency_limit : Low cutoff frequency for mel bins (float, default = 20) - --filterbank_channel_count : Number of triangular mel-frequency bins (float, default = 23) + :param config: contains thirteen optional parameters: + --window_length : Window length in seconds. (float, default = 0.025) + --frame_length : Hop length in seconds. (float, default = 0.010) + --snip_edges : If 1, the last frame (shorter than window_length) will be + cutoff. If 2, 1 // 2 frame_length data will be padded + to data. (int, default = 1) + ---raw_energy : If 1, compute frame energy before preemphasis and + windowing. If 2, compute frame energy after + preemphasis and windowing. (int, default = 1) + --preeph_coeff : Coefficient for use in frame-signal preemphasis. + (float, default = 0.97) + --window_type : Type of window ("hamm"|"hann"|"povey"|"rect"|"blac"|"tria"). + (string, default = "povey") + --remove_dc_offset : Subtract mean from waveform on each frame. + (bool, default = true) + --is_fbank : If true, compute power spetrum without frame energy. + If false, using the frame energy instead of the + square of the constant component of the signal. + (bool, default = true) + --output_type : If 1, return power spectrum. If 2, return log-power + spectrum. (int, default = 1) + --upper_frequency_limit : High cutoff frequency for mel bins (if <= 0, offset + from Nyquist) (float, default = 0) + --lower_frequency_limit : Low cutoff frequency for mel bins (float, default = 20) + --filterbank_channel_count : Number of triangular mel-frequency bins. + (float, default = 23) + --dither : Dithering constant (0.0 means no dither). + (float, default = 1) [add robust to training] :return: An object of class HParams, which is a set of hyperparameters as name-value pairs. """ @@ -55,12 +74,13 @@ def params(cls, config=None): frame_length = 0.010 output_type = 1 sample_rate = 16000 - snip_edges = True + snip_edges = 1 raw_energy = 1 preeph_coeff = 0.97 window_type = 'povey' remove_dc_offset = True is_fbank = True + dither = 0.0 hparams = HParams(cls=cls) hparams.add_hparam('upper_frequency_limit', upper_frequency_limit) @@ -76,6 +96,7 @@ def params(cls, config=None): hparams.add_hparam('window_type', window_type) hparams.add_hparam('remove_dc_offset', remove_dc_offset) hparams.add_hparam('is_fbank', is_fbank) + hparams.add_hparam('dither', dither) if config is not None: hparams.override_from_dict(config) @@ -84,11 +105,13 @@ def params(cls, config=None): def call(self, audio_data, sample_rate=None): """ - Caculate fbank features of audio data. - :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. - :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. - :return: A float tensor of size (num_channels, num_frames, num_frequencies) containing - fbank features of every frame in speech. + Caculate fbank features of audio data. + :param audio_data: the audio signal from which to compute spectrum. + Should be an (1, N) tensor. + :param sample_rate: [option]the samplerate of the signal we working with, + default is 16kHz. + :return: A float tensor of size (num_frames, num_frequencies, num_channels) containing + fbank features of every frame in speech. """ p = self.config with tf.name_scope('fbank'): @@ -116,4 +139,10 @@ def call(self, audio_data, sample_rate=None): lower_frequency_limit=p.lower_frequency_limit, filterbank_channel_count=p.filterbank_channel_count) + fbank = tf.squeeze(fbank, axis=0) + shape = tf.shape(fbank) + nframe = shape[0] + nfbank = shape[1] + fbank = tf.reshape(fbank, (nframe, nfbank, 1)) + return fbank diff --git a/delta/data/frontend/fbank_pitch.py b/delta/data/frontend/fbank_pitch.py index 3cb53445..84e2e240 100644 --- a/delta/data/frontend/fbank_pitch.py +++ b/delta/data/frontend/fbank_pitch.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""This model extracts Fbank && Pitch features per frame.""" import delta.compat as tf from delta.utils.hparam import HParams @@ -22,7 +23,10 @@ class FbankPitch(BaseFrontend): - + """ + Compute Fbank && Pitch features respectively,and concate them. Return + a tensor with shape (num_frames, dim_features). + """ def __init__(self, config: dict): super().__init__(config) self.fbank = Fbank(config) @@ -32,10 +36,79 @@ def __init__(self, config: dict): def params(cls, config=None): """ Set params. - :param config: contains eight optional parameters:upper_frequency_limit(float, default=4000.0), - lower_frequency_limit(float, default=20.0), filterbank_channel_count(float, default=40.0), - window_length(float, default=0.025), frame_length(float, default=0.010), - thres_autoc(float, default=0.3), output_type(int, default=2), sample_rate(int, default=16000). + :param config: contains twenty-nine optional parameters: + --sample_rate : Samplerate of the signal we working with. + (int, default = 16000) + --window_length : Window length in seconds. (float, default = 0.025) + --frame_length : Hop length in seconds. (float, default = 0.010) + --snip_edges : If 1, the last frame (shorter than window_length) will + be cutoff. If 2, 1 // 2 frame_length data will be padded + to data. (int, default = 1) + ---raw_energy : If 1, compute frame energy before preemphasis and + windowing. If 2, compute frame energy after preemphasis + and windowing. (int, default = 1) + --preEph_coeff : Coefficient for use in frame-signal preemphasis. + (float, default = 0.97) + --window_type : Type of window ("hamm"|"hann"|"povey"|"rect"|"blac"|"tria"). + (string, default = "povey") + --remove_dc_offset : Subtract mean from waveform on each frame. + (bool, default = true) + --is_fbank : If true, compute power spetrum without frame + energy. If false, using the frame energy instead + of the square of the constant component of the + signal. (bool, default = true) + --output_type : If 1, return power spectrum. If 2, return + log-power spectrum. (int, default = 1) + --upper_frequency_limit : High cutoff frequency for mel bins. + (if <= 0, offset from Nyquist) (float, default = 0) + --lower_frequency_limit : Low cutoff frequency for mel bins. + (float, default = 20) + --filterbank_channel_count : Number of triangular mel-frequency bins. + (float, default = 23) + --dither : Dithering constant (0.0 means no dither). + (float, default = 1) + [add robust to training] + --delta-pitch : Smallest relative change in pitch that our + algorithm measures. (float, default = 0.005) + --frames-per-chunk : Only relevant for offline pitch extraction. + (e.g. compute-kaldi-pitch-feats), you can set it to a + small nonzero value, such as 10, for better feature + compatibility with online decoding (affects energy + normalization in the algorithm) (int, default = 0) + --lowpass-cutoff : cutoff frequency for LowPass filter (Hz). + (float, default = 1000) + --lowpass-filter-width : Integer that determines filter width of lowpass filter, + more gives sharper filter (int, default = 1) + --max-f0 : max. F0 to search for (Hz) (float, default = 400) + --max-frames-latency : Maximum number of frames of latency that we allow pitch + tracking to introduce into the feature processing + (affects output only if --frames-per-chunk > 0 and + --simulate-first-pass-online=true (int, default = 0) + --min-f0 : min. F0 to search for (Hz) (float, default = 50) + --nccf-ballast : Increasing this factor reduces NCCF for quiet frames. + (float, default = 7000) + --nccf-ballast-online : This is useful mainly for debug; it affects how the + NCCF ballast is computed. (bool, default = false) + --penalty-factor : cost factor for FO change. (float, default = 0.1) + --preemphasis-coefficient : Coefficient for use in signal preemphasis (deprecated) + (float, default = 0) + --recompute-frame : Only relevant for online pitch extraction, or for + compatibility with online pitch extraction. A + non-critical parameter; the frame at which we recompute + some of the forward pointers, after revising our + estimate of the signal energy. Relevant + if--frames-per-chunk > 0. (int, default = 500) + --resample-frequency : Frequency that we down-sample the signal to. Must be + more than twice lowpass-cutoff (float, default = 4000) + --simulate-first-pass-online : If true, compute-kaldi-pitch-feats will output features + that correspond to what an online decoder would see in + the first pass of decoding-- not the final version of + the features, which is the default. Relevant if + --frames-per-chunk > 0 (bool, default = false) + --soft-min-f0 : Minimum f0, applied in soft way, must not exceed + min-f0 (float, default = 10) + --upsample-filter-width : Integer that determines filter width when upsampling + NCCF (int, default = 5) :return: An object of class HParams, which is a set of hyperparameters as name-value pairs. """ hparams = HParams(cls=cls) @@ -51,8 +124,9 @@ def params(cls, config=None): remove_dc_offset = True is_fbank = True output_type = 1 + dither = 0.0 sample_rate = 16000 - snip_edges = True + snip_edges = 1 preemph_coeff = 0.0 min_f0 = 50.0 max_f0 = 400.0 @@ -73,6 +147,7 @@ def params(cls, config=None): hparams.add_hparam('sample_rate', sample_rate) hparams.add_hparam('snip_edges', snip_edges) hparams.add_hparam('preemph_coeff', preemph_coeff) + hparams.add_hparam('dither', dither) hparams.add_hparam('min_f0', min_f0) hparams.add_hparam('max_f0', max_f0) hparams.add_hparam('soft_min_f0', soft_min_f0) @@ -108,9 +183,11 @@ def params(cls, config=None): def call(self, audio_data, sample_rate=None): """ Caculate fbank && pitch(concat) features of wav. - :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. - :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. - :return: A tensor with shape (num_frames, dim_features), containing fbank && pitch feature of every frame in speech. + :param audio_data: the audio signal from which to compute spectrum. + Should be an (1, N) tensor. + :param sample_rate: the samplerate of the signal we working with. + :return: A tensor with shape (num_frames, dim_features), containing + fbank && pitch feature of every frame in speech. """ p = self.config diff --git a/delta/data/frontend/fbank_pitch_test.py b/delta/data/frontend/fbank_pitch_test.py index 07c917e0..b778f7e6 100644 --- a/delta/data/frontend/fbank_pitch_test.py +++ b/delta/data/frontend/fbank_pitch_test.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""The model tests Fbank&&Pitch FE.""" import delta.compat as tf import os @@ -23,7 +24,9 @@ class FbankPitchTest(tf.test.TestCase): - + """ + Compare Fbank&&Pitch FE with kaldi. + """ def test_FbankPitch(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) diff --git a/delta/data/frontend/fbank_test.py b/delta/data/frontend/fbank_test.py index 03f4cb2b..c26fe6bb 100644 --- a/delta/data/frontend/fbank_test.py +++ b/delta/data/frontend/fbank_test.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""The model tests Fbank FE.""" import os import numpy as np @@ -25,7 +26,9 @@ class FbankTest(tf.test.TestCase): - + """ + Test Fbank FE using 8k/16k wav files. + """ def test_fbank(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) @@ -36,7 +39,7 @@ def test_fbank(self): 'window_length': 0.025, 'output_type': 1, 'frame_length': 0.010, - 'snip_edges': True + 'snip_edges': 1 } fbank = Fbank.params(config).instantiate() fbank_test = fbank(input_data, sample_rate) @@ -48,7 +51,7 @@ def test_fbank(self): [3.803553, 5.450971, 6.547878, 5.796172, 6.397846, 7.242926]]) self.assertAllClose( - np.squeeze(fbank_test.eval()[0, 0:2, 0:6]), + np.squeeze(fbank_test.eval()[0:2, 0:6, 0]), real_fank_feats, rtol=1e-05, atol=1e-05) diff --git a/delta/data/frontend/framepow.py b/delta/data/frontend/framepow.py index 443c7579..5d54ca95 100644 --- a/delta/data/frontend/framepow.py +++ b/delta/data/frontend/framepow.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +""""This model extracts framepow features per frame.""" import delta.compat as tf @@ -22,7 +23,10 @@ class Framepow(BaseFrontend): - + """ + Compute power of every frame in speech. Return a float tensor with + shape (1 * num_frames). + """ def __init__(self, config: dict): super().__init__(config) @@ -30,18 +34,29 @@ def __init__(self, config: dict): def params(cls, config=None): """ Set params. - :param config: contains three optional parameters:window_length(float, default=0.025), - frame_length(float, default=0.010), sample_rate(int, default=16000). + :param config: contains five optional parameters: + --sample_rate : Waveform data sample frequency (must match the waveform + file, if specified there). (float, default = 16000) + --window_length : Window length in seconds. (float, default = 0.025) + --frame_length : Hop length in seconds. (float, default = 0.010) + --snip_edges : If True, the last frame (shorter than window_length) + will be cutoff. If False, 1 // 2 frame_length data will + be padded to data. (int, default = True) + --remove_dc_offset : Subtract mean from waveform on each frame (bool, default = true) :return:An object of class HParams, which is a set of hyperparameters as name-value pairs. """ window_length = 0.025 frame_length = 0.010 + snip_edges = 1 + remove_dc_offset = True sample_rate = 16000 hparams = HParams(cls=cls) hparams.add_hparam('window_length', window_length) hparams.add_hparam('frame_length', frame_length) + hparams.add_hparam('snip_edges', snip_edges) + hparams.add_hparam('remove_dc_offset', remove_dc_offset) hparams.add_hparam('sample_rate', sample_rate) if config is not None: @@ -51,11 +66,14 @@ def params(cls, config=None): def call(self, audio_data, sample_rate=None): """ - Caculate power of every frame in speech. - :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. - :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. - :return:A float tensor of size (1, num_frames) containing power of every frame in speech. - """ + Caculate power of every frame in speech. + :param audio_data: the audio signal from which to compute spectrum. + Should be an (1, N) tensor. + :param sample_rate: [option]the samplerate of the signal we working with, + default is 16kHz. + :return:A float tensor of size (1 * num_frames) containing power of every + frame in speech. + """ p = self.config with tf.name_scope('framepow'): @@ -68,10 +86,11 @@ def call(self, audio_data, sample_rate=None): with tf.control_dependencies([assert_op]): sample_rate = tf.cast(sample_rate, dtype=float) - framepow = py_x_ops.frame_pow( - audio_data, - sample_rate, - window_length=p.window_length, - frame_length=p.frame_length) - - return framepow + framepow = py_x_ops.frame_pow(audio_data, + sample_rate, + snip_edges=p.snip_edges, + remove_dc_offset=p.remove_dc_offset, + window_length=p.window_length, + frame_length=p.frame_length) + + return tf.squeeze(framepow) diff --git a/delta/data/frontend/framepow_test.py b/delta/data/frontend/framepow_test.py index c21a568b..4a8a879f 100644 --- a/delta/data/frontend/framepow_test.py +++ b/delta/data/frontend/framepow_test.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""The model tests framepow FE.""" import os import numpy as np @@ -25,14 +26,15 @@ class FramepowTest(tf.test.TestCase): - + """ + Framepow extraction test. + """ def test_framepow(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) with self.cached_session(use_gpu=False, force_gpu=False): read_wav = ReadWav.params().instantiate() input_data, sample_rate = read_wav(wav_path) - input_data = input_data / 32768 framepow = Framepow.params({ 'window_length': 0.025, @@ -40,19 +42,11 @@ def test_framepow(self): }).instantiate() framepow_test = framepow(input_data, sample_rate) - output_true = np.array([ - 0.000018, 0.000011, 0.000010, 0.000010, 0.000010, 0.000010, 0.000008, - 0.000009, 0.000009, 0.000009, 0.000009, 0.000011, 0.090164, 0.133028, - 0.156547, 0.053551, 0.056670, 0.097706, 0.405659, 2.119505, 4.296845, - 6.139090, 6.623638, 6.136467, 7.595072, 7.904415, 7.655983, 6.771016, - 5.706427, 4.220942, 3.259599, 2.218259, 1.911394, 2.234246, 3.056905, - 2.534153, 0.464354, 0.013493, 0.021231, 0.148362, 0.364829, 0.627266, - 0.494912, 0.366029, 0.315408, 0.312441, 0.323796, 0.267505, 0.152856, - 0.045305 - ]) + real_framepow_feats = np.array( + [9.819611, 9.328745, 9.247337, 9.26451, 9.266059]) self.assertEqual(tf.rank(framepow_test).eval(), 1) - self.assertAllClose(framepow_test.eval().flatten()[:50], output_true) + self.assertAllClose(framepow_test.eval()[0 : 5], real_framepow_feats) if __name__ == '__main__': diff --git a/delta/data/frontend/mfcc.py b/delta/data/frontend/mfcc.py index 9a5de70e..16086b0e 100644 --- a/delta/data/frontend/mfcc.py +++ b/delta/data/frontend/mfcc.py @@ -13,46 +13,62 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""This model extracts MFCC features per frame.""" import delta.compat as tf from core.ops import py_x_ops from delta.utils.hparam import HParams from delta.data.frontend.base_frontend import BaseFrontend from delta.data.frontend.fbank import Fbank -from delta.data.frontend.spectrum import Spectrum +from delta.data.frontend.framepow import Framepow import copy class Mfcc(BaseFrontend): - + """ + Compute mfcc features of every frame in speech, return a float tensor + with size (num_channels, num_frames, num_frequencies). + """ def __init__(self, config: dict): super().__init__(config) - config1 = copy.deepcopy(config) - config1['is_fbank'] = False - config1['output_type'] = 2 - self.spect = Spectrum(config1) + self.framepow = Framepow(config) self.fbank = Fbank(config) @classmethod def params(cls, config=None): """ Set params. - :param config: contains fifthteen optional parameters. - --sample_rate : Sample frequency of waveform data. (int, default = 16000) + :param config: contains fourteen optional parameters. --window_length : Window length in seconds. (float, default = 0.025) --frame_length : Hop length in seconds. (float, default = 0.010) - --snip_edges : If 1, the last frame (shorter than window_length) will be cutoff. If 2, 1 // 2 frame_length data will be padded to data. (int, default = 1) - ---raw_energy : If 1, compute frame energy before preemphasis and windowing. If 2, compute frame energy after preemphasis and windowing. (int, default = 1) - --preeph_coeff : Coefficient for use in frame-signal preemphasis. (float, default = 0.97) - --window_type : Type of window ("hamm"|"hann"|"povey"|"rect"|"blac"|"tria"). (string, default = "povey") - --remove_dc_offset : Subtract mean from waveform on each frame (bool, default = true) - --is_fbank : If true, compute power spetrum without frame energy. If false, using the frame energy instead of the square of the constant component of the signal. (bool, default = true) - --output_type : If 1, return power spectrum. If 2, return log-power spectrum. (int, default = 1) - --upper_frequency_limit : High cutoff frequency for mel bins (if < 0, offset from Nyquist) (float, default = 0) - --lower_frequency_limit : Low cutoff frequency for mel bins (float, default = 20) - --filterbank_channel_count : Number of triangular mel-frequency bins (float, default = 23) - --coefficient_count : Number of cepstra in MFCC computation.(int, default = 13) - --cepstral_lifter : Constant that controls scaling of MFCCs.(float, default = 22) + --snip_edges : If 1, the last frame (shorter than window_length) will + be cutoff. If 2, 1 // 2 frame_length data will be padded + to data. (int, default = 1) + ---raw_energy : If 1, compute frame energy before preemphasis and + windowing. If 2, compute frame energy after + preemphasis and windowing. (int, default = 1) + --preEph_coeff : Coefficient for use in frame-signal preemphasis. + (float, default = 0.97) + --window_type : Type of window ("hamm"|"hann"|"povey"|"rect"|"blac"|"tria"). + (string, default = "povey") + --remove_dc_offset : Subtract mean from waveform on each frame + (bool, default = true) + --is_fbank : If true, compute power spetrum without frame energy. If + false, using the frame energy instead of the square of the + constant component of the signal. (bool, default = true) + --output_type : If 1, return power spectrum. If 2, return log-power + spectrum. (int, default = 1) + --upper_frequency_limit : High cutoff frequency for mel bins (if < 0, offset from + Nyquist) (float, default = 0) + --lower_frequency_limit : Low cutoff frequency for mel bins (float, default = 20) + --filterbank_channel_count : Number of triangular mel-frequency bins. + (float, default = 23) + --coefficient_count : Number of cepstra in MFCC computation. + (int, default = 13) + --cepstral_lifter : Constant that controls scaling of MFCCs. + (float, default = 22) + --use_energy :Use energy (not C0) in MFCC computation. + (bool, default = True) :return: An object of class HParams, which is a set of hyperparameters as name-value pairs. """ @@ -63,7 +79,7 @@ def params(cls, config=None): frame_length = 0.010 output_type = 1 sample_rate = 16000 - snip_edges = True + snip_edges = 1 raw_energy = 1 preeph_coeff = 0.97 window_type = 'povey' @@ -72,6 +88,7 @@ def params(cls, config=None): cepstral_lifter = 22.0 coefficient_count = 13 use_energy = True + dither = 0.0 hparams = HParams(cls=cls) hparams.add_hparam('upper_frequency_limit', upper_frequency_limit) @@ -90,6 +107,7 @@ def params(cls, config=None): hparams.add_hparam('cepstral_lifter', cepstral_lifter) hparams.add_hparam('coefficient_count', coefficient_count) hparams.add_hparam('use_energy', use_energy) + hparams.add_hparam('dither', dither) if config is not None: hparams.override_from_dict(config) @@ -99,10 +117,11 @@ def params(cls, config=None): def call(self, audio_data, sample_rate=None): """ Caculate mfcc features of audio data. - :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. - :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. - :return: A float tensor of size (num_channels, num_frames, num_frequencies) containing - mfcc features of every frame in speech. + :param audio_data: the audio signal from which to compute spectrum. + Should be an (1, N) tensor. + :param sample_rate: the samplerate of the signal we working with. + :return: A float tensor of size (num_channels, num_frames, num_frequencies) + containing mfcc features of every frame in speech. """ p = self.config with tf.name_scope('mfcc'): @@ -114,14 +133,17 @@ def call(self, audio_data, sample_rate=None): tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): - spectrum_feats = self.spect(audio_data, sample_rate) - spectrum_feats = tf.expand_dims(spectrum_feats, 0) fbank_feats = self.fbank(audio_data, sample_rate) - mfcc = py_x_ops.mfcc( - fbank_feats, - spectrum_feats, - sample_rate, - use_energy=p.use_energy, - cepstral_lifter=p.cepstral_lifter, - coefficient_count=p.coefficient_count) + sample_rate = tf.cast(sample_rate, dtype=tf.int32) + shape = tf.shape(fbank_feats) + nframe = shape[0] + nfbank = shape[1] + fbank_feats = tf.reshape(fbank_feats, (1, nframe, nfbank)) + framepow_feats = self.framepow(audio_data, sample_rate) + mfcc = py_x_ops.mfcc(fbank_feats, + framepow_feats, + sample_rate, + use_energy=p.use_energy, + cepstral_lifter=p.cepstral_lifter, + coefficient_count=p.coefficient_count) return mfcc diff --git a/delta/data/frontend/mfcc_test.py b/delta/data/frontend/mfcc_test.py index b79a183c..b29d2033 100644 --- a/delta/data/frontend/mfcc_test.py +++ b/delta/data/frontend/mfcc_test.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""The model tests MFCC FE.""" import delta.compat as tf import os @@ -24,7 +25,9 @@ class MfccTest(tf.test.TestCase): - + """ + MFCC extraction test. + """ def test_mfcc(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) diff --git a/delta/data/frontend/pitch.py b/delta/data/frontend/pitch.py index 5c747caf..44097d6b 100644 --- a/delta/data/frontend/pitch.py +++ b/delta/data/frontend/pitch.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""This model extracts pitch features per frame.""" import delta.compat as tf from core.ops import py_x_ops @@ -21,7 +22,10 @@ class Pitch(BaseFrontend): - + """ + Compute pitch features of every frame in speech, return a float tensor + with size (num_frames, 2). + """ def __init__(self, config: dict): super().__init__(config) @@ -29,27 +33,57 @@ def __init__(self, config: dict): def params(cls, config=None): """ Set params. - :param config: contains twenty optional parameters: - --delta-pitch : Smallest relative change in pitch that our algorithm measures (float, default = 0.005) - --frame-length : Frame length in milliseconds (float, default = 25) - --frame-shift : Frame shift in milliseconds (float, default = 10) - --frames-per-chunk : Only relevant for offline pitch extraction (e.g. compute-kaldi-pitch-feats), you can set it to a small nonzero value, such as 10, for better feature compatibility with online decoding (affects energy normalization in the algorithm) (int, default = 0) - --lowpass-cutoff : cutoff frequency for LowPass filter (Hz) (float, default = 1000) - --lowpass-filter-width : Integer that determines filter width of lowpass filter, more gives sharper filter (int, default = 1) - --max-f0 : max. F0 to search for (Hz) (float, default = 400) - --max-frames-latency : Maximum number of frames of latency that we allow pitch tracking to introduce into the feature processing (affects output only if --frames-per-chunk > 0 and --simulate-first-pass-online=true (int, default = 0) - --min-f0 : min. F0 to search for (Hz) (float, default = 50) - --nccf-ballast : Increasing this factor reduces NCCF for quiet frames (float, default = 7000) - --nccf-ballast-online : This is useful mainly for debug; it affects how the NCCF ballast is computed. (bool, default = false) - --penalty-factor : cost factor for FO change. (float, default = 0.1) - --preemphasis-coefficient : Coefficient for use in signal preemphasis (deprecated) (float, default = 0) - --recompute-frame : Only relevant for online pitch extraction, or for compatibility with online pitch extraction. A non-critical parameter; the frame at which we recompute some of the forward pointers, after revising our estimate of the signal energy. Relevant if--frames-per-chunk > 0 (int, default = 500) - --resample-frequency : Frequency that we down-sample the signal to. Must be more than twice lowpass-cutoff (float, default = 4000) - --sample-frequency : Waveform data sample frequency (must match the waveform file, if specified there) (float, default = 16000) - --simulate-first-pass-online : If true, compute-kaldi-pitch-feats will output features that correspond to what an online decoder would see in the first pass of decoding-- not the final version of the features, which is the default. Relevant if --frames-per-chunk > 0 (bool, default = false) - --snip-edges : If this is set to false, the incomplete frames near the ending edge won't be snipped, so that the number of frames is the file size divided by the frame-shift. This makes different types of features give the same number of frames. (bool, default = true) - --soft-min-f0 : Minimum f0, applied in soft way, must not exceed min-f0 (float, default = 10) - --upsample-filter-width : Integer that determines filter width when upsampling NCCF (int, default = 5) + :param config: contains nineteen optional parameters: + --sample_rate : Waveform data sample frequency (must match the waveform + file, if specified there). (float, default = 16000) + --delta-pitch : Smallest relative change in pitch that our algorithm + measures (float, default = 0.005) + --window_length : Frame length in seconds (float, default = 0.025) + --frame_length : Frame shift in seconds (float, default = 0.010) + --frames-per-chunk : Only relevant for offline pitch extraction (e.g. + compute-kaldi-pitch-feats), you can set it to a small + nonzero value, such as 10, for better feature + compatibility with online decoding (affects energy + normalization in the algorithm) (int, default = 0) + --lowpass-cutoff : cutoff frequency for LowPass filter (Hz). + (float, default = 1000) + --lowpass-filter-width : Integer that determines filter width of lowpass filter, + more gives sharper filter (int, default = 1) + --max-f0 : max. F0 to search for (Hz) (float, default = 400) + --max-frames-latency : Maximum number of frames of latency that we allow pitch + tracking to introduce into the feature processing + (affects output only if --frames-per-chunk > 0 and + --simulate-first-pass-online=true (int, default = 0) + --min-f0 : min. F0 to search for (Hz) (float, default = 50) + --nccf-ballast : Increasing this factor reduces NCCF for quiet frames. + (float, default = 7000) + --nccf-ballast-online : This is useful mainly for debug; it affects how the NCCF + ballast is computed. (bool, default = false) + --penalty-factor : cost factor for FO change. (float, default = 0.1) + --preemphasis-coefficient : Coefficient for use in signal preemphasis (deprecated). + (float, default = 0) + --recompute-frame : Only relevant for online pitch extraction, or for + compatibility with online pitch extraction. A + non-critical parameter; the frame at which we recompute + some of the forward pointers, after revising our + estimate of the signal energy. Relevant + if--frames-per-chunk > 0. (int, default = 500) + --resample-frequency : Frequency that we down-sample the signal to. Must be + more than twice lowpass-cutoff (float, default = 4000) + --simulate-first-pass-online : If true, compute-kaldi-pitch-feats will output features + that correspond to what an online decoder would see in + the first pass of decoding-- not the final version of + the features, which is the default. Relevant if + --frames-per-chunk > 0 (bool, default = false) + --snip-edges : If this is set to false, the incomplete frames near the + ending edge won't be snipped, so that the number of + frames is the file size divided by the frame-shift. + This makes different types of features give the same + number of frames. (bool, default = true) + --soft-min-f0 : Minimum f0, applied in soft way, must not exceed min-f0. + (float, default = 10) + --upsample-filter-width : Integer that determines filter width when upsampling + NCCF. (int, default = 5) :return: An object of class HParams, which is a set of hyperparameters as name-value pairs. """ @@ -103,12 +137,13 @@ def params(cls, config=None): def call(self, audio_data, sample_rate=None): """ - Caculate picth features of audio data. - :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. - :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. - :return: A float tensor of size (num_frames, 2) containing - pitch && POV features of every frame in speech. - """ + Caculate picth features of audio data. + :param audio_data: the audio signal from which to compute spectrum. + Should be an (1, N) tensor. + :param sample_rate: the samplerate of the signal we working with. + :return: A float tensor of size (num_frames, 2) containing + pitch && POV features of every frame in speech. + """ p = self.config with tf.name_scope('pitch'): diff --git a/delta/data/frontend/pitch_test.py b/delta/data/frontend/pitch_test.py index f9ed3c0b..7564522c 100644 --- a/delta/data/frontend/pitch_test.py +++ b/delta/data/frontend/pitch_test.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""The model tests pitch FE.""" import delta.compat as tf import os @@ -24,7 +25,9 @@ class SpectrumTest(tf.test.TestCase): - + """ + Pitch extraction test. + """ def test_spectrum(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) diff --git a/delta/data/frontend/plp.py b/delta/data/frontend/plp.py index 74b3e584..e07bc376 100644 --- a/delta/data/frontend/plp.py +++ b/delta/data/frontend/plp.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""This model extracts PLP features per frame.""" import delta.compat as tf @@ -22,7 +23,10 @@ class Plp(BaseFrontend): - + """ + Compute PLP features of every frame in speech, return a float tensor + with size (num_frames, plp_order + 1). + """ def __init__(self, config: dict): super().__init__(config) @@ -30,9 +34,12 @@ def __init__(self, config: dict): def params(cls, config=None): """ Set params. - :param config: contains four optional parameters:window_length(float, default=0.025), - frame_length(float, default=0.010), sample_rate(float, default=16000), - plp_order(int, default=12). + :param config: contains four optional parameters: + --sample_rate : Waveform data sample frequency (must match the waveform + file, if specified there). (float, default = 16000) + --window_length : Window length in seconds. (float, default = 0.025) + --frame_length : Hop length in seconds. (float, default = 0.010) + --plp_order : Plp order. (int, default=12). :return:An object of class HParams, which is a set of hyperparameters as name-value pairs. """ @@ -55,9 +62,12 @@ def params(cls, config=None): def call(self, audio_data, sample_rate=None): """ Caculate plp features of audio data. - :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. - :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. - :return:A float tensor of size (num_frames, (plp_order + 1)) containing plp features of every frame in speech. + :param audio_data: the audio signal from which to compute spectrum. + Should be an (1, N) tensor. + :param sample_rate: [option]the samplerate of the signal we working + with, default is 16kHz. + :return:A float tensor of size (num_frames, (plp_order + 1)) containing plp + features of every frame in speech. """ p = self.config diff --git a/delta/data/frontend/plp_test.py b/delta/data/frontend/plp_test.py index eecc343f..6dc54269 100644 --- a/delta/data/frontend/plp_test.py +++ b/delta/data/frontend/plp_test.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""The model tests PLP FE.""" import delta.compat as tf import os @@ -24,7 +25,9 @@ class PlpTest(tf.test.TestCase): - + """ + Plp extraction test. + """ def test_plp(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) diff --git a/delta/data/frontend/read_wav.py b/delta/data/frontend/read_wav.py index 38585d9d..17ff3b3f 100644 --- a/delta/data/frontend/read_wav.py +++ b/delta/data/frontend/read_wav.py @@ -13,32 +13,40 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""The model reads audio sample from wav file.""" import delta.compat as tf - from delta.utils.hparam import HParams from delta.data.frontend.base_frontend import BaseFrontend - +from core.ops import py_x_ops class ReadWav(BaseFrontend): - + """ + Read audio sample from wav file, return sample data and sample rate. + """ def __init__(self, config: dict): super().__init__(config) @classmethod def params(cls, config=None): """ - Set params. - :param config: contains two optional parameters: audio_channels(int, default=1), - sample_rate(int, default=16000). - :return: An object of class HParams, which is a set of hyperparameters as name-value pairs. - """ + Set params. + :param config: contains three optional parameters: + --sample_rate : Waveform data sample frequency (must match the waveform + file, if specified there). (float, default = 16000) + --speed : Speed of sample channels wanted. (float, default=1.0) + --audio_channels :(int, default=1). + :return: An object of class HParams, which is a set of hyperparameters as + name-value pairs. + """ audio_channels = 1 sample_rate = 16000 + speed = 1.0 hparams = HParams(cls=cls) hparams.add_hparam('audio_channels', audio_channels) hparams.add_hparam('sample_rate', sample_rate) + hparams.add_hparam('speed', speed) if config is not None: hparams.override_from_dict(config) @@ -48,8 +56,9 @@ def params(cls, config=None): def call(self, wavfile): """ Get audio data and sample rate from a wavfile. - :param wavfile: filepath of wav - :return: 2 values. The first is a Tensor of audio data. The second return value is the sample rate of the input wav + :param wavfile: filepath of wav. + :return: 2 values. The first is a Tensor of audio data. + The second return value isthe sample rate of the input wav file, which is a tensor with float dtype. """ p = self.config @@ -58,7 +67,16 @@ def call(self, wavfile): contents, desired_channels=p.audio_channels) assert_op = tf.assert_equal( tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) + with tf.control_dependencies([assert_op]): - return tf.squeeze( - audio_data * 32768, axis=-1), tf.cast( - sample_rate, dtype=tf.int32) + + if p.speed == 1.0: + return tf.squeeze(audio_data * 32768, axis=-1), tf.cast(sample_rate, dtype=tf.int32) + else: + resample_rate = tf.cast(sample_rate, dtype=tf.float32) * tf.cast( + 1.0 / p.speed, dtype=tf.float32) + speed_data = py_x_ops.speed(tf.squeeze(audio_data * 32768, axis=-1), + tf.cast(sample_rate, dtype=tf.int32), + tf.cast(resample_rate, dtype=tf.int32), + lowpass_filter_width=5) + return tf.squeeze(speed_data), tf.cast(sample_rate, dtype=tf.int32) diff --git a/delta/data/frontend/read_wav_test.py b/delta/data/frontend/read_wav_test.py index d1a2eeb6..2c3f099a 100644 --- a/delta/data/frontend/read_wav_test.py +++ b/delta/data/frontend/read_wav_test.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""The model tests OP of read_wav """ import delta.compat as tf from pathlib import Path @@ -22,17 +23,20 @@ class ReadWavTest(tf.test.TestCase): - + """ + ReadWav OP test. + """ def test_read_wav(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) with self.cached_session(use_gpu=False, force_gpu=False): - read_wav = ReadWav.params({'sample_rate': 16000}).instantiate() + config = {'speed': 1.0} + read_wav = ReadWav.params(config).instantiate() audio_data, sample_rate = read_wav(wav_path) audio_data_true, sample_rate_true = librosa.load(wav_path, sr=16000) - self.assertAllClose(audio_data.eval() / 32768, audio_data_true) - self.assertAllClose(sample_rate.eval(), sample_rate_true) - + if (config['speed'] == 1.0): + self.assertAllClose(audio_data.eval() / 32768, audio_data_true) + self.assertAllClose(sample_rate.eval(), sample_rate_true) if __name__ == '__main__': tf.test.main() diff --git a/delta/data/frontend/spectrum.py b/delta/data/frontend/spectrum.py index 4bd82a1e..f8a350ec 100644 --- a/delta/data/frontend/spectrum.py +++ b/delta/data/frontend/spectrum.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""This model extracts spetrum features per frame.""" import tensorflow as tf from core.ops import py_x_ops @@ -21,7 +22,10 @@ class Spectrum(BaseFrontend): - + """ + Compute spectrum features of every frame in speech, return a float tensor + with size (num_frames, num_frequencies). + """ def __init__(self, config: dict): super().__init__(config) @@ -29,17 +33,30 @@ def __init__(self, config: dict): def params(cls, config=None): """ Set params. - :param config: contains ten optional parameters. - --sample_rate : Sample frequency of waveform data. (int, default = 16000) + :param config: contains nine optional parameters: + --sample_rate : Waveform data sample frequency (must match the waveform + file, if specified there). (float, default = 16000) --window_length : Window length in seconds. (float, default = 0.025) - --frame_length : Hop length in seconds. (float, default = 0.010) - --snip_edges : If True, the last frame (shorter than window_length) will be cutoff. If False, 1 // 2 frame_length data will be padded to data. (int, default = True) - ---raw_energy : If 1, compute frame energy before preemphasis and windowing. If 2, compute frame energy after preemphasis and windowing. (int, default = 1) - --preeph_coeff : Coefficient for use in frame-signal preemphasis. (float, default = 0.97) - --window_type : Type of window ("hamm"|"hann"|"povey"|"rect"|"blac"|"tria"). (string, default = "povey") - --remove_dc_offset : Subtract mean from waveform on each frame (bool, default = true) - --is_fbank : If true, compute power spetrum without frame energy. If false, using the frame energy instead of the square of the constant component of the signal. (bool, default = false) - --output_type : If 1, return power spectrum. If 2, return log-power spectrum. (int, default = 2) + --frame_length : Hop length in seconds. (float, default = 0.010) + --snip_edges : If 1, the last frame (shorter than window_length) + will be cutoff. If 2, 1 // 2 frame_length data will + be padded to data. (int, default = 1) + ---raw_energy : If 1, compute frame energy before preemphasis and windowing. + If 2, compute frame energy after preemphasis and windowing. + (int, default = 1) + --preeph_coeff : Coefficient for use in frame-signal preemphasis. + (float, default = 0.97) + --window_type : Type of window ("hamm"|"hann"|"povey"|"rect"|"blac"|"tria"). + (string, default = "povey") + --remove_dc_offset : Subtract mean from waveform on each frame. + (bool, default = true) + --is_fbank : If true, compute power spetrum without frame energy. + If false, using the frame energy instead of the square of the + constant component of the signal. (bool, default = false) + --output_type : If 1, return power spectrum. If 2, return log-power spectrum. + (int, default = 2) + --dither : Dithering constant (0.0 means no dither). + (float, default = 1) [add robust to training] :return: An object of class HParams, which is a set of hyperparameters as name-value pairs. """ @@ -47,12 +64,13 @@ def params(cls, config=None): frame_length = 0.010 output_type = 2 sample_rate = 16000 - snip_edges = True + snip_edges = 1 raw_energy = 1 preeph_coeff = 0.97 window_type = 'povey' remove_dc_offset = True is_fbank = False + dither = 0.0 hparams = HParams(cls=cls) hparams.add_hparam('window_length', window_length) @@ -65,6 +83,7 @@ def params(cls, config=None): hparams.add_hparam('window_type', window_type) hparams.add_hparam('remove_dc_offset', remove_dc_offset) hparams.add_hparam('is_fbank', is_fbank) + hparams.add_hparam('dither', dither) if config is not None: hparams.override_from_dict(config) @@ -74,10 +93,12 @@ def params(cls, config=None): def call(self, audio_data, sample_rate=None): """ Caculate power spectrum or log power spectrum of audio data. - :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. + :param audio_data: the audio signal from which to compute spectrum. + Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. - :return: A float tensor of size (num_frames, num_frequencies) containing power spectrum (output_type=1) - or log power spectrum (output_type=2) of every frame in speech. + :return: A float tensor of size (num_frames, num_frequencies) containing power + spectrum (output_type=1) or log power spectrum (output_type=2) + of every frame in speech. """ p = self.config @@ -102,6 +123,7 @@ def call(self, audio_data, sample_rate=None): preEph_coeff=p.preeph_coeff, window_type=p.window_type, remove_dc_offset=p.remove_dc_offset, - is_fbank=p.is_fbank) + is_fbank=p.is_fbank, + dither=p.dither) return spectrum diff --git a/delta/data/frontend/spectrum_test.py b/delta/data/frontend/spectrum_test.py index 0d095c71..dd7bc11f 100644 --- a/delta/data/frontend/spectrum_test.py +++ b/delta/data/frontend/spectrum_test.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""The model tests spectrum FE.""" import os import numpy as np @@ -24,7 +25,9 @@ class SpectrumTest(tf.test.TestCase): - + ''' + Spectum extraction test. + ''' def test_spectrum(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) @@ -34,7 +37,8 @@ def test_spectrum(self): spectrum = Spectrum.params({ 'window_length': 0.025, - 'snip_edges': True + 'snip_edges': 1, + 'dither':0.0 }).instantiate() spectrum_test = spectrum(input_data, sample_rate) diff --git a/delta/data/frontend/synthfiltbank.py b/delta/data/frontend/synthfiltbank.py index 9c2d7c48..4ab34ba1 100644 --- a/delta/data/frontend/synthfiltbank.py +++ b/delta/data/frontend/synthfiltbank.py @@ -14,8 +14,8 @@ # limitations under the License. # ============================================================================== -import delta.compat as tf +import delta.compat as tf from core.ops import py_x_ops from delta.utils.hparam import HParams from delta.data.frontend.base_frontend import BaseFrontend diff --git a/delta/data/frontend/synthfiltbank_test.py b/delta/data/frontend/synthfiltbank_test.py index 2742ba0f..2208d494 100644 --- a/delta/data/frontend/synthfiltbank_test.py +++ b/delta/data/frontend/synthfiltbank_test.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""The model tests Synthfiltbank FE.""" import os from pathlib import Path @@ -25,7 +26,9 @@ class Test(tf.test.TestCase): - + """ + Synthfiltbank extraction test. + """ def test_synthfiltbank(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) diff --git a/delta/data/frontend/write_wav.py b/delta/data/frontend/write_wav.py index 2b74ba4d..aec5d4ce 100644 --- a/delta/data/frontend/write_wav.py +++ b/delta/data/frontend/write_wav.py @@ -60,7 +60,7 @@ def call(self, filename, audio_data, sample_rate=None): assert_op = tf.assert_equal( tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): - audio_data = tf.cast(audio_data, dtype=tf.float32) + audio_data = tf.cast(audio_data / 32768, dtype=tf.float32) contents = tf.audio.encode_wav( tf.expand_dims(audio_data, 1), tf.cast(sample_rate, dtype=tf.int32)) w = tf.io.write_file(filename, contents) diff --git a/delta/data/frontend/write_wav_test.py b/delta/data/frontend/write_wav_test.py index 39518cc7..470389f4 100644 --- a/delta/data/frontend/write_wav_test.py +++ b/delta/data/frontend/write_wav_test.py @@ -28,15 +28,14 @@ def test_write_wav(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) with self.cached_session(use_gpu=False, force_gpu=False) as sess: - read_wav = ReadWav.params().instantiate() + read_wav = ReadWav.params({'speed': 1.1}).instantiate() input_data, sample_rate = read_wav(wav_path) input_data = input_data / 32768 write_wav = WriteWav.params().instantiate() - new_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln_new.wav')) + new_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln_speed.wav')) writewav_op = write_wav(new_path, input_data, sample_rate) sess.run(writewav_op) test_data, test_sample_rate = read_wav(new_path) - test_data = test_data / 32768 self.assertAllEqual(input_data.eval(), test_data.eval()) self.assertAllEqual(sample_rate.eval(), test_sample_rate.eval()) diff --git a/delta/data/frontend/zcr.py b/delta/data/frontend/zcr.py index bda72423..5ffe76f3 100644 --- a/delta/data/frontend/zcr.py +++ b/delta/data/frontend/zcr.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""This model extracts zcr features per frame.""" import delta.compat as tf @@ -22,6 +23,10 @@ class Zcr(BaseFrontend): + """ + Compute ZCR features respectively,and concate them. Return + a tensor with shape (1, num_frames). + """ def __init__(self, config: dict): super().__init__(config) @@ -30,9 +35,13 @@ def __init__(self, config: dict): def params(cls, config=None): """ Set params. - :param config:contains three optional parameters: window_length(float, default=0.025s), - frame_length(float, default=0.010s), and sample_rate(int, default=16000). - :return: An object of class HParams, which is a set of hyperparameters as name-value pairs. + :param config:contains three optional parameters: + --sample_rate : Waveform data sample frequency (must match the waveform + file, if specified there). (float, default = 16000) + --window_length : Window length in seconds. (float, default = 0.025) + --frame_length : Hop length in seconds. (float, default = 0.010) + :return: An object of class HParams, which is a set of hyperparameters as + name-value pairs. """ window_length = 0.025 @@ -52,9 +61,12 @@ def params(cls, config=None): def call(self, audio_data, sample_rate=None): """ Calculate the zero-crossing rate of speech. - :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. - :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. - :return: A tensor with shape (1, num_frames), containing zero-crossing rate of every frame in speech. + :param audio_data: the audio signal from which to compute spectrum. + Should be an (1, N) tensor. + :param sample_rate: [option]the samplerate of the signal we working with, + default is 16kHz. + :return: A tensor with shape (1, num_frames), containing zero-crossing rate of + every frame in speech. """ p = self.config diff --git a/delta/data/frontend/zcr_test.py b/delta/data/frontend/zcr_test.py index f7d9808a..f670ceee 100644 --- a/delta/data/frontend/zcr_test.py +++ b/delta/data/frontend/zcr_test.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""The model tests ZCR FE.""" import os from pathlib import Path @@ -25,7 +26,9 @@ class ZcrTest(tf.test.TestCase): - + """ + Test Fbank FE using 8k/16k wav files. + """ def test_zcr(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) diff --git a/utils/speech/compute_fbank_feats.py b/utils/speech/compute_fbank_feats.py index cff4d5db..b1f715c1 100755 --- a/utils/speech/compute_fbank_feats.py +++ b/utils/speech/compute_fbank_feats.py @@ -64,9 +64,14 @@ def get_parser(): help='Type of window ("hamm"|"hann"|"povey"|"rect"|"blac"|"tria").') parser.add_argument( '--snip_edges', - type=bool, - default=True, + type=int, + default=1, help='The last frame (shorter than window_length) will not be cutoff.') + parser.add_argument( + '--dither', + type=float, + default=0.0, + help='Dithering constant (0.0 means no dither).') parser.add_argument( '--raw_energy', type=int, @@ -131,6 +136,7 @@ def compute_fbank(): config['preeph_coeff'] = args.preeph_coeff config['remove_dc_offset'] = args.remove_dc_offset config['is_fbank'] = args.is_fbank + config['dither'] = args.dither fbank = Fbank.params(config).instantiate() diff --git a/utils/speech/compute_fbank_pitch.py b/utils/speech/compute_fbank_pitch.py index 43f908b3..6d73f68d 100755 --- a/utils/speech/compute_fbank_pitch.py +++ b/utils/speech/compute_fbank_pitch.py @@ -47,6 +47,11 @@ def get_parser(): type=float, default=40, help='Order of fbank') + parser.add_argument( + '--dither', + type=float, + default=0.0, + help='Dithering constant (0.0 means no dither).') parser.add_argument( '--window_length', type=float, default=0.025, help='Length of a frame') parser.add_argument( @@ -133,6 +138,7 @@ def compute_fbank_pitch(): config['remove_dc_offset'] = args.remove_dc_offset config['is_fbank'] = args.is_fbank config['thres_autoc'] = args.thres_autoc + config['dither'] = args.dither fbank_pitch = FbankPitch.params(config).instantiate() diff --git a/utils/speech/compute_mfcc_feats.py b/utils/speech/compute_mfcc_feats.py index 101cf2eb..60f87de0 100755 --- a/utils/speech/compute_mfcc_feats.py +++ b/utils/speech/compute_mfcc_feats.py @@ -87,6 +87,11 @@ def get_parser(): type=bool, default=True, help='Compute power spetrum without frame energy.') + parser.add_argument( + '--dither', + type=float, + default=0.0, + help='Dithering constant (0.0 means no dither).') parser.add_argument( '--cepstral_lifter', type=float, @@ -149,6 +154,7 @@ def compute_mfcc(): config['cepstral_lifter'] = args.cepstral_lifter config['coefficient_count'] = args.coefficient_count config['use_energy'] = args.use_energy + config['dither'] = args.dither mfcc = Mfcc.params(config).instantiate() diff --git a/utils/speech/compute_spectrum_feats.py b/utils/speech/compute_spectrum_feats.py index 800229e1..d873d91f 100755 --- a/utils/speech/compute_spectrum_feats.py +++ b/utils/speech/compute_spectrum_feats.py @@ -72,6 +72,11 @@ def get_parser(): type=bool, default=False, help='Compute power spetrum without frame energy') + parser.add_argument( + '--dither', + type=float, + default=0.0, + help='Dithering constant (0.0 means no dither).') parser.add_argument( '--write_num_frames', type=str, @@ -114,6 +119,7 @@ def compute_spectrum(): config['preeph_coeff'] = args.preeph_coeff config['remove_dc_offset'] = args.remove_dc_offset config['is_fbank'] = args.is_fbank + config['dither'] = args.dither spectrum = Spectrum.params(config).instantiate() diff --git a/utils/speech/make_fbank.sh b/utils/speech/make_fbank.sh index 34808dee..91c43ff2 100755 --- a/utils/speech/make_fbank.sh +++ b/utils/speech/make_fbank.sh @@ -26,8 +26,9 @@ filterbank_channel_count=23 window_length=0.025 frame_length=0.010 output_type=1 -snip_edges=true +snip_edges=1 raw_energy=1 +dither=0.0 preeph_coeff=0.97 window_type='povey' remove_dc_offset=true @@ -123,6 +124,7 @@ if [ -f ${data}/segments ]; then --window_type ${window_type} \ --remove_dc_offset ${remove_dc_offset} \ --is_fbank ${is_fbank} \ + --dither ${dither} \ ${write_num_frames_opt} \ --compress ${compress} \ --compression_method ${compression_method} \ @@ -153,6 +155,7 @@ else --window_type ${window_type} \ --remove_dc_offset ${remove_dc_offset} \ --is_fbank ${is_fbank} \ + --dither ${dither} \ ${write_num_frames_opt} \ --compress ${compress} \ --compression_method ${compression_method} \ diff --git a/utils/speech/make_fbank_pitch.sh b/utils/speech/make_fbank_pitch.sh index a3522f12..2570ae75 100755 --- a/utils/speech/make_fbank_pitch.sh +++ b/utils/speech/make_fbank_pitch.sh @@ -32,6 +32,7 @@ preeph_coeff=0.97 window_type='povey' remove_dc_offset=true is_fbank=true +dither=0.0 thres_autoc=0.3 write_utt2num_frames=true compress=false @@ -125,6 +126,7 @@ if [ -f ${data}/segments ]; then --window_type ${window_type} \ --remove_dc_offset ${remove_dc_offset} \ --is_fbank ${is_fbank} \ + --dither ${dither} \ ${write_num_frames_opt} \ --compress ${compress} \ --compression_method ${compression_method} \ @@ -156,6 +158,7 @@ else --window_type ${window_type} \ --remove_dc_offset ${remove_dc_offset} \ --is_fbank ${is_fbank} \ + --dither ${dither} \ ${write_num_frames_opt} \ --compress ${compress} \ --compression_method ${compression_method} \ diff --git a/utils/speech/make_mfcc.sh b/utils/speech/make_mfcc.sh index 9f0e69bf..e6813fc2 100755 --- a/utils/speech/make_mfcc.sh +++ b/utils/speech/make_mfcc.sh @@ -38,6 +38,7 @@ write_utt2num_frames=true compress=false compression_method=2 use_energy=true +dither=0.0 if [ -f path.sh ]; then . ./path.sh; fi . parse_options.sh || exit 1; @@ -126,6 +127,7 @@ if [ -f ${data}/segments ]; then --window_type ${window_type} \ --remove_dc_offset ${remove_dc_offset} \ --is_fbank ${is_fbank} \ + --dither ${dither} \ --cepstral_lifter ${cepstral_lifter} \ --coefficient_count ${coefficient_count} \ --use_energy ${use_energy} \ @@ -159,6 +161,7 @@ else --window_type ${window_type} \ --remove_dc_offset ${remove_dc_offset} \ --is_fbank ${is_fbank} \ + --dither ${dither} \ --cepstral_lifter ${cepstral_lifter} \ --coefficient_count ${coefficient_count} \ --use_energy ${use_energy} \ diff --git a/utils/speech/make_spectrum.sh b/utils/speech/make_spectrum.sh index dce6fb5c..7d6de82d 100755 --- a/utils/speech/make_spectrum.sh +++ b/utils/speech/make_spectrum.sh @@ -29,6 +29,7 @@ preeph_coeff=0.97 window_type='povey' remove_dc_offset=true is_fbank=false +dither=0.0 output_type=2 write_utt2num_frames=true compress=false @@ -119,6 +120,7 @@ if [ -f ${data}/segments ]; then --window_type ${window_type} \ --remove_dc_offset ${remove_dc_offset} \ --is_fbank ${is_fbank} \ + --dither ${dither} \ ${write_num_frames_opt} \ --compress ${compress} \ --compression_method ${compression_method} \ @@ -147,6 +149,7 @@ else --window_type ${window_type} \ --remove_dc_offset ${remove_dc_offset} \ --is_fbank ${is_fbank} \ + --dither ${dither} \ ${write_num_frames_opt} \ --compress ${compress} \ --compression_method ${compression_method} \ From 715f57591c88c58b117da95054ab2f0d6ec9b6bc Mon Sep 17 00:00:00 2001 From: dengchengyun Date: Fri, 20 Dec 2019 10:55:51 +0800 Subject: [PATCH 2/6] fix snip_edges setting --- core/ops/kernels/framepow.cc | 6 +++--- core/ops/kernels/framepow.h | 4 ++-- core/ops/kernels/framepow_op.cc | 4 ++-- core/ops/kernels/spectrum.cc | 6 +++--- core/ops/kernels/spectrum.h | 4 ++-- core/ops/kernels/spectrum_op.cc | 6 +++--- core/ops/kernels/x_ops.cc | 4 ++-- delta/data/frontend/fbank.py | 8 ++++---- delta/data/frontend/fbank_pitch.py | 8 ++++---- delta/data/frontend/fbank_pitch_test.py | 2 +- delta/data/frontend/fbank_test.py | 2 +- delta/data/frontend/framepow.py | 2 +- delta/data/frontend/mfcc.py | 8 ++++---- delta/data/frontend/spectrum.py | 8 ++++---- delta/data/frontend/spectrum_test.py | 2 +- utils/speech/compute_fbank_feats.py | 5 +++-- utils/speech/compute_fbank_pitch.py | 5 +++-- utils/speech/compute_mfcc_feats.py | 1 + utils/speech/compute_pitch_feats.py | 1 + utils/speech/compute_spectrum_feats.py | 5 +++-- utils/speech/make_fbank.sh | 2 +- utils/speech/make_fbank_pitch.sh | 2 +- utils/speech/make_spectrum.sh | 2 +- 23 files changed, 51 insertions(+), 46 deletions(-) diff --git a/core/ops/kernels/framepow.cc b/core/ops/kernels/framepow.cc index 4fc13f2f..a682fdc9 100644 --- a/core/ops/kernels/framepow.cc +++ b/core/ops/kernels/framepow.cc @@ -27,7 +27,7 @@ const float frame_length_sec = 0.010; FramePow::FramePow() { window_length_sec_ = window_length_sec; frame_length_sec_ = frame_length_sec; - i_snip_edges = 1; + i_snip_edges = true; i_remove_dc_offset = true; pf_FrmEng = NULL; } @@ -42,7 +42,7 @@ void FramePow::set_frame_length_sec(float frame_length_sec) { frame_length_sec_ = frame_length_sec; } -void FramePow::set_snip_edges(int snip_edges) { i_snip_edges = snip_edges; } +void FramePow::set_snip_edges(bool snip_edges) { i_snip_edges = snip_edges; } void FramePow::set_remove_dc_offset(bool remove_dc_offset) { i_remove_dc_offset = remove_dc_offset; @@ -52,7 +52,7 @@ int FramePow::init_eng(int input_size, float sample_rate) { f_SamRat = sample_rate; i_WinLen = static_cast(window_length_sec_ * f_SamRat); i_FrmLen = static_cast(frame_length_sec_ * f_SamRat); - if (i_snip_edges == 1) + if (i_snip_edges == true) i_NumFrm = (input_size - i_WinLen) / i_FrmLen + 1; else i_NumFrm = (input_size + i_FrmLen / 2) / i_FrmLen; diff --git a/core/ops/kernels/framepow.h b/core/ops/kernels/framepow.h index c756da78..47f1c8ad 100644 --- a/core/ops/kernels/framepow.h +++ b/core/ops/kernels/framepow.h @@ -27,7 +27,7 @@ class FramePow { private: float window_length_sec_; float frame_length_sec_; - int i_snip_edges; + bool i_snip_edges; bool i_remove_dc_offset; float f_SamRat; @@ -46,7 +46,7 @@ class FramePow { void set_frame_length_sec(float frame_length_sec); - void set_snip_edges(int snip_edges); + void set_snip_edges(bool snip_edges); void set_remove_dc_offset(bool remove_dc_offset); diff --git a/core/ops/kernels/framepow_op.cc b/core/ops/kernels/framepow_op.cc index 55897d5a..0d7e3dd3 100644 --- a/core/ops/kernels/framepow_op.cc +++ b/core/ops/kernels/framepow_op.cc @@ -63,7 +63,7 @@ class FramePowOp : public OpKernel { int i_WinLen = static_cast(window_length_ * sample_rate); int i_FrmLen = static_cast(frame_length_ * sample_rate); int i_NumFrm = (L - i_WinLen) / i_FrmLen + 1; - if (snip_edges_ == 2) i_NumFrm = (L + i_FrmLen / 2) / i_FrmLen; + if (snip_edges_ == false) i_NumFrm = (L + i_FrmLen / 2) / i_FrmLen; if (i_NumFrm < 1) i_NumFrm = 1; OP_REQUIRES_OK(context, context->allocate_output( 0, TensorShape({1, i_NumFrm}), &output_tensor)); @@ -79,7 +79,7 @@ class FramePowOp : public OpKernel { private: float window_length_; float frame_length_; - int snip_edges_; + bool snip_edges_; bool remove_dc_offset_; }; diff --git a/core/ops/kernels/spectrum.cc b/core/ops/kernels/spectrum.cc index eec21b11..1e8e513c 100644 --- a/core/ops/kernels/spectrum.cc +++ b/core/ops/kernels/spectrum.cc @@ -30,7 +30,7 @@ Spectrum::Spectrum() { window_length_sec_ = window_length_sec; frame_length_sec_ = frame_length_sec; i_OutTyp = 1; - i_snip_edges = 1; + i_snip_edges = true; i_raw_energy = 1; f_PreEph = 0.97; i_is_fbank = true; @@ -57,7 +57,7 @@ void Spectrum::set_frame_length_sec(float frame_length_sec) { void Spectrum::set_output_type(int output_type) { i_OutTyp = output_type; } -void Spectrum::set_snip_edges(int snip_edges) { i_snip_edges = snip_edges; } +void Spectrum::set_snip_edges(bool snip_edges) { i_snip_edges = snip_edges; } void Spectrum::set_raw_energy(int raw_energy) {i_raw_energy = raw_energy;} @@ -77,7 +77,7 @@ int Spectrum::init_spc(int input_size, float sample_rate) { f_SamRat = sample_rate; i_WinLen = static_cast(window_length_sec_ * f_SamRat); i_FrmLen = static_cast(frame_length_sec_ * f_SamRat); - if (i_snip_edges == 1) + if (i_snip_edges == true) i_NumFrm = (input_size - i_WinLen) / i_FrmLen + 1; else i_NumFrm = (input_size + i_FrmLen / 2) / i_FrmLen; diff --git a/core/ops/kernels/spectrum.h b/core/ops/kernels/spectrum.h index 517890ce..e9b93e16 100644 --- a/core/ops/kernels/spectrum.h +++ b/core/ops/kernels/spectrum.h @@ -41,7 +41,7 @@ class Spectrum { float f_PreEph; char s_WinTyp[40]; int i_OutTyp; // 1: PSD, 2:log(PSD) - int i_snip_edges; + bool i_snip_edges; int i_raw_energy; bool i_remove_dc_offset; bool i_is_fbank; @@ -66,7 +66,7 @@ class Spectrum { void set_output_type(int output_type); - void set_snip_edges(int snip_edges); + void set_snip_edges(bool snip_edges); void set_raw_energy(int raw_energy); diff --git a/core/ops/kernels/spectrum_op.cc b/core/ops/kernels/spectrum_op.cc index d6afecea..7e88275e 100644 --- a/core/ops/kernels/spectrum_op.cc +++ b/core/ops/kernels/spectrum_op.cc @@ -77,8 +77,8 @@ class SpecOp : public OpKernel { int i_WinLen = static_cast(window_length_ * sample_rate); int i_FrmLen = static_cast(frame_length_ * sample_rate); int i_NumFrm = (L - i_WinLen) / i_FrmLen + 1; - int i_snip_edges = snip_edges_; - if (i_snip_edges == 2) + bool i_snip_edges = snip_edges_; + if (i_snip_edges == false) i_NumFrm = (L + i_FrmLen / 2) / i_FrmLen; if (i_NumFrm < 1) i_NumFrm = 1; @@ -99,7 +99,7 @@ class SpecOp : public OpKernel { float window_length_; float frame_length_; int output_type_; - int snip_edges_; + bool snip_edges_; int raw_energy_; float preEph_coeff_; string window_type_; diff --git a/core/ops/kernels/x_ops.cc b/core/ops/kernels/x_ops.cc index 601124cd..a36e5b5b 100644 --- a/core/ops/kernels/x_ops.cc +++ b/core/ops/kernels/x_ops.cc @@ -365,7 +365,7 @@ REGISTER_OP("Pitch") REGISTER_OP("FramePow") .Input("input_data: float") .Input("sample_rate: float") - .Attr("snip_edges: int = 1") + .Attr("snip_edges: bool = true") .Attr("remove_dc_offset: bool = true") .Attr("window_length: float = 0.025") .Attr("frame_length: float = 0.010") @@ -423,7 +423,7 @@ REGISTER_OP("Spectrum") .Attr("frame_length: float = 0.010") .Attr("window_type: string") .Attr("output_type: int = 2") - .Attr("snip_edges: int = 1") + .Attr("snip_edges: bool = true") .Attr("raw_energy: int = 1") .Attr("preEph_coeff: float = 0.97") .Attr("remove_dc_offset: bool = true") diff --git a/delta/data/frontend/fbank.py b/delta/data/frontend/fbank.py index 3bd55ab7..fb288ab7 100644 --- a/delta/data/frontend/fbank.py +++ b/delta/data/frontend/fbank.py @@ -39,9 +39,9 @@ def params(cls, config=None): :param config: contains thirteen optional parameters: --window_length : Window length in seconds. (float, default = 0.025) --frame_length : Hop length in seconds. (float, default = 0.010) - --snip_edges : If 1, the last frame (shorter than window_length) will be - cutoff. If 2, 1 // 2 frame_length data will be padded - to data. (int, default = 1) + --snip_edges : If true, the last frame (shorter than window_length) will be + cutoff. If ,false 1 // 2 frame_length data will be padded + to data. (bool, default = true) ---raw_energy : If 1, compute frame energy before preemphasis and windowing. If 2, compute frame energy after preemphasis and windowing. (int, default = 1) @@ -74,7 +74,7 @@ def params(cls, config=None): frame_length = 0.010 output_type = 1 sample_rate = 16000 - snip_edges = 1 + snip_edges = True raw_energy = 1 preeph_coeff = 0.97 window_type = 'povey' diff --git a/delta/data/frontend/fbank_pitch.py b/delta/data/frontend/fbank_pitch.py index 84e2e240..1af93b84 100644 --- a/delta/data/frontend/fbank_pitch.py +++ b/delta/data/frontend/fbank_pitch.py @@ -41,9 +41,9 @@ def params(cls, config=None): (int, default = 16000) --window_length : Window length in seconds. (float, default = 0.025) --frame_length : Hop length in seconds. (float, default = 0.010) - --snip_edges : If 1, the last frame (shorter than window_length) will - be cutoff. If 2, 1 // 2 frame_length data will be padded - to data. (int, default = 1) + --snip_edges : If true, the last frame (shorter than window_length) will + be cutoff. If false, 1 // 2 frame_length data will be padded + to data. (bool, default = true) ---raw_energy : If 1, compute frame energy before preemphasis and windowing. If 2, compute frame energy after preemphasis and windowing. (int, default = 1) @@ -126,7 +126,7 @@ def params(cls, config=None): output_type = 1 dither = 0.0 sample_rate = 16000 - snip_edges = 1 + snip_edges = True preemph_coeff = 0.0 min_f0 = 50.0 max_f0 = 400.0 diff --git a/delta/data/frontend/fbank_pitch_test.py b/delta/data/frontend/fbank_pitch_test.py index b778f7e6..f7829990 100644 --- a/delta/data/frontend/fbank_pitch_test.py +++ b/delta/data/frontend/fbank_pitch_test.py @@ -35,7 +35,7 @@ def test_FbankPitch(self): input_data, sample_rate = read_wav(wav_path) config = {'window_length': 0.025, 'output_type': 1, 'frame_length': 0.010} fbank_pitch = FbankPitch.params(config).instantiate() - fbank_pitch_test = fbank_pitch(input_data) + fbank_pitch_test = fbank_pitch(input_data, sample_rate) self.assertEqual(tf.rank(fbank_pitch_test).eval(), 2) print(fbank_pitch_test.eval()[0:2]) diff --git a/delta/data/frontend/fbank_test.py b/delta/data/frontend/fbank_test.py index c26fe6bb..3228ad4f 100644 --- a/delta/data/frontend/fbank_test.py +++ b/delta/data/frontend/fbank_test.py @@ -39,7 +39,7 @@ def test_fbank(self): 'window_length': 0.025, 'output_type': 1, 'frame_length': 0.010, - 'snip_edges': 1 + 'snip_edges': True } fbank = Fbank.params(config).instantiate() fbank_test = fbank(input_data, sample_rate) diff --git a/delta/data/frontend/framepow.py b/delta/data/frontend/framepow.py index 5d54ca95..ca048179 100644 --- a/delta/data/frontend/framepow.py +++ b/delta/data/frontend/framepow.py @@ -48,7 +48,7 @@ def params(cls, config=None): window_length = 0.025 frame_length = 0.010 - snip_edges = 1 + snip_edges = True remove_dc_offset = True sample_rate = 16000 diff --git a/delta/data/frontend/mfcc.py b/delta/data/frontend/mfcc.py index 16086b0e..a43a57df 100644 --- a/delta/data/frontend/mfcc.py +++ b/delta/data/frontend/mfcc.py @@ -41,9 +41,9 @@ def params(cls, config=None): :param config: contains fourteen optional parameters. --window_length : Window length in seconds. (float, default = 0.025) --frame_length : Hop length in seconds. (float, default = 0.010) - --snip_edges : If 1, the last frame (shorter than window_length) will - be cutoff. If 2, 1 // 2 frame_length data will be padded - to data. (int, default = 1) + --snip_edges : If True, the last frame (shorter than window_length) will + be cutoff. If False, 1 // 2 frame_length data will be padded + to data. (bool, default = True) ---raw_energy : If 1, compute frame energy before preemphasis and windowing. If 2, compute frame energy after preemphasis and windowing. (int, default = 1) @@ -79,7 +79,7 @@ def params(cls, config=None): frame_length = 0.010 output_type = 1 sample_rate = 16000 - snip_edges = 1 + snip_edges = True raw_energy = 1 preeph_coeff = 0.97 window_type = 'povey' diff --git a/delta/data/frontend/spectrum.py b/delta/data/frontend/spectrum.py index f8a350ec..323f17b7 100644 --- a/delta/data/frontend/spectrum.py +++ b/delta/data/frontend/spectrum.py @@ -38,9 +38,9 @@ def params(cls, config=None): file, if specified there). (float, default = 16000) --window_length : Window length in seconds. (float, default = 0.025) --frame_length : Hop length in seconds. (float, default = 0.010) - --snip_edges : If 1, the last frame (shorter than window_length) - will be cutoff. If 2, 1 // 2 frame_length data will - be padded to data. (int, default = 1) + --snip_edges : If True, the last frame (shorter than window_length) + will be cutoff. If False, 1 // 2 frame_length data will + be padded to data. (bool, default = True) ---raw_energy : If 1, compute frame energy before preemphasis and windowing. If 2, compute frame energy after preemphasis and windowing. (int, default = 1) @@ -64,7 +64,7 @@ def params(cls, config=None): frame_length = 0.010 output_type = 2 sample_rate = 16000 - snip_edges = 1 + snip_edges = True raw_energy = 1 preeph_coeff = 0.97 window_type = 'povey' diff --git a/delta/data/frontend/spectrum_test.py b/delta/data/frontend/spectrum_test.py index dd7bc11f..4574d1a8 100644 --- a/delta/data/frontend/spectrum_test.py +++ b/delta/data/frontend/spectrum_test.py @@ -37,7 +37,7 @@ def test_spectrum(self): spectrum = Spectrum.params({ 'window_length': 0.025, - 'snip_edges': 1, + 'snip_edges': True, 'dither':0.0 }).instantiate() spectrum_test = spectrum(input_data, sample_rate) diff --git a/utils/speech/compute_fbank_feats.py b/utils/speech/compute_fbank_feats.py index b1f715c1..11cbdf07 100755 --- a/utils/speech/compute_fbank_feats.py +++ b/utils/speech/compute_fbank_feats.py @@ -15,6 +15,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""Create Fbank feature files.""" import delta.compat as tf import argparse @@ -64,8 +65,8 @@ def get_parser(): help='Type of window ("hamm"|"hann"|"povey"|"rect"|"blac"|"tria").') parser.add_argument( '--snip_edges', - type=int, - default=1, + type=bool, + default=True, help='The last frame (shorter than window_length) will not be cutoff.') parser.add_argument( '--dither', diff --git a/utils/speech/compute_fbank_pitch.py b/utils/speech/compute_fbank_pitch.py index 6d73f68d..d7b0d0aa 100755 --- a/utils/speech/compute_fbank_pitch.py +++ b/utils/speech/compute_fbank_pitch.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""Create fbank_picth feature files.""" import delta.compat as tf import argparse @@ -68,8 +69,8 @@ def get_parser(): help='Type of window ("hamm"|"hann"|"povey"|"rect"|"blac"|"tria").') parser.add_argument( '--snip_edges', - type=int, - default=1, + type=bool, + default=True, help='The last frame (shorter than window_length) will not be cutoff.') parser.add_argument( '--raw_energy', diff --git a/utils/speech/compute_mfcc_feats.py b/utils/speech/compute_mfcc_feats.py index 60f87de0..7d525581 100755 --- a/utils/speech/compute_mfcc_feats.py +++ b/utils/speech/compute_mfcc_feats.py @@ -15,6 +15,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""Create MFCC feature files.""" import delta.compat as tf import argparse diff --git a/utils/speech/compute_pitch_feats.py b/utils/speech/compute_pitch_feats.py index 13b266b4..69e535a0 100755 --- a/utils/speech/compute_pitch_feats.py +++ b/utils/speech/compute_pitch_feats.py @@ -15,6 +15,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""Create Pitch feature files.""" import delta.compat as tf import argparse diff --git a/utils/speech/compute_spectrum_feats.py b/utils/speech/compute_spectrum_feats.py index d873d91f..e6db813b 100755 --- a/utils/speech/compute_spectrum_feats.py +++ b/utils/speech/compute_spectrum_feats.py @@ -15,6 +15,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +""""Create spectrogram feature files.""" import delta.compat as tf import argparse @@ -49,8 +50,8 @@ def get_parser(): help='Type of window ("hamm"|"hann"|"povey"|"rect"|"blac"|"tria").') parser.add_argument( '--snip_edges', - type=int, - default=1, + type=bool, + default=True, help='The last frame (shorter than window_length) will not be cutoff.') parser.add_argument( '--raw_energy', diff --git a/utils/speech/make_fbank.sh b/utils/speech/make_fbank.sh index 91c43ff2..5b8bcc2e 100755 --- a/utils/speech/make_fbank.sh +++ b/utils/speech/make_fbank.sh @@ -26,7 +26,7 @@ filterbank_channel_count=23 window_length=0.025 frame_length=0.010 output_type=1 -snip_edges=1 +snip_edges=true raw_energy=1 dither=0.0 preeph_coeff=0.97 diff --git a/utils/speech/make_fbank_pitch.sh b/utils/speech/make_fbank_pitch.sh index 2570ae75..756a5c2a 100755 --- a/utils/speech/make_fbank_pitch.sh +++ b/utils/speech/make_fbank_pitch.sh @@ -26,7 +26,7 @@ filterbank_channel_count=40 window_length=0.025 frame_length=0.010 output_type=1 -snip_edges=1 +snip_edges=true raw_energy=1 preeph_coeff=0.97 window_type='povey' diff --git a/utils/speech/make_spectrum.sh b/utils/speech/make_spectrum.sh index 7d6de82d..4150eff0 100755 --- a/utils/speech/make_spectrum.sh +++ b/utils/speech/make_spectrum.sh @@ -23,7 +23,7 @@ sample_rate=16000 window_length=0.025 frame_length=0.010 output_type=2 -snip_edges=1 +snip_edges=true raw_energy=1 preeph_coeff=0.97 window_type='povey' From 4726324a894e3130a19779b67a5e24841e5ddf6d Mon Sep 17 00:00:00 2001 From: dengchengyun Date: Fri, 20 Dec 2019 12:15:03 +0800 Subject: [PATCH 3/6] fix params --- delta/data/frontend/add_noise_end_to_end.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/delta/data/frontend/add_noise_end_to_end.py b/delta/data/frontend/add_noise_end_to_end.py index 26329fb0..93dcf83b 100644 --- a/delta/data/frontend/add_noise_end_to_end.py +++ b/delta/data/frontend/add_noise_end_to_end.py @@ -38,7 +38,7 @@ def __init__(self, config: dict): def params(cls, config=None): """ Set params. - :param config: contains nine optional parameters: + :param config: contains ten optional parameters: --sample_rate : Sample frequency of waveform data. (int, default = 16000) --if_add_rir : If true, add rir to audio data. (bool, default = False) --rir_filelist : FileList path of rir.(string, default = 'rirlist.scp') @@ -48,6 +48,7 @@ def params(cls, config=None): --noise_filelist : FileList path of noise.(string, default = 'noiselist.scp') --if_add_aecres : If true, add aecres to audio data. (bool, default = False) --aecres_filelist : FileList path of aecres.(string, default = 'aecreslist.scp') + --speed : Speed of sample channels wanted. (float, default=1.0) :return: An object of class HParams, which is a set of hyperparameters as name-value pairs. """ @@ -61,9 +62,11 @@ def params(cls, config=None): if_add_aecres = False aecres_filelist = 'aecreslist.scp' audio_channels = 1 + speed = 1.0 hparams = HParams(cls=cls) hparams.add_hparam('sample_rate', sample_rate) + hparams.add_hparam('speed', speed) hparams.add_hparam('if_add_rir', if_add_rir) hparams.add_hparam('if_add_noise', if_add_noise) hparams.add_hparam('rir_filelist', rir_filelist) From 6b65c22fe84b20ce1a21d5077dc93cf8d7e3045d Mon Sep 17 00:00:00 2001 From: dengchengyun Date: Fri, 20 Dec 2019 12:25:39 +0800 Subject: [PATCH 4/6] fix test of write_wav --- delta/data/frontend/write_wav_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/delta/data/frontend/write_wav_test.py b/delta/data/frontend/write_wav_test.py index 470389f4..de92b37b 100644 --- a/delta/data/frontend/write_wav_test.py +++ b/delta/data/frontend/write_wav_test.py @@ -28,7 +28,7 @@ def test_write_wav(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) with self.cached_session(use_gpu=False, force_gpu=False) as sess: - read_wav = ReadWav.params({'speed': 1.1}).instantiate() + read_wav = ReadWav.params({'speed': 1.0}).instantiate() input_data, sample_rate = read_wav(wav_path) input_data = input_data / 32768 write_wav = WriteWav.params().instantiate() From db961df9094237713cf2d939419228979878740a Mon Sep 17 00:00:00 2001 From: dengchengyun Date: Fri, 20 Dec 2019 12:43:16 +0800 Subject: [PATCH 5/6] Update write_wav_test.py --- delta/data/frontend/write_wav_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/delta/data/frontend/write_wav_test.py b/delta/data/frontend/write_wav_test.py index de92b37b..cf18ccba 100644 --- a/delta/data/frontend/write_wav_test.py +++ b/delta/data/frontend/write_wav_test.py @@ -30,7 +30,7 @@ def test_write_wav(self): with self.cached_session(use_gpu=False, force_gpu=False) as sess: read_wav = ReadWav.params({'speed': 1.0}).instantiate() input_data, sample_rate = read_wav(wav_path) - input_data = input_data / 32768 + input_data = input_data write_wav = WriteWav.params().instantiate() new_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln_speed.wav')) writewav_op = write_wav(new_path, input_data, sample_rate) From 8e4bfa52f35fe7a55857f2ab259abb0733bb15f5 Mon Sep 17 00:00:00 2001 From: dengchengyun Date: Fri, 20 Dec 2019 13:41:40 +0800 Subject: [PATCH 6/6] do format.sh --- delta/data/frontend/add_noise_end_to_end.py | 1 + .../frontend/add_noise_end_to_end_test.py | 1 + delta/data/frontend/add_rir_noise_aecres.py | 1 + .../frontend/add_rir_noise_aecres_test.py | 1 + delta/data/frontend/analyfiltbank.py | 1 + delta/data/frontend/analyfiltbank_test.py | 1 + delta/data/frontend/cepstrum.py | 1 + delta/data/frontend/cepstrum_test.py | 1 + delta/data/frontend/cmvn.py | 1 + delta/data/frontend/delta_delta.py | 1 + delta/data/frontend/delta_delta_test.py | 1 + delta/data/frontend/fbank.py | 1 + delta/data/frontend/fbank_pitch.py | 1 + delta/data/frontend/fbank_pitch_test.py | 1 + delta/data/frontend/fbank_test.py | 1 + delta/data/frontend/framepow.py | 14 +++++++------ delta/data/frontend/framepow_test.py | 5 +++-- delta/data/frontend/mfcc.py | 14 +++++++------ delta/data/frontend/mfcc_test.py | 1 + delta/data/frontend/pitch.py | 1 + delta/data/frontend/pitch_test.py | 1 + delta/data/frontend/plp.py | 1 + delta/data/frontend/plp_test.py | 1 + delta/data/frontend/read_wav.py | 20 ++++++++++++------- delta/data/frontend/read_wav_test.py | 2 ++ delta/data/frontend/spectrum.py | 1 + delta/data/frontend/spectrum_test.py | 3 ++- delta/data/frontend/synthfiltbank.py | 1 - delta/data/frontend/synthfiltbank_test.py | 1 + delta/data/frontend/zcr_test.py | 1 + utils/speech/compute_fbank_feats.py | 8 ++++---- utils/speech/compute_fbank_pitch.py | 8 ++++---- utils/speech/compute_mfcc_feats.py | 8 ++++---- utils/speech/compute_spectrum_feats.py | 8 ++++---- 34 files changed, 75 insertions(+), 39 deletions(-) diff --git a/delta/data/frontend/add_noise_end_to_end.py b/delta/data/frontend/add_noise_end_to_end.py index 93dcf83b..d35a652a 100644 --- a/delta/data/frontend/add_noise_end_to_end.py +++ b/delta/data/frontend/add_noise_end_to_end.py @@ -28,6 +28,7 @@ class AddNoiseEndToEnd(BaseFrontend): Add a random signal-to-noise ratio noise or impulse response to clean speech, and write it to wavfile. """ + def __init__(self, config: dict): super().__init__(config) self.add_noise = Add_rir_noise_aecres(config) diff --git a/delta/data/frontend/add_noise_end_to_end_test.py b/delta/data/frontend/add_noise_end_to_end_test.py index 7152a822..806fe1fe 100644 --- a/delta/data/frontend/add_noise_end_to_end_test.py +++ b/delta/data/frontend/add_noise_end_to_end_test.py @@ -37,6 +37,7 @@ class AddNoiseEndToEndTest(tf.test.TestCase): """ AddNoiseEndToEnd OP test. """ + def test_add_noise_end_to_end(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) diff --git a/delta/data/frontend/add_rir_noise_aecres.py b/delta/data/frontend/add_rir_noise_aecres.py index 35426409..9516c9b1 100644 --- a/delta/data/frontend/add_rir_noise_aecres.py +++ b/delta/data/frontend/add_rir_noise_aecres.py @@ -25,6 +25,7 @@ class Add_rir_noise_aecres(BaseFrontend): """ Add a random signal-to-noise ratio noise or impulse response to clean speech. """ + def __init__(self, config: dict): super().__init__(config) diff --git a/delta/data/frontend/add_rir_noise_aecres_test.py b/delta/data/frontend/add_rir_noise_aecres_test.py index 05939dbb..f17f8f62 100644 --- a/delta/data/frontend/add_rir_noise_aecres_test.py +++ b/delta/data/frontend/add_rir_noise_aecres_test.py @@ -39,6 +39,7 @@ class AddRirNoiseAecresTest(tf.test.TestCase): """ AddNoiseRIR OP test. """ + def test_add_rir_noise_aecres(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) diff --git a/delta/data/frontend/analyfiltbank.py b/delta/data/frontend/analyfiltbank.py index 7ce91df2..39707695 100644 --- a/delta/data/frontend/analyfiltbank.py +++ b/delta/data/frontend/analyfiltbank.py @@ -26,6 +26,7 @@ class Analyfiltbank(BaseFrontend): Compute power-spectrum && phase-spectrum features of every frame in speech, return two float tensors with size (num_frames, num_frequencies). """ + def __init__(self, config: dict): super().__init__(config) diff --git a/delta/data/frontend/analyfiltbank_test.py b/delta/data/frontend/analyfiltbank_test.py index d8007a8b..e7f3e783 100644 --- a/delta/data/frontend/analyfiltbank_test.py +++ b/delta/data/frontend/analyfiltbank_test.py @@ -28,6 +28,7 @@ class Test(tf.test.TestCase): """ Analyfiltbank extraction test. """ + def test_analyfiltbank(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) diff --git a/delta/data/frontend/cepstrum.py b/delta/data/frontend/cepstrum.py index 1cf9e011..5f352a19 100644 --- a/delta/data/frontend/cepstrum.py +++ b/delta/data/frontend/cepstrum.py @@ -27,6 +27,7 @@ class Cepstrum(BaseFrontend): Compute Cepstrum features of every frame in speech, return a float tensor with size (num_frames, ceps_subband_num). """ + def __init__(self, config: dict): super().__init__(config) diff --git a/delta/data/frontend/cepstrum_test.py b/delta/data/frontend/cepstrum_test.py index fcbb4b4d..401a6648 100644 --- a/delta/data/frontend/cepstrum_test.py +++ b/delta/data/frontend/cepstrum_test.py @@ -28,6 +28,7 @@ class CepstrumTest(tf.test.TestCase): """ Cepstrum extraction test. """ + def test_cepstrum(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) diff --git a/delta/data/frontend/cmvn.py b/delta/data/frontend/cmvn.py index 7717f9d0..4634a837 100644 --- a/delta/data/frontend/cmvn.py +++ b/delta/data/frontend/cmvn.py @@ -26,6 +26,7 @@ class CMVN(BaseFrontend): """ Compute and apply CMVN to features. """ + def __init__(self, config: dict): super().__init__(config) diff --git a/delta/data/frontend/delta_delta.py b/delta/data/frontend/delta_delta.py index 430967e8..ef3edd6e 100644 --- a/delta/data/frontend/delta_delta.py +++ b/delta/data/frontend/delta_delta.py @@ -26,6 +26,7 @@ class DeltaDelta(BaseFrontend): """ Do Delta_delta to features. """ + def __init__(self, config: dict): super().__init__(config) diff --git a/delta/data/frontend/delta_delta_test.py b/delta/data/frontend/delta_delta_test.py index d8f0cc74..26e5a760 100644 --- a/delta/data/frontend/delta_delta_test.py +++ b/delta/data/frontend/delta_delta_test.py @@ -26,6 +26,7 @@ class Delta_delta_Test(tf.test.TestCase): """ Delta_delta extraction test. """ + def test_delta_delta(self): self.feat_dim = 80 diff --git a/delta/data/frontend/fbank.py b/delta/data/frontend/fbank.py index fb288ab7..7c1e9436 100644 --- a/delta/data/frontend/fbank.py +++ b/delta/data/frontend/fbank.py @@ -28,6 +28,7 @@ class Fbank(BaseFrontend): spectrum to extract frequency bands. Return a float tensor with shape (num_channels, num_frames, num_frequencies). """ + def __init__(self, config: dict): super().__init__(config) self.spect = Spectrum(config) diff --git a/delta/data/frontend/fbank_pitch.py b/delta/data/frontend/fbank_pitch.py index 1af93b84..f53b135f 100644 --- a/delta/data/frontend/fbank_pitch.py +++ b/delta/data/frontend/fbank_pitch.py @@ -27,6 +27,7 @@ class FbankPitch(BaseFrontend): Compute Fbank && Pitch features respectively,and concate them. Return a tensor with shape (num_frames, dim_features). """ + def __init__(self, config: dict): super().__init__(config) self.fbank = Fbank(config) diff --git a/delta/data/frontend/fbank_pitch_test.py b/delta/data/frontend/fbank_pitch_test.py index f7829990..8cfe57b8 100644 --- a/delta/data/frontend/fbank_pitch_test.py +++ b/delta/data/frontend/fbank_pitch_test.py @@ -27,6 +27,7 @@ class FbankPitchTest(tf.test.TestCase): """ Compare Fbank&&Pitch FE with kaldi. """ + def test_FbankPitch(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) diff --git a/delta/data/frontend/fbank_test.py b/delta/data/frontend/fbank_test.py index 3228ad4f..b60a2dd7 100644 --- a/delta/data/frontend/fbank_test.py +++ b/delta/data/frontend/fbank_test.py @@ -29,6 +29,7 @@ class FbankTest(tf.test.TestCase): """ Test Fbank FE using 8k/16k wav files. """ + def test_fbank(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) diff --git a/delta/data/frontend/framepow.py b/delta/data/frontend/framepow.py index ca048179..16480ffd 100644 --- a/delta/data/frontend/framepow.py +++ b/delta/data/frontend/framepow.py @@ -27,6 +27,7 @@ class Framepow(BaseFrontend): Compute power of every frame in speech. Return a float tensor with shape (1 * num_frames). """ + def __init__(self, config: dict): super().__init__(config) @@ -86,11 +87,12 @@ def call(self, audio_data, sample_rate=None): with tf.control_dependencies([assert_op]): sample_rate = tf.cast(sample_rate, dtype=float) - framepow = py_x_ops.frame_pow(audio_data, - sample_rate, - snip_edges=p.snip_edges, - remove_dc_offset=p.remove_dc_offset, - window_length=p.window_length, - frame_length=p.frame_length) + framepow = py_x_ops.frame_pow( + audio_data, + sample_rate, + snip_edges=p.snip_edges, + remove_dc_offset=p.remove_dc_offset, + window_length=p.window_length, + frame_length=p.frame_length) return tf.squeeze(framepow) diff --git a/delta/data/frontend/framepow_test.py b/delta/data/frontend/framepow_test.py index 4a8a879f..db10b4a3 100644 --- a/delta/data/frontend/framepow_test.py +++ b/delta/data/frontend/framepow_test.py @@ -29,6 +29,7 @@ class FramepowTest(tf.test.TestCase): """ Framepow extraction test. """ + def test_framepow(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) @@ -43,10 +44,10 @@ def test_framepow(self): framepow_test = framepow(input_data, sample_rate) real_framepow_feats = np.array( - [9.819611, 9.328745, 9.247337, 9.26451, 9.266059]) + [9.819611, 9.328745, 9.247337, 9.26451, 9.266059]) self.assertEqual(tf.rank(framepow_test).eval(), 1) - self.assertAllClose(framepow_test.eval()[0 : 5], real_framepow_feats) + self.assertAllClose(framepow_test.eval()[0:5], real_framepow_feats) if __name__ == '__main__': diff --git a/delta/data/frontend/mfcc.py b/delta/data/frontend/mfcc.py index a43a57df..92f04053 100644 --- a/delta/data/frontend/mfcc.py +++ b/delta/data/frontend/mfcc.py @@ -29,6 +29,7 @@ class Mfcc(BaseFrontend): Compute mfcc features of every frame in speech, return a float tensor with size (num_channels, num_frames, num_frequencies). """ + def __init__(self, config: dict): super().__init__(config) self.framepow = Framepow(config) @@ -140,10 +141,11 @@ def call(self, audio_data, sample_rate=None): nfbank = shape[1] fbank_feats = tf.reshape(fbank_feats, (1, nframe, nfbank)) framepow_feats = self.framepow(audio_data, sample_rate) - mfcc = py_x_ops.mfcc(fbank_feats, - framepow_feats, - sample_rate, - use_energy=p.use_energy, - cepstral_lifter=p.cepstral_lifter, - coefficient_count=p.coefficient_count) + mfcc = py_x_ops.mfcc( + fbank_feats, + framepow_feats, + sample_rate, + use_energy=p.use_energy, + cepstral_lifter=p.cepstral_lifter, + coefficient_count=p.coefficient_count) return mfcc diff --git a/delta/data/frontend/mfcc_test.py b/delta/data/frontend/mfcc_test.py index b29d2033..5e401566 100644 --- a/delta/data/frontend/mfcc_test.py +++ b/delta/data/frontend/mfcc_test.py @@ -28,6 +28,7 @@ class MfccTest(tf.test.TestCase): """ MFCC extraction test. """ + def test_mfcc(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) diff --git a/delta/data/frontend/pitch.py b/delta/data/frontend/pitch.py index 44097d6b..1a5668e7 100644 --- a/delta/data/frontend/pitch.py +++ b/delta/data/frontend/pitch.py @@ -26,6 +26,7 @@ class Pitch(BaseFrontend): Compute pitch features of every frame in speech, return a float tensor with size (num_frames, 2). """ + def __init__(self, config: dict): super().__init__(config) diff --git a/delta/data/frontend/pitch_test.py b/delta/data/frontend/pitch_test.py index 7564522c..97c7b04e 100644 --- a/delta/data/frontend/pitch_test.py +++ b/delta/data/frontend/pitch_test.py @@ -28,6 +28,7 @@ class SpectrumTest(tf.test.TestCase): """ Pitch extraction test. """ + def test_spectrum(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) diff --git a/delta/data/frontend/plp.py b/delta/data/frontend/plp.py index e07bc376..81c7485c 100644 --- a/delta/data/frontend/plp.py +++ b/delta/data/frontend/plp.py @@ -27,6 +27,7 @@ class Plp(BaseFrontend): Compute PLP features of every frame in speech, return a float tensor with size (num_frames, plp_order + 1). """ + def __init__(self, config: dict): super().__init__(config) diff --git a/delta/data/frontend/plp_test.py b/delta/data/frontend/plp_test.py index 6dc54269..2549cf4c 100644 --- a/delta/data/frontend/plp_test.py +++ b/delta/data/frontend/plp_test.py @@ -28,6 +28,7 @@ class PlpTest(tf.test.TestCase): """ Plp extraction test. """ + def test_plp(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) diff --git a/delta/data/frontend/read_wav.py b/delta/data/frontend/read_wav.py index 17ff3b3f..0809d9d7 100644 --- a/delta/data/frontend/read_wav.py +++ b/delta/data/frontend/read_wav.py @@ -20,10 +20,12 @@ from delta.data.frontend.base_frontend import BaseFrontend from core.ops import py_x_ops + class ReadWav(BaseFrontend): """ Read audio sample from wav file, return sample data and sample rate. """ + def __init__(self, config: dict): super().__init__(config) @@ -71,12 +73,16 @@ def call(self, wavfile): with tf.control_dependencies([assert_op]): if p.speed == 1.0: - return tf.squeeze(audio_data * 32768, axis=-1), tf.cast(sample_rate, dtype=tf.int32) + return tf.squeeze( + audio_data * 32768, axis=-1), tf.cast( + sample_rate, dtype=tf.int32) else: - resample_rate = tf.cast(sample_rate, dtype=tf.float32) * tf.cast( - 1.0 / p.speed, dtype=tf.float32) - speed_data = py_x_ops.speed(tf.squeeze(audio_data * 32768, axis=-1), - tf.cast(sample_rate, dtype=tf.int32), - tf.cast(resample_rate, dtype=tf.int32), - lowpass_filter_width=5) + resample_rate = tf.cast( + sample_rate, dtype=tf.float32) * tf.cast( + 1.0 / p.speed, dtype=tf.float32) + speed_data = py_x_ops.speed( + tf.squeeze(audio_data * 32768, axis=-1), + tf.cast(sample_rate, dtype=tf.int32), + tf.cast(resample_rate, dtype=tf.int32), + lowpass_filter_width=5) return tf.squeeze(speed_data), tf.cast(sample_rate, dtype=tf.int32) diff --git a/delta/data/frontend/read_wav_test.py b/delta/data/frontend/read_wav_test.py index 2c3f099a..5f2a9d20 100644 --- a/delta/data/frontend/read_wav_test.py +++ b/delta/data/frontend/read_wav_test.py @@ -26,6 +26,7 @@ class ReadWavTest(tf.test.TestCase): """ ReadWav OP test. """ + def test_read_wav(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) @@ -38,5 +39,6 @@ def test_read_wav(self): self.assertAllClose(audio_data.eval() / 32768, audio_data_true) self.assertAllClose(sample_rate.eval(), sample_rate_true) + if __name__ == '__main__': tf.test.main() diff --git a/delta/data/frontend/spectrum.py b/delta/data/frontend/spectrum.py index 323f17b7..e006d233 100644 --- a/delta/data/frontend/spectrum.py +++ b/delta/data/frontend/spectrum.py @@ -26,6 +26,7 @@ class Spectrum(BaseFrontend): Compute spectrum features of every frame in speech, return a float tensor with size (num_frames, num_frequencies). """ + def __init__(self, config: dict): super().__init__(config) diff --git a/delta/data/frontend/spectrum_test.py b/delta/data/frontend/spectrum_test.py index 4574d1a8..8488487e 100644 --- a/delta/data/frontend/spectrum_test.py +++ b/delta/data/frontend/spectrum_test.py @@ -28,6 +28,7 @@ class SpectrumTest(tf.test.TestCase): ''' Spectum extraction test. ''' + def test_spectrum(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) @@ -38,7 +39,7 @@ def test_spectrum(self): spectrum = Spectrum.params({ 'window_length': 0.025, 'snip_edges': True, - 'dither':0.0 + 'dither': 0.0 }).instantiate() spectrum_test = spectrum(input_data, sample_rate) diff --git a/delta/data/frontend/synthfiltbank.py b/delta/data/frontend/synthfiltbank.py index 4ab34ba1..9ca370c3 100644 --- a/delta/data/frontend/synthfiltbank.py +++ b/delta/data/frontend/synthfiltbank.py @@ -14,7 +14,6 @@ # limitations under the License. # ============================================================================== - import delta.compat as tf from core.ops import py_x_ops from delta.utils.hparam import HParams diff --git a/delta/data/frontend/synthfiltbank_test.py b/delta/data/frontend/synthfiltbank_test.py index 2208d494..35f857ce 100644 --- a/delta/data/frontend/synthfiltbank_test.py +++ b/delta/data/frontend/synthfiltbank_test.py @@ -29,6 +29,7 @@ class Test(tf.test.TestCase): """ Synthfiltbank extraction test. """ + def test_synthfiltbank(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) diff --git a/delta/data/frontend/zcr_test.py b/delta/data/frontend/zcr_test.py index f670ceee..c66a44a1 100644 --- a/delta/data/frontend/zcr_test.py +++ b/delta/data/frontend/zcr_test.py @@ -29,6 +29,7 @@ class ZcrTest(tf.test.TestCase): """ Test Fbank FE using 8k/16k wav files. """ + def test_zcr(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) diff --git a/utils/speech/compute_fbank_feats.py b/utils/speech/compute_fbank_feats.py index 11cbdf07..9d05c076 100755 --- a/utils/speech/compute_fbank_feats.py +++ b/utils/speech/compute_fbank_feats.py @@ -69,10 +69,10 @@ def get_parser(): default=True, help='The last frame (shorter than window_length) will not be cutoff.') parser.add_argument( - '--dither', - type=float, - default=0.0, - help='Dithering constant (0.0 means no dither).') + '--dither', + type=float, + default=0.0, + help='Dithering constant (0.0 means no dither).') parser.add_argument( '--raw_energy', type=int, diff --git a/utils/speech/compute_fbank_pitch.py b/utils/speech/compute_fbank_pitch.py index d7b0d0aa..95684557 100755 --- a/utils/speech/compute_fbank_pitch.py +++ b/utils/speech/compute_fbank_pitch.py @@ -49,10 +49,10 @@ def get_parser(): default=40, help='Order of fbank') parser.add_argument( - '--dither', - type=float, - default=0.0, - help='Dithering constant (0.0 means no dither).') + '--dither', + type=float, + default=0.0, + help='Dithering constant (0.0 means no dither).') parser.add_argument( '--window_length', type=float, default=0.025, help='Length of a frame') parser.add_argument( diff --git a/utils/speech/compute_mfcc_feats.py b/utils/speech/compute_mfcc_feats.py index 7d525581..1e68ec10 100755 --- a/utils/speech/compute_mfcc_feats.py +++ b/utils/speech/compute_mfcc_feats.py @@ -89,10 +89,10 @@ def get_parser(): default=True, help='Compute power spetrum without frame energy.') parser.add_argument( - '--dither', - type=float, - default=0.0, - help='Dithering constant (0.0 means no dither).') + '--dither', + type=float, + default=0.0, + help='Dithering constant (0.0 means no dither).') parser.add_argument( '--cepstral_lifter', type=float, diff --git a/utils/speech/compute_spectrum_feats.py b/utils/speech/compute_spectrum_feats.py index e6db813b..d5bd2740 100755 --- a/utils/speech/compute_spectrum_feats.py +++ b/utils/speech/compute_spectrum_feats.py @@ -74,10 +74,10 @@ def get_parser(): default=False, help='Compute power spetrum without frame energy') parser.add_argument( - '--dither', - type=float, - default=0.0, - help='Dithering constant (0.0 means no dither).') + '--dither', + type=float, + default=0.0, + help='Dithering constant (0.0 means no dither).') parser.add_argument( '--write_num_frames', type=str,