From 1ee62441e975cbb41c1488bcbdf5037e14c86b16 Mon Sep 17 00:00:00 2001
From: dengchengyun <dengchengyun@didiglobal.com>
Date: Thu, 19 Dec 2019 21:41:12 +0800
Subject: [PATCH 1/6] fix bugs and docs of FE

---
 core/ops/.gitignore                           |   2 +
 core/ops/kernels/analyfiltbank.cc             |   4 +-
 core/ops/kernels/fbank.cc                     |   2 +
 core/ops/kernels/fbank.h                      |   1 +
 core/ops/kernels/fbank_op.cc                  |   7 +
 core/ops/kernels/framepow.cc                  |  39 ++++-
 core/ops/kernels/framepow.h                   |   8 +-
 core/ops/kernels/framepow_op.cc               |  11 +-
 core/ops/kernels/mfcc_dct_op.cc               |  23 ++-
 core/ops/kernels/mfcc_mel_filterbank.cc       |   6 +
 core/ops/kernels/mfcc_mel_filterbank.h        |   1 +
 core/ops/kernels/resample.cc                  |   3 +-
 core/ops/kernels/resample.h                   |   4 +
 core/ops/kernels/spectrum.cc                  | 133 ++++++++++--------
 core/ops/kernels/spectrum.h                   |  17 ++-
 core/ops/kernels/spectrum_op.cc               |  26 ++--
 core/ops/kernels/speed_op.cc                  |  84 +++++++++++
 core/ops/kernels/support_functions.cc         |  35 +++--
 core/ops/kernels/support_functions.h          |   7 +-
 core/ops/kernels/synthfiltbank.cc             |   6 +-
 core/ops/kernels/x_ops.cc                     |  22 ++-
 core/ops/py_x_ops.py                          |   1 +
 delta/data/frontend/add_noise_end_to_end.py   |   6 +-
 .../frontend/add_noise_end_to_end_test.py     |   5 +-
 delta/data/frontend/add_rir_noise_aecres.py   |  11 +-
 .../frontend/add_rir_noise_aecres_test.py     |   5 +-
 delta/data/frontend/analyfiltbank.py          |  30 ++--
 delta/data/frontend/analyfiltbank_test.py     |   5 +-
 delta/data/frontend/cepstrum.py               |  30 ++--
 delta/data/frontend/cepstrum_test.py          |   5 +-
 delta/data/frontend/cmvn.py                   |  25 +++-
 delta/data/frontend/delta_delta.py            |   5 +-
 delta/data/frontend/delta_delta_test.py       |   5 +-
 delta/data/frontend/fbank.py                  |  71 +++++++---
 delta/data/frontend/fbank_pitch.py            |  95 +++++++++++--
 delta/data/frontend/fbank_pitch_test.py       |   5 +-
 delta/data/frontend/fbank_test.py             |   9 +-
 delta/data/frontend/framepow.py               |  49 +++++--
 delta/data/frontend/framepow_test.py          |  20 +--
 delta/data/frontend/mfcc.py                   |  90 +++++++-----
 delta/data/frontend/mfcc_test.py              |   5 +-
 delta/data/frontend/pitch.py                  |  91 ++++++++----
 delta/data/frontend/pitch_test.py             |   5 +-
 delta/data/frontend/plp.py                    |  24 +++-
 delta/data/frontend/plp_test.py               |   5 +-
 delta/data/frontend/read_wav.py               |  44 ++++--
 delta/data/frontend/read_wav_test.py          |  14 +-
 delta/data/frontend/spectrum.py               |  54 ++++---
 delta/data/frontend/spectrum_test.py          |   8 +-
 delta/data/frontend/synthfiltbank.py          |   2 +-
 delta/data/frontend/synthfiltbank_test.py     |   5 +-
 delta/data/frontend/write_wav.py              |   2 +-
 delta/data/frontend/write_wav_test.py         |   5 +-
 delta/data/frontend/zcr.py                    |  24 +++-
 delta/data/frontend/zcr_test.py               |   5 +-
 utils/speech/compute_fbank_feats.py           |  10 +-
 utils/speech/compute_fbank_pitch.py           |   6 +
 utils/speech/compute_mfcc_feats.py            |   6 +
 utils/speech/compute_spectrum_feats.py        |   6 +
 utils/speech/make_fbank.sh                    |   5 +-
 utils/speech/make_fbank_pitch.sh              |   3 +
 utils/speech/make_mfcc.sh                     |   3 +
 utils/speech/make_spectrum.sh                 |   3 +
 63 files changed, 906 insertions(+), 342 deletions(-)
 create mode 100644 core/ops/kernels/speed_op.cc

diff --git a/core/ops/.gitignore b/core/ops/.gitignore
index e63aa584..3f028b5b 100644
--- a/core/ops/.gitignore
+++ b/core/ops/.gitignore
@@ -1,4 +1,6 @@
 gen/
+cppjieba
+*.so
 !data/sm1_cln.wav
 *.scp
 !noiselist.scp
diff --git a/core/ops/kernels/analyfiltbank.cc b/core/ops/kernels/analyfiltbank.cc
index 9e63e81b..6c57b47b 100644
--- a/core/ops/kernels/analyfiltbank.cc
+++ b/core/ops/kernels/analyfiltbank.cc
@@ -79,6 +79,7 @@ int Analyfiltbank::proc_afb(const float* mic_buf) {
   xcomplex* win = static_cast<xcomplex*>(malloc(sizeof(xcomplex) * i_FFTSiz));
   xcomplex* fftwin =
       static_cast<xcomplex*>(malloc(sizeof(xcomplex) * i_FFTSiz));
+  float* fft_buf = static_cast<float*>(malloc(sizeof(float) * 2 * i_FFTSiz));
 
   /* generate window */
   gen_window(pf_WINDOW, i_WinLen, s_WinTyp);
@@ -96,7 +97,7 @@ int Analyfiltbank::proc_afb(const float* mic_buf) {
     }
 
     /* fft */
-    dit_r2_fft(win, fftwin, i_FFTSiz, -1);
+    dit_r2_fft(win, fftwin, fft_buf, i_FFTSiz, -1);
 
     for (k = 0; k < i_NumFrq; k++) {
       pf_PowSpc[n * i_NumFrq + k] = complex_abs2(fftwin[k]);
@@ -106,6 +107,7 @@ int Analyfiltbank::proc_afb(const float* mic_buf) {
 
   free(win);
   free(fftwin);
+  free(fft_buf);
 
   return 1;
 }
diff --git a/core/ops/kernels/fbank.cc b/core/ops/kernels/fbank.cc
index 4a8a7702..0427443f 100644
--- a/core/ops/kernels/fbank.cc
+++ b/core/ops/kernels/fbank.cc
@@ -33,6 +33,8 @@ Fbank::Fbank()
       upper_frequency_limit_(kDefaultUpperFrequencyLimit),
       filterbank_channel_count_(kDefaultFilterbankChannelCount) {}
 
+Fbank::~Fbank() {}
+
 bool Fbank::Initialize(int input_length, double input_sample_rate) {
   if (input_length < 1) {
     LOG(ERROR) << "Input length must be positive.";
diff --git a/core/ops/kernels/fbank.h b/core/ops/kernels/fbank.h
index 7bc4356d..d286c1af 100644
--- a/core/ops/kernels/fbank.h
+++ b/core/ops/kernels/fbank.h
@@ -32,6 +32,7 @@ namespace delta {
 class Fbank {
  public:
   Fbank();
+  ~Fbank();
   bool Initialize(int input_length, double input_sample_rate);
   // Input is a single squared-magnitude spectrogram frame. The input spectrum
   // is converted to linear magnitude and weighted into bands using a
diff --git a/core/ops/kernels/fbank_op.cc b/core/ops/kernels/fbank_op.cc
index 697dfa7e..a766e22b 100644
--- a/core/ops/kernels/fbank_op.cc
+++ b/core/ops/kernels/fbank_op.cc
@@ -49,6 +49,11 @@ class FbankOp : public OpKernel {
                     sample_rate_tensor.shape().DebugString(), " instead."));
     const int32 sample_rate = sample_rate_tensor.scalar<int32>()();
 
+    if (upper_frequency_limit_ <= 0)
+        upper_frequency_limit_ = sample_rate / 2.0 + upper_frequency_limit_;
+    else if (upper_frequency_limit_ > sample_rate / 2.0 || upper_frequency_limit_ <= lower_frequency_limit_)
+        upper_frequency_limit_ = sample_rate / 2.0;
+
     // shape [channels, time, bins]
     const int spectrogram_channels = spectrogram.dim_size(2);
     const int spectrogram_samples = spectrogram.dim_size(1);
@@ -94,6 +99,8 @@ class FbankOp : public OpKernel {
         for (int i = 0; i < filterbank_channel_count_; ++i) {
           output_data[i] = fbank_output[i];
         }
+        std::vector<double>().swap(fbank_input);
+        std::vector<double>().swap(fbank_output);
       }
     }
   }
diff --git a/core/ops/kernels/framepow.cc b/core/ops/kernels/framepow.cc
index 21b66240..4fc13f2f 100644
--- a/core/ops/kernels/framepow.cc
+++ b/core/ops/kernels/framepow.cc
@@ -27,6 +27,8 @@ const float frame_length_sec = 0.010;
 FramePow::FramePow() {
   window_length_sec_ = window_length_sec;
   frame_length_sec_ = frame_length_sec;
+  i_snip_edges = 1;
+  i_remove_dc_offset = true;
   pf_FrmEng = NULL;
 }
 
@@ -40,27 +42,54 @@ void FramePow::set_frame_length_sec(float frame_length_sec) {
   frame_length_sec_ = frame_length_sec;
 }
 
+void FramePow::set_snip_edges(int snip_edges) { i_snip_edges = snip_edges; }
+
+void FramePow::set_remove_dc_offset(bool remove_dc_offset) {
+  i_remove_dc_offset = remove_dc_offset;
+ }
+
 int FramePow::init_eng(int input_size, float sample_rate) {
   f_SamRat = sample_rate;
   i_WinLen = static_cast<int>(window_length_sec_ * f_SamRat);
   i_FrmLen = static_cast<int>(frame_length_sec_ * f_SamRat);
-  i_NumFrm = (input_size - i_WinLen) / i_FrmLen + 1;
+  if (i_snip_edges == 1)
+    i_NumFrm = (input_size - i_WinLen) / i_FrmLen + 1;
+  else
+    i_NumFrm = (input_size + i_FrmLen / 2) / i_FrmLen;
 
   pf_FrmEng = static_cast<float*>(malloc(sizeof(float) * i_NumFrm));
 
   return 1;
 }
 
-int FramePow::proc_eng(const float* mic_buf) {
-  int n, k;
+int FramePow::proc_eng(const float* mic_buf, int input_size) {
+  int i, n, k;
   float* win = static_cast<float*>(malloc(sizeof(float) * i_WinLen));
 
   for (n = 0; n < i_NumFrm; n++) {
     pf_FrmEng[n] = 0.0;
+    float sum = 0.0;
+    float energy = 0.0;
     for (k = 0; k < i_WinLen; k++) {
-      win[k] = mic_buf[n * i_FrmLen + k];
-      pf_FrmEng[n] = pf_FrmEng[n] + win[k] * win[k];
+      int index = n * i_FrmLen + k;
+      if (index < input_size)
+        win[k] = mic_buf[index];
+      else
+        win[k] = 0.0f;
+      sum += win[k];
+    }
+
+    if (i_remove_dc_offset == true) {
+      float mean = sum / i_WinLen;
+      for (int l = 0; l < i_WinLen; l++) win[l] -= mean;
     }
+
+    for (i = 0; i < i_WinLen; i++) {
+        energy += win[i] * win[i];
+    }
+
+    pf_FrmEng[n] = log(energy);
+
   }
 
   free(win);
diff --git a/core/ops/kernels/framepow.h b/core/ops/kernels/framepow.h
index 3019deda..c756da78 100644
--- a/core/ops/kernels/framepow.h
+++ b/core/ops/kernels/framepow.h
@@ -27,6 +27,8 @@ class FramePow {
  private:
   float window_length_sec_;
   float frame_length_sec_;
+  int i_snip_edges;
+  bool i_remove_dc_offset;
 
   float f_SamRat;
   int i_WinLen;
@@ -44,9 +46,13 @@ class FramePow {
 
   void set_frame_length_sec(float frame_length_sec);
 
+  void set_snip_edges(int snip_edges);
+
+  void set_remove_dc_offset(bool remove_dc_offset);
+
   int init_eng(int input_size, float sample_rate);
 
-  int proc_eng(const float* mic_buf);
+  int proc_eng(const float* mic_buf, int input_size);
 
   int get_eng(float* output);
 
diff --git a/core/ops/kernels/framepow_op.cc b/core/ops/kernels/framepow_op.cc
index 6707b6bc..55897d5a 100644
--- a/core/ops/kernels/framepow_op.cc
+++ b/core/ops/kernels/framepow_op.cc
@@ -29,6 +29,9 @@ class FramePowOp : public OpKernel {
   explicit FramePowOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("window_length", &window_length_));
     OP_REQUIRES_OK(context, context->GetAttr("frame_length", &frame_length_));
+    OP_REQUIRES_OK(context, context->GetAttr("snip_edges", &snip_edges_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("remove_dc_offset", &remove_dc_offset_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -49,6 +52,8 @@ class FramePowOp : public OpKernel {
     FramePow cls_eng;
     cls_eng.set_window_length_sec(window_length_);
     cls_eng.set_frame_length_sec(frame_length_);
+    cls_eng.set_snip_edges(snip_edges_);
+    cls_eng.set_remove_dc_offset(remove_dc_offset_);
     OP_REQUIRES(context, cls_eng.init_eng(L, sample_rate),
                 errors::InvalidArgument(
                     "framepow_class initialization failed for length ", L,
@@ -58,6 +63,8 @@ class FramePowOp : public OpKernel {
     int i_WinLen = static_cast<int>(window_length_ * sample_rate);
     int i_FrmLen = static_cast<int>(frame_length_ * sample_rate);
     int i_NumFrm = (L - i_WinLen) / i_FrmLen + 1;
+    if (snip_edges_ == 2) i_NumFrm = (L + i_FrmLen / 2) / i_FrmLen;
+    if (i_NumFrm < 1) i_NumFrm = 1;
     OP_REQUIRES_OK(context, context->allocate_output(
                                 0, TensorShape({1, i_NumFrm}), &output_tensor));
 
@@ -65,13 +72,15 @@ class FramePowOp : public OpKernel {
     float* output_flat = output_tensor->flat<float>().data();
 
     int ret;
-    ret = cls_eng.proc_eng(input_flat);
+    ret = cls_eng.proc_eng(input_flat, L);
     ret = cls_eng.get_eng(output_flat);
   }
 
  private:
   float window_length_;
   float frame_length_;
+  int snip_edges_;
+  bool remove_dc_offset_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("FramePow").Device(DEVICE_CPU), FramePowOp);
diff --git a/core/ops/kernels/mfcc_dct_op.cc b/core/ops/kernels/mfcc_dct_op.cc
index c9da6b79..8d94b0db 100644
--- a/core/ops/kernels/mfcc_dct_op.cc
+++ b/core/ops/kernels/mfcc_dct_op.cc
@@ -41,10 +41,10 @@ class MfccDctOp : public OpKernel {
     OP_REQUIRES(context, fbank.dims() == 3,
                 errors::InvalidArgument("Fbank must be 3-dimensional",
                                         fbank.shape().DebugString()));
-    const Tensor& spectrum = context->input(1);
-    OP_REQUIRES(context, spectrum.dims() == 3,
-                errors::InvalidArgument("Spectrum must be 3-dimensional",
-                                        spectrum.shape().DebugString()));
+    const Tensor& framepow = context->input(1);
+    OP_REQUIRES(context, framepow.dims() == 1,
+                errors::InvalidArgument("Framepow must be 1-dimensional",
+                                        framepow.shape().DebugString()));
     const Tensor& sample_rate_tensor = context->input(2);
     OP_REQUIRES(context, TensorShapeUtils::IsScalar(sample_rate_tensor.shape()),
                 errors::InvalidArgument(
@@ -56,8 +56,6 @@ class MfccDctOp : public OpKernel {
     const int fbank_channels = fbank.dim_size(2);
     const int fbank_samples = fbank.dim_size(1);
     const int audio_channels = fbank.dim_size(0);
-    const int spectrum_samples = spectrum.dim_size(1);
-    const int spectrum_channels = spectrum.dim_size(2);
 
     MfccDct mfcc;
     mfcc.set_coefficient_count(coefficient_count_);
@@ -77,7 +75,7 @@ class MfccDctOp : public OpKernel {
             &output_tensor));
 
     const float* fbank_flat = fbank.flat<float>().data();
-    const float* spectrum_flat = spectrum.flat<float>().data();
+    const float* framepow_flat = framepow.flat<float>().data();
     float* output_flat = output_tensor->flat<float>().data();
 
     for (int audio_channel = 0; audio_channel < audio_channels;
@@ -86,13 +84,10 @@ class MfccDctOp : public OpKernel {
         const float* sample_data =
             fbank_flat + (audio_channel * fbank_samples * fbank_channels) +
             (fbank_sample * fbank_channels);
-        const float* spectrum_data =
-            spectrum_flat + (audio_channel * fbank_samples * spectrum_channels) +
-            (fbank_sample * spectrum_channels);
+        const float* framepow_data = framepow_flat + fbank_sample;
         std::vector<double> mfcc_input(sample_data,
                                        sample_data + fbank_channels);
-        std::vector<double> spectrum_input(spectrum_data,
-                                           spectrum_data + spectrum_channels);
+        std::vector<double> framepow_input(framepow_data, framepow_data + 1);
         std::vector<double> mfcc_output;
         mfcc.Compute(mfcc_input, &mfcc_output);
         DCHECK_EQ(coefficient_count_, mfcc_output.size());
@@ -103,10 +98,10 @@ class MfccDctOp : public OpKernel {
           output_data[i] = mfcc_output[i];
         }
         if (use_energy_)
-            output_data[0] = spectrum_input[0];
+            output_data[0] = framepow_input[0];
 
         std::vector<double>().swap(mfcc_input);
-        std::vector<double>().swap(spectrum_input);
+        std::vector<double>().swap(framepow_input);
         std::vector<double>().swap(mfcc_output);
       }
     }
diff --git a/core/ops/kernels/mfcc_mel_filterbank.cc b/core/ops/kernels/mfcc_mel_filterbank.cc
index 76f848e9..097faa6c 100644
--- a/core/ops/kernels/mfcc_mel_filterbank.cc
+++ b/core/ops/kernels/mfcc_mel_filterbank.cc
@@ -38,6 +38,12 @@ namespace tensorflow {
 
 MfccMelFilterbank::MfccMelFilterbank() : initialized_(false) {}
 
+MfccMelFilterbank::~MfccMelFilterbank() {
+    std::vector<double>().swap(center_frequencies_);
+    std::vector<double>().swap(weights_);
+    std::vector<int>().swap(band_mapper_);
+}
+
 bool MfccMelFilterbank::Initialize(int input_length, double input_sample_rate,
                                    int output_channel_count,
                                    double lower_frequency_limit,
diff --git a/core/ops/kernels/mfcc_mel_filterbank.h b/core/ops/kernels/mfcc_mel_filterbank.h
index e9bcc6a1..2a745b2d 100644
--- a/core/ops/kernels/mfcc_mel_filterbank.h
+++ b/core/ops/kernels/mfcc_mel_filterbank.h
@@ -27,6 +27,7 @@ namespace tensorflow {
 class MfccMelFilterbank {
  public:
   MfccMelFilterbank();
+  ~MfccMelFilterbank();
   bool Initialize(int input_length,  // Number of unique FFT bins fftsize/2+1.
                   double input_sample_rate, int output_channel_count,
                   double lower_frequency_limit, double upper_frequency_limit);
diff --git a/core/ops/kernels/resample.cc b/core/ops/kernels/resample.cc
index 8b7f7327..ed6d1be5 100644
--- a/core/ops/kernels/resample.cc
+++ b/core/ops/kernels/resample.cc
@@ -35,7 +35,6 @@ LinearResample::LinearResample(int samp_rate_in_hz,
   assert(samp_rate_in_hz > 0.0 &&
                samp_rate_out_hz > 0.0 &&
                filter_cutoff_hz > 0.0 &&
-               filter_cutoff_hz*2 <= samp_rate_in_hz &&
                filter_cutoff_hz*2 <= samp_rate_out_hz &&
                num_zeros > 0);
 
@@ -56,7 +55,7 @@ int LinearResample::GetNumOutputSamples(int input_num_samp,
 
   // work out the number of ticks in the time interval
   // [ 0, input_num_samp/samp_rate_in_ ).
-  int interval_length_in_ticks = input_num_samp * ticks_per_input_period;
+  long long interval_length_in_ticks = (long long)input_num_samp * (long long)ticks_per_input_period;
   if (!flush) {
     BaseFloat window_width = num_zeros_ / (2.0 * filter_cutoff_);
     int window_width_ticks = floor(window_width * tick_freq);
diff --git a/core/ops/kernels/resample.h b/core/ops/kernels/resample.h
index 06cef89c..19c07014 100644
--- a/core/ops/kernels/resample.h
+++ b/core/ops/kernels/resample.h
@@ -25,6 +25,10 @@ limitations under the License.
 #include <vector>
 #include <assert.h>
 
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/logging.h"
+using namespace tensorflow;  // NOLINT
+
 using namespace std;
 #include "kernels/support_functions.h"
 
diff --git a/core/ops/kernels/spectrum.cc b/core/ops/kernels/spectrum.cc
index d284437d..eec21b11 100644
--- a/core/ops/kernels/spectrum.cc
+++ b/core/ops/kernels/spectrum.cc
@@ -30,19 +30,21 @@ Spectrum::Spectrum() {
   window_length_sec_ = window_length_sec;
   frame_length_sec_ = frame_length_sec;
   i_OutTyp = 1;
-  i_snip_edges = true;
+  i_snip_edges = 1;
   i_raw_energy = 1;
   f_PreEph = 0.97;
   i_is_fbank = true;
   i_remove_dc_offset = true;
+  i_dither = 0.0;
   snprintf(s_WinTyp, sizeof(s_WinTyp), "povey");
   pf_WINDOW = NULL;
   pf_SPC = NULL;
-}
-
-Spectrum::~Spectrum() {
-  free(pf_WINDOW);
-  free(pf_SPC);
+  win_temp = NULL;
+  win_buf = NULL;
+  eph_buf = NULL;
+  win = NULL;
+  fftwin = NULL;
+  fft_buf = NULL;
 }
 
 void Spectrum::set_window_length_sec(float window_length_sec) {
@@ -55,35 +57,34 @@ void Spectrum::set_frame_length_sec(float frame_length_sec) {
 
 void Spectrum::set_output_type(int output_type) { i_OutTyp = output_type; }
 
-void Spectrum::set_snip_edges(bool snip_edges) { i_snip_edges = snip_edges; }
+void Spectrum::set_snip_edges(int snip_edges) { i_snip_edges = snip_edges; }
 
-void Spectrum::set_raw_energy(int raw_energy) { i_raw_energy = raw_energy; }
+void Spectrum::set_raw_energy(int raw_energy) {i_raw_energy = raw_energy;}
 
-void Spectrum::set_is_fbank(bool is_fbank) { i_is_fbank = is_fbank; }
+void Spectrum::set_is_fbank(bool is_fbank) {i_is_fbank = is_fbank;}
 
-void Spectrum::set_remove_dc_offset(bool remove_dc_offset) {
-  i_remove_dc_offset = remove_dc_offset;
-}
+void Spectrum::set_remove_dc_offset(bool remove_dc_offset) {i_remove_dc_offset = remove_dc_offset;}
 
-void Spectrum::set_preEph(float preEph) { f_PreEph = preEph; }
+void Spectrum::set_preEph(float preEph) {f_PreEph = preEph;}
 
-void Spectrum::set_window_type(char* window_type) {
-  snprintf(s_WinTyp, sizeof(s_WinTyp), "%s", window_type);
+void Spectrum::set_dither(float dither) {i_dither = dither;}
+
+void Spectrum::set_window_type(char* window_type){
+    snprintf(s_WinTyp, sizeof(s_WinTyp), "%s", window_type);
 }
 
 int Spectrum::init_spc(int input_size, float sample_rate) {
   f_SamRat = sample_rate;
   i_WinLen = static_cast<int>(window_length_sec_ * f_SamRat);
   i_FrmLen = static_cast<int>(frame_length_sec_ * f_SamRat);
-  if (i_snip_edges == true)
+  if (i_snip_edges == 1)
     i_NumFrm = (input_size - i_WinLen) / i_FrmLen + 1;
   else
     i_NumFrm = (input_size + i_FrmLen / 2) / i_FrmLen;
+  if (i_NumFrm < 1)
+    i_NumFrm = 1;
   i_FFTSiz = static_cast<int>(pow(2.0f, ceil(log2(i_WinLen))));
   i_NumFrq = i_FFTSiz / 2 + 1;
-  if (i_NumFrm < 1) i_NumFrm = 1;
-  pf_WINDOW = static_cast<float*>(malloc(sizeof(float) * i_WinLen));
-  pf_SPC = static_cast<float*>(malloc(sizeof(float) * i_NumFrq * i_NumFrm));
 
   return 1;
 }
@@ -91,36 +92,44 @@ int Spectrum::init_spc(int input_size, float sample_rate) {
 int Spectrum::proc_spc(const float* mic_buf, int input_size) {
   int n, k;
 
-  /* generate window */
-  gen_window(pf_WINDOW, i_WinLen, s_WinTyp);
-
   if (input_size < i_WinLen)
-    std::cerr << "Wraning: The length of input data is shorter than "
-              << window_length_sec_ << " s." << std::endl;
+    std::cerr<<"Wraning: The length of input data is shorter than "<< window_length_sec_ << " s." <<std::endl;
 
-  float tmp;
-  xcomplex* win = static_cast<xcomplex*>(malloc(sizeof(xcomplex) * i_FFTSiz));
-  float* win_buf = static_cast<float*>(malloc(sizeof(float) * i_WinLen));
-  float* eph_buf = static_cast<float*>(malloc(sizeof(float) * i_WinLen));
-  float* win_temp = static_cast<float*>(malloc(sizeof(float) * i_WinLen));
-  xcomplex* fftwin =
-      static_cast<xcomplex*>(malloc(sizeof(xcomplex) * i_FFTSiz));
+  //malloc
+  pf_WINDOW = static_cast<float*>(malloc(sizeof(float) * i_WinLen));
+  pf_SPC = static_cast<float*>(malloc(sizeof(float) * i_NumFrq * i_NumFrm));
+  win = static_cast<xcomplex*>(malloc(sizeof(xcomplex) * i_FFTSiz));
+  win_buf = static_cast<float*>(malloc(sizeof(float) * i_WinLen));
+  eph_buf = static_cast<float*>(malloc(sizeof(float) * i_WinLen));
+  win_temp = static_cast<float*>(malloc(sizeof(float) * i_WinLen));
+  fftwin = static_cast<xcomplex*>(malloc(sizeof(xcomplex) * i_FFTSiz));
+  fft_buf = static_cast<float*>(malloc(sizeof(float) * 2 * i_FFTSiz));  // c.r&c.i
+
+  /* generate window */
+  gen_window(pf_WINDOW, i_WinLen, s_WinTyp);
 
   for (n = 0; n < i_NumFrm; n++) {
     float signal_raw_log_energy = 0.0;
     float sum = 0.0;
-    for (int l = 0; l < i_WinLen; l++) {
+    for (int l = 0; l < i_WinLen; l++){
       int index = n * i_FrmLen + l;
-      if (index < input_size)
+      if (index < input_size) {
         win_buf[l] = mic_buf[index];
-      else
+      } else {
         win_buf[l] = 0.0f;
+      }
       sum += win_buf[l];
     }
 
-    if (i_remove_dc_offset == true) {
+    if(i_dither != 0.0) {
+      do_dither(win_buf, i_WinLen, i_dither);
+    }
+
+    if (i_remove_dc_offset == true){
       float mean = sum / i_WinLen;
-      for (int l = 0; l < i_WinLen; l++) win_buf[l] -= mean;
+      for (int l = 0; l < i_WinLen; l++) {
+        win_buf[l] -= mean;
+      }
     }
 
     /* do pre-emphais */
@@ -129,53 +138,58 @@ int Spectrum::proc_spc(const float* mic_buf, int input_size) {
     for (k = 0; k < i_WinLen; k++) {
       win[k].r = eph_buf[k] * pf_WINDOW[k];
       win[k].i = 0.0f;
-      if (i_raw_energy == 1)
-        win_temp[k] = win_buf[k];
-      else
-        win_temp[k] = win[k].r;
     }
 
-    for (k = i_WinLen; k < i_FFTSiz; k++) {
-      win[k].r = 0.0f;
-      win[k].i = 0.0f;
+    if (i_raw_energy == 1) {
+      std::memcpy(win_temp, win_buf, i_WinLen * sizeof(float));
     }
+    else {
+      for (k = 0; k < i_WinLen; k++) {
+        win_temp[k] = win[k].r;
+      }
+    }
+
+    std::memset((void*)&(win[i_WinLen]), 0, sizeof(float) * 2 * (i_FFTSiz - i_WinLen));;
 
     /* raw energy */
     signal_raw_log_energy = compute_energy(win_temp, i_WinLen);
 
     /* fft */
-    dit_r2_fft(win, fftwin, i_FFTSiz, -1);
+    dit_r2_fft(win, fftwin, fft_buf, i_FFTSiz, -1);
 
-    for (k = 0; k < i_NumFrq; k++) {
-      if (k == 0 && i_is_fbank == false) {
-        fftwin[k].r = sqrt(signal_raw_log_energy);
-        fftwin[k].i = 0.0f;
-      }
-      if (i_OutTyp == 1)
+    if (!i_is_fbank) {
+      fftwin[0].r = sqrt(signal_raw_log_energy);
+      fftwin[0].i = 0.0f;
+    }
+
+    if (i_OutTyp == 1) {
+      for (k = 0; k < i_NumFrq; k++) {
         pf_SPC[n * i_NumFrq + k] = complex_abs2(fftwin[k]);
-      else if (i_OutTyp == 2)
+      }
+    } else if (i_OutTyp == 2) {
+      for (k = 0; k < i_NumFrq; k++) {
         pf_SPC[n * i_NumFrq + k] = log(complex_abs2(fftwin[k]));
-      else
-        return -1;
+      }
+    } else {
+      return -1;
     }
   }
 
+  free(pf_WINDOW);
   free(win_temp);
   free(win_buf);
   free(eph_buf);
   free(win);
   free(fftwin);
+  free(fft_buf);
 
   return 1;
 }
 
 int Spectrum::get_spc(float* output) {
-  int n, m;
-  for (m = 0; m < i_NumFrq; m++) {
-    for (n = 0; n < i_NumFrm; n++) {
-      output[n * i_NumFrq + m] = pf_SPC[n * i_NumFrq + m];
-    }
-  }
+  std::memcpy((void*)output, (void*)pf_SPC, \
+		i_NumFrq * i_NumFrm * sizeof(float));
+  free(pf_SPC);
   return 1;
 }
 
@@ -192,4 +206,5 @@ int Spectrum::write_spc() {
   fclose(fp);
   return 1;
 }
+
 }  // namespace delta
diff --git a/core/ops/kernels/spectrum.h b/core/ops/kernels/spectrum.h
index 29c152d8..517890ce 100644
--- a/core/ops/kernels/spectrum.h
+++ b/core/ops/kernels/spectrum.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 
 #include "kernels/complex_defines.h"
+#include "kernels/support_functions.h"
 
 using namespace tensorflow;  // NOLINT
 
@@ -40,26 +41,32 @@ class Spectrum {
   float f_PreEph;
   char s_WinTyp[40];
   int i_OutTyp;  // 1: PSD, 2:log(PSD)
-  bool i_snip_edges;
+  int i_snip_edges;
   int i_raw_energy;
   bool i_remove_dc_offset;
   bool i_is_fbank;
+  float i_dither;
 
   float* pf_WINDOW;
   float* pf_SPC;
 
+  xcomplex* win;
+  float* win_buf;
+  float* eph_buf;
+  float* win_temp;
+  xcomplex* fftwin;
+  float* fft_buf;
+
  public:
   Spectrum();
 
-  ~Spectrum();
-
   void set_window_length_sec(float window_length_sec);
 
   void set_frame_length_sec(float frame_length_sec);
 
   void set_output_type(int output_type);
 
-  void set_snip_edges(bool snip_edges);
+  void set_snip_edges(int snip_edges);
 
   void set_raw_energy(int raw_energy);
 
@@ -71,6 +78,8 @@ class Spectrum {
 
   void set_remove_dc_offset(bool remove_dc_offset);
 
+  void set_dither(float dither);
+
   int init_spc(int input_size, float sample_rate);
 
   int proc_spc(const float* mic_buf, int input_size);
diff --git a/core/ops/kernels/spectrum_op.cc b/core/ops/kernels/spectrum_op.cc
index f36a7593..d6afecea 100644
--- a/core/ops/kernels/spectrum_op.cc
+++ b/core/ops/kernels/spectrum_op.cc
@@ -14,7 +14,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <string.h>
 #include "kernels/spectrum.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -22,8 +21,10 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
+#include <string.h>
 
 namespace delta {
+
 class SpecOp : public OpKernel {
  public:
   explicit SpecOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -34,9 +35,9 @@ class SpecOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("raw_energy", &raw_energy_));
     OP_REQUIRES_OK(context, context->GetAttr("preEph_coeff", &preEph_coeff_));
     OP_REQUIRES_OK(context, context->GetAttr("window_type", &window_type_));
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("remove_dc_offset", &remove_dc_offset_));
+    OP_REQUIRES_OK(context, context->GetAttr("remove_dc_offset", &remove_dc_offset_));
     OP_REQUIRES_OK(context, context->GetAttr("is_fbank", &is_fbank_));
+    OP_REQUIRES_OK(context, context->GetAttr("dither", &dither_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -52,10 +53,8 @@ class SpecOp : public OpKernel {
                     sample_rate_tensor.shape().DebugString(), " instead."));
     const float sample_rate = sample_rate_tensor.scalar<float>()();
 
-    // shape
-    const int L = input_tensor.dim_size(0);
-    char* window_type = const_cast<char*>(window_type_.c_str());
     Spectrum cls_spc;
+    char* window_type = const_cast<char *>(window_type_.c_str());
     cls_spc.set_window_length_sec(window_length_);
     cls_spc.set_frame_length_sec(frame_length_);
     cls_spc.set_output_type(output_type_);
@@ -65,6 +64,10 @@ class SpecOp : public OpKernel {
     cls_spc.set_window_type(window_type);
     cls_spc.set_remove_dc_offset(remove_dc_offset_);
     cls_spc.set_is_fbank(is_fbank_);
+    cls_spc.set_dither(dither_);
+
+    // shape
+    const int L = input_tensor.dim_size(0);
     OP_REQUIRES(context, cls_spc.init_spc(L, sample_rate),
                 errors::InvalidArgument(
                     "spectrum_class initialization failed for length ", L,
@@ -74,9 +77,11 @@ class SpecOp : public OpKernel {
     int i_WinLen = static_cast<int>(window_length_ * sample_rate);
     int i_FrmLen = static_cast<int>(frame_length_ * sample_rate);
     int i_NumFrm = (L - i_WinLen) / i_FrmLen + 1;
-    bool i_snip_edges = snip_edges_;
-    if (i_snip_edges == false) i_NumFrm = (L + i_FrmLen / 2) / i_FrmLen;
-    if (i_NumFrm < 1) i_NumFrm = 1;
+    int i_snip_edges = snip_edges_;
+    if (i_snip_edges == 2)
+        i_NumFrm = (L + i_FrmLen / 2) / i_FrmLen;
+    if (i_NumFrm < 1)
+        i_NumFrm = 1;
     int i_FrqNum = static_cast<int>(pow(2.0f, ceil(log2(i_WinLen))) / 2 + 1);
     OP_REQUIRES_OK(
         context, context->allocate_output(0, TensorShape({i_NumFrm, i_FrqNum}),
@@ -94,12 +99,13 @@ class SpecOp : public OpKernel {
   float window_length_;
   float frame_length_;
   int output_type_;
-  bool snip_edges_;
+  int snip_edges_;
   int raw_energy_;
   float preEph_coeff_;
   string window_type_;
   bool remove_dc_offset_;
   bool is_fbank_;
+  float dither_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("Spectrum").Device(DEVICE_CPU), SpecOp);
diff --git a/core/ops/kernels/speed_op.cc b/core/ops/kernels/speed_op.cc
new file mode 100644
index 00000000..43452ecd
--- /dev/null
+++ b/core/ops/kernels/speed_op.cc
@@ -0,0 +1,84 @@
+/* Copyright (C) 2017 Beijing Didi Infinity Technology and Development Co.,Ltd.
+All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "kernels/resample.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace delta {
+
+class SpeedOp : public OpKernel {
+ public:
+  explicit SpeedOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("lowpass_filter_width", &lowpass_filter_width_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input_tensor = context->input(0);
+    OP_REQUIRES(context, input_tensor.dims() == 1,
+                errors::InvalidArgument("input signal must be 1-dimensional",
+                                        input_tensor.shape().DebugString()));
+    const Tensor& sample_rate_tensor = context->input(1);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(sample_rate_tensor.shape()),
+                errors::InvalidArgument(
+                    "Input sample_rate should be a scalar tensor, got ",
+                    sample_rate_tensor.shape().DebugString(), " instead."));
+    const Tensor& resample_rate_tensor = context->input(2);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(resample_rate_tensor.shape()),
+                errors::InvalidArgument(
+                    "Resample sample_rate should be a scalar tensor, got ",
+                    resample_rate_tensor.shape().DebugString(), " instead."));
+    const int sample_rate = static_cast<int>(sample_rate_tensor.scalar<int32>()());
+    const int resample_freq = static_cast<int>(resample_rate_tensor.scalar<int32>()());
+    const float* input_flat = input_tensor.flat<float>().data();
+    const int L = input_tensor.dim_size(0);
+
+    lowpass_cutoff_ = min(resample_freq / 2, sample_rate / 2);
+    LinearResample cls_resample_(sample_rate, resample_freq,
+                                     lowpass_cutoff_,
+                                     lowpass_filter_width_);
+    vector<float> waveform(L);
+    for (int i = 0; i < L; i++){
+        waveform[i] = static_cast<float>(input_flat[i]);
+    }
+    vector<float> downsampled_wave;
+    cls_resample_.Resample(waveform, false, &downsampled_wave);
+    int output_length = downsampled_wave.size();
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape({1, output_length}),
+                                                     &output_tensor));
+    float* output_flat = output_tensor->flat<float>().data();
+    for (int j = 0; j < output_length; j++)
+        output_flat[j] = downsampled_wave[j];
+
+    std::vector<float>().swap(downsampled_wave);
+    std::vector<float>().swap(waveform);
+    cls_resample_.Reset();
+   }
+
+ private:
+  float lowpass_cutoff_;
+  int lowpass_filter_width_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("Speed").Device(DEVICE_CPU), SpeedOp);
+
+}  // namespace delta
diff --git a/core/ops/kernels/support_functions.cc b/core/ops/kernels/support_functions.cc
index 02d92ddc..b0631486 100644
--- a/core/ops/kernels/support_functions.cc
+++ b/core/ops/kernels/support_functions.cc
@@ -110,7 +110,7 @@ int gen_window(float* w, int L, char* typ) {
       w[n] = 0.54 - 0.46 * cos(pn[n]);
     }
   } else if (strcmp(typ, "povey") == 0) {
-    for (n = 0; n < L; n++) {
+    for (n = 0; n < L; n++){
       w[n] = pow(0.5 - 0.5 * cos(pn[n]), 0.85);
     }
   } else if (strcmp(typ, "blac") == 0) {
@@ -121,6 +121,7 @@ int gen_window(float* w, int L, char* typ) {
     printf("Window type not support!\n");
     return -1;
   }
+  free(pn);
   return 0;
 }
 
@@ -525,12 +526,10 @@ int compute_lpc(int ncep, int nfrm, int pord, float* x, float* y) {
 
 /* Radix-2 DIT FFT */
 /* isign=-1 ==> FFT, isign=1 ==> IFFT */
-int dit_r2_fft(xcomplex* input, xcomplex* output, int N, int isign) {
+int dit_r2_fft(xcomplex* input, xcomplex* output, float* in_buf, int N, int isign) {
   float wtemp, wr, wpr, wpi, wi, theta;
   float tempr, tempi;
   int i = 0, j = 0, n = 0, k = 0, m = 0, istep, mmax;
-  float* in_buf =
-      static_cast<float*>(malloc(sizeof(float) * 2 * N));  // c.r&c.i
   float* out_buf;
   float den;
   if (isign == -1)
@@ -592,29 +591,29 @@ int dit_r2_fft(xcomplex* input, xcomplex* output, int N, int isign) {
     output[i].r = out_buf[i * 2 + 1] / den;
     output[i].i = out_buf[i * 2 + 2] / den;
   }
-  free(in_buf);
   return 0;
 }
 
 /* compute energy of frame */
-float compute_energy(const float* input, int L) {
-  float energy = 0;
-  for (int i = 0; i < L; i++) {
-    energy += input[i] * input[i];
-  }
-  return energy;
+float compute_energy(const float* input, int L){
+    float energy = 0;
+    for (int i = 0; i < L; i++){
+        energy += input[i] * input[i];
+    }
+    return energy;
 }
 
 /* do pre_emphasis on frame */
-int do_frame_preemphasis(float* input, float* output, int i_size, float coef) {
-  if (coef == 0.0) {
+int do_frame_preemphasis(float* input, float* output, int i_size, float coef){
+    if (coef == 0.0){
+        memcpy(output, input, sizeof(float) * i_size);
+        return 0;
+    }
     memcpy(output, input, sizeof(float) * i_size);
+    for (int i = i_size - 1; i > 0; i--)
+        output[i] -= coef * output[i-1];
+    output[0] -= coef * output[0];
     return 0;
-  }
-  memcpy(output, input, sizeof(float) * i_size);
-  for (int i = i_size - 1; i > 0; i--) output[i] -= coef * output[i - 1];
-  output[0] -= coef * output[0];
-  return 0;
 }
 
 /* return subvector */
diff --git a/core/ops/kernels/support_functions.h b/core/ops/kernels/support_functions.h
index 105dc9f7..3cf27f2f 100644
--- a/core/ops/kernels/support_functions.h
+++ b/core/ops/kernels/support_functions.h
@@ -27,8 +27,6 @@ limitations under the License.
 #include <memory>
 #include <chrono>
 
-using namespace std;
-
 #include "kernels/complex_defines.h"
 
 #ifndef M_PI
@@ -39,9 +37,10 @@ using namespace std;
 #define M_2PI 6.283185307179586476925286766559005
 #endif
 
+using namespace std;
+
 namespace delta {
 typedef float  BaseFloat;
-
 /* compute mean */
 float compute_mean(float* input, int i_size);
 
@@ -105,7 +104,7 @@ int do_levinson(int pord, float* r, float* a);
 int compute_lpc(int ncep, int nfrm, int pord, float* x, float* y);
 
 /* radix-2 DIT FFT */
-int dit_r2_fft(xcomplex* input, xcomplex* output, int N, int isign);
+int dit_r2_fft(xcomplex* input, xcomplex* output, float* in_buf, int N, int isign);
 
 /* compute energy of frame */
 float compute_energy(const float* input, int L);
diff --git a/core/ops/kernels/synthfiltbank.cc b/core/ops/kernels/synthfiltbank.cc
index e1bb9901..946f69f1 100644
--- a/core/ops/kernels/synthfiltbank.cc
+++ b/core/ops/kernels/synthfiltbank.cc
@@ -68,7 +68,7 @@ int Synthfiltbank::proc_sfb(const float* powspc, const float* phaspc) {
   xcomplex* win = static_cast<xcomplex*>(malloc(sizeof(xcomplex) * i_FFTSiz));
   xcomplex* fftwin =
       static_cast<xcomplex*>(malloc(sizeof(xcomplex) * i_FFTSiz));
-
+  float* fft_buf = static_cast<float*>(malloc(sizeof(float) * 2 * i_FFTSiz));
   /* generate window */
   gen_window(pf_WINDOW, i_WinLen, s_WinTyp);
 
@@ -84,7 +84,7 @@ int Synthfiltbank::proc_sfb(const float* powspc, const float* phaspc) {
       fftwin[k].i = -1.0f * fftwin[i_FFTSiz - k].i;
     }
     /* ifft */
-    dit_r2_fft(fftwin, win, i_FFTSiz, 1);
+    dit_r2_fft(fftwin, win, fft_buf, i_FFTSiz, 1);
 
     for (k = 0; k < i_WinLen; k++) {
       pf_wav[n * i_FrmLen + k] +=
@@ -94,7 +94,7 @@ int Synthfiltbank::proc_sfb(const float* powspc, const float* phaspc) {
 
   free(win);
   free(fftwin);
-
+  free(fft_buf);
   return 1;
 }
 
diff --git a/core/ops/kernels/x_ops.cc b/core/ops/kernels/x_ops.cc
index 20681190..601124cd 100644
--- a/core/ops/kernels/x_ops.cc
+++ b/core/ops/kernels/x_ops.cc
@@ -365,6 +365,8 @@ REGISTER_OP("Pitch")
 REGISTER_OP("FramePow")
     .Input("input_data: float")
     .Input("sample_rate: float")
+    .Attr("snip_edges: int = 1")
+    .Attr("remove_dc_offset: bool = true")
     .Attr("window_length: float = 0.025")
     .Attr("frame_length: float = 0.010")
     .Output("output: float")
@@ -421,11 +423,12 @@ REGISTER_OP("Spectrum")
     .Attr("frame_length: float = 0.010")
     .Attr("window_type: string")
     .Attr("output_type: int = 2")
-    .Attr("snip_edges: bool = true")
+    .Attr("snip_edges: int = 1")
     .Attr("raw_energy: int = 1")
     .Attr("preEph_coeff: float = 0.97")
     .Attr("remove_dc_offset: bool = true")
     .Attr("is_fbank: bool = true")
+    .Attr("dither: float = 0.0")
     .Output("output: float")
     .SetShapeFn(SpectrumShapeFn)
     .Doc(R"doc(
@@ -529,9 +532,24 @@ filterbank_channel_count: int, resolution of the Mel bank used internally.
 output: float, fbank features, a tensor of shape [audio_channels, spectrogram_length, bank_feat_dim].
 )doc");
 
+REGISTER_OP("Speed")
+    .Input("input_data: float")
+    .Input("sample_rate: int32")
+    .Input("resample_freq: int32")
+    .Attr("lowpass_filter_width: int = 1")
+    .Output("output: float")
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c){
+        return Status::OK();
+    })
+    .Doc(R"doc(
+    Create pitch feature files.
+    input_data: float, input wave, a tensor of shape [1, data_length].
+    sample_rate: float, NB 8000, WB 16000 etc.
+    )doc");
+
 REGISTER_OP("MfccDct")
     .Input("fbank: float")
-    .Input("spectrum: float")
+    .Input("framepow: float")
     .Input("sample_rate: int32")
     .Attr("coefficient_count: int = 13")
     .Attr("cepstral_lifter: float = 22")
diff --git a/core/ops/py_x_ops.py b/core/ops/py_x_ops.py
index 83941e33..681e4587 100644
--- a/core/ops/py_x_ops.py
+++ b/core/ops/py_x_ops.py
@@ -51,6 +51,7 @@
 delta_delta = gen_x_ops.delta_delta
 mfcc = gen_x_ops.mfcc_dct
 add_rir_noise_aecres = gen_x_ops.add_rir_noise_aecres
+speed = gen_x_ops.speed
 
 
 def jieba_cut(input_sentence, use_file=True, hmm=True):
diff --git a/delta/data/frontend/add_noise_end_to_end.py b/delta/data/frontend/add_noise_end_to_end.py
index 4820feda..26329fb0 100644
--- a/delta/data/frontend/add_noise_end_to_end.py
+++ b/delta/data/frontend/add_noise_end_to_end.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""This model adds noise/rir to signal and writes it to file."""
 
 import delta.compat as tf
 from delta.utils.hparam import HParams
@@ -23,7 +24,10 @@
 
 
 class AddNoiseEndToEnd(BaseFrontend):
-
+  """
+  Add a random signal-to-noise ratio noise or impulse response to clean speech, and
+  write it to wavfile.
+  """
   def __init__(self, config: dict):
     super().__init__(config)
     self.add_noise = Add_rir_noise_aecres(config)
diff --git a/delta/data/frontend/add_noise_end_to_end_test.py b/delta/data/frontend/add_noise_end_to_end_test.py
index 4e4dd5d6..7152a822 100644
--- a/delta/data/frontend/add_noise_end_to_end_test.py
+++ b/delta/data/frontend/add_noise_end_to_end_test.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""The model tests OP of Add_noise_rir_end_to_end """
 
 import os
 from pathlib import Path
@@ -33,7 +34,9 @@ def change_file_path(scp_path, filetype, newfilePath):
 
 
 class AddNoiseEndToEndTest(tf.test.TestCase):
-
+  """
+  AddNoiseEndToEnd OP test.
+  """
   def test_add_noise_end_to_end(self):
 
     wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))
diff --git a/delta/data/frontend/add_rir_noise_aecres.py b/delta/data/frontend/add_rir_noise_aecres.py
index b7728362..35426409 100644
--- a/delta/data/frontend/add_rir_noise_aecres.py
+++ b/delta/data/frontend/add_rir_noise_aecres.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""This model adds noise/rir to signal."""
 
 import delta.compat as tf
 from delta.utils.hparam import HParams
@@ -21,7 +22,9 @@
 
 
 class Add_rir_noise_aecres(BaseFrontend):
-
+  """
+  Add a random signal-to-noise ratio noise or impulse response to clean speech.
+  """
   def __init__(self, config: dict):
     super().__init__(config)
 
@@ -71,8 +74,10 @@ def params(cls, config=None):
   def call(self, audio_data, sample_rate=None):
     """
         Caculate power spectrum or log power spectrum of audio data.
-        :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
-        :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
+        :param audio_data: the audio signal from which to compute spectrum.
+                          Should be an (1, N) tensor.
+        :param sample_rate: [option]the samplerate of the signal we working with,
+                           default is 16kHz.
         :return: A float tensor of size N containing add-noise audio.
         """
 
diff --git a/delta/data/frontend/add_rir_noise_aecres_test.py b/delta/data/frontend/add_rir_noise_aecres_test.py
index 2b266d42..05939dbb 100644
--- a/delta/data/frontend/add_rir_noise_aecres_test.py
+++ b/delta/data/frontend/add_rir_noise_aecres_test.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""The model tests OP of Add_noise_rir """
 
 import os
 from pathlib import Path
@@ -35,7 +36,9 @@ def change_file_path(scp_path, filetype, newfilePath):
 
 
 class AddRirNoiseAecresTest(tf.test.TestCase):
-
+  """
+  AddNoiseRIR OP test.
+  """
   def test_add_rir_noise_aecres(self):
     wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))
 
diff --git a/delta/data/frontend/analyfiltbank.py b/delta/data/frontend/analyfiltbank.py
index 6d68be4f..7ce91df2 100644
--- a/delta/data/frontend/analyfiltbank.py
+++ b/delta/data/frontend/analyfiltbank.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""This model extracts power-spectrum && phase-spectrum features per frame."""
 
 import delta.compat as tf
 from core.ops import py_x_ops
@@ -21,7 +22,10 @@
 
 
 class Analyfiltbank(BaseFrontend):
-
+  """
+  Compute power-spectrum && phase-spectrum features of every frame in speech,
+  return two float tensors with size (num_frames, num_frequencies).
+  """
   def __init__(self, config: dict):
     super().__init__(config)
 
@@ -29,9 +33,13 @@ def __init__(self, config: dict):
   def params(cls, config=None):
     """
     Set params.
-    :param config: contains three optional parameters:window_length(float, default=0.030),
-          frame_length(float, default=0.010), sample_rate(int, default=16000).
-    :return: An object of class HParams, which is a set of hyperparameters as name-value pairs.
+    :param config: contains three optional parameters:
+        --sample_rate       : Waveform data sample frequency (must match the waveform
+                             file, if specified there). (float, default = 16000)
+        --window_length		 : Window length in seconds. (float, default = 0.030)
+        --frame_length		 : Hop length in seconds. (float, default = 0.010)
+    :return: An object of class HParams, which is a set of hyperparameters as
+             name-value pairs.
     """
 
     window_length = 0.030
@@ -51,13 +59,15 @@ def params(cls, config=None):
   def call(self, audio_data, sample_rate=None):
     """
     Caculate power spectrum and phase spectrum of audio data.
-    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
-    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
+    :param audio_data: the audio signal from which to compute spectrum.
+                      Should be an (1, N) tensor.
+    :param sample_rate: [option]the samplerate of the signal we working with,
+                        default is 16kHz.
     :return: Two returns:
-        power spectrum —— A float tensor of size (num_frames, num_frequencies) containing
-            power spectrum and of every frame in speech.
-        phase spectrum —— A float tensor of size (num_frames, num_frequencies) containing
-            phase spectrum and of every frame in speech.
+        power spectrum —— A float tensor of size (num_frames, num_frequencies)
+                          containing power spectrum and of every frame in speech.
+        phase spectrum —— A float tensor of size (num_frames, num_frequencies)
+                          containing phase spectrum and of every frame in speech.
     """
 
     p = self.config
diff --git a/delta/data/frontend/analyfiltbank_test.py b/delta/data/frontend/analyfiltbank_test.py
index 73ea0bf6..d8007a8b 100644
--- a/delta/data/frontend/analyfiltbank_test.py
+++ b/delta/data/frontend/analyfiltbank_test.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""The model tests Analyfiltbank FE."""
 
 from pathlib import Path
 import numpy as np
@@ -24,7 +25,9 @@
 
 
 class Test(tf.test.TestCase):
-
+  """
+  Analyfiltbank extraction test.
+  """
   def test_analyfiltbank(self):
     wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))
 
diff --git a/delta/data/frontend/cepstrum.py b/delta/data/frontend/cepstrum.py
index 5e98c368..1cf9e011 100644
--- a/delta/data/frontend/cepstrum.py
+++ b/delta/data/frontend/cepstrum.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""This model extracts Cepstrum features per frame."""
 
 import delta.compat as tf
 
@@ -22,7 +23,10 @@
 
 
 class Cepstrum(BaseFrontend):
-
+  """
+  Compute Cepstrum features of every frame in speech, return a float tensor
+  with size (num_frames, ceps_subband_num).
+  """
   def __init__(self, config: dict):
     super().__init__(config)
 
@@ -30,10 +34,15 @@ def __init__(self, config: dict):
   def params(cls, config=None):
     """
     Set params.
-    :param config: contains five optional parameters:window_length(float, default=0.025),
-          frame_length(float, default=0.010), sample_rate(int, default=16000),
-          ceps_subband_num(int, default=13), tag_ceps_mean_norm(bool, default=True).
-    :return:An object of class HParams, which is a set of hyperparameters as name-value pairs.
+    :param config: contains five optional parameters:
+        --sample_rate       : Waveform data sample frequency (must match the waveform
+                            file, if specified there). (float, default = 16000)
+        --window_length		 : Window length in seconds. (float, default = 0.025)
+        --frame_length		 : Hop length in seconds. (float, default = 0.010)
+        --ceps_subband_num : Number of Ceps_subband. (int, default=13).
+        --tag_ceps_mean_norm : Flag of tag_ceps_mean_norm. (bool, default=True).
+    :return:An object of class HParams, which is a set of hyperparameters as
+            name-value pairs.
     """
 
     window_length = 0.025
@@ -57,10 +66,13 @@ def params(cls, config=None):
   def call(self, audio_data, sample_rate=None):
     """
     Caculate cepstrum of audio data.
-    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
-    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
-    :return:A float tensor of size (num_frames, ceps_subband_num) containing normalized cepstrum
-          (tag_ceps_mean_norm = True) or cepstrum (tag_ceps_mean_norm = False) of every frame in speech.
+    :param audio_data: the audio signal from which to compute spectrum.
+                        Should be an (1, N) tensor.
+    :param sample_rate: [option]the samplerate of the signal we working with,
+                        default is 16kHz.
+    :return:A float tensor of size (num_frames, ceps_subband_num) containing
+            normalized cepstrum (tag_ceps_mean_norm = True) or cepstrum
+            (tag_ceps_mean_norm = False) of every frame in speech.
     """
 
     p = self.config
diff --git a/delta/data/frontend/cepstrum_test.py b/delta/data/frontend/cepstrum_test.py
index 550842ed..fcbb4b4d 100644
--- a/delta/data/frontend/cepstrum_test.py
+++ b/delta/data/frontend/cepstrum_test.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""The model tests Cepstrum FE."""
 
 import numpy as np
 from pathlib import Path
@@ -24,7 +25,9 @@
 
 
 class CepstrumTest(tf.test.TestCase):
-
+  """
+  Cepstrum extraction test.
+  """
   def test_cepstrum(self):
 
     wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))
diff --git a/delta/data/frontend/cmvn.py b/delta/data/frontend/cmvn.py
index 0cdf7750..7717f9d0 100644
--- a/delta/data/frontend/cmvn.py
+++ b/delta/data/frontend/cmvn.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""The model computes CMVN of features."""
 
 import io
 import kaldiio
@@ -22,13 +23,27 @@
 
 
 class CMVN(BaseFrontend):
-
+  """
+  Compute and apply CMVN to features.
+  """
   def __init__(self, config: dict):
     super().__init__(config)
 
   @classmethod
   def params(cls, config=None):
-
+    """
+    Set params.
+    :param config: contains seven optional parameters:
+            --norm_means   : Flag of norm_means. (bool, default=True)
+            --norm_vars    : Flag of norm_vars. (bool, default=False)
+            --utt2spk      : Use for speaker CMVN. (string, default=None)
+            --spk2utt      : Rspecifier for speaker to utterance-list map.
+                            (string, default=None)
+            --reverse      : Flag of reverse. (bool, default=False)
+            --std_floor    : Floor to std. (float, default=1.0e-20)
+            --filetype     : Type of input file. (string, default='mat')
+    :return:
+    """
     norm_means = True
     norm_vars = False
     utt2spk = None
@@ -52,7 +67,11 @@ def params(cls, config=None):
     return hparams
 
   def call(self, stats):
-
+    """
+    Do CMVN.
+    :param stats: Statistics of features.
+    :return: Mean and std of features.
+    """
     p = self.config
 
     if isinstance(stats, dict):
diff --git a/delta/data/frontend/delta_delta.py b/delta/data/frontend/delta_delta.py
index 911a5955..430967e8 100644
--- a/delta/data/frontend/delta_delta.py
+++ b/delta/data/frontend/delta_delta.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""This model does delta_delta to features."""
 
 import delta.compat as tf
 
@@ -22,7 +23,9 @@
 
 
 class DeltaDelta(BaseFrontend):
-
+  """
+  Do Delta_delta to features.
+  """
   def __init__(self, config: dict):
     super().__init__(config)
 
diff --git a/delta/data/frontend/delta_delta_test.py b/delta/data/frontend/delta_delta_test.py
index 616b6a4e..d8f0cc74 100644
--- a/delta/data/frontend/delta_delta_test.py
+++ b/delta/data/frontend/delta_delta_test.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""The model tests Delta_delta FE."""
 
 import delta.compat as tf
 from delta.data.frontend.delta_delta import DeltaDelta
@@ -22,7 +23,9 @@
 
 
 class Delta_delta_Test(tf.test.TestCase):
-
+  """
+  Delta_delta extraction test.
+  """
   def test_delta_delta(self):
 
     self.feat_dim = 80
diff --git a/delta/data/frontend/fbank.py b/delta/data/frontend/fbank.py
index 06f8fbe4..3bd55ab7 100644
--- a/delta/data/frontend/fbank.py
+++ b/delta/data/frontend/fbank.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""This model extracts Fbank features per frame."""
 
 import tensorflow as tf
 from core.ops import py_x_ops
@@ -22,7 +23,11 @@
 
 
 class Fbank(BaseFrontend):
-
+  """
+  Computing filter banks is applying triangular filters on a Mel-scale to the power
+   spectrum to extract frequency bands. Return a float tensor with shape
+   (num_channels, num_frames, num_frequencies).
+  """
   def __init__(self, config: dict):
     super().__init__(config)
     self.spect = Spectrum(config)
@@ -31,20 +36,34 @@ def __init__(self, config: dict):
   def params(cls, config=None):
     """
     Set params.
-    :param config: contains thirteen optional parameters.
-        --sample_rate				  : Sample frequency of waveform data. (int, default = 16000)
-        --window_length				: Window length in seconds. (float, default = 0.025)
-        --frame_length				: Hop length in seconds. (float, default = 0.010)
-        --snip_edges				  : If True, the last frame (shorter than window_length) will be cutoff. If False, 1 // 2 frame_length data will be padded to data. (int, default = True)
-        ---raw_energy				  : If 1, compute frame energy before preemphasis and windowing. If 2,  compute frame energy after preemphasis and windowing. (int, default = 1)
-        --preeph_coeff				: Coefficient for use in frame-signal preemphasis. (float, default = 0.97)
-        --window_type				  : Type of window ("hamm"|"hann"|"povey"|"rect"|"blac"|"tria"). (string, default = "povey")
-        --remove_dc_offset		: Subtract mean from waveform on each frame (bool, default = true)
-        --is_fbank					  : If true, compute power spetrum without frame energy. If false, using the frame energy instead of the square of the constant component of the signal. (bool, default = true)
-        --output_type				  : If 1, return power spectrum. If 2, return log-power spectrum. (int, default = 1)
-        --upper_frequency_limit		        : High cutoff frequency for mel bins (if < 0, offset from Nyquist) (float, default = 0)
-        --lower_frequency_limit		        : Low cutoff frequency for mel bins (float, default = 20)
-        --filterbank_channel_count	      : Number of triangular mel-frequency bins (float, default = 23)
+    :param config: contains thirteen optional parameters:
+           --window_length				: Window length in seconds. (float, default = 0.025)
+           --frame_length				: Hop length in seconds. (float, default = 0.010)
+           --snip_edges				: If 1, the last frame (shorter than window_length) will be
+                                         cutoff. If 2, 1 // 2 frame_length data will be padded
+                                         to data. (int, default = 1)
+           ---raw_energy				: If 1, compute frame energy before preemphasis and
+                                         windowing. If 2,  compute frame energy after
+                                         preemphasis and windowing. (int, default = 1)
+           --preeph_coeff				: Coefficient for use in frame-signal preemphasis.
+                                        (float, default = 0.97)
+           --window_type				: Type of window ("hamm"|"hann"|"povey"|"rect"|"blac"|"tria").
+                                        (string, default = "povey")
+           --remove_dc_offset			: Subtract mean from waveform on each frame.
+                                         (bool, default = true)
+           --is_fbank					: If true, compute power spetrum without frame energy.
+                                         If false, using the frame energy instead of the
+                                         square of the constant component of the signal.
+                                         (bool, default = true)
+           --output_type				: If 1, return power spectrum. If 2, return log-power
+                                         spectrum. (int, default = 1)
+           --upper_frequency_limit		: High cutoff frequency for mel bins (if <= 0, offset
+                                        from Nyquist) (float, default = 0)
+           --lower_frequency_limit		: Low cutoff frequency for mel bins (float, default = 20)
+           --filterbank_channel_count	: Number of triangular mel-frequency bins.
+                                        (float, default = 23)
+           --dither			    	: Dithering constant (0.0 means no dither).
+                                        (float, default = 1) [add robust to training]
     :return: An object of class HParams, which is a set of hyperparameters as name-value pairs.
     """
 
@@ -55,12 +74,13 @@ def params(cls, config=None):
     frame_length = 0.010
     output_type = 1
     sample_rate = 16000
-    snip_edges = True
+    snip_edges = 1
     raw_energy = 1
     preeph_coeff = 0.97
     window_type = 'povey'
     remove_dc_offset = True
     is_fbank = True
+    dither = 0.0
 
     hparams = HParams(cls=cls)
     hparams.add_hparam('upper_frequency_limit', upper_frequency_limit)
@@ -76,6 +96,7 @@ def params(cls, config=None):
     hparams.add_hparam('window_type', window_type)
     hparams.add_hparam('remove_dc_offset', remove_dc_offset)
     hparams.add_hparam('is_fbank', is_fbank)
+    hparams.add_hparam('dither', dither)
 
     if config is not None:
       hparams.override_from_dict(config)
@@ -84,11 +105,13 @@ def params(cls, config=None):
 
   def call(self, audio_data, sample_rate=None):
     """
-    Caculate fbank features of audio data.
-    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
-    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
-    :return: A float tensor of size (num_channels, num_frames, num_frequencies) containing
-            fbank features of every frame in speech.
+       Caculate fbank features of audio data.
+       :param audio_data: the audio signal from which to compute spectrum.
+                          Should be an (1, N) tensor.
+       :param sample_rate: [option]the samplerate of the signal we working with,
+                            default is 16kHz.
+       :return: A float tensor of size (num_frames, num_frequencies, num_channels) containing
+               fbank features of every frame in speech.
     """
     p = self.config
     with tf.name_scope('fbank'):
@@ -116,4 +139,10 @@ def call(self, audio_data, sample_rate=None):
             lower_frequency_limit=p.lower_frequency_limit,
             filterbank_channel_count=p.filterbank_channel_count)
 
+        fbank = tf.squeeze(fbank, axis=0)
+        shape = tf.shape(fbank)
+        nframe = shape[0]
+        nfbank = shape[1]
+        fbank = tf.reshape(fbank, (nframe, nfbank, 1))
+
         return fbank
diff --git a/delta/data/frontend/fbank_pitch.py b/delta/data/frontend/fbank_pitch.py
index 3cb53445..84e2e240 100644
--- a/delta/data/frontend/fbank_pitch.py
+++ b/delta/data/frontend/fbank_pitch.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""This model extracts Fbank && Pitch features per frame."""
 
 import delta.compat as tf
 from delta.utils.hparam import HParams
@@ -22,7 +23,10 @@
 
 
 class FbankPitch(BaseFrontend):
-
+  """
+  Compute Fbank && Pitch features respectively，and concate them. Return
+  a tensor with shape (num_frames, dim_features).
+  """
   def __init__(self, config: dict):
     super().__init__(config)
     self.fbank = Fbank(config)
@@ -32,10 +36,79 @@ def __init__(self, config: dict):
   def params(cls, config=None):
     """
     Set params.
-    :param config: contains eight optional parameters:upper_frequency_limit(float, default=4000.0),
-    lower_frequency_limit(float, default=20.0), filterbank_channel_count(float, default=40.0),
-    window_length(float, default=0.025), frame_length(float, default=0.010),
-    thres_autoc(float, default=0.3), output_type(int, default=2), sample_rate(int, default=16000).
+    :param config: contains twenty-nine optional parameters:
+          --sample_rate         : Samplerate of the signal we working with.
+                                  (int, default = 16000)
+          --window_length		    : Window length in seconds. (float, default = 0.025)
+          --frame_length			  : Hop length in seconds. (float, default = 0.010)
+          --snip_edges				  : If 1, the last frame (shorter than window_length) will
+                                        be cutoff. If 2, 1 // 2 frame_length data will be padded
+                                         to data. (int, default = 1)
+          ---raw_energy				  : If 1, compute frame energy before preemphasis and
+                                        windowing. If 2,  compute frame energy after preemphasis
+                                         and windowing. (int, default = 1)
+          --preEph_coeff			  : Coefficient for use in frame-signal preemphasis.
+                                        (float, default = 0.97)
+          --window_type				  : Type of window ("hamm"|"hann"|"povey"|"rect"|"blac"|"tria").
+                                        (string, default = "povey")
+          --remove_dc_offset	      : Subtract mean from waveform on each frame.
+                                        (bool, default = true)
+          --is_fbank				  : If true, compute power spetrum without frame
+                                        energy. If false, using the frame energy instead
+                                         of the square of the constant component of the
+                                         signal. (bool, default = true)
+          --output_type				  : If 1, return power spectrum. If 2, return
+                                        log-power spectrum. (int, default = 1)
+          --upper_frequency_limit	  : High cutoff frequency for mel bins.
+                                        (if <= 0, offset from Nyquist) (float, default = 0)
+          --lower_frequency_limit	  : Low cutoff frequency for mel bins.
+                                        (float, default = 20)
+          --filterbank_channel_count  : Number of triangular mel-frequency bins.
+                                        (float, default = 23)
+          --dither			    	  : Dithering constant (0.0 means no dither).
+                                        (float, default = 1)
+            [add robust to training]
+          --delta-pitch               : Smallest relative change in pitch that our
+                                        algorithm measures. (float, default = 0.005)
+          --frames-per-chunk          : Only relevant for offline pitch extraction.
+                                        (e.g. compute-kaldi-pitch-feats), you can set it to a
+                                        small nonzero value, such as 10, for better feature
+                                        compatibility with online decoding (affects energy
+                                        normalization in the algorithm) (int, default = 0)
+          --lowpass-cutoff            : cutoff frequency for LowPass filter (Hz).
+                                        (float, default = 1000)
+          --lowpass-filter-width      : Integer that determines filter width of lowpass filter,
+                                        more gives sharper filter (int, default = 1)
+          --max-f0                    : max. F0 to search for (Hz) (float, default = 400)
+          --max-frames-latency        : Maximum number of frames of latency that we allow pitch
+                                        tracking to introduce into the feature processing
+                                        (affects output only if --frames-per-chunk > 0 and
+                                        --simulate-first-pass-online=true (int, default = 0)
+          --min-f0                    : min. F0 to search for (Hz) (float, default = 50)
+          --nccf-ballast              : Increasing this factor reduces NCCF for quiet frames.
+                                        (float, default = 7000)
+          --nccf-ballast-online       : This is useful mainly for debug; it affects how the
+                                        NCCF ballast is computed. (bool, default = false)
+          --penalty-factor            : cost factor for FO change. (float, default = 0.1)
+          --preemphasis-coefficient   : Coefficient for use in signal preemphasis (deprecated)
+                                        (float, default = 0)
+          --recompute-frame           : Only relevant for online pitch extraction, or for
+                                        compatibility with online pitch extraction.  A
+                                        non-critical parameter; the frame at which we recompute
+                                        some of the forward pointers, after revising our
+                                        estimate of the signal energy. Relevant
+                                        if--frames-per-chunk > 0. (int, default = 500)
+          --resample-frequency        : Frequency that we down-sample the signal to. Must be
+                                        more than twice lowpass-cutoff (float, default = 4000)
+          --simulate-first-pass-online : If true, compute-kaldi-pitch-feats will output features
+                                         that correspond to what an online decoder would see in
+                                         the first pass of decoding-- not the final version of
+                                         the features, which is the default.  Relevant if
+                                         --frames-per-chunk > 0 (bool, default = false)
+          --soft-min-f0               : Minimum f0, applied in soft way, must not exceed
+                                        min-f0 (float, default = 10)
+          --upsample-filter-width     : Integer that determines filter width when upsampling
+                                        NCCF (int, default = 5)
     :return: An object of class HParams, which is a set of hyperparameters as name-value pairs.
     """
     hparams = HParams(cls=cls)
@@ -51,8 +124,9 @@ def params(cls, config=None):
     remove_dc_offset = True
     is_fbank = True
     output_type = 1
+    dither = 0.0
     sample_rate = 16000
-    snip_edges = True
+    snip_edges = 1
     preemph_coeff = 0.0
     min_f0 = 50.0
     max_f0 = 400.0
@@ -73,6 +147,7 @@ def params(cls, config=None):
     hparams.add_hparam('sample_rate', sample_rate)
     hparams.add_hparam('snip_edges', snip_edges)
     hparams.add_hparam('preemph_coeff', preemph_coeff)
+    hparams.add_hparam('dither', dither)
     hparams.add_hparam('min_f0', min_f0)
     hparams.add_hparam('max_f0', max_f0)
     hparams.add_hparam('soft_min_f0', soft_min_f0)
@@ -108,9 +183,11 @@ def params(cls, config=None):
   def call(self, audio_data, sample_rate=None):
     """
     Caculate fbank && pitch(concat) features of wav.
-    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
-    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
-    :return: A tensor with shape (num_frames, dim_features), containing fbank && pitch feature of every frame in speech.
+    :param audio_data: the audio signal from which to compute spectrum.
+                       Should be an (1, N) tensor.
+    :param sample_rate: the samplerate of the signal we working with.
+    :return: A tensor with shape (num_frames, dim_features), containing
+            fbank && pitch feature of every frame in speech.
     """
 
     p = self.config
diff --git a/delta/data/frontend/fbank_pitch_test.py b/delta/data/frontend/fbank_pitch_test.py
index 07c917e0..b778f7e6 100644
--- a/delta/data/frontend/fbank_pitch_test.py
+++ b/delta/data/frontend/fbank_pitch_test.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""The model tests Fbank&&Pitch FE."""
 
 import delta.compat as tf
 import os
@@ -23,7 +24,9 @@
 
 
 class FbankPitchTest(tf.test.TestCase):
-
+  """
+  Compare Fbank&&Pitch FE with kaldi.
+  """
   def test_FbankPitch(self):
     wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))
 
diff --git a/delta/data/frontend/fbank_test.py b/delta/data/frontend/fbank_test.py
index 03f4cb2b..c26fe6bb 100644
--- a/delta/data/frontend/fbank_test.py
+++ b/delta/data/frontend/fbank_test.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""The model tests Fbank FE."""
 
 import os
 import numpy as np
@@ -25,7 +26,9 @@
 
 
 class FbankTest(tf.test.TestCase):
-
+  """
+  Test Fbank FE using 8k/16k wav files.
+  """
   def test_fbank(self):
     wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))
 
@@ -36,7 +39,7 @@ def test_fbank(self):
           'window_length': 0.025,
           'output_type': 1,
           'frame_length': 0.010,
-          'snip_edges': True
+          'snip_edges': 1
       }
       fbank = Fbank.params(config).instantiate()
       fbank_test = fbank(input_data, sample_rate)
@@ -48,7 +51,7 @@ def test_fbank(self):
            [3.803553, 5.450971, 6.547878, 5.796172, 6.397846, 7.242926]])
 
       self.assertAllClose(
-          np.squeeze(fbank_test.eval()[0, 0:2, 0:6]),
+          np.squeeze(fbank_test.eval()[0:2, 0:6, 0]),
           real_fank_feats,
           rtol=1e-05,
           atol=1e-05)
diff --git a/delta/data/frontend/framepow.py b/delta/data/frontend/framepow.py
index 443c7579..5d54ca95 100644
--- a/delta/data/frontend/framepow.py
+++ b/delta/data/frontend/framepow.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+""""This model extracts framepow features per frame."""
 
 import delta.compat as tf
 
@@ -22,7 +23,10 @@
 
 
 class Framepow(BaseFrontend):
-
+  """
+  Compute power of every frame in speech. Return a float tensor with
+  shape (1 * num_frames).
+  """
   def __init__(self, config: dict):
     super().__init__(config)
 
@@ -30,18 +34,29 @@ def __init__(self, config: dict):
   def params(cls, config=None):
     """
     Set params.
-    :param config: contains three optional parameters:window_length(float, default=0.025),
-          frame_length(float, default=0.010), sample_rate(int, default=16000).
+    :param config: contains five optional parameters:
+        --sample_rate       : Waveform data sample frequency (must match the waveform
+                             file, if specified there). (float, default = 16000)
+        --window_length		 : Window length in seconds. (float, default = 0.025)
+        --frame_length		 : Hop length in seconds. (float, default = 0.010)
+        --snip_edges			 : If True, the last frame (shorter than window_length)
+                              will be cutoff. If False, 1 // 2 frame_length data will
+                              be padded to data. (int, default = True)
+        --remove_dc_offset : Subtract mean from waveform on each frame (bool, default = true)
     :return:An object of class HParams, which is a set of hyperparameters as name-value pairs.
     """
 
     window_length = 0.025
     frame_length = 0.010
+    snip_edges = 1
+    remove_dc_offset = True
     sample_rate = 16000
 
     hparams = HParams(cls=cls)
     hparams.add_hparam('window_length', window_length)
     hparams.add_hparam('frame_length', frame_length)
+    hparams.add_hparam('snip_edges', snip_edges)
+    hparams.add_hparam('remove_dc_offset', remove_dc_offset)
     hparams.add_hparam('sample_rate', sample_rate)
 
     if config is not None:
@@ -51,11 +66,14 @@ def params(cls, config=None):
 
   def call(self, audio_data, sample_rate=None):
     """
-        Caculate power of every frame in speech.
-        :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
-        :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
-        :return:A float tensor of size (1, num_frames) containing power of every frame in speech.
-        """
+    Caculate power of every frame in speech.
+    :param audio_data: the audio signal from which to compute spectrum.
+                       Should be an (1, N) tensor.
+    :param sample_rate: [option]the samplerate of the signal we working with,
+                        default is 16kHz.
+    :return:A float tensor of size (1 * num_frames) containing power of every
+            frame in speech.
+    """
 
     p = self.config
     with tf.name_scope('framepow'):
@@ -68,10 +86,11 @@ def call(self, audio_data, sample_rate=None):
       with tf.control_dependencies([assert_op]):
 
         sample_rate = tf.cast(sample_rate, dtype=float)
-        framepow = py_x_ops.frame_pow(
-            audio_data,
-            sample_rate,
-            window_length=p.window_length,
-            frame_length=p.frame_length)
-
-        return framepow
+        framepow = py_x_ops.frame_pow(audio_data,
+                                      sample_rate,
+                                      snip_edges=p.snip_edges,
+                                      remove_dc_offset=p.remove_dc_offset,
+                                      window_length=p.window_length,
+                                      frame_length=p.frame_length)
+
+        return tf.squeeze(framepow)
diff --git a/delta/data/frontend/framepow_test.py b/delta/data/frontend/framepow_test.py
index c21a568b..4a8a879f 100644
--- a/delta/data/frontend/framepow_test.py
+++ b/delta/data/frontend/framepow_test.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""The model tests framepow FE."""
 
 import os
 import numpy as np
@@ -25,14 +26,15 @@
 
 
 class FramepowTest(tf.test.TestCase):
-
+  """
+  Framepow extraction test.
+  """
   def test_framepow(self):
     wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))
 
     with self.cached_session(use_gpu=False, force_gpu=False):
       read_wav = ReadWav.params().instantiate()
       input_data, sample_rate = read_wav(wav_path)
-      input_data = input_data / 32768
 
       framepow = Framepow.params({
           'window_length': 0.025,
@@ -40,19 +42,11 @@ def test_framepow(self):
       }).instantiate()
       framepow_test = framepow(input_data, sample_rate)
 
-      output_true = np.array([
-          0.000018, 0.000011, 0.000010, 0.000010, 0.000010, 0.000010, 0.000008,
-          0.000009, 0.000009, 0.000009, 0.000009, 0.000011, 0.090164, 0.133028,
-          0.156547, 0.053551, 0.056670, 0.097706, 0.405659, 2.119505, 4.296845,
-          6.139090, 6.623638, 6.136467, 7.595072, 7.904415, 7.655983, 6.771016,
-          5.706427, 4.220942, 3.259599, 2.218259, 1.911394, 2.234246, 3.056905,
-          2.534153, 0.464354, 0.013493, 0.021231, 0.148362, 0.364829, 0.627266,
-          0.494912, 0.366029, 0.315408, 0.312441, 0.323796, 0.267505, 0.152856,
-          0.045305
-      ])
+      real_framepow_feats = np.array(
+        [9.819611, 9.328745, 9.247337, 9.26451, 9.266059])
 
       self.assertEqual(tf.rank(framepow_test).eval(), 1)
-      self.assertAllClose(framepow_test.eval().flatten()[:50], output_true)
+      self.assertAllClose(framepow_test.eval()[0 : 5], real_framepow_feats)
 
 
 if __name__ == '__main__':
diff --git a/delta/data/frontend/mfcc.py b/delta/data/frontend/mfcc.py
index 9a5de70e..16086b0e 100644
--- a/delta/data/frontend/mfcc.py
+++ b/delta/data/frontend/mfcc.py
@@ -13,46 +13,62 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""This model extracts MFCC features per frame."""
 
 import delta.compat as tf
 from core.ops import py_x_ops
 from delta.utils.hparam import HParams
 from delta.data.frontend.base_frontend import BaseFrontend
 from delta.data.frontend.fbank import Fbank
-from delta.data.frontend.spectrum import Spectrum
+from delta.data.frontend.framepow import Framepow
 import copy
 
 
 class Mfcc(BaseFrontend):
-
+  """
+  Compute mfcc features of every frame in speech, return a float tensor
+  with size (num_channels, num_frames, num_frequencies).
+  """
   def __init__(self, config: dict):
     super().__init__(config)
-    config1 = copy.deepcopy(config)
-    config1['is_fbank'] = False
-    config1['output_type'] = 2
-    self.spect = Spectrum(config1)
+    self.framepow = Framepow(config)
     self.fbank = Fbank(config)
 
   @classmethod
   def params(cls, config=None):
     """
     Set params.
-    :param config: contains fifthteen optional parameters.
-        --sample_rate				  : Sample frequency of waveform data. (int, default = 16000)
+    :param config: contains fourteen optional parameters.
         --window_length				: Window length in seconds. (float, default = 0.025)
         --frame_length				: Hop length in seconds. (float, default = 0.010)
-        --snip_edges				  : If 1, the last frame (shorter than window_length) will be cutoff. If 2, 1 // 2 frame_length data will be padded to data. (int, default = 1)
-        ---raw_energy				  : If 1, compute frame energy before preemphasis and windowing. If 2,  compute frame energy after preemphasis and windowing. (int, default = 1)
-        --preeph_coeff				: Coefficient for use in frame-signal preemphasis. (float, default = 0.97)
-        --window_type				  : Type of window ("hamm"|"hann"|"povey"|"rect"|"blac"|"tria"). (string, default = "povey")
-        --remove_dc_offset		: Subtract mean from waveform on each frame (bool, default = true)
-        --is_fbank					  : If true, compute power spetrum without frame energy. If false, using the frame energy instead of the square of the constant component of the signal. (bool, default = true)
-        --output_type				  : If 1, return power spectrum. If 2, return log-power spectrum. (int, default = 1)
-        --upper_frequency_limit		        : High cutoff frequency for mel bins (if < 0, offset from Nyquist) (float, default = 0)
-        --lower_frequency_limit		        : Low cutoff frequency for mel bins (float, default = 20)
-        --filterbank_channel_count	      : Number of triangular mel-frequency bins (float, default = 23)
-        --coefficient_count                 : Number of cepstra in MFCC computation.(int, default = 13)
-        --cepstral_lifter                 : Constant that controls scaling of MFCCs.(float, default = 22)
+        --snip_edges				: If 1, the last frame (shorter than window_length) will
+                                      be cutoff. If 2, 1 // 2 frame_length data will be padded
+                                      to data. (int, default = 1)
+        ---raw_energy				: If 1, compute frame energy before preemphasis and
+                                      windowing. If 2, compute frame energy after
+                                      preemphasis and windowing. (int, default = 1)
+        --preEph_coeff			    : Coefficient for use in frame-signal preemphasis.
+                                      (float, default = 0.97)
+        --window_type				: Type of window ("hamm"|"hann"|"povey"|"rect"|"blac"|"tria").
+                                      (string, default = "povey")
+        --remove_dc_offset		    : Subtract mean from waveform on each frame
+                                      (bool, default = true)
+        --is_fbank					: If true, compute power spetrum without frame energy. If
+                                      false, using the frame energy instead of the square of the
+                                      constant component of the signal. (bool, default = true)
+        --output_type				: If 1, return power spectrum. If 2, return log-power
+                                      spectrum. (int, default = 1)
+        --upper_frequency_limit		: High cutoff frequency for mel bins (if < 0, offset from
+                                      Nyquist) (float, default = 0)
+        --lower_frequency_limit		: Low cutoff frequency for mel bins (float, default = 20)
+        --filterbank_channel_count	: Number of triangular mel-frequency bins.
+                                     (float, default = 23)
+        --coefficient_count         : Number of cepstra in MFCC computation.
+                                     (int, default = 13)
+        --cepstral_lifter           : Constant that controls scaling of MFCCs.
+                                     (float, default = 22)
+        --use_energy                :Use energy (not C0) in MFCC computation.
+                                     (bool, default = True)
     :return: An object of class HParams, which is a set of hyperparameters as name-value pairs.
     """
 
@@ -63,7 +79,7 @@ def params(cls, config=None):
     frame_length = 0.010
     output_type = 1
     sample_rate = 16000
-    snip_edges = True
+    snip_edges = 1
     raw_energy = 1
     preeph_coeff = 0.97
     window_type = 'povey'
@@ -72,6 +88,7 @@ def params(cls, config=None):
     cepstral_lifter = 22.0
     coefficient_count = 13
     use_energy = True
+    dither = 0.0
 
     hparams = HParams(cls=cls)
     hparams.add_hparam('upper_frequency_limit', upper_frequency_limit)
@@ -90,6 +107,7 @@ def params(cls, config=None):
     hparams.add_hparam('cepstral_lifter', cepstral_lifter)
     hparams.add_hparam('coefficient_count', coefficient_count)
     hparams.add_hparam('use_energy', use_energy)
+    hparams.add_hparam('dither', dither)
 
     if config is not None:
       hparams.override_from_dict(config)
@@ -99,10 +117,11 @@ def params(cls, config=None):
   def call(self, audio_data, sample_rate=None):
     """
     Caculate mfcc features of audio data.
-    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
-    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
-    :return: A float tensor of size (num_channels, num_frames, num_frequencies) containing
-            mfcc features of every frame in speech.
+    :param audio_data: the audio signal from which to compute spectrum.
+                       Should be an (1, N) tensor.
+    :param sample_rate: the samplerate of the signal we working with.
+    :return: A float tensor of size (num_channels, num_frames, num_frequencies)
+            containing mfcc features of every frame in speech.
     """
     p = self.config
     with tf.name_scope('mfcc'):
@@ -114,14 +133,17 @@ def call(self, audio_data, sample_rate=None):
           tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32))
       with tf.control_dependencies([assert_op]):
 
-        spectrum_feats = self.spect(audio_data, sample_rate)
-        spectrum_feats = tf.expand_dims(spectrum_feats, 0)
         fbank_feats = self.fbank(audio_data, sample_rate)
-        mfcc = py_x_ops.mfcc(
-            fbank_feats,
-            spectrum_feats,
-            sample_rate,
-            use_energy=p.use_energy,
-            cepstral_lifter=p.cepstral_lifter,
-            coefficient_count=p.coefficient_count)
+        sample_rate = tf.cast(sample_rate, dtype=tf.int32)
+        shape = tf.shape(fbank_feats)
+        nframe = shape[0]
+        nfbank = shape[1]
+        fbank_feats = tf.reshape(fbank_feats, (1, nframe, nfbank))
+        framepow_feats = self.framepow(audio_data, sample_rate)
+        mfcc = py_x_ops.mfcc(fbank_feats,
+                             framepow_feats,
+                             sample_rate,
+                             use_energy=p.use_energy,
+                             cepstral_lifter=p.cepstral_lifter,
+                             coefficient_count=p.coefficient_count)
         return mfcc
diff --git a/delta/data/frontend/mfcc_test.py b/delta/data/frontend/mfcc_test.py
index b79a183c..b29d2033 100644
--- a/delta/data/frontend/mfcc_test.py
+++ b/delta/data/frontend/mfcc_test.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""The model tests MFCC FE."""
 
 import delta.compat as tf
 import os
@@ -24,7 +25,9 @@
 
 
 class MfccTest(tf.test.TestCase):
-
+  """
+  MFCC extraction test.
+  """
   def test_mfcc(self):
     wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))
 
diff --git a/delta/data/frontend/pitch.py b/delta/data/frontend/pitch.py
index 5c747caf..44097d6b 100644
--- a/delta/data/frontend/pitch.py
+++ b/delta/data/frontend/pitch.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""This model extracts pitch features per frame."""
 
 import delta.compat as tf
 from core.ops import py_x_ops
@@ -21,7 +22,10 @@
 
 
 class Pitch(BaseFrontend):
-
+  """
+  Compute pitch features of every frame in speech, return a float tensor
+  with size (num_frames, 2).
+  """
   def __init__(self, config: dict):
     super().__init__(config)
 
@@ -29,27 +33,57 @@ def __init__(self, config: dict):
   def params(cls, config=None):
     """
     Set params.
-    :param config: contains twenty optional parameters:
-      --delta-pitch               : Smallest relative change in pitch that our algorithm measures (float, default = 0.005)
-		  --frame-length              : Frame length in milliseconds (float, default = 25)
-		  --frame-shift               : Frame shift in milliseconds (float, default = 10)
-		  --frames-per-chunk          : Only relevant for offline pitch extraction (e.g. compute-kaldi-pitch-feats), you can set it to a small nonzero value, such as 10, for better feature compatibility with online decoding (affects energy normalization in the algorithm) (int, default = 0)
-		  --lowpass-cutoff            : cutoff frequency for LowPass filter (Hz)  (float, default = 1000)
-		  --lowpass-filter-width      : Integer that determines filter width of lowpass filter, more gives sharper filter (int, default = 1)
-		  --max-f0                    : max. F0 to search for (Hz) (float, default = 400)
-		  --max-frames-latency        : Maximum number of frames of latency that we allow pitch tracking to introduce into the feature processing (affects output only if --frames-per-chunk > 0 and --simulate-first-pass-online=true (int, default = 0)
-		  --min-f0                    : min. F0 to search for (Hz) (float, default = 50)
-		  --nccf-ballast              : Increasing this factor reduces NCCF for quiet frames (float, default = 7000)
-		  --nccf-ballast-online       : This is useful mainly for debug; it affects how the NCCF ballast is computed. (bool, default = false)
-		  --penalty-factor            : cost factor for FO change. (float, default = 0.1)
-		  --preemphasis-coefficient   : Coefficient for use in signal preemphasis (deprecated) (float, default = 0)
-		  --recompute-frame           : Only relevant for online pitch extraction, or for compatibility with online pitch extraction.  A non-critical parameter; the frame at which we recompute some of the forward pointers, after revising our estimate of the signal energy.  Relevant if--frames-per-chunk > 0 (int, default = 500)
-		  --resample-frequency        : Frequency that we down-sample the signal to.  Must be more than twice lowpass-cutoff (float, default = 4000)
-		  --sample-frequency          : Waveform data sample frequency (must match the waveform file, if specified there) (float, default = 16000)
-		  --simulate-first-pass-online : If true, compute-kaldi-pitch-feats will output features that correspond to what an online decoder would see in the first pass of decoding-- not the final version of the features, which is the default.  Relevant if --frames-per-chunk > 0 (bool, default = false)
-		  --snip-edges                : If this is set to false, the incomplete frames near the ending edge won't be snipped, so that the number of frames is the file size divided by the frame-shift. This makes different types of features give the same number of frames. (bool, default = true)
-		  --soft-min-f0               : Minimum f0, applied in soft way, must not exceed min-f0 (float, default = 10)
-      --upsample-filter-width     : Integer that determines filter width when upsampling NCCF (int, default = 5)
+    :param config: contains nineteen optional parameters:
+          --sample_rate               : Waveform data sample frequency (must match the waveform
+                                        file, if specified there). (float, default = 16000)
+          --delta-pitch               : Smallest relative change in pitch that our algorithm
+                                        measures (float, default = 0.005)
+          --window_length             : Frame length in seconds (float, default = 0.025)
+          --frame_length              : Frame shift in seconds (float, default = 0.010)
+          --frames-per-chunk          : Only relevant for offline pitch extraction (e.g.
+                                        compute-kaldi-pitch-feats), you can set it to a small
+                                        nonzero value, such as 10, for better feature
+                                        compatibility with online decoding (affects energy
+                                        normalization in the algorithm) (int, default = 0)
+          --lowpass-cutoff            : cutoff frequency for LowPass filter (Hz).
+                                        (float, default = 1000)
+          --lowpass-filter-width      : Integer that determines filter width of lowpass filter,
+                                        more gives sharper filter (int, default = 1)
+          --max-f0                    : max. F0 to search for (Hz) (float, default = 400)
+          --max-frames-latency        : Maximum number of frames of latency that we allow pitch
+                                        tracking to introduce into the feature processing
+                                        (affects output only if --frames-per-chunk > 0 and
+                                        --simulate-first-pass-online=true (int, default = 0)
+          --min-f0                    : min. F0 to search for (Hz) (float, default = 50)
+          --nccf-ballast              : Increasing this factor reduces NCCF for quiet frames.
+                                        (float, default = 7000)
+          --nccf-ballast-online       : This is useful mainly for debug; it affects how the NCCF
+                                        ballast is computed. (bool, default = false)
+          --penalty-factor            : cost factor for FO change. (float, default = 0.1)
+          --preemphasis-coefficient   : Coefficient for use in signal preemphasis (deprecated).
+                                        (float, default = 0)
+          --recompute-frame           : Only relevant for online pitch extraction, or for
+                                        compatibility with online pitch extraction.  A
+                                        non-critical parameter; the frame at which we recompute
+                                        some of the forward pointers, after revising our
+                                        estimate of the signal energy.  Relevant
+                                        if--frames-per-chunk > 0. (int, default = 500)
+          --resample-frequency        : Frequency that we down-sample the signal to.  Must be
+                                        more than twice lowpass-cutoff (float, default = 4000)
+          --simulate-first-pass-online : If true, compute-kaldi-pitch-feats will output features
+                                        that correspond to what an online decoder would see in
+                                        the first pass of decoding-- not the final version of
+                                        the features, which is the default.  Relevant if
+                                        --frames-per-chunk > 0 (bool, default = false)
+          --snip-edges                : If this is set to false, the incomplete frames near the
+                                        ending edge won't be snipped, so that the number of
+                                        frames is the file size divided by the frame-shift.
+                                        This makes different types of features give the same
+                                        number of frames. (bool, default = true)
+          --soft-min-f0               : Minimum f0, applied in soft way, must not exceed min-f0.
+                                        (float, default = 10)
+          --upsample-filter-width     : Integer that determines filter width when upsampling
+                                        NCCF. (int, default = 5)
     :return: An object of class HParams, which is a set of hyperparameters as name-value pairs.
     """
 
@@ -103,12 +137,13 @@ def params(cls, config=None):
 
   def call(self, audio_data, sample_rate=None):
     """
-       Caculate picth features of audio data.
-       :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
-       :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
-       :return: A float tensor of size (num_frames, 2) containing
-               pitch && POV features of every frame in speech.
-       """
+    Caculate picth features of audio data.
+    :param audio_data: the audio signal from which to compute spectrum.
+                      Should be an (1, N) tensor.
+    :param sample_rate: the samplerate of the signal we working with.
+    :return: A float tensor of size (num_frames, 2) containing
+           pitch && POV features of every frame in speech.
+    """
     p = self.config
 
     with tf.name_scope('pitch'):
diff --git a/delta/data/frontend/pitch_test.py b/delta/data/frontend/pitch_test.py
index f9ed3c0b..7564522c 100644
--- a/delta/data/frontend/pitch_test.py
+++ b/delta/data/frontend/pitch_test.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""The model tests pitch FE."""
 
 import delta.compat as tf
 import os
@@ -24,7 +25,9 @@
 
 
 class SpectrumTest(tf.test.TestCase):
-
+  """
+  Pitch extraction test.
+  """
   def test_spectrum(self):
     wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))
 
diff --git a/delta/data/frontend/plp.py b/delta/data/frontend/plp.py
index 74b3e584..e07bc376 100644
--- a/delta/data/frontend/plp.py
+++ b/delta/data/frontend/plp.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""This model extracts PLP features per frame."""
 
 import delta.compat as tf
 
@@ -22,7 +23,10 @@
 
 
 class Plp(BaseFrontend):
-
+  """
+  Compute PLP features of every frame in speech, return a float tensor
+  with size (num_frames, plp_order + 1).
+  """
   def __init__(self, config: dict):
     super().__init__(config)
 
@@ -30,9 +34,12 @@ def __init__(self, config: dict):
   def params(cls, config=None):
     """
     Set params.
-    :param config: contains four optional parameters:window_length(float, default=0.025),
-          frame_length(float, default=0.010), sample_rate(float, default=16000),
-          plp_order(int, default=12).
+    :param config: contains four optional parameters:
+        --sample_rate       : Waveform data sample frequency (must match the waveform
+                             file, if specified there). (float, default = 16000)
+        --window_length		 : Window length in seconds. (float, default = 0.025)
+        --frame_length		 : Hop length in seconds. (float, default = 0.010)
+        --plp_order        : Plp order. (int, default=12).
     :return:An object of class HParams, which is a set of hyperparameters as name-value pairs.
     """
 
@@ -55,9 +62,12 @@ def params(cls, config=None):
   def call(self, audio_data, sample_rate=None):
     """
     Caculate plp features of audio data.
-    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
-    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
-    :return:A float tensor of size (num_frames, (plp_order + 1)) containing plp features of every frame in speech.
+    :param audio_data: the audio signal from which to compute spectrum.
+                       Should be an (1, N) tensor.
+    :param sample_rate: [option]the samplerate of the signal we working
+                        with, default is 16kHz.
+    :return:A float tensor of size (num_frames, (plp_order + 1)) containing plp
+            features of every frame in speech.
     """
 
     p = self.config
diff --git a/delta/data/frontend/plp_test.py b/delta/data/frontend/plp_test.py
index eecc343f..6dc54269 100644
--- a/delta/data/frontend/plp_test.py
+++ b/delta/data/frontend/plp_test.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""The model tests PLP FE."""
 
 import delta.compat as tf
 import os
@@ -24,7 +25,9 @@
 
 
 class PlpTest(tf.test.TestCase):
-
+  """
+  Plp extraction test.
+  """
   def test_plp(self):
     wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))
 
diff --git a/delta/data/frontend/read_wav.py b/delta/data/frontend/read_wav.py
index 38585d9d..17ff3b3f 100644
--- a/delta/data/frontend/read_wav.py
+++ b/delta/data/frontend/read_wav.py
@@ -13,32 +13,40 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""The model reads audio sample from wav file."""
 
 import delta.compat as tf
-
 from delta.utils.hparam import HParams
 from delta.data.frontend.base_frontend import BaseFrontend
-
+from core.ops import py_x_ops
 
 class ReadWav(BaseFrontend):
-
+  """
+      Read audio sample from wav file, return sample data and sample rate.
+      """
   def __init__(self, config: dict):
     super().__init__(config)
 
   @classmethod
   def params(cls, config=None):
     """
-      Set params.
-       :param config: contains two optional parameters: audio_channels(int, default=1),
-              sample_rate(int, default=16000).
-       :return: An object of class HParams, which is a set of hyperparameters as name-value pairs.
-       """
+    Set params.
+    :param config: contains three optional parameters:
+          --sample_rate       : Waveform data sample frequency (must match the waveform
+                                file, if specified there). (float, default = 16000)
+          --speed             : Speed of sample channels wanted. (float, default=1.0)
+          --audio_channels    :(int, default=1).
+    :return: An object of class HParams, which is a set of hyperparameters as
+            name-value pairs.
+    """
     audio_channels = 1
     sample_rate = 16000
+    speed = 1.0
 
     hparams = HParams(cls=cls)
     hparams.add_hparam('audio_channels', audio_channels)
     hparams.add_hparam('sample_rate', sample_rate)
+    hparams.add_hparam('speed', speed)
 
     if config is not None:
       hparams.override_from_dict(config)
@@ -48,8 +56,9 @@ def params(cls, config=None):
   def call(self, wavfile):
     """
     Get audio data and sample rate from a wavfile.
-    :param wavfile: filepath of wav
-    :return: 2 values. The first is a Tensor of audio data. The second return value is the sample rate of the input wav
+    :param wavfile: filepath of wav.
+    :return: 2 values. The first is a Tensor of audio data.
+        The second return value isthe sample rate of the input wav
         file, which is a tensor with float dtype.
     """
     p = self.config
@@ -58,7 +67,16 @@ def call(self, wavfile):
         contents, desired_channels=p.audio_channels)
     assert_op = tf.assert_equal(
         tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32))
+
     with tf.control_dependencies([assert_op]):
-      return tf.squeeze(
-          audio_data * 32768, axis=-1), tf.cast(
-              sample_rate, dtype=tf.int32)
+
+      if p.speed == 1.0:
+        return tf.squeeze(audio_data * 32768, axis=-1), tf.cast(sample_rate, dtype=tf.int32)
+      else:
+        resample_rate = tf.cast(sample_rate, dtype=tf.float32) * tf.cast(
+          1.0 / p.speed, dtype=tf.float32)
+        speed_data = py_x_ops.speed(tf.squeeze(audio_data * 32768, axis=-1),
+                                    tf.cast(sample_rate, dtype=tf.int32),
+                                    tf.cast(resample_rate, dtype=tf.int32),
+                                    lowpass_filter_width=5)
+        return tf.squeeze(speed_data), tf.cast(sample_rate, dtype=tf.int32)
diff --git a/delta/data/frontend/read_wav_test.py b/delta/data/frontend/read_wav_test.py
index d1a2eeb6..2c3f099a 100644
--- a/delta/data/frontend/read_wav_test.py
+++ b/delta/data/frontend/read_wav_test.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""The model tests OP of read_wav """
 
 import delta.compat as tf
 from pathlib import Path
@@ -22,17 +23,20 @@
 
 
 class ReadWavTest(tf.test.TestCase):
-
+  """
+  ReadWav OP test.
+  """
   def test_read_wav(self):
     wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))
 
     with self.cached_session(use_gpu=False, force_gpu=False):
-      read_wav = ReadWav.params({'sample_rate': 16000}).instantiate()
+      config = {'speed': 1.0}
+      read_wav = ReadWav.params(config).instantiate()
       audio_data, sample_rate = read_wav(wav_path)
       audio_data_true, sample_rate_true = librosa.load(wav_path, sr=16000)
-      self.assertAllClose(audio_data.eval() / 32768, audio_data_true)
-      self.assertAllClose(sample_rate.eval(), sample_rate_true)
-
+      if (config['speed'] == 1.0):
+        self.assertAllClose(audio_data.eval() / 32768, audio_data_true)
+        self.assertAllClose(sample_rate.eval(), sample_rate_true)
 
 if __name__ == '__main__':
   tf.test.main()
diff --git a/delta/data/frontend/spectrum.py b/delta/data/frontend/spectrum.py
index 4bd82a1e..f8a350ec 100644
--- a/delta/data/frontend/spectrum.py
+++ b/delta/data/frontend/spectrum.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""This model extracts spetrum features per frame."""
 
 import tensorflow as tf
 from core.ops import py_x_ops
@@ -21,7 +22,10 @@
 
 
 class Spectrum(BaseFrontend):
-
+  """
+  Compute spectrum features of every frame in speech, return a float tensor
+  with size (num_frames, num_frequencies).
+  """
   def __init__(self, config: dict):
     super().__init__(config)
 
@@ -29,17 +33,30 @@ def __init__(self, config: dict):
   def params(cls, config=None):
     """
     Set params.
-    :param config: contains ten optional parameters.
-          --sample_rate			: Sample frequency of waveform data. (int, default = 16000)
+    :param config: contains nine optional parameters：
+          --sample_rate     : Waveform data sample frequency (must match the waveform
+                              file, if specified there). (float, default = 16000)
           --window_length		: Window length in seconds. (float, default = 0.025)
-          --frame_length			: Hop length in seconds. (float, default = 0.010)
-          --snip_edges			: If True, the last frame (shorter than window_length) will be cutoff. If False, 1 // 2 frame_length data will be padded to data. (int, default = True)
-          ---raw_energy			: If 1, compute frame energy before preemphasis and windowing. If 2,  compute frame energy after preemphasis and windowing. (int, default = 1)
-          --preeph_coeff			: Coefficient for use in frame-signal preemphasis. (float, default = 0.97)
-          --window_type			: Type of window ("hamm"|"hann"|"povey"|"rect"|"blac"|"tria"). (string, default = "povey")
-          --remove_dc_offset		: Subtract mean from waveform on each frame (bool, default = true)
-          --is_fbank				: If true, compute power spetrum without frame energy. If false, using the frame energy instead of the square of the constant component of the signal. (bool, default = false)
-          --output_type			: If 1, return power spectrum. If 2, return log-power spectrum. (int, default = 2)
+          --frame_length		: Hop length in seconds. (float, default = 0.010)
+          --snip_edges			: If 1, the last frame (shorter than window_length)
+                                  will be cutoff. If 2, 1 // 2 frame_length data will
+                                  be padded to data. (int, default = 1)
+          ---raw_energy			: If 1, compute frame energy before preemphasis and windowing.
+                                  If 2,  compute frame energy after preemphasis and windowing.
+                                  (int, default = 1)
+          --preeph_coeff		: Coefficient for use in frame-signal preemphasis.
+                                 (float, default = 0.97)
+          --window_type			: Type of window ("hamm"|"hann"|"povey"|"rect"|"blac"|"tria").
+                                  (string, default = "povey")
+          --remove_dc_offset	: Subtract mean from waveform on each frame.
+                                 (bool, default = true)
+          --is_fbank			: If true, compute power spetrum without frame energy.
+                                  If false, using the frame energy instead of the square of the
+                                  constant component of the signal. (bool, default = false)
+          --output_type			: If 1, return power spectrum. If 2, return log-power spectrum.
+                                  (int, default = 2)
+          --dither		        : Dithering constant (0.0 means no dither).
+                                 (float, default = 1) [add robust to training]
     :return: An object of class HParams, which is a set of hyperparameters as name-value pairs.
     """
 
@@ -47,12 +64,13 @@ def params(cls, config=None):
     frame_length = 0.010
     output_type = 2
     sample_rate = 16000
-    snip_edges = True
+    snip_edges = 1
     raw_energy = 1
     preeph_coeff = 0.97
     window_type = 'povey'
     remove_dc_offset = True
     is_fbank = False
+    dither = 0.0
 
     hparams = HParams(cls=cls)
     hparams.add_hparam('window_length', window_length)
@@ -65,6 +83,7 @@ def params(cls, config=None):
     hparams.add_hparam('window_type', window_type)
     hparams.add_hparam('remove_dc_offset', remove_dc_offset)
     hparams.add_hparam('is_fbank', is_fbank)
+    hparams.add_hparam('dither', dither)
 
     if config is not None:
       hparams.override_from_dict(config)
@@ -74,10 +93,12 @@ def params(cls, config=None):
   def call(self, audio_data, sample_rate=None):
     """
     Caculate power spectrum or log power spectrum of audio data.
-    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
+    :param audio_data: the audio signal from which to compute spectrum.
+                       Should be an (1, N) tensor.
     :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
-    :return: A float tensor of size (num_frames, num_frequencies) containing power spectrum (output_type=1)
-        or log power spectrum (output_type=2) of every frame in speech.
+    :return: A float tensor of size (num_frames, num_frequencies) containing power
+            spectrum (output_type=1) or log power spectrum (output_type=2)
+            of every frame in speech.
     """
 
     p = self.config
@@ -102,6 +123,7 @@ def call(self, audio_data, sample_rate=None):
             preEph_coeff=p.preeph_coeff,
             window_type=p.window_type,
             remove_dc_offset=p.remove_dc_offset,
-            is_fbank=p.is_fbank)
+            is_fbank=p.is_fbank,
+            dither=p.dither)
 
         return spectrum
diff --git a/delta/data/frontend/spectrum_test.py b/delta/data/frontend/spectrum_test.py
index 0d095c71..dd7bc11f 100644
--- a/delta/data/frontend/spectrum_test.py
+++ b/delta/data/frontend/spectrum_test.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""The model tests spectrum FE."""
 
 import os
 import numpy as np
@@ -24,7 +25,9 @@
 
 
 class SpectrumTest(tf.test.TestCase):
-
+  '''
+  Spectum extraction test.
+  '''
   def test_spectrum(self):
     wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))
 
@@ -34,7 +37,8 @@ def test_spectrum(self):
 
       spectrum = Spectrum.params({
           'window_length': 0.025,
-          'snip_edges': True
+          'snip_edges': 1,
+          'dither':0.0
       }).instantiate()
       spectrum_test = spectrum(input_data, sample_rate)
 
diff --git a/delta/data/frontend/synthfiltbank.py b/delta/data/frontend/synthfiltbank.py
index 9c2d7c48..4ab34ba1 100644
--- a/delta/data/frontend/synthfiltbank.py
+++ b/delta/data/frontend/synthfiltbank.py
@@ -14,8 +14,8 @@
 # limitations under the License.
 # ==============================================================================
 
-import delta.compat as tf
 
+import delta.compat as tf
 from core.ops import py_x_ops
 from delta.utils.hparam import HParams
 from delta.data.frontend.base_frontend import BaseFrontend
diff --git a/delta/data/frontend/synthfiltbank_test.py b/delta/data/frontend/synthfiltbank_test.py
index 2742ba0f..2208d494 100644
--- a/delta/data/frontend/synthfiltbank_test.py
+++ b/delta/data/frontend/synthfiltbank_test.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""The model tests Synthfiltbank FE."""
 
 import os
 from pathlib import Path
@@ -25,7 +26,9 @@
 
 
 class Test(tf.test.TestCase):
-
+  """
+  Synthfiltbank extraction test.
+  """
   def test_synthfiltbank(self):
     wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))
 
diff --git a/delta/data/frontend/write_wav.py b/delta/data/frontend/write_wav.py
index 2b74ba4d..aec5d4ce 100644
--- a/delta/data/frontend/write_wav.py
+++ b/delta/data/frontend/write_wav.py
@@ -60,7 +60,7 @@ def call(self, filename, audio_data, sample_rate=None):
     assert_op = tf.assert_equal(
         tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32))
     with tf.control_dependencies([assert_op]):
-      audio_data = tf.cast(audio_data, dtype=tf.float32)
+      audio_data = tf.cast(audio_data / 32768, dtype=tf.float32)
       contents = tf.audio.encode_wav(
           tf.expand_dims(audio_data, 1), tf.cast(sample_rate, dtype=tf.int32))
       w = tf.io.write_file(filename, contents)
diff --git a/delta/data/frontend/write_wav_test.py b/delta/data/frontend/write_wav_test.py
index 39518cc7..470389f4 100644
--- a/delta/data/frontend/write_wav_test.py
+++ b/delta/data/frontend/write_wav_test.py
@@ -28,15 +28,14 @@ def test_write_wav(self):
     wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))
 
     with self.cached_session(use_gpu=False, force_gpu=False) as sess:
-      read_wav = ReadWav.params().instantiate()
+      read_wav = ReadWav.params({'speed': 1.1}).instantiate()
       input_data, sample_rate = read_wav(wav_path)
       input_data = input_data / 32768
       write_wav = WriteWav.params().instantiate()
-      new_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln_new.wav'))
+      new_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln_speed.wav'))
       writewav_op = write_wav(new_path, input_data, sample_rate)
       sess.run(writewav_op)
       test_data, test_sample_rate = read_wav(new_path)
-      test_data = test_data / 32768
       self.assertAllEqual(input_data.eval(), test_data.eval())
       self.assertAllEqual(sample_rate.eval(), test_sample_rate.eval())
 
diff --git a/delta/data/frontend/zcr.py b/delta/data/frontend/zcr.py
index bda72423..5ffe76f3 100644
--- a/delta/data/frontend/zcr.py
+++ b/delta/data/frontend/zcr.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""This model extracts zcr features per frame."""
 
 import delta.compat as tf
 
@@ -22,6 +23,10 @@
 
 
 class Zcr(BaseFrontend):
+  """
+  Compute ZCR features respectively，and concate them. Return
+  a tensor with shape (1, num_frames).
+  """
 
   def __init__(self, config: dict):
     super().__init__(config)
@@ -30,9 +35,13 @@ def __init__(self, config: dict):
   def params(cls, config=None):
     """
     Set params.
-    :param config:contains three optional parameters: window_length(float, default=0.025s),
-        frame_length(float, default=0.010s), and sample_rate(int, default=16000).
-    :return: An object of class HParams, which is a set of hyperparameters as name-value pairs.
+    :param config:contains three optional parameters:
+        --sample_rate       : Waveform data sample frequency (must match the waveform
+                             file, if specified there). (float, default = 16000)
+        --window_length		 : Window length in seconds. (float, default = 0.025)
+        --frame_length		 : Hop length in seconds. (float, default = 0.010)
+    :return: An object of class HParams, which is a set of hyperparameters as
+            name-value pairs.
     """
 
     window_length = 0.025
@@ -52,9 +61,12 @@ def params(cls, config=None):
   def call(self, audio_data, sample_rate=None):
     """
     Calculate the zero-crossing rate of speech.
-    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
-    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
-    :return: A tensor with shape (1, num_frames), containing zero-crossing rate of every frame in speech.
+    :param audio_data: the audio signal from which to compute spectrum.
+                      Should be an (1, N) tensor.
+    :param sample_rate: [option]the samplerate of the signal we working with,
+                        default is 16kHz.
+    :return: A tensor with shape (1, num_frames), containing zero-crossing rate of
+            every frame in speech.
     """
 
     p = self.config
diff --git a/delta/data/frontend/zcr_test.py b/delta/data/frontend/zcr_test.py
index f7d9808a..f670ceee 100644
--- a/delta/data/frontend/zcr_test.py
+++ b/delta/data/frontend/zcr_test.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""The model tests ZCR FE."""
 
 import os
 from pathlib import Path
@@ -25,7 +26,9 @@
 
 
 class ZcrTest(tf.test.TestCase):
-
+  """
+  Test Fbank FE using 8k/16k wav files.
+  """
   def test_zcr(self):
 
     wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))
diff --git a/utils/speech/compute_fbank_feats.py b/utils/speech/compute_fbank_feats.py
index cff4d5db..b1f715c1 100755
--- a/utils/speech/compute_fbank_feats.py
+++ b/utils/speech/compute_fbank_feats.py
@@ -64,9 +64,14 @@ def get_parser():
       help='Type of window ("hamm"|"hann"|"povey"|"rect"|"blac"|"tria").')
   parser.add_argument(
       '--snip_edges',
-      type=bool,
-      default=True,
+      type=int,
+      default=1,
       help='The last frame (shorter than window_length) will not be cutoff.')
+  parser.add_argument(
+    '--dither',
+    type=float,
+    default=0.0,
+    help='Dithering constant (0.0 means no dither).')
   parser.add_argument(
       '--raw_energy',
       type=int,
@@ -131,6 +136,7 @@ def compute_fbank():
   config['preeph_coeff'] = args.preeph_coeff
   config['remove_dc_offset'] = args.remove_dc_offset
   config['is_fbank'] = args.is_fbank
+  config['dither'] = args.dither
 
   fbank = Fbank.params(config).instantiate()
 
diff --git a/utils/speech/compute_fbank_pitch.py b/utils/speech/compute_fbank_pitch.py
index 43f908b3..6d73f68d 100755
--- a/utils/speech/compute_fbank_pitch.py
+++ b/utils/speech/compute_fbank_pitch.py
@@ -47,6 +47,11 @@ def get_parser():
       type=float,
       default=40,
       help='Order of fbank')
+  parser.add_argument(
+    '--dither',
+    type=float,
+    default=0.0,
+    help='Dithering constant (0.0 means no dither).')
   parser.add_argument(
       '--window_length', type=float, default=0.025, help='Length of a frame')
   parser.add_argument(
@@ -133,6 +138,7 @@ def compute_fbank_pitch():
   config['remove_dc_offset'] = args.remove_dc_offset
   config['is_fbank'] = args.is_fbank
   config['thres_autoc'] = args.thres_autoc
+  config['dither'] = args.dither
 
   fbank_pitch = FbankPitch.params(config).instantiate()
 
diff --git a/utils/speech/compute_mfcc_feats.py b/utils/speech/compute_mfcc_feats.py
index 101cf2eb..60f87de0 100755
--- a/utils/speech/compute_mfcc_feats.py
+++ b/utils/speech/compute_mfcc_feats.py
@@ -87,6 +87,11 @@ def get_parser():
       type=bool,
       default=True,
       help='Compute power spetrum without frame energy.')
+  parser.add_argument(
+    '--dither',
+    type=float,
+    default=0.0,
+    help='Dithering constant (0.0 means no dither).')
   parser.add_argument(
       '--cepstral_lifter',
       type=float,
@@ -149,6 +154,7 @@ def compute_mfcc():
   config['cepstral_lifter'] = args.cepstral_lifter
   config['coefficient_count'] = args.coefficient_count
   config['use_energy'] = args.use_energy
+  config['dither'] = args.dither
 
   mfcc = Mfcc.params(config).instantiate()
 
diff --git a/utils/speech/compute_spectrum_feats.py b/utils/speech/compute_spectrum_feats.py
index 800229e1..d873d91f 100755
--- a/utils/speech/compute_spectrum_feats.py
+++ b/utils/speech/compute_spectrum_feats.py
@@ -72,6 +72,11 @@ def get_parser():
       type=bool,
       default=False,
       help='Compute power spetrum without frame energy')
+  parser.add_argument(
+    '--dither',
+    type=float,
+    default=0.0,
+    help='Dithering constant (0.0 means no dither).')
   parser.add_argument(
       '--write_num_frames',
       type=str,
@@ -114,6 +119,7 @@ def compute_spectrum():
   config['preeph_coeff'] = args.preeph_coeff
   config['remove_dc_offset'] = args.remove_dc_offset
   config['is_fbank'] = args.is_fbank
+  config['dither'] = args.dither
 
   spectrum = Spectrum.params(config).instantiate()
 
diff --git a/utils/speech/make_fbank.sh b/utils/speech/make_fbank.sh
index 34808dee..91c43ff2 100755
--- a/utils/speech/make_fbank.sh
+++ b/utils/speech/make_fbank.sh
@@ -26,8 +26,9 @@ filterbank_channel_count=23
 window_length=0.025
 frame_length=0.010
 output_type=1
-snip_edges=true
+snip_edges=1
 raw_energy=1
+dither=0.0
 preeph_coeff=0.97
 window_type='povey'
 remove_dc_offset=true
@@ -123,6 +124,7 @@ if [ -f ${data}/segments ]; then
             --window_type ${window_type} \
             --remove_dc_offset ${remove_dc_offset} \
             --is_fbank ${is_fbank} \
+            --dither ${dither} \
             ${write_num_frames_opt} \
             --compress ${compress} \
             --compression_method ${compression_method} \
@@ -153,6 +155,7 @@ else
             --window_type ${window_type} \
             --remove_dc_offset ${remove_dc_offset} \
             --is_fbank ${is_fbank} \
+            --dither ${dither} \
             ${write_num_frames_opt} \
             --compress ${compress} \
             --compression_method ${compression_method} \
diff --git a/utils/speech/make_fbank_pitch.sh b/utils/speech/make_fbank_pitch.sh
index a3522f12..2570ae75 100755
--- a/utils/speech/make_fbank_pitch.sh
+++ b/utils/speech/make_fbank_pitch.sh
@@ -32,6 +32,7 @@ preeph_coeff=0.97
 window_type='povey'
 remove_dc_offset=true
 is_fbank=true
+dither=0.0
 thres_autoc=0.3
 write_utt2num_frames=true
 compress=false
@@ -125,6 +126,7 @@ if [ -f ${data}/segments ]; then
             --window_type ${window_type} \
             --remove_dc_offset ${remove_dc_offset} \
             --is_fbank ${is_fbank} \
+            --dither ${dither} \
             ${write_num_frames_opt} \
             --compress ${compress} \
             --compression_method ${compression_method} \
@@ -156,6 +158,7 @@ else
             --window_type ${window_type} \
             --remove_dc_offset ${remove_dc_offset} \
             --is_fbank ${is_fbank} \
+            --dither ${dither} \
             ${write_num_frames_opt} \
             --compress ${compress} \
             --compression_method ${compression_method} \
diff --git a/utils/speech/make_mfcc.sh b/utils/speech/make_mfcc.sh
index 9f0e69bf..e6813fc2 100755
--- a/utils/speech/make_mfcc.sh
+++ b/utils/speech/make_mfcc.sh
@@ -38,6 +38,7 @@ write_utt2num_frames=true
 compress=false
 compression_method=2
 use_energy=true
+dither=0.0
 
 if [ -f path.sh ]; then . ./path.sh; fi
  . parse_options.sh || exit 1;
@@ -126,6 +127,7 @@ if [ -f ${data}/segments ]; then
             --window_type ${window_type} \
             --remove_dc_offset ${remove_dc_offset} \
             --is_fbank ${is_fbank} \
+            --dither ${dither} \
             --cepstral_lifter ${cepstral_lifter} \
             --coefficient_count ${coefficient_count} \
             --use_energy ${use_energy} \
@@ -159,6 +161,7 @@ else
             --window_type ${window_type} \
             --remove_dc_offset ${remove_dc_offset} \
             --is_fbank ${is_fbank} \
+            --dither ${dither} \
             --cepstral_lifter ${cepstral_lifter} \
             --coefficient_count ${coefficient_count} \
             --use_energy ${use_energy} \
diff --git a/utils/speech/make_spectrum.sh b/utils/speech/make_spectrum.sh
index dce6fb5c..7d6de82d 100755
--- a/utils/speech/make_spectrum.sh
+++ b/utils/speech/make_spectrum.sh
@@ -29,6 +29,7 @@ preeph_coeff=0.97
 window_type='povey'
 remove_dc_offset=true
 is_fbank=false
+dither=0.0
 output_type=2
 write_utt2num_frames=true
 compress=false
@@ -119,6 +120,7 @@ if [ -f ${data}/segments ]; then
             --window_type ${window_type} \
             --remove_dc_offset ${remove_dc_offset} \
             --is_fbank ${is_fbank} \
+            --dither ${dither} \
             ${write_num_frames_opt} \
             --compress ${compress} \
             --compression_method ${compression_method} \
@@ -147,6 +149,7 @@ else
             --window_type ${window_type} \
             --remove_dc_offset ${remove_dc_offset} \
             --is_fbank ${is_fbank} \
+            --dither ${dither} \
             ${write_num_frames_opt} \
             --compress ${compress} \
             --compression_method ${compression_method} \

From 715f57591c88c58b117da95054ab2f0d6ec9b6bc Mon Sep 17 00:00:00 2001
From: dengchengyun <dengchengyun@didiglobal.com>
Date: Fri, 20 Dec 2019 10:55:51 +0800
Subject: [PATCH 2/6] fix snip_edges setting

---
 core/ops/kernels/framepow.cc            | 6 +++---
 core/ops/kernels/framepow.h             | 4 ++--
 core/ops/kernels/framepow_op.cc         | 4 ++--
 core/ops/kernels/spectrum.cc            | 6 +++---
 core/ops/kernels/spectrum.h             | 4 ++--
 core/ops/kernels/spectrum_op.cc         | 6 +++---
 core/ops/kernels/x_ops.cc               | 4 ++--
 delta/data/frontend/fbank.py            | 8 ++++----
 delta/data/frontend/fbank_pitch.py      | 8 ++++----
 delta/data/frontend/fbank_pitch_test.py | 2 +-
 delta/data/frontend/fbank_test.py       | 2 +-
 delta/data/frontend/framepow.py         | 2 +-
 delta/data/frontend/mfcc.py             | 8 ++++----
 delta/data/frontend/spectrum.py         | 8 ++++----
 delta/data/frontend/spectrum_test.py    | 2 +-
 utils/speech/compute_fbank_feats.py     | 5 +++--
 utils/speech/compute_fbank_pitch.py     | 5 +++--
 utils/speech/compute_mfcc_feats.py      | 1 +
 utils/speech/compute_pitch_feats.py     | 1 +
 utils/speech/compute_spectrum_feats.py  | 5 +++--
 utils/speech/make_fbank.sh              | 2 +-
 utils/speech/make_fbank_pitch.sh        | 2 +-
 utils/speech/make_spectrum.sh           | 2 +-
 23 files changed, 51 insertions(+), 46 deletions(-)

diff --git a/core/ops/kernels/framepow.cc b/core/ops/kernels/framepow.cc
index 4fc13f2f..a682fdc9 100644
--- a/core/ops/kernels/framepow.cc
+++ b/core/ops/kernels/framepow.cc
@@ -27,7 +27,7 @@ const float frame_length_sec = 0.010;
 FramePow::FramePow() {
   window_length_sec_ = window_length_sec;
   frame_length_sec_ = frame_length_sec;
-  i_snip_edges = 1;
+  i_snip_edges = true;
   i_remove_dc_offset = true;
   pf_FrmEng = NULL;
 }
@@ -42,7 +42,7 @@ void FramePow::set_frame_length_sec(float frame_length_sec) {
   frame_length_sec_ = frame_length_sec;
 }
 
-void FramePow::set_snip_edges(int snip_edges) { i_snip_edges = snip_edges; }
+void FramePow::set_snip_edges(bool snip_edges) { i_snip_edges = snip_edges; }
 
 void FramePow::set_remove_dc_offset(bool remove_dc_offset) {
   i_remove_dc_offset = remove_dc_offset;
@@ -52,7 +52,7 @@ int FramePow::init_eng(int input_size, float sample_rate) {
   f_SamRat = sample_rate;
   i_WinLen = static_cast<int>(window_length_sec_ * f_SamRat);
   i_FrmLen = static_cast<int>(frame_length_sec_ * f_SamRat);
-  if (i_snip_edges == 1)
+  if (i_snip_edges == true)
     i_NumFrm = (input_size - i_WinLen) / i_FrmLen + 1;
   else
     i_NumFrm = (input_size + i_FrmLen / 2) / i_FrmLen;
diff --git a/core/ops/kernels/framepow.h b/core/ops/kernels/framepow.h
index c756da78..47f1c8ad 100644
--- a/core/ops/kernels/framepow.h
+++ b/core/ops/kernels/framepow.h
@@ -27,7 +27,7 @@ class FramePow {
  private:
   float window_length_sec_;
   float frame_length_sec_;
-  int i_snip_edges;
+  bool i_snip_edges;
   bool i_remove_dc_offset;
 
   float f_SamRat;
@@ -46,7 +46,7 @@ class FramePow {
 
   void set_frame_length_sec(float frame_length_sec);
 
-  void set_snip_edges(int snip_edges);
+  void set_snip_edges(bool snip_edges);
 
   void set_remove_dc_offset(bool remove_dc_offset);
 
diff --git a/core/ops/kernels/framepow_op.cc b/core/ops/kernels/framepow_op.cc
index 55897d5a..0d7e3dd3 100644
--- a/core/ops/kernels/framepow_op.cc
+++ b/core/ops/kernels/framepow_op.cc
@@ -63,7 +63,7 @@ class FramePowOp : public OpKernel {
     int i_WinLen = static_cast<int>(window_length_ * sample_rate);
     int i_FrmLen = static_cast<int>(frame_length_ * sample_rate);
     int i_NumFrm = (L - i_WinLen) / i_FrmLen + 1;
-    if (snip_edges_ == 2) i_NumFrm = (L + i_FrmLen / 2) / i_FrmLen;
+    if (snip_edges_ == false) i_NumFrm = (L + i_FrmLen / 2) / i_FrmLen;
     if (i_NumFrm < 1) i_NumFrm = 1;
     OP_REQUIRES_OK(context, context->allocate_output(
                                 0, TensorShape({1, i_NumFrm}), &output_tensor));
@@ -79,7 +79,7 @@ class FramePowOp : public OpKernel {
  private:
   float window_length_;
   float frame_length_;
-  int snip_edges_;
+  bool snip_edges_;
   bool remove_dc_offset_;
 };
 
diff --git a/core/ops/kernels/spectrum.cc b/core/ops/kernels/spectrum.cc
index eec21b11..1e8e513c 100644
--- a/core/ops/kernels/spectrum.cc
+++ b/core/ops/kernels/spectrum.cc
@@ -30,7 +30,7 @@ Spectrum::Spectrum() {
   window_length_sec_ = window_length_sec;
   frame_length_sec_ = frame_length_sec;
   i_OutTyp = 1;
-  i_snip_edges = 1;
+  i_snip_edges = true;
   i_raw_energy = 1;
   f_PreEph = 0.97;
   i_is_fbank = true;
@@ -57,7 +57,7 @@ void Spectrum::set_frame_length_sec(float frame_length_sec) {
 
 void Spectrum::set_output_type(int output_type) { i_OutTyp = output_type; }
 
-void Spectrum::set_snip_edges(int snip_edges) { i_snip_edges = snip_edges; }
+void Spectrum::set_snip_edges(bool snip_edges) { i_snip_edges = snip_edges; }
 
 void Spectrum::set_raw_energy(int raw_energy) {i_raw_energy = raw_energy;}
 
@@ -77,7 +77,7 @@ int Spectrum::init_spc(int input_size, float sample_rate) {
   f_SamRat = sample_rate;
   i_WinLen = static_cast<int>(window_length_sec_ * f_SamRat);
   i_FrmLen = static_cast<int>(frame_length_sec_ * f_SamRat);
-  if (i_snip_edges == 1)
+  if (i_snip_edges == true)
     i_NumFrm = (input_size - i_WinLen) / i_FrmLen + 1;
   else
     i_NumFrm = (input_size + i_FrmLen / 2) / i_FrmLen;
diff --git a/core/ops/kernels/spectrum.h b/core/ops/kernels/spectrum.h
index 517890ce..e9b93e16 100644
--- a/core/ops/kernels/spectrum.h
+++ b/core/ops/kernels/spectrum.h
@@ -41,7 +41,7 @@ class Spectrum {
   float f_PreEph;
   char s_WinTyp[40];
   int i_OutTyp;  // 1: PSD, 2:log(PSD)
-  int i_snip_edges;
+  bool i_snip_edges;
   int i_raw_energy;
   bool i_remove_dc_offset;
   bool i_is_fbank;
@@ -66,7 +66,7 @@ class Spectrum {
 
   void set_output_type(int output_type);
 
-  void set_snip_edges(int snip_edges);
+  void set_snip_edges(bool snip_edges);
 
   void set_raw_energy(int raw_energy);
 
diff --git a/core/ops/kernels/spectrum_op.cc b/core/ops/kernels/spectrum_op.cc
index d6afecea..7e88275e 100644
--- a/core/ops/kernels/spectrum_op.cc
+++ b/core/ops/kernels/spectrum_op.cc
@@ -77,8 +77,8 @@ class SpecOp : public OpKernel {
     int i_WinLen = static_cast<int>(window_length_ * sample_rate);
     int i_FrmLen = static_cast<int>(frame_length_ * sample_rate);
     int i_NumFrm = (L - i_WinLen) / i_FrmLen + 1;
-    int i_snip_edges = snip_edges_;
-    if (i_snip_edges == 2)
+    bool i_snip_edges = snip_edges_;
+    if (i_snip_edges == false)
         i_NumFrm = (L + i_FrmLen / 2) / i_FrmLen;
     if (i_NumFrm < 1)
         i_NumFrm = 1;
@@ -99,7 +99,7 @@ class SpecOp : public OpKernel {
   float window_length_;
   float frame_length_;
   int output_type_;
-  int snip_edges_;
+  bool snip_edges_;
   int raw_energy_;
   float preEph_coeff_;
   string window_type_;
diff --git a/core/ops/kernels/x_ops.cc b/core/ops/kernels/x_ops.cc
index 601124cd..a36e5b5b 100644
--- a/core/ops/kernels/x_ops.cc
+++ b/core/ops/kernels/x_ops.cc
@@ -365,7 +365,7 @@ REGISTER_OP("Pitch")
 REGISTER_OP("FramePow")
     .Input("input_data: float")
     .Input("sample_rate: float")
-    .Attr("snip_edges: int = 1")
+    .Attr("snip_edges: bool = true")
     .Attr("remove_dc_offset: bool = true")
     .Attr("window_length: float = 0.025")
     .Attr("frame_length: float = 0.010")
@@ -423,7 +423,7 @@ REGISTER_OP("Spectrum")
     .Attr("frame_length: float = 0.010")
     .Attr("window_type: string")
     .Attr("output_type: int = 2")
-    .Attr("snip_edges: int = 1")
+    .Attr("snip_edges: bool = true")
     .Attr("raw_energy: int = 1")
     .Attr("preEph_coeff: float = 0.97")
     .Attr("remove_dc_offset: bool = true")
diff --git a/delta/data/frontend/fbank.py b/delta/data/frontend/fbank.py
index 3bd55ab7..fb288ab7 100644
--- a/delta/data/frontend/fbank.py
+++ b/delta/data/frontend/fbank.py
@@ -39,9 +39,9 @@ def params(cls, config=None):
     :param config: contains thirteen optional parameters:
            --window_length				: Window length in seconds. (float, default = 0.025)
            --frame_length				: Hop length in seconds. (float, default = 0.010)
-           --snip_edges				: If 1, the last frame (shorter than window_length) will be
-                                         cutoff. If 2, 1 // 2 frame_length data will be padded
-                                         to data. (int, default = 1)
+           --snip_edges				: If true, the last frame (shorter than window_length) will be
+                                         cutoff. If ,false 1 // 2 frame_length data will be padded
+                                         to data. (bool, default = true)
            ---raw_energy				: If 1, compute frame energy before preemphasis and
                                          windowing. If 2,  compute frame energy after
                                          preemphasis and windowing. (int, default = 1)
@@ -74,7 +74,7 @@ def params(cls, config=None):
     frame_length = 0.010
     output_type = 1
     sample_rate = 16000
-    snip_edges = 1
+    snip_edges = True
     raw_energy = 1
     preeph_coeff = 0.97
     window_type = 'povey'
diff --git a/delta/data/frontend/fbank_pitch.py b/delta/data/frontend/fbank_pitch.py
index 84e2e240..1af93b84 100644
--- a/delta/data/frontend/fbank_pitch.py
+++ b/delta/data/frontend/fbank_pitch.py
@@ -41,9 +41,9 @@ def params(cls, config=None):
                                   (int, default = 16000)
           --window_length		    : Window length in seconds. (float, default = 0.025)
           --frame_length			  : Hop length in seconds. (float, default = 0.010)
-          --snip_edges				  : If 1, the last frame (shorter than window_length) will
-                                        be cutoff. If 2, 1 // 2 frame_length data will be padded
-                                         to data. (int, default = 1)
+          --snip_edges				  : If true, the last frame (shorter than window_length) will
+                                        be cutoff. If false, 1 // 2 frame_length data will be padded
+                                         to data. (bool, default = true)
           ---raw_energy				  : If 1, compute frame energy before preemphasis and
                                         windowing. If 2,  compute frame energy after preemphasis
                                          and windowing. (int, default = 1)
@@ -126,7 +126,7 @@ def params(cls, config=None):
     output_type = 1
     dither = 0.0
     sample_rate = 16000
-    snip_edges = 1
+    snip_edges = True
     preemph_coeff = 0.0
     min_f0 = 50.0
     max_f0 = 400.0
diff --git a/delta/data/frontend/fbank_pitch_test.py b/delta/data/frontend/fbank_pitch_test.py
index b778f7e6..f7829990 100644
--- a/delta/data/frontend/fbank_pitch_test.py
+++ b/delta/data/frontend/fbank_pitch_test.py
@@ -35,7 +35,7 @@ def test_FbankPitch(self):
       input_data, sample_rate = read_wav(wav_path)
       config = {'window_length': 0.025, 'output_type': 1, 'frame_length': 0.010}
       fbank_pitch = FbankPitch.params(config).instantiate()
-      fbank_pitch_test = fbank_pitch(input_data)
+      fbank_pitch_test = fbank_pitch(input_data, sample_rate)
 
       self.assertEqual(tf.rank(fbank_pitch_test).eval(), 2)
       print(fbank_pitch_test.eval()[0:2])
diff --git a/delta/data/frontend/fbank_test.py b/delta/data/frontend/fbank_test.py
index c26fe6bb..3228ad4f 100644
--- a/delta/data/frontend/fbank_test.py
+++ b/delta/data/frontend/fbank_test.py
@@ -39,7 +39,7 @@ def test_fbank(self):
           'window_length': 0.025,
           'output_type': 1,
           'frame_length': 0.010,
-          'snip_edges': 1
+          'snip_edges': True
       }
       fbank = Fbank.params(config).instantiate()
       fbank_test = fbank(input_data, sample_rate)
diff --git a/delta/data/frontend/framepow.py b/delta/data/frontend/framepow.py
index 5d54ca95..ca048179 100644
--- a/delta/data/frontend/framepow.py
+++ b/delta/data/frontend/framepow.py
@@ -48,7 +48,7 @@ def params(cls, config=None):
 
     window_length = 0.025
     frame_length = 0.010
-    snip_edges = 1
+    snip_edges = True
     remove_dc_offset = True
     sample_rate = 16000
 
diff --git a/delta/data/frontend/mfcc.py b/delta/data/frontend/mfcc.py
index 16086b0e..a43a57df 100644
--- a/delta/data/frontend/mfcc.py
+++ b/delta/data/frontend/mfcc.py
@@ -41,9 +41,9 @@ def params(cls, config=None):
     :param config: contains fourteen optional parameters.
         --window_length				: Window length in seconds. (float, default = 0.025)
         --frame_length				: Hop length in seconds. (float, default = 0.010)
-        --snip_edges				: If 1, the last frame (shorter than window_length) will
-                                      be cutoff. If 2, 1 // 2 frame_length data will be padded
-                                      to data. (int, default = 1)
+        --snip_edges				: If True, the last frame (shorter than window_length) will
+                              be cutoff. If False, 1 // 2 frame_length data will be padded
+                              to data. (bool, default = True)
         ---raw_energy				: If 1, compute frame energy before preemphasis and
                                       windowing. If 2, compute frame energy after
                                       preemphasis and windowing. (int, default = 1)
@@ -79,7 +79,7 @@ def params(cls, config=None):
     frame_length = 0.010
     output_type = 1
     sample_rate = 16000
-    snip_edges = 1
+    snip_edges = True
     raw_energy = 1
     preeph_coeff = 0.97
     window_type = 'povey'
diff --git a/delta/data/frontend/spectrum.py b/delta/data/frontend/spectrum.py
index f8a350ec..323f17b7 100644
--- a/delta/data/frontend/spectrum.py
+++ b/delta/data/frontend/spectrum.py
@@ -38,9 +38,9 @@ def params(cls, config=None):
                               file, if specified there). (float, default = 16000)
           --window_length		: Window length in seconds. (float, default = 0.025)
           --frame_length		: Hop length in seconds. (float, default = 0.010)
-          --snip_edges			: If 1, the last frame (shorter than window_length)
-                                  will be cutoff. If 2, 1 // 2 frame_length data will
-                                  be padded to data. (int, default = 1)
+          --snip_edges			: If True, the last frame (shorter than window_length)
+                                  will be cutoff. If False, 1 // 2 frame_length data will
+                                  be padded to data. (bool, default = True)
           ---raw_energy			: If 1, compute frame energy before preemphasis and windowing.
                                   If 2,  compute frame energy after preemphasis and windowing.
                                   (int, default = 1)
@@ -64,7 +64,7 @@ def params(cls, config=None):
     frame_length = 0.010
     output_type = 2
     sample_rate = 16000
-    snip_edges = 1
+    snip_edges = True
     raw_energy = 1
     preeph_coeff = 0.97
     window_type = 'povey'
diff --git a/delta/data/frontend/spectrum_test.py b/delta/data/frontend/spectrum_test.py
index dd7bc11f..4574d1a8 100644
--- a/delta/data/frontend/spectrum_test.py
+++ b/delta/data/frontend/spectrum_test.py
@@ -37,7 +37,7 @@ def test_spectrum(self):
 
       spectrum = Spectrum.params({
           'window_length': 0.025,
-          'snip_edges': 1,
+          'snip_edges': True,
           'dither':0.0
       }).instantiate()
       spectrum_test = spectrum(input_data, sample_rate)
diff --git a/utils/speech/compute_fbank_feats.py b/utils/speech/compute_fbank_feats.py
index b1f715c1..11cbdf07 100755
--- a/utils/speech/compute_fbank_feats.py
+++ b/utils/speech/compute_fbank_feats.py
@@ -15,6 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""Create Fbank feature files."""
 
 import delta.compat as tf
 import argparse
@@ -64,8 +65,8 @@ def get_parser():
       help='Type of window ("hamm"|"hann"|"povey"|"rect"|"blac"|"tria").')
   parser.add_argument(
       '--snip_edges',
-      type=int,
-      default=1,
+      type=bool,
+      default=True,
       help='The last frame (shorter than window_length) will not be cutoff.')
   parser.add_argument(
     '--dither',
diff --git a/utils/speech/compute_fbank_pitch.py b/utils/speech/compute_fbank_pitch.py
index 6d73f68d..d7b0d0aa 100755
--- a/utils/speech/compute_fbank_pitch.py
+++ b/utils/speech/compute_fbank_pitch.py
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""Create fbank_picth feature files."""
 
 import delta.compat as tf
 import argparse
@@ -68,8 +69,8 @@ def get_parser():
       help='Type of window ("hamm"|"hann"|"povey"|"rect"|"blac"|"tria").')
   parser.add_argument(
       '--snip_edges',
-      type=int,
-      default=1,
+      type=bool,
+      default=True,
       help='The last frame (shorter than window_length) will not be cutoff.')
   parser.add_argument(
       '--raw_energy',
diff --git a/utils/speech/compute_mfcc_feats.py b/utils/speech/compute_mfcc_feats.py
index 60f87de0..7d525581 100755
--- a/utils/speech/compute_mfcc_feats.py
+++ b/utils/speech/compute_mfcc_feats.py
@@ -15,6 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""Create MFCC feature files."""
 
 import delta.compat as tf
 import argparse
diff --git a/utils/speech/compute_pitch_feats.py b/utils/speech/compute_pitch_feats.py
index 13b266b4..69e535a0 100755
--- a/utils/speech/compute_pitch_feats.py
+++ b/utils/speech/compute_pitch_feats.py
@@ -15,6 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""Create Pitch feature files."""
 
 import delta.compat as tf
 import argparse
diff --git a/utils/speech/compute_spectrum_feats.py b/utils/speech/compute_spectrum_feats.py
index d873d91f..e6db813b 100755
--- a/utils/speech/compute_spectrum_feats.py
+++ b/utils/speech/compute_spectrum_feats.py
@@ -15,6 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+""""Create spectrogram feature files."""
 
 import delta.compat as tf
 import argparse
@@ -49,8 +50,8 @@ def get_parser():
       help='Type of window ("hamm"|"hann"|"povey"|"rect"|"blac"|"tria").')
   parser.add_argument(
       '--snip_edges',
-      type=int,
-      default=1,
+      type=bool,
+      default=True,
       help='The last frame (shorter than window_length) will not be cutoff.')
   parser.add_argument(
       '--raw_energy',
diff --git a/utils/speech/make_fbank.sh b/utils/speech/make_fbank.sh
index 91c43ff2..5b8bcc2e 100755
--- a/utils/speech/make_fbank.sh
+++ b/utils/speech/make_fbank.sh
@@ -26,7 +26,7 @@ filterbank_channel_count=23
 window_length=0.025
 frame_length=0.010
 output_type=1
-snip_edges=1
+snip_edges=true
 raw_energy=1
 dither=0.0
 preeph_coeff=0.97
diff --git a/utils/speech/make_fbank_pitch.sh b/utils/speech/make_fbank_pitch.sh
index 2570ae75..756a5c2a 100755
--- a/utils/speech/make_fbank_pitch.sh
+++ b/utils/speech/make_fbank_pitch.sh
@@ -26,7 +26,7 @@ filterbank_channel_count=40
 window_length=0.025
 frame_length=0.010
 output_type=1
-snip_edges=1
+snip_edges=true
 raw_energy=1
 preeph_coeff=0.97
 window_type='povey'
diff --git a/utils/speech/make_spectrum.sh b/utils/speech/make_spectrum.sh
index 7d6de82d..4150eff0 100755
--- a/utils/speech/make_spectrum.sh
+++ b/utils/speech/make_spectrum.sh
@@ -23,7 +23,7 @@ sample_rate=16000
 window_length=0.025
 frame_length=0.010
 output_type=2
-snip_edges=1
+snip_edges=true
 raw_energy=1
 preeph_coeff=0.97
 window_type='povey'

From 4726324a894e3130a19779b67a5e24841e5ddf6d Mon Sep 17 00:00:00 2001
From: dengchengyun <dengchengyun@didiglobal.com>
Date: Fri, 20 Dec 2019 12:15:03 +0800
Subject: [PATCH 3/6] fix params

---
 delta/data/frontend/add_noise_end_to_end.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/delta/data/frontend/add_noise_end_to_end.py b/delta/data/frontend/add_noise_end_to_end.py
index 26329fb0..93dcf83b 100644
--- a/delta/data/frontend/add_noise_end_to_end.py
+++ b/delta/data/frontend/add_noise_end_to_end.py
@@ -38,7 +38,7 @@ def __init__(self, config: dict):
   def params(cls, config=None):
     """
         Set params.
-        :param config: contains nine optional parameters:
+        :param config: contains ten optional parameters:
             --sample_rate				  : Sample frequency of waveform data. (int, default = 16000)
             --if_add_rir          : If true, add rir to audio data. (bool, default = False)
             --rir_filelist        : FileList path of rir.(string, default = 'rirlist.scp')
@@ -48,6 +48,7 @@ def params(cls, config=None):
             --noise_filelist      : FileList path of noise.(string, default = 'noiselist.scp')
             --if_add_aecres       : If true, add aecres to audio data. (bool, default = False)
             --aecres_filelist     : FileList path of aecres.(string, default = 'aecreslist.scp')
+            --speed               : Speed of sample channels wanted. (float, default=1.0)
         :return: An object of class HParams, which is a set of hyperparameters as name-value pairs.
         """
 
@@ -61,9 +62,11 @@ def params(cls, config=None):
     if_add_aecres = False
     aecres_filelist = 'aecreslist.scp'
     audio_channels = 1
+    speed = 1.0
 
     hparams = HParams(cls=cls)
     hparams.add_hparam('sample_rate', sample_rate)
+    hparams.add_hparam('speed', speed)
     hparams.add_hparam('if_add_rir', if_add_rir)
     hparams.add_hparam('if_add_noise', if_add_noise)
     hparams.add_hparam('rir_filelist', rir_filelist)

From 6b65c22fe84b20ce1a21d5077dc93cf8d7e3045d Mon Sep 17 00:00:00 2001
From: dengchengyun <dengchengyun@didiglobal.com>
Date: Fri, 20 Dec 2019 12:25:39 +0800
Subject: [PATCH 4/6] fix test of write_wav

---
 delta/data/frontend/write_wav_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/delta/data/frontend/write_wav_test.py b/delta/data/frontend/write_wav_test.py
index 470389f4..de92b37b 100644
--- a/delta/data/frontend/write_wav_test.py
+++ b/delta/data/frontend/write_wav_test.py
@@ -28,7 +28,7 @@ def test_write_wav(self):
     wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))
 
     with self.cached_session(use_gpu=False, force_gpu=False) as sess:
-      read_wav = ReadWav.params({'speed': 1.1}).instantiate()
+      read_wav = ReadWav.params({'speed': 1.0}).instantiate()
       input_data, sample_rate = read_wav(wav_path)
       input_data = input_data / 32768
       write_wav = WriteWav.params().instantiate()

From db961df9094237713cf2d939419228979878740a Mon Sep 17 00:00:00 2001
From: dengchengyun <dengchengyun@didiglobal.com>
Date: Fri, 20 Dec 2019 12:43:16 +0800
Subject: [PATCH 5/6] Update write_wav_test.py

---
 delta/data/frontend/write_wav_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/delta/data/frontend/write_wav_test.py b/delta/data/frontend/write_wav_test.py
index de92b37b..cf18ccba 100644
--- a/delta/data/frontend/write_wav_test.py
+++ b/delta/data/frontend/write_wav_test.py
@@ -30,7 +30,7 @@ def test_write_wav(self):
     with self.cached_session(use_gpu=False, force_gpu=False) as sess:
       read_wav = ReadWav.params({'speed': 1.0}).instantiate()
       input_data, sample_rate = read_wav(wav_path)
-      input_data = input_data / 32768
+      input_data = input_data
       write_wav = WriteWav.params().instantiate()
       new_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln_speed.wav'))
       writewav_op = write_wav(new_path, input_data, sample_rate)

From 8e4bfa52f35fe7a55857f2ab259abb0733bb15f5 Mon Sep 17 00:00:00 2001
From: dengchengyun <dengchengyun@didiglobal.com>
Date: Fri, 20 Dec 2019 13:41:40 +0800
Subject: [PATCH 6/6] do format.sh

---
 delta/data/frontend/add_noise_end_to_end.py   |  1 +
 .../frontend/add_noise_end_to_end_test.py     |  1 +
 delta/data/frontend/add_rir_noise_aecres.py   |  1 +
 .../frontend/add_rir_noise_aecres_test.py     |  1 +
 delta/data/frontend/analyfiltbank.py          |  1 +
 delta/data/frontend/analyfiltbank_test.py     |  1 +
 delta/data/frontend/cepstrum.py               |  1 +
 delta/data/frontend/cepstrum_test.py          |  1 +
 delta/data/frontend/cmvn.py                   |  1 +
 delta/data/frontend/delta_delta.py            |  1 +
 delta/data/frontend/delta_delta_test.py       |  1 +
 delta/data/frontend/fbank.py                  |  1 +
 delta/data/frontend/fbank_pitch.py            |  1 +
 delta/data/frontend/fbank_pitch_test.py       |  1 +
 delta/data/frontend/fbank_test.py             |  1 +
 delta/data/frontend/framepow.py               | 14 +++++++------
 delta/data/frontend/framepow_test.py          |  5 +++--
 delta/data/frontend/mfcc.py                   | 14 +++++++------
 delta/data/frontend/mfcc_test.py              |  1 +
 delta/data/frontend/pitch.py                  |  1 +
 delta/data/frontend/pitch_test.py             |  1 +
 delta/data/frontend/plp.py                    |  1 +
 delta/data/frontend/plp_test.py               |  1 +
 delta/data/frontend/read_wav.py               | 20 ++++++++++++-------
 delta/data/frontend/read_wav_test.py          |  2 ++
 delta/data/frontend/spectrum.py               |  1 +
 delta/data/frontend/spectrum_test.py          |  3 ++-
 delta/data/frontend/synthfiltbank.py          |  1 -
 delta/data/frontend/synthfiltbank_test.py     |  1 +
 delta/data/frontend/zcr_test.py               |  1 +
 utils/speech/compute_fbank_feats.py           |  8 ++++----
 utils/speech/compute_fbank_pitch.py           |  8 ++++----
 utils/speech/compute_mfcc_feats.py            |  8 ++++----
 utils/speech/compute_spectrum_feats.py        |  8 ++++----
 34 files changed, 75 insertions(+), 39 deletions(-)

diff --git a/delta/data/frontend/add_noise_end_to_end.py b/delta/data/frontend/add_noise_end_to_end.py
index 93dcf83b..d35a652a 100644
--- a/delta/data/frontend/add_noise_end_to_end.py
+++ b/delta/data/frontend/add_noise_end_to_end.py
@@ -28,6 +28,7 @@ class AddNoiseEndToEnd(BaseFrontend):
   Add a random signal-to-noise ratio noise or impulse response to clean speech, and
   write it to wavfile.
   """
+
   def __init__(self, config: dict):
     super().__init__(config)
     self.add_noise = Add_rir_noise_aecres(config)
diff --git a/delta/data/frontend/add_noise_end_to_end_test.py b/delta/data/frontend/add_noise_end_to_end_test.py
index 7152a822..806fe1fe 100644
--- a/delta/data/frontend/add_noise_end_to_end_test.py
+++ b/delta/data/frontend/add_noise_end_to_end_test.py
@@ -37,6 +37,7 @@ class AddNoiseEndToEndTest(tf.test.TestCase):
   """
   AddNoiseEndToEnd OP test.
   """
+
   def test_add_noise_end_to_end(self):
 
     wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))
diff --git a/delta/data/frontend/add_rir_noise_aecres.py b/delta/data/frontend/add_rir_noise_aecres.py
index 35426409..9516c9b1 100644
--- a/delta/data/frontend/add_rir_noise_aecres.py
+++ b/delta/data/frontend/add_rir_noise_aecres.py
@@ -25,6 +25,7 @@ class Add_rir_noise_aecres(BaseFrontend):
   """
   Add a random signal-to-noise ratio noise or impulse response to clean speech.
   """
+
   def __init__(self, config: dict):
     super().__init__(config)
 
diff --git a/delta/data/frontend/add_rir_noise_aecres_test.py b/delta/data/frontend/add_rir_noise_aecres_test.py
index 05939dbb..f17f8f62 100644
--- a/delta/data/frontend/add_rir_noise_aecres_test.py
+++ b/delta/data/frontend/add_rir_noise_aecres_test.py
@@ -39,6 +39,7 @@ class AddRirNoiseAecresTest(tf.test.TestCase):
   """
   AddNoiseRIR OP test.
   """
+
   def test_add_rir_noise_aecres(self):
     wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))
 
diff --git a/delta/data/frontend/analyfiltbank.py b/delta/data/frontend/analyfiltbank.py
index 7ce91df2..39707695 100644
--- a/delta/data/frontend/analyfiltbank.py
+++ b/delta/data/frontend/analyfiltbank.py
@@ -26,6 +26,7 @@ class Analyfiltbank(BaseFrontend):
   Compute power-spectrum && phase-spectrum features of every frame in speech,
   return two float tensors with size (num_frames, num_frequencies).
   """
+
   def __init__(self, config: dict):
     super().__init__(config)
 
diff --git a/delta/data/frontend/analyfiltbank_test.py b/delta/data/frontend/analyfiltbank_test.py
index d8007a8b..e7f3e783 100644
--- a/delta/data/frontend/analyfiltbank_test.py
+++ b/delta/data/frontend/analyfiltbank_test.py
@@ -28,6 +28,7 @@ class Test(tf.test.TestCase):
   """
   Analyfiltbank extraction test.
   """
+
   def test_analyfiltbank(self):
     wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))
 
diff --git a/delta/data/frontend/cepstrum.py b/delta/data/frontend/cepstrum.py
index 1cf9e011..5f352a19 100644
--- a/delta/data/frontend/cepstrum.py
+++ b/delta/data/frontend/cepstrum.py
@@ -27,6 +27,7 @@ class Cepstrum(BaseFrontend):
   Compute Cepstrum features of every frame in speech, return a float tensor
   with size (num_frames, ceps_subband_num).
   """
+
   def __init__(self, config: dict):
     super().__init__(config)
 
diff --git a/delta/data/frontend/cepstrum_test.py b/delta/data/frontend/cepstrum_test.py
index fcbb4b4d..401a6648 100644
--- a/delta/data/frontend/cepstrum_test.py
+++ b/delta/data/frontend/cepstrum_test.py
@@ -28,6 +28,7 @@ class CepstrumTest(tf.test.TestCase):
   """
   Cepstrum extraction test.
   """
+
   def test_cepstrum(self):
 
     wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))
diff --git a/delta/data/frontend/cmvn.py b/delta/data/frontend/cmvn.py
index 7717f9d0..4634a837 100644
--- a/delta/data/frontend/cmvn.py
+++ b/delta/data/frontend/cmvn.py
@@ -26,6 +26,7 @@ class CMVN(BaseFrontend):
   """
   Compute and apply CMVN to features.
   """
+
   def __init__(self, config: dict):
     super().__init__(config)
 
diff --git a/delta/data/frontend/delta_delta.py b/delta/data/frontend/delta_delta.py
index 430967e8..ef3edd6e 100644
--- a/delta/data/frontend/delta_delta.py
+++ b/delta/data/frontend/delta_delta.py
@@ -26,6 +26,7 @@ class DeltaDelta(BaseFrontend):
   """
   Do Delta_delta to features.
   """
+
   def __init__(self, config: dict):
     super().__init__(config)
 
diff --git a/delta/data/frontend/delta_delta_test.py b/delta/data/frontend/delta_delta_test.py
index d8f0cc74..26e5a760 100644
--- a/delta/data/frontend/delta_delta_test.py
+++ b/delta/data/frontend/delta_delta_test.py
@@ -26,6 +26,7 @@ class Delta_delta_Test(tf.test.TestCase):
   """
   Delta_delta extraction test.
   """
+
   def test_delta_delta(self):
 
     self.feat_dim = 80
diff --git a/delta/data/frontend/fbank.py b/delta/data/frontend/fbank.py
index fb288ab7..7c1e9436 100644
--- a/delta/data/frontend/fbank.py
+++ b/delta/data/frontend/fbank.py
@@ -28,6 +28,7 @@ class Fbank(BaseFrontend):
    spectrum to extract frequency bands. Return a float tensor with shape
    (num_channels, num_frames, num_frequencies).
   """
+
   def __init__(self, config: dict):
     super().__init__(config)
     self.spect = Spectrum(config)
diff --git a/delta/data/frontend/fbank_pitch.py b/delta/data/frontend/fbank_pitch.py
index 1af93b84..f53b135f 100644
--- a/delta/data/frontend/fbank_pitch.py
+++ b/delta/data/frontend/fbank_pitch.py
@@ -27,6 +27,7 @@ class FbankPitch(BaseFrontend):
   Compute Fbank && Pitch features respectively，and concate them. Return
   a tensor with shape (num_frames, dim_features).
   """
+
   def __init__(self, config: dict):
     super().__init__(config)
     self.fbank = Fbank(config)
diff --git a/delta/data/frontend/fbank_pitch_test.py b/delta/data/frontend/fbank_pitch_test.py
index f7829990..8cfe57b8 100644
--- a/delta/data/frontend/fbank_pitch_test.py
+++ b/delta/data/frontend/fbank_pitch_test.py
@@ -27,6 +27,7 @@ class FbankPitchTest(tf.test.TestCase):
   """
   Compare Fbank&&Pitch FE with kaldi.
   """
+
   def test_FbankPitch(self):
     wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))
 
diff --git a/delta/data/frontend/fbank_test.py b/delta/data/frontend/fbank_test.py
index 3228ad4f..b60a2dd7 100644
--- a/delta/data/frontend/fbank_test.py
+++ b/delta/data/frontend/fbank_test.py
@@ -29,6 +29,7 @@ class FbankTest(tf.test.TestCase):
   """
   Test Fbank FE using 8k/16k wav files.
   """
+
   def test_fbank(self):
     wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))
 
diff --git a/delta/data/frontend/framepow.py b/delta/data/frontend/framepow.py
index ca048179..16480ffd 100644
--- a/delta/data/frontend/framepow.py
+++ b/delta/data/frontend/framepow.py
@@ -27,6 +27,7 @@ class Framepow(BaseFrontend):
   Compute power of every frame in speech. Return a float tensor with
   shape (1 * num_frames).
   """
+
   def __init__(self, config: dict):
     super().__init__(config)
 
@@ -86,11 +87,12 @@ def call(self, audio_data, sample_rate=None):
       with tf.control_dependencies([assert_op]):
 
         sample_rate = tf.cast(sample_rate, dtype=float)
-        framepow = py_x_ops.frame_pow(audio_data,
-                                      sample_rate,
-                                      snip_edges=p.snip_edges,
-                                      remove_dc_offset=p.remove_dc_offset,
-                                      window_length=p.window_length,
-                                      frame_length=p.frame_length)
+        framepow = py_x_ops.frame_pow(
+            audio_data,
+            sample_rate,
+            snip_edges=p.snip_edges,
+            remove_dc_offset=p.remove_dc_offset,
+            window_length=p.window_length,
+            frame_length=p.frame_length)
 
         return tf.squeeze(framepow)
diff --git a/delta/data/frontend/framepow_test.py b/delta/data/frontend/framepow_test.py
index 4a8a879f..db10b4a3 100644
--- a/delta/data/frontend/framepow_test.py
+++ b/delta/data/frontend/framepow_test.py
@@ -29,6 +29,7 @@ class FramepowTest(tf.test.TestCase):
   """
   Framepow extraction test.
   """
+
   def test_framepow(self):
     wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))
 
@@ -43,10 +44,10 @@ def test_framepow(self):
       framepow_test = framepow(input_data, sample_rate)
 
       real_framepow_feats = np.array(
-        [9.819611, 9.328745, 9.247337, 9.26451, 9.266059])
+          [9.819611, 9.328745, 9.247337, 9.26451, 9.266059])
 
       self.assertEqual(tf.rank(framepow_test).eval(), 1)
-      self.assertAllClose(framepow_test.eval()[0 : 5], real_framepow_feats)
+      self.assertAllClose(framepow_test.eval()[0:5], real_framepow_feats)
 
 
 if __name__ == '__main__':
diff --git a/delta/data/frontend/mfcc.py b/delta/data/frontend/mfcc.py
index a43a57df..92f04053 100644
--- a/delta/data/frontend/mfcc.py
+++ b/delta/data/frontend/mfcc.py
@@ -29,6 +29,7 @@ class Mfcc(BaseFrontend):
   Compute mfcc features of every frame in speech, return a float tensor
   with size (num_channels, num_frames, num_frequencies).
   """
+
   def __init__(self, config: dict):
     super().__init__(config)
     self.framepow = Framepow(config)
@@ -140,10 +141,11 @@ def call(self, audio_data, sample_rate=None):
         nfbank = shape[1]
         fbank_feats = tf.reshape(fbank_feats, (1, nframe, nfbank))
         framepow_feats = self.framepow(audio_data, sample_rate)
-        mfcc = py_x_ops.mfcc(fbank_feats,
-                             framepow_feats,
-                             sample_rate,
-                             use_energy=p.use_energy,
-                             cepstral_lifter=p.cepstral_lifter,
-                             coefficient_count=p.coefficient_count)
+        mfcc = py_x_ops.mfcc(
+            fbank_feats,
+            framepow_feats,
+            sample_rate,
+            use_energy=p.use_energy,
+            cepstral_lifter=p.cepstral_lifter,
+            coefficient_count=p.coefficient_count)
         return mfcc
diff --git a/delta/data/frontend/mfcc_test.py b/delta/data/frontend/mfcc_test.py
index b29d2033..5e401566 100644
--- a/delta/data/frontend/mfcc_test.py
+++ b/delta/data/frontend/mfcc_test.py
@@ -28,6 +28,7 @@ class MfccTest(tf.test.TestCase):
   """
   MFCC extraction test.
   """
+
   def test_mfcc(self):
     wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))
 
diff --git a/delta/data/frontend/pitch.py b/delta/data/frontend/pitch.py
index 44097d6b..1a5668e7 100644
--- a/delta/data/frontend/pitch.py
+++ b/delta/data/frontend/pitch.py
@@ -26,6 +26,7 @@ class Pitch(BaseFrontend):
   Compute pitch features of every frame in speech, return a float tensor
   with size (num_frames, 2).
   """
+
   def __init__(self, config: dict):
     super().__init__(config)
 
diff --git a/delta/data/frontend/pitch_test.py b/delta/data/frontend/pitch_test.py
index 7564522c..97c7b04e 100644
--- a/delta/data/frontend/pitch_test.py
+++ b/delta/data/frontend/pitch_test.py
@@ -28,6 +28,7 @@ class SpectrumTest(tf.test.TestCase):
   """
   Pitch extraction test.
   """
+
   def test_spectrum(self):
     wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))
 
diff --git a/delta/data/frontend/plp.py b/delta/data/frontend/plp.py
index e07bc376..81c7485c 100644
--- a/delta/data/frontend/plp.py
+++ b/delta/data/frontend/plp.py
@@ -27,6 +27,7 @@ class Plp(BaseFrontend):
   Compute PLP features of every frame in speech, return a float tensor
   with size (num_frames, plp_order + 1).
   """
+
   def __init__(self, config: dict):
     super().__init__(config)
 
diff --git a/delta/data/frontend/plp_test.py b/delta/data/frontend/plp_test.py
index 6dc54269..2549cf4c 100644
--- a/delta/data/frontend/plp_test.py
+++ b/delta/data/frontend/plp_test.py
@@ -28,6 +28,7 @@ class PlpTest(tf.test.TestCase):
   """
   Plp extraction test.
   """
+
   def test_plp(self):
     wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))
 
diff --git a/delta/data/frontend/read_wav.py b/delta/data/frontend/read_wav.py
index 17ff3b3f..0809d9d7 100644
--- a/delta/data/frontend/read_wav.py
+++ b/delta/data/frontend/read_wav.py
@@ -20,10 +20,12 @@
 from delta.data.frontend.base_frontend import BaseFrontend
 from core.ops import py_x_ops
 
+
 class ReadWav(BaseFrontend):
   """
       Read audio sample from wav file, return sample data and sample rate.
       """
+
   def __init__(self, config: dict):
     super().__init__(config)
 
@@ -71,12 +73,16 @@ def call(self, wavfile):
     with tf.control_dependencies([assert_op]):
 
       if p.speed == 1.0:
-        return tf.squeeze(audio_data * 32768, axis=-1), tf.cast(sample_rate, dtype=tf.int32)
+        return tf.squeeze(
+            audio_data * 32768, axis=-1), tf.cast(
+                sample_rate, dtype=tf.int32)
       else:
-        resample_rate = tf.cast(sample_rate, dtype=tf.float32) * tf.cast(
-          1.0 / p.speed, dtype=tf.float32)
-        speed_data = py_x_ops.speed(tf.squeeze(audio_data * 32768, axis=-1),
-                                    tf.cast(sample_rate, dtype=tf.int32),
-                                    tf.cast(resample_rate, dtype=tf.int32),
-                                    lowpass_filter_width=5)
+        resample_rate = tf.cast(
+            sample_rate, dtype=tf.float32) * tf.cast(
+                1.0 / p.speed, dtype=tf.float32)
+        speed_data = py_x_ops.speed(
+            tf.squeeze(audio_data * 32768, axis=-1),
+            tf.cast(sample_rate, dtype=tf.int32),
+            tf.cast(resample_rate, dtype=tf.int32),
+            lowpass_filter_width=5)
         return tf.squeeze(speed_data), tf.cast(sample_rate, dtype=tf.int32)
diff --git a/delta/data/frontend/read_wav_test.py b/delta/data/frontend/read_wav_test.py
index 2c3f099a..5f2a9d20 100644
--- a/delta/data/frontend/read_wav_test.py
+++ b/delta/data/frontend/read_wav_test.py
@@ -26,6 +26,7 @@ class ReadWavTest(tf.test.TestCase):
   """
   ReadWav OP test.
   """
+
   def test_read_wav(self):
     wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))
 
@@ -38,5 +39,6 @@ def test_read_wav(self):
         self.assertAllClose(audio_data.eval() / 32768, audio_data_true)
         self.assertAllClose(sample_rate.eval(), sample_rate_true)
 
+
 if __name__ == '__main__':
   tf.test.main()
diff --git a/delta/data/frontend/spectrum.py b/delta/data/frontend/spectrum.py
index 323f17b7..e006d233 100644
--- a/delta/data/frontend/spectrum.py
+++ b/delta/data/frontend/spectrum.py
@@ -26,6 +26,7 @@ class Spectrum(BaseFrontend):
   Compute spectrum features of every frame in speech, return a float tensor
   with size (num_frames, num_frequencies).
   """
+
   def __init__(self, config: dict):
     super().__init__(config)
 
diff --git a/delta/data/frontend/spectrum_test.py b/delta/data/frontend/spectrum_test.py
index 4574d1a8..8488487e 100644
--- a/delta/data/frontend/spectrum_test.py
+++ b/delta/data/frontend/spectrum_test.py
@@ -28,6 +28,7 @@ class SpectrumTest(tf.test.TestCase):
   '''
   Spectum extraction test.
   '''
+
   def test_spectrum(self):
     wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))
 
@@ -38,7 +39,7 @@ def test_spectrum(self):
       spectrum = Spectrum.params({
           'window_length': 0.025,
           'snip_edges': True,
-          'dither':0.0
+          'dither': 0.0
       }).instantiate()
       spectrum_test = spectrum(input_data, sample_rate)
 
diff --git a/delta/data/frontend/synthfiltbank.py b/delta/data/frontend/synthfiltbank.py
index 4ab34ba1..9ca370c3 100644
--- a/delta/data/frontend/synthfiltbank.py
+++ b/delta/data/frontend/synthfiltbank.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 # ==============================================================================
 
-
 import delta.compat as tf
 from core.ops import py_x_ops
 from delta.utils.hparam import HParams
diff --git a/delta/data/frontend/synthfiltbank_test.py b/delta/data/frontend/synthfiltbank_test.py
index 2208d494..35f857ce 100644
--- a/delta/data/frontend/synthfiltbank_test.py
+++ b/delta/data/frontend/synthfiltbank_test.py
@@ -29,6 +29,7 @@ class Test(tf.test.TestCase):
   """
   Synthfiltbank extraction test.
   """
+
   def test_synthfiltbank(self):
     wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))
 
diff --git a/delta/data/frontend/zcr_test.py b/delta/data/frontend/zcr_test.py
index f670ceee..c66a44a1 100644
--- a/delta/data/frontend/zcr_test.py
+++ b/delta/data/frontend/zcr_test.py
@@ -29,6 +29,7 @@ class ZcrTest(tf.test.TestCase):
   """
   Test Fbank FE using 8k/16k wav files.
   """
+
   def test_zcr(self):
 
     wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav'))
diff --git a/utils/speech/compute_fbank_feats.py b/utils/speech/compute_fbank_feats.py
index 11cbdf07..9d05c076 100755
--- a/utils/speech/compute_fbank_feats.py
+++ b/utils/speech/compute_fbank_feats.py
@@ -69,10 +69,10 @@ def get_parser():
       default=True,
       help='The last frame (shorter than window_length) will not be cutoff.')
   parser.add_argument(
-    '--dither',
-    type=float,
-    default=0.0,
-    help='Dithering constant (0.0 means no dither).')
+      '--dither',
+      type=float,
+      default=0.0,
+      help='Dithering constant (0.0 means no dither).')
   parser.add_argument(
       '--raw_energy',
       type=int,
diff --git a/utils/speech/compute_fbank_pitch.py b/utils/speech/compute_fbank_pitch.py
index d7b0d0aa..95684557 100755
--- a/utils/speech/compute_fbank_pitch.py
+++ b/utils/speech/compute_fbank_pitch.py
@@ -49,10 +49,10 @@ def get_parser():
       default=40,
       help='Order of fbank')
   parser.add_argument(
-    '--dither',
-    type=float,
-    default=0.0,
-    help='Dithering constant (0.0 means no dither).')
+      '--dither',
+      type=float,
+      default=0.0,
+      help='Dithering constant (0.0 means no dither).')
   parser.add_argument(
       '--window_length', type=float, default=0.025, help='Length of a frame')
   parser.add_argument(
diff --git a/utils/speech/compute_mfcc_feats.py b/utils/speech/compute_mfcc_feats.py
index 7d525581..1e68ec10 100755
--- a/utils/speech/compute_mfcc_feats.py
+++ b/utils/speech/compute_mfcc_feats.py
@@ -89,10 +89,10 @@ def get_parser():
       default=True,
       help='Compute power spetrum without frame energy.')
   parser.add_argument(
-    '--dither',
-    type=float,
-    default=0.0,
-    help='Dithering constant (0.0 means no dither).')
+      '--dither',
+      type=float,
+      default=0.0,
+      help='Dithering constant (0.0 means no dither).')
   parser.add_argument(
       '--cepstral_lifter',
       type=float,
diff --git a/utils/speech/compute_spectrum_feats.py b/utils/speech/compute_spectrum_feats.py
index e6db813b..d5bd2740 100755
--- a/utils/speech/compute_spectrum_feats.py
+++ b/utils/speech/compute_spectrum_feats.py
@@ -74,10 +74,10 @@ def get_parser():
       default=False,
       help='Compute power spetrum without frame energy')
   parser.add_argument(
-    '--dither',
-    type=float,
-    default=0.0,
-    help='Dithering constant (0.0 means no dither).')
+      '--dither',
+      type=float,
+      default=0.0,
+      help='Dithering constant (0.0 means no dither).')
   parser.add_argument(
       '--write_num_frames',
       type=str,