Skip to content

Commit

Permalink
Merge pull request #1773 from mozilla/mfcc-striding
Browse files Browse the repository at this point in the history
Use longer MFCC step instead of throwing away features (Fixes #1744)
  • Loading branch information
reuben authored Dec 10, 2018
2 parents 20400e1 + 1df9602 commit 2a8128b
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 25 deletions.
47 changes: 26 additions & 21 deletions native_client/deepspeech.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include <algorithm>
#include <cmath>
#include <iostream>
#include <memory>
#include <string>
Expand All @@ -20,23 +21,35 @@
#include "ctcdecode/ctc_beam_search_decoder.h"

//TODO: infer batch size from model/use dynamic batch size
const unsigned int BATCH_SIZE = 1;
constexpr unsigned int BATCH_SIZE = 1;

//TODO: use dynamic sample rate
const unsigned int SAMPLE_RATE = 16000;
constexpr unsigned int SAMPLE_RATE = 16000;

const float AUDIO_WIN_LEN = 0.025f;
const float AUDIO_WIN_STEP = 0.01f;
const unsigned int AUDIO_WIN_LEN_SAMPLES = (unsigned int)(AUDIO_WIN_LEN * SAMPLE_RATE);
const unsigned int AUDIO_WIN_STEP_SAMPLES = (unsigned int)(AUDIO_WIN_STEP * SAMPLE_RATE);
constexpr float AUDIO_WIN_LEN = 0.032f;
constexpr float AUDIO_WIN_STEP = 0.02f;
constexpr unsigned int AUDIO_WIN_LEN_SAMPLES = (unsigned int)(AUDIO_WIN_LEN * SAMPLE_RATE);
constexpr unsigned int AUDIO_WIN_STEP_SAMPLES = (unsigned int)(AUDIO_WIN_STEP * SAMPLE_RATE);

const unsigned int MFCC_FEATURES = 26;
constexpr unsigned int MFCC_FEATURES = 26;

const float PREEMPHASIS_COEFF = 0.97f;
const unsigned int N_FFT = 512;
const unsigned int N_FILTERS = 26;
const unsigned int LOWFREQ = 0;
const unsigned int CEP_LIFTER = 22;
constexpr float PREEMPHASIS_COEFF = 0.97f;
constexpr unsigned int N_FFT = 512;
constexpr unsigned int N_FILTERS = 26;
constexpr unsigned int LOWFREQ = 0;
constexpr unsigned int CEP_LIFTER = 22;

constexpr size_t WINDOW_SIZE = AUDIO_WIN_LEN * SAMPLE_RATE;

std::array<float, WINDOW_SIZE> calc_hamming_window() {
std::array<float, WINDOW_SIZE> a{0};
for (int i = 0; i < WINDOW_SIZE; ++i) {
a[i] = 0.54 - 0.46 * std::cos(2*M_PI*i/(WINDOW_SIZE-1));
}
return a;
}

std::array<float, WINDOW_SIZE> hamming_window = calc_hamming_window();

using namespace tensorflow;

Expand Down Expand Up @@ -77,7 +90,6 @@ struct StreamingState {
float last_sample; // used for preemphasis
vector<float> mfcc_buffer;
vector<float> batch_buffer;
bool skip_next_mfcc;
ModelState* model;

void feedAudioContent(const short* buffer, unsigned int buffer_size);
Expand Down Expand Up @@ -214,16 +226,11 @@ StreamingState::finishStream()
void
StreamingState::processAudioWindow(const vector<float>& buf)
{
skip_next_mfcc = !skip_next_mfcc;
if (!skip_next_mfcc) { // Was true
return;
}

// Compute MFCC features
float* mfcc;
int n_frames = csf_mfcc(buf.data(), buf.size(), SAMPLE_RATE,
AUDIO_WIN_LEN, AUDIO_WIN_STEP, MFCC_FEATURES, N_FILTERS, N_FFT,
LOWFREQ, SAMPLE_RATE/2, 0.f, CEP_LIFTER, 1, nullptr,
LOWFREQ, SAMPLE_RATE/2, 0.f, CEP_LIFTER, 1, hamming_window.data(),
&mfcc);
assert(n_frames == 1);

Expand Down Expand Up @@ -518,8 +525,6 @@ DS_SetupStream(ModelState* aCtx,
ctx->mfcc_buffer.resize(MFCC_FEATURES*aCtx->n_context, 0.f);
ctx->batch_buffer.reserve(aCtx->n_steps * aCtx->mfcc_feats_per_timestep);

ctx->skip_next_mfcc = false;

ctx->model = aCtx;

*retval = ctx.release();
Expand Down
5 changes: 1 addition & 4 deletions util/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,7 @@ def audiofile_to_input_vector(audio_filename, numcep, numcontext):
fs, audio = wav.read(audio_filename)

# Get mfcc coefficients
features = mfcc(audio, samplerate=fs, numcep=numcep)

# We only keep every second feature (BiRNN stride = 2)
features = features[::2]
features = mfcc(audio, samplerate=fs, numcep=numcep, winlen=0.032, winstep=0.02, winfunc=np.hamming)

# Add empty initial and final contexts
empty_context = np.zeros((numcontext, numcep), dtype=features.dtype)
Expand Down

0 comments on commit 2a8128b

Please sign in to comment.