Skip to content

Commit

Permalink
Use ctcdecode in native client
Browse files Browse the repository at this point in the history
  • Loading branch information
reuben committed Oct 25, 2018
1 parent 770d742 commit 3cc9b37
Show file tree
Hide file tree
Showing 8 changed files with 51 additions and 112 deletions.
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
*.binary filter=lfs diff=lfs merge=lfs -crlf
data/lm/trie filter=lfs diff=lfs merge=lfs -crlf
data/lm/vocab.txt filter=lfs diff=lfs merge=lfs -text
data/lm/trie.ctcdecode filter=lfs diff=lfs merge=lfs -text
3 changes: 3 additions & 0 deletions DeepSpeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -1770,6 +1770,9 @@ def create_inference_graph(batch_size=1, n_steps=16, use_new_decoder=False):
n_steps=n_steps,
previous_state=previous_state)

# Apply softmax for CTC decoder
logits = tf.nn.softmax(logits)

new_state_c, new_state_h = layers['rnn_output_state']

# Initial zero state
Expand Down
5 changes: 2 additions & 3 deletions data/lm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,8 @@ binary_path = '/tmp/lm.binary'
os.remove(lm_path)
```

The trie was then generated from the list of unique words in the corpus (data/lm/vocab.txt):
The trie was then generated from the vocabulary of the language model:

```bash
tr -s '[[:space:]]' '\n' < /tmp/lower.txt | sort -u > /tmp/vocab.txt
./generate_trie ../data/alphabet.txt /tmp/lm.binary /tmp/vocab.txt /tmp/trie
./generate_trie ../data/alphabet.txt /tmp/lm.binary /tmp/trie
```
3 changes: 3 additions & 0 deletions data/lm/trie.ctcdecode
Git LFS file not shown
3 changes: 1 addition & 2 deletions native_client/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,6 @@ tf_cc_shared_object(
srcs = ["deepspeech.cc",
"deepspeech.h",
"alphabet.h",
"beam_search.h",
"trie_node.h",
"c_speech_features/c_speech_features.cpp",
"kiss_fft130/kiss_fft.c",
"kiss_fft130/tools/kiss_fftr.c",
Expand Down Expand Up @@ -72,6 +70,7 @@ tf_cc_shared_object(
"//tensorflow/core/kernels:constant_op", # Const
"//tensorflow/core/kernels:immutable_constant_op", # ImmutableConst
"//tensorflow/core/kernels:identity_op", # Identity
"//tensorflow/core/kernels:softmax_op", # Softmax
"//tensorflow/core/kernels:transpose_op", # Transpose
"//tensorflow/core/kernels:reshape_op", # Reshape
"//tensorflow/core/kernels:shape_ops", # Shape
Expand Down
69 changes: 20 additions & 49 deletions native_client/deepspeech.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
#include <iostream>
#include <memory>
#include <string>
#include <utility>
#include <vector>

#include "deepspeech.h"
#include "alphabet.h"
#include "beam_search.h"

#include "tensorflow/core/public/version.h"
#include "native_client/ds_version.h"
Expand All @@ -17,6 +17,8 @@

#include "c_speech_features.h"

#include "ctcdecode/ctc_beam_search_decoder.h"

//TODO: infer batch size from model/use dynamic batch size
const unsigned int BATCH_SIZE = 1;

Expand All @@ -37,8 +39,6 @@ const unsigned int LOWFREQ = 0;
const unsigned int CEP_LIFTER = 22;

using namespace tensorflow;
using tensorflow::ctc::CTCBeamSearchDecoder;
using tensorflow::ctc::CTCDecoder;

using std::vector;

Expand Down Expand Up @@ -98,7 +98,7 @@ struct ModelState {
unsigned int ncep;
unsigned int ncontext;
Alphabet* alphabet;
KenLMBeamScorer* scorer;
Scorer* scorer;
unsigned int beam_width;
unsigned int n_steps;
unsigned int mfcc_feats_per_timestep;
Expand Down Expand Up @@ -177,7 +177,7 @@ StreamingState::feedAudioContent(const short* buffer,
// If the buffer is full, process and shift it
if (audio_buffer.size() == AUDIO_WIN_LEN_SAMPLES) {
processAudioWindow(audio_buffer);
// Shift data by one step of 10ms
// Shift data by one step
std::rotate(audio_buffer.begin(), audio_buffer.begin() + AUDIO_WIN_STEP_SAMPLES, audio_buffer.end());
audio_buffer.resize(audio_buffer.size() - AUDIO_WIN_STEP_SAMPLES);
}
Expand Down Expand Up @@ -320,55 +320,24 @@ ModelState::infer(const float* aMfcc, unsigned int n_frames, vector<float>& logi
char*
ModelState::decode(vector<float>& logits)
{
const int top_paths = 1;
const int cutoff_top_n = 40;
const double cutoff_prob = 1.0;
const size_t num_classes = alphabet->GetSize() + 1; // +1 for blank
const int n_frames = logits.size() / (BATCH_SIZE * num_classes);

// Raw data containers (arrays of floats, ints, etc.).
int sequence_lengths[BATCH_SIZE] = {n_frames};

// Convert data containers to the format accepted by the decoder, simply
// mapping the memory from the container to an Eigen::ArrayXi,::MatrixXf,
// using Eigen::Map.
Eigen::Map<const Eigen::ArrayXi> seq_len(&sequence_lengths[0], BATCH_SIZE);
vector<Eigen::Map<const Eigen::MatrixXf>> inputs;
inputs.reserve(n_frames);
vector<vector<double>> inputs;
inputs.resize(n_frames);
for (int t = 0; t < n_frames; ++t) {
inputs.emplace_back(&logits[t * BATCH_SIZE * num_classes], BATCH_SIZE, num_classes);
}

// Prepare containers for output and scores.
// CTCDecoder::Output is vector<vector<int>>
vector<CTCDecoder::Output> decoder_outputs(top_paths);
for (CTCDecoder::Output& output : decoder_outputs) {
output.resize(BATCH_SIZE);
}
float score[BATCH_SIZE][top_paths] = {{0.0}};
Eigen::Map<Eigen::MatrixXf> scores(&score[0][0], BATCH_SIZE, top_paths);

if (scorer == nullptr) {
CTCBeamSearchDecoder<>::DefaultBeamScorer default_scorer;
CTCBeamSearchDecoder<> decoder(num_classes,
beam_width,
&default_scorer,
BATCH_SIZE);
decoder.Decode(seq_len, inputs, &decoder_outputs, &scores).ok();
} else {
CTCBeamSearchDecoder<KenLMBeamState> decoder(num_classes,
beam_width,
scorer,
BATCH_SIZE);
decoder.Decode(seq_len, inputs, &decoder_outputs, &scores).ok();
for (int i = 0; i < num_classes; ++i) {
inputs[t].push_back(logits[t * num_classes + i]);
}
}

// Output is an array of shape (batch_size, top_paths, result_length).

std::stringstream output;
for (int64 character : decoder_outputs[0][0]) {
output << alphabet->StringFromLabel(character);
}
// Vector of <probability, Output(tokens, timings)> pairs
vector<std::pair<double, Output>> out = ctc_beam_search_decoder(
inputs, *alphabet, beam_width, cutoff_prob, cutoff_top_n, scorer);

return strdup(output.str().c_str());
return strdup(alphabet->LabelsToString(out[0].second.tokens).c_str());
}

int
Expand Down Expand Up @@ -493,8 +462,10 @@ DS_EnableDecoderWithLM(ModelState* aCtx,
float aValidWordCountWeight)
{
try {
aCtx->scorer = new KenLMBeamScorer(aLMPath, aTriePath, aAlphabetConfigPath,
aLMWeight, aValidWordCountWeight);
aCtx->scorer = new Scorer(aLMWeight, aValidWordCountWeight,
aLMPath ? aLMPath : "",
aTriePath ? aTriePath : "",
*aCtx->alphabet);
return 0;
} catch (...) {
return 1;
Expand Down
59 changes: 10 additions & 49 deletions native_client/generate_trie.cpp
Original file line number Diff line number Diff line change
@@ -1,64 +1,25 @@
#include <algorithm>
#include <iostream>
#include <string>
using namespace std;

#include "lm/model.hh"
#include "trie_node.h"
#include "ctcdecode/scorer.h"
#include "fst/fstlib.h"
#include "alphabet.h"

typedef lm::ngram::QuantArrayTrieModel Model;

lm::WordIndex GetWordIndex(const Model& model, const std::string& word) {
return model.GetVocabulary().Index(word);
}

float ScoreWord(const Model& model, lm::WordIndex word_index) {
// We don't need to keep state here as we're scoring the words individually.
Model::State out;
return model.FullScore(model.NullContextState(), word_index, out).prob;
}

int generate_trie(const char* alphabet_path, const char* kenlm_path, const char* vocab_path, const char* trie_path) {
Alphabet a(alphabet_path);

lm::ngram::Config config;
config.load_method = util::POPULATE_OR_READ;
Model model(kenlm_path, config);
TrieNode root(a.GetSize());

std::ifstream ifs(vocab_path, std::ifstream::in | std::ios::binary);
if (!ifs) {
std::cerr << "unable to open vocabulary file " << vocab_path << std::endl;
return -1;
}

std::ofstream ofs(trie_path);
if (!ofs) {
std::cerr << "unable to open output file " << trie_path << std::endl;
return -1;
}

std::string word;
while (ifs >> word) {
lm::WordIndex word_index = GetWordIndex(model, word);
float unigram_score = ScoreWord(model, word_index);
root.Insert(word,
[&a](const std::string& c) {
return a.LabelFromString(c);
},
word_index, unigram_score);
}
using namespace std;

root.WriteToStream(ofs);
int generate_trie(const char* alphabet_path, const char* kenlm_path, const char* trie_path) {
Alphabet alphabet(alphabet_path);
Scorer scorer(0.0, 0.0, kenlm_path, "", alphabet);
scorer.save_dictionary(trie_path);
return 0;
}

int main(int argc, char** argv) {
if (argc != 5) {
std::cerr << "Usage: " << argv[0] << " <alphabet> <lm_model> <vocabulary> <trie_path>" << std::endl;
if (argc != 4) {
std::cerr << "Usage: " << argv[0] << " <alphabet> <lm_model> <trie_path>" << std::endl;
return -1;
}

return generate_trie(argv[1], argv[2], argv[3], argv[4]);
return generate_trie(argv[1], argv[2], argv[3]);
}
20 changes: 11 additions & 9 deletions native_client/trie_load.cc
Original file line number Diff line number Diff line change
@@ -1,22 +1,24 @@
#include <algorithm>
#include <iostream>
#include <memory>
#include <string>

#include "ctcdecode/scorer.h"
#include "fst/fstlib.h"
#include "alphabet.h"
#include "trie_node.h"

using namespace std;


int main(int argc, char** argv)
{
const char* trie_path = argv[1];
const char* alphabet_path = argv[2];
const char* kenlm_path = argv[1];
const char* trie_path = argv[2];
const char* alphabet_path = argv[3];

printf("Loading trie(%s) and alphabet(%s)\n", trie_path, alphabet_path);

Alphabet alphabet_ = Alphabet(alphabet_path);
TrieNode *trieRoot_;

std::ifstream in(trie_path, std::ios::in | std::ios::binary);
TrieNode::ReadFromStream(in, trieRoot_, alphabet_.GetSize());
Alphabet alphabet(alphabet_path);
Scorer scorer(0.0, 0.0, kenlm_path, trie_path, alphabet);

return 0;
}

0 comments on commit 3cc9b37

Please sign in to comment.