Use ctcdecode in native client

mozilla · Oct 25, 2018 · 3cc9b37 · 3cc9b37
1 parent 770d742
commit 3cc9b37
Show file tree

Hide file tree

Showing 8 changed files with 51 additions and 112 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -1,3 +1,4 @@
 *.binary filter=lfs diff=lfs merge=lfs -crlf
 data/lm/trie filter=lfs diff=lfs merge=lfs -crlf
 data/lm/vocab.txt filter=lfs diff=lfs merge=lfs -text
+data/lm/trie.ctcdecode filter=lfs diff=lfs merge=lfs -text
diff --git a/DeepSpeech.py b/DeepSpeech.py
@@ -1770,6 +1770,9 @@ def create_inference_graph(batch_size=1, n_steps=16, use_new_decoder=False):
                            n_steps=n_steps,
                            previous_state=previous_state)
 
+    # Apply softmax for CTC decoder
+    logits = tf.nn.softmax(logits)
+
     new_state_c, new_state_h = layers['rnn_output_state']
 
     # Initial zero state

diff --git a/data/lm/README.md b/data/lm/README.md
@@ -38,9 +38,8 @@ binary_path = '/tmp/lm.binary'
 os.remove(lm_path)
 ```
 
-The trie was then generated from the list of unique words in the corpus (data/lm/vocab.txt):
+The trie was then generated from the vocabulary of the language model:
 
 ```bash
-tr -s '[[:space:]]' '\n' < /tmp/lower.txt | sort -u > /tmp/vocab.txt
-./generate_trie ../data/alphabet.txt /tmp/lm.binary /tmp/vocab.txt /tmp/trie
+./generate_trie ../data/alphabet.txt /tmp/lm.binary /tmp/trie
 ```
diff --git a/data/lm/trie.ctcdecode b/data/lm/trie.ctcdecode
diff --git a/native_client/BUILD b/native_client/BUILD
@@ -36,8 +36,6 @@ tf_cc_shared_object(
     srcs = ["deepspeech.cc",
             "deepspeech.h",
             "alphabet.h",
-            "beam_search.h",
-            "trie_node.h",
             "c_speech_features/c_speech_features.cpp",
             "kiss_fft130/kiss_fft.c",
             "kiss_fft130/tools/kiss_fftr.c",
@@ -72,6 +70,7 @@ tf_cc_shared_object(
         "//tensorflow/core/kernels:constant_op",        # Const
         "//tensorflow/core/kernels:immutable_constant_op", # ImmutableConst
         "//tensorflow/core/kernels:identity_op",        # Identity
+        "//tensorflow/core/kernels:softmax_op",         # Softmax
         "//tensorflow/core/kernels:transpose_op",       # Transpose
         "//tensorflow/core/kernels:reshape_op",         # Reshape
         "//tensorflow/core/kernels:shape_ops",          # Shape

diff --git a/native_client/deepspeech.cc b/native_client/deepspeech.cc
@@ -2,11 +2,11 @@
 #include <iostream>
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "deepspeech.h"
 #include "alphabet.h"
-#include "beam_search.h"
 
 #include "tensorflow/core/public/version.h"
 #include "native_client/ds_version.h"
@@ -17,6 +17,8 @@
 
 #include "c_speech_features.h"
 
+#include "ctcdecode/ctc_beam_search_decoder.h"
+
 //TODO: infer batch size from model/use dynamic batch size
 const unsigned int BATCH_SIZE = 1;
 
@@ -37,8 +39,6 @@ const unsigned int LOWFREQ = 0;
 const unsigned int CEP_LIFTER = 22;
 
 using namespace tensorflow;
-using tensorflow::ctc::CTCBeamSearchDecoder;
-using tensorflow::ctc::CTCDecoder;
 
 using std::vector;
 
@@ -98,7 +98,7 @@ struct ModelState {
   unsigned int ncep;
   unsigned int ncontext;
   Alphabet* alphabet;
-  KenLMBeamScorer* scorer;
+  Scorer* scorer;
   unsigned int beam_width;
   unsigned int n_steps;
   unsigned int mfcc_feats_per_timestep;
@@ -177,7 +177,7 @@ StreamingState::feedAudioContent(const short* buffer,
     // If the buffer is full, process and shift it
     if (audio_buffer.size() == AUDIO_WIN_LEN_SAMPLES) {
       processAudioWindow(audio_buffer);
-      // Shift data by one step of 10ms
+      // Shift data by one step
       std::rotate(audio_buffer.begin(), audio_buffer.begin() + AUDIO_WIN_STEP_SAMPLES, audio_buffer.end());
       audio_buffer.resize(audio_buffer.size() - AUDIO_WIN_STEP_SAMPLES);
     }
@@ -320,55 +320,24 @@ ModelState::infer(const float* aMfcc, unsigned int n_frames, vector<float>& logi
 char*
 ModelState::decode(vector<float>& logits)
 {
-  const int top_paths = 1;
+  const int cutoff_top_n = 40;
+  const double cutoff_prob = 1.0;
   const size_t num_classes = alphabet->GetSize() + 1; // +1 for blank
   const int n_frames = logits.size() / (BATCH_SIZE * num_classes);
 
-  // Raw data containers (arrays of floats, ints, etc.).
-  int sequence_lengths[BATCH_SIZE] = {n_frames};
-
-  // Convert data containers to the format accepted by the decoder, simply
-  // mapping the memory from the container to an Eigen::ArrayXi,::MatrixXf,
-  // using Eigen::Map.
-  Eigen::Map<const Eigen::ArrayXi> seq_len(&sequence_lengths[0], BATCH_SIZE);
-  vector<Eigen::Map<const Eigen::MatrixXf>> inputs;
-  inputs.reserve(n_frames);
+  vector<vector<double>> inputs;
+  inputs.resize(n_frames);
   for (int t = 0; t < n_frames; ++t) {
-    inputs.emplace_back(&logits[t * BATCH_SIZE * num_classes], BATCH_SIZE, num_classes);
-  }
-
-  // Prepare containers for output and scores.
-  // CTCDecoder::Output is vector<vector<int>>
-  vector<CTCDecoder::Output> decoder_outputs(top_paths);
-  for (CTCDecoder::Output& output : decoder_outputs) {
-    output.resize(BATCH_SIZE);
-  }
-  float score[BATCH_SIZE][top_paths] = {{0.0}};
-  Eigen::Map<Eigen::MatrixXf> scores(&score[0][0], BATCH_SIZE, top_paths);
-
-  if (scorer == nullptr) {
-    CTCBeamSearchDecoder<>::DefaultBeamScorer default_scorer;
-    CTCBeamSearchDecoder<> decoder(num_classes,
-                                   beam_width,
-                                   &default_scorer,
-                                   BATCH_SIZE);
-    decoder.Decode(seq_len, inputs, &decoder_outputs, &scores).ok();
-  } else {
-    CTCBeamSearchDecoder<KenLMBeamState> decoder(num_classes,
-                                                 beam_width,
-                                                 scorer,
-                                                 BATCH_SIZE);
-    decoder.Decode(seq_len, inputs, &decoder_outputs, &scores).ok();
+    for (int i = 0; i < num_classes; ++i) {
+      inputs[t].push_back(logits[t * num_classes + i]);
+    }
   }
 
-  // Output is an array of shape (batch_size, top_paths, result_length).
-
-  std::stringstream output;
-  for (int64 character : decoder_outputs[0][0]) {
-    output << alphabet->StringFromLabel(character);
-  }
+  // Vector of <probability, Output(tokens, timings)> pairs
+  vector<std::pair<double, Output>> out = ctc_beam_search_decoder(
+    inputs, *alphabet, beam_width, cutoff_prob, cutoff_top_n, scorer);
 
-  return strdup(output.str().c_str());
+  return strdup(alphabet->LabelsToString(out[0].second.tokens).c_str());
 }
 
 int
@@ -493,8 +462,10 @@ DS_EnableDecoderWithLM(ModelState* aCtx,
                        float aValidWordCountWeight)
 {
   try {
-    aCtx->scorer = new KenLMBeamScorer(aLMPath, aTriePath, aAlphabetConfigPath,
-                                       aLMWeight, aValidWordCountWeight);
+    aCtx->scorer = new Scorer(aLMWeight, aValidWordCountWeight,
+                              aLMPath ? aLMPath : "",
+                              aTriePath ? aTriePath : "",
+                              *aCtx->alphabet);
     return 0;
   } catch (...) {
     return 1;

diff --git a/native_client/generate_trie.cpp b/native_client/generate_trie.cpp
@@ -1,64 +1,25 @@
 #include <algorithm>
 #include <iostream>
 #include <string>
-using namespace std;
 
-#include "lm/model.hh"
-#include "trie_node.h"
+#include "ctcdecode/scorer.h"
+#include "fst/fstlib.h"
 #include "alphabet.h"
 
-typedef lm::ngram::QuantArrayTrieModel Model;
-
-lm::WordIndex GetWordIndex(const Model& model, const std::string& word) {
-  return model.GetVocabulary().Index(word);
-}
-
-float ScoreWord(const Model& model, lm::WordIndex word_index) {
-  // We don't need to keep state here as we're scoring the words individually.
-  Model::State out;
-  return model.FullScore(model.NullContextState(), word_index, out).prob;
-}
-
-int generate_trie(const char* alphabet_path, const char* kenlm_path, const char* vocab_path, const char* trie_path) {
-  Alphabet a(alphabet_path);
-
-  lm::ngram::Config config;
-  config.load_method = util::POPULATE_OR_READ;
-  Model model(kenlm_path, config);
-  TrieNode root(a.GetSize());
-
-  std::ifstream ifs(vocab_path, std::ifstream::in | std::ios::binary);
-  if (!ifs) {
-    std::cerr << "unable to open vocabulary file " << vocab_path << std::endl;
-    return -1;
-  }
-
-  std::ofstream ofs(trie_path);
-  if (!ofs) {
-    std::cerr << "unable to open output file " << trie_path << std::endl;
-    return -1;
-  }
-
-  std::string word;
-  while (ifs >> word) {
-    lm::WordIndex word_index = GetWordIndex(model, word);
-    float unigram_score = ScoreWord(model, word_index);
-    root.Insert(word,
-                [&a](const std::string& c) {
-                  return a.LabelFromString(c);
-                },
-                word_index, unigram_score);
-  }
+using namespace std;
 
-  root.WriteToStream(ofs);
+int generate_trie(const char* alphabet_path, const char* kenlm_path, const char* trie_path) {
+  Alphabet alphabet(alphabet_path);
+  Scorer scorer(0.0, 0.0, kenlm_path, "", alphabet);
+  scorer.save_dictionary(trie_path);
   return 0;
 }
 
 int main(int argc, char** argv) {
-  if (argc != 5) {
-    std::cerr << "Usage: " << argv[0] << " <alphabet> <lm_model> <vocabulary> <trie_path>" << std::endl;
+  if (argc != 4) {
+    std::cerr << "Usage: " << argv[0] << " <alphabet> <lm_model> <trie_path>" << std::endl;
     return -1;
   }
 
-  return generate_trie(argv[1], argv[2], argv[3], argv[4]);
+  return generate_trie(argv[1], argv[2], argv[3]);
 }
diff --git a/native_client/trie_load.cc b/native_client/trie_load.cc
@@ -1,22 +1,24 @@
+#include <algorithm>
 #include <iostream>
-#include <memory>
 #include <string>
 
+#include "ctcdecode/scorer.h"
+#include "fst/fstlib.h"
 #include "alphabet.h"
-#include "trie_node.h"
+
+using namespace std;
+
 
 int main(int argc, char** argv)
 {
-  const char* trie_path     = argv[1];
-  const char* alphabet_path = argv[2];
+  const char* kenlm_path    = argv[1];
+  const char* trie_path     = argv[2];
+  const char* alphabet_path = argv[3];
 
   printf("Loading trie(%s) and alphabet(%s)\n", trie_path, alphabet_path);
 
-  Alphabet alphabet_ = Alphabet(alphabet_path);
-  TrieNode *trieRoot_;
-
-  std::ifstream in(trie_path, std::ios::in | std::ios::binary);
-  TrieNode::ReadFromStream(in, trieRoot_, alphabet_.GetSize());
+  Alphabet alphabet(alphabet_path);
+  Scorer scorer(0.0, 0.0, kenlm_path, trie_path, alphabet);
 
   return 0;
 }