mozilla · reuben · Nov 12, 2018 · Nov 8, 2018 · Nov 8, 2018 · Nov 8, 2018
diff --git a/.compute b/.compute
@@ -21,5 +21,4 @@ python3 -u DeepSpeech.py \
   --display_step 0 \
   --validation_step 1 \
   --checkpoint_dir "../keep" \
-  --summary_dir "../keep/summaries" \
-  --decoder_library_path "../tmp/native_client/libctc_decoder_with_kenlm.so"
+  --summary_dir "../keep/summaries"
diff --git a/.gitattributes b/.gitattributes
@@ -1,4 +1,3 @@
 *.binary filter=lfs diff=lfs merge=lfs -crlf
 data/lm/trie filter=lfs diff=lfs merge=lfs -crlf
 data/lm/vocab.txt filter=lfs diff=lfs merge=lfs -text
-data/lm/trie.ctcdecode filter=lfs diff=lfs merge=lfs -text
diff --git a/.install b/.install
@@ -7,4 +7,10 @@ pip install tensorflow-gpu==1.12.0rc2
 
 python3 util/taskcluster.py --arch gpu --target ../tmp/native_client
 
+# Install ds_ctcdecoder package from TaskCluster
+VERSION=$(python -c 'import pkg_resources; print(pkg_resources.safe_version(open("VERSION").read()))')
+PYVER=$(python -c 'import sys; print("cp{0}{1}-cp{0}{1}m".format(sys.version_info.major, sys.version_info.minor))')
+python3 util/taskcluster.py --arch cpu --target ../tmp --artifact "ds_ctcdecoder-${VERSION}-${PYVER}-manylinux1_x86_64.whl"
+pip install ../tmp/ds_ctcdecoder-*.whl
+
 mkdir -p ../keep/summaries
diff --git a/DeepSpeech.py b/DeepSpeech.py
diff --git a/Dockerfile b/Dockerfile
@@ -165,9 +165,6 @@ RUN ./configure
 # passing LD_LIBRARY_PATH is required cause Bazel doesn't pickup it from environment
 
 
-# Build LM Prefix Decoder, CPU only - no need for CUDA flag
-RUN bazel build -c opt --copt=-O3 --copt="-D_GLIBCXX_USE_CXX11_ABI=0" --copt=-mtune=generic --copt=-march=x86-64 --copt=-msse --copt=-msse2 --copt=-msse3 --copt=-msse4.1 --copt=-msse4.2 --copt=-mavx //native_client:libctc_decoder_with_kenlm.so  --verbose_failures --action_env=LD_LIBRARY_PATH=${LD_LIBRARY_PATH}
-
 # Build DeepSpeech
 RUN bazel build --config=monolithic --config=cuda -c opt --copt=-O3 --copt="-D_GLIBCXX_USE_CXX11_ABI=0" --copt=-mtune=generic --copt=-march=x86-64 --copt=-msse --copt=-msse2 --copt=-msse3 --copt=-msse4.1 --copt=-msse4.2 --copt=-mavx --copt=-fvisibility=hidden //native_client:libdeepspeech.so //native_client:generate_trie --verbose_failures --action_env=LD_LIBRARY_PATH=${LD_LIBRARY_PATH}
 
@@ -184,8 +181,7 @@ RUN bazel build --config=monolithic --config=cuda -c opt --copt=-O3 --copt="-D_G
 # RUN pip install /tmp/tensorflow_pkg/*.whl
 
 # Copy built libs to /DeepSpeech/native_client
-RUN cp /tensorflow/bazel-bin/native_client/libctc_decoder_with_kenlm.so /DeepSpeech/native_client/ \
-    && cp /tensorflow/bazel-bin/native_client/generate_trie /DeepSpeech/native_client/ \
+RUN cp /tensorflow/bazel-bin/native_client/generate_trie /DeepSpeech/native_client/ \
     && cp /tensorflow/bazel-bin/native_client/libdeepspeech.so /DeepSpeech/native_client/
 
 # Install TensorFlow
@@ -200,6 +196,9 @@ RUN make deepspeech
 WORKDIR /DeepSpeech/native_client/python
 RUN make bindings
 RUN pip install dist/deepspeech*
+WORKDIR /DeepSpeech/native_client/ctcdecode
+RUN make
+RUN pip install dist/*.whl
 
 
 # << END Build and bind

diff --git a/README.md b/README.md
@@ -213,13 +213,13 @@ cd DeepSpeech
 pip3 install -r requirements.txt
 ```
 
-You'll also need to download `native_client.tar.xz` or build the native client files yourself to get the custom TensorFlow OP needed for decoding the outputs of the neural network. You can use `util/taskcluster.py` to download the files for your architecture:
+You'll also need to install the `ds_ctcdecoder` Python package which is required for decoding the outputs of the acoustic model into text. We have binaries available in our CI infrastructure, you can use `util/taskcluster.py` to get a URL to the decoder package. When you pass the `--decoder` option, the script will print the URL to the appropriate decoder package for your platform and Python version:
 
 ```bash
-python3 util/taskcluster.py --target .
+pip3 install $(python3 util/taskcluster.py --decoder)
 ```
 
-This will download the native client files for the x86_64 architecture without CUDA support, and extract them into the current folder. If you prefer building the binaries from source, see the [native_client README file](native_client/README.md). We also have binaries with CUDA enabled ("--arch gpu") and for ARM7 ("--arch arm").
+This command will download and install the `ds_ctcdecoder` package. If you prefer building the binaries from source, see the [native_client README file](native_client/README.md). You can override the platform with `--arch` if you want the package for ARM7 (`--arch arm`) or ARM64 (`--arch arm64`).
 
 ### Recommendations
 

diff --git a/bin/run-tc-ldc93s1_checkpoint.sh b/bin/run-tc-ldc93s1_checkpoint.sh
@@ -19,7 +19,6 @@ python -u DeepSpeech.py --noshow_progressbar \
   --n_hidden 494 --epoch -1 --random_seed 4567 --default_stddev 0.046875 \
   --max_to_keep 1 --checkpoint_dir '/tmp/ckpt' \
   --learning_rate 0.001 --dropout_rate 0.05 \
-  --decoder_library_path '/tmp/ds/libctc_decoder_with_kenlm.so' \
   --lm_binary_path 'data/smoke_test/vocab.pruned.lm' \
   --lm_trie_path 'data/smoke_test/vocab.trie' | tee /tmp/resume.log
 

diff --git a/bin/run-tc-ldc93s1_new.sh b/bin/run-tc-ldc93s1_new.sh
@@ -20,6 +20,5 @@ python -u DeepSpeech.py \
   --default_stddev 0.046875 --max_to_keep 1 \
   --checkpoint_dir '/tmp/ckpt' \
   --learning_rate 0.001 --dropout_rate 0.05  --export_dir '/tmp/train' \
-  --decoder_library_path '/tmp/ds/libctc_decoder_with_kenlm.so' \
   --lm_binary_path 'data/smoke_test/vocab.pruned.lm' \
   --lm_trie_path 'data/smoke_test/vocab.trie' \
diff --git a/bin/run-tc-ldc93s1_singleshotinference.sh b/bin/run-tc-ldc93s1_singleshotinference.sh
@@ -17,7 +17,6 @@ python -u DeepSpeech.py \
   --n_hidden 494 --epoch 1 --random_seed 4567 --default_stddev 0.046875 \
   --max_to_keep 1 --checkpoint_dir '/tmp/ckpt' --checkpoint_secs 0 \
   --learning_rate 0.001 --dropout_rate 0.05 \
-  --decoder_library_path '/tmp/ds/libctc_decoder_with_kenlm.so' \
   --lm_binary_path 'data/smoke_test/vocab.pruned.lm' \
   --lm_trie_path 'data/smoke_test/vocab.trie'
 
@@ -28,7 +27,6 @@ python -u DeepSpeech.py \
   --n_hidden 494 --epoch 1 --random_seed 4567 --default_stddev 0.046875 \
   --max_to_keep 1 --checkpoint_dir '/tmp/ckpt' --checkpoint_secs 0 \
   --learning_rate 0.001 --dropout_rate 0.05 \
-  --decoder_library_path '/tmp/ds/libctc_decoder_with_kenlm.so' \
   --lm_binary_path 'data/smoke_test/vocab.pruned.lm' \
   --lm_trie_path 'data/smoke_test/vocab.trie' \
   --one_shot_infer 'data/smoke_test/LDC93S1.wav'
diff --git a/bin/run-tc-ldc93s1_tflite.sh b/bin/run-tc-ldc93s1_tflite.sh
@@ -14,7 +14,6 @@ python -u DeepSpeech.py \
   --n_hidden 494 \
   --checkpoint_dir '/tmp/ckpt' \
   --export_dir '/tmp/train' \
-  --decoder_library_path '/tmp/ds/libctc_decoder_with_kenlm.so' \
   --lm_binary_path 'data/smoke_test/vocab.pruned.lm' \
   --lm_trie_path 'data/smoke_test/vocab.trie' \
   --notrain --notest \

diff --git a/data/lm/trie b/data/lm/trie
diff --git a/data/lm/trie.ctcdecode b/data/lm/trie.ctcdecode
diff --git a/data/lm/vocab.txt b/data/lm/vocab.txt
diff --git a/data/smoke_test/vocab.trie b/data/smoke_test/vocab.trie
diff --git a/data/smoke_test/vocab.trie.ctcdecode b/data/smoke_test/vocab.trie.ctcdecode
diff --git a/evaluate.py b/evaluate.py
@@ -15,18 +15,14 @@
 from attrdict import AttrDict
 from collections import namedtuple
 from ds_ctcdecoder import ctc_beam_search_decoder_batch, Scorer
-from DeepSpeech import initialize_globals, create_flags, log_debug, log_info, log_warn, log_error, create_inference_graph
-from multiprocessing import Pool
+from multiprocessing import Pool, cpu_count
 from six.moves import zip, range
 from util.audio import audiofile_to_input_vector
-from util.text import Alphabet, ctc_label_dense_to_sparse, wer, levenshtein
+from util.config import Config, initialize_globals
+from util.flags import create_flags, FLAGS
+from util.logging import log_error
 from util.preprocess import pmap, preprocess
-
-
-FLAGS = tf.app.flags.FLAGS
-
-N_FEATURES = 26
-N_CONTEXT = 9
+from util.text import Alphabet, ctc_label_dense_to_sparse, wer, levenshtein
 
 
 def split_data(dataset, batch_size):
@@ -86,41 +82,21 @@ def calculate_report(labels, decodings, distances, losses):
     return samples_wer, samples
 
 
-def main(_):
-    initialize_globals()
-
-    if not FLAGS.test_files:
-        log_error('You need to specify what files to use for evaluation via '
-                  'the --test_files flag.')
-        exit(1)
-
-    global alphabet
-    alphabet = Alphabet(FLAGS.alphabet_config_path)
-
-    scorer = Scorer(FLAGS.lm_weight, FLAGS.valid_word_count_weight,
+def evaluate(test_data, inference_graph, alphabet):
+    scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta,
                     FLAGS.lm_binary_path, FLAGS.lm_trie_path,
-                    alphabet)
+                    Config.alphabet)
 
-    # sort examples by length, improves packing of batches and timesteps
-    test_data = preprocess(
-        FLAGS.test_files.split(','),
-        FLAGS.test_batch_size,
-        alphabet=alphabet,
-        numcep=N_FEATURES,
-        numcontext=N_CONTEXT,
-        hdf5_cache_path=FLAGS.hdf5_test_set).sort_values(
-        by="features_len",
-        ascending=False)
 
     def create_windows(features):
-        num_strides = len(features) - (N_CONTEXT * 2)
+        num_strides = len(features) - (Config.n_context * 2)
 
         # Create a view into the array with overlapping strides of size
         # numcontext (past) + 1 (present) + numcontext (future)
-        window_size = 2*N_CONTEXT+1
+        window_size = 2*Config.n_context+1
         features = np.lib.stride_tricks.as_strided(
             features,
-            (num_strides, window_size, N_FEATURES),
+            (num_strides, window_size, Config.n_input),
             (features.strides[0], features.strides[0], features.strides[1]),
             writeable=False)
 
@@ -129,8 +105,8 @@ def create_windows(features):
     # Create overlapping windows over the features
     test_data['features'] = test_data['features'].apply(create_windows)
 
-    with tf.Session() as session:
-        inputs, outputs, layers = create_inference_graph(batch_size=FLAGS.test_batch_size, n_steps=-1)
+    with tf.Session(config=Config.session_config) as session:
+        inputs, outputs, layers = inference_graph
 
         # Transpose to batch major for decoder
         transposed = tf.transpose(outputs['outputs'], [1, 0, 2])
@@ -183,26 +159,29 @@ def create_windows(features):
             logitses.append(logits)
             losses.extend(loss)
 
-        ground_truths = []
-        predictions = []
-        distances = []
+    ground_truths = []
+    predictions = []
 
-        print('Decoding predictions...')
-        bar = progressbar.ProgressBar(max_value=batch_count,
-                                      widget=progressbar.AdaptiveETA)
+    print('Decoding predictions...')
+    bar = progressbar.ProgressBar(max_value=batch_count,
+                                  widget=progressbar.AdaptiveETA)
 
-        # Get number of accessible CPU cores for this process
-        num_processes = len(os.sched_getaffinity(0))
+    # Get number of accessible CPU cores for this process
+    try:
+        num_processes = cpu_count()
+    except:
+        num_processes = 1
 
-        # Second pass, decode logits and compute WER and edit distance metrics
-        for logits, batch in bar(zip(logitses, split_data(test_data, FLAGS.test_batch_size))):
-            seq_lengths = batch['features_len'].values.astype(np.int32)
-            decoded = ctc_beam_search_decoder_batch(logits, seq_lengths, alphabet, FLAGS.beam_width,
-                                                    num_processes=num_processes, scorer=scorer)
+    # Second pass, decode logits and compute WER and edit distance metrics
+    for logits, batch in bar(zip(logitses, split_data(test_data, FLAGS.test_batch_size))):
+        seq_lengths = batch['features_len'].values.astype(np.int32)
+        decoded = ctc_beam_search_decoder_batch(logits, seq_lengths, alphabet, FLAGS.beam_width,
+                                                num_processes=num_processes, scorer=scorer)
 
-            ground_truths.extend(alphabet.decode(l) for l in batch['transcript'])
-            predictions.extend(d[0][1] for d in decoded)
-            distances.extend(levenshtein(a, b) for a, b in zip(labels, predictions))
+        ground_truths.extend(alphabet.decode(l) for l in batch['transcript'])
+        predictions.extend(d[0][1] for d in decoded)
+
+    distances = [levenshtein(a, b) for a, b in zip(ground_truths, predictions)]
 
     wer, samples = calculate_report(ground_truths, predictions, distances, losses)
     mean_edit_distance = np.mean(distances)
@@ -211,17 +190,48 @@ def create_windows(features):
     # Take only the first report_count items
     report_samples = itertools.islice(samples, FLAGS.report_count)
 
-    print('Test - WER: %f, loss: %f, mean edit distance: %f' %
-          (wer, mean_loss, mean_edit_distance))
+    print('Test - WER: %f, CER: %f, loss: %f' %
+          (wer, mean_edit_distance, mean_loss))
     print('-' * 80)
     for sample in report_samples:
-        print('WER: %f, loss: %f, edit distance: %f' %
-              (sample.wer, sample.loss, sample.distance))
+        print('WER: %f, CER: %f, loss: %f' %
+              (sample.wer, sample.distance, sample.loss))
         print(' - src: "%s"' % sample.src)
         print(' - res: "%s"' % sample.res)
         print('-' * 80)
 
+    return samples
+
+
+def main(_):
+    initialize_globals()
+
+    if not FLAGS.test_files:
+        log_error('You need to specify what files to use for evaluation via '
+                  'the --test_files flag.')
+        exit(1)
+
+    global alphabet
+    alphabet = Alphabet(FLAGS.alphabet_config_path)
+
+    # sort examples by length, improves packing of batches and timesteps
+    test_data = preprocess(
+        FLAGS.test_files.split(','),
+        FLAGS.test_batch_size,
+        alphabet=alphabet,
+        numcep=Config.n_input,
+        numcontext=Config.n_context,
+        hdf5_cache_path=FLAGS.hdf5_test_set).sort_values(
+        by="features_len",
+        ascending=False)
+
+    from DeepSpeech import create_inference_graph
+    graph = create_inference_graph(batch_size=FLAGS.test_batch_size, n_steps=-1)
+
+    samples = evaluate(test_data, graph, alphabet)
+
     if FLAGS.test_output_file:
+        # Save decoded tuples as JSON, converting NumPy floats to Python floats
         json.dump(samples, open(FLAGS.test_output_file, 'w'), default=lambda x: float(x))
 
 

diff --git a/native_client/BUILD b/native_client/BUILD
@@ -12,11 +12,10 @@ genrule(
 
 KENLM_SOURCES = glob(["kenlm/lm/*.cc", "kenlm/util/*.cc", "kenlm/util/double-conversion/*.cc",
                       "kenlm/lm/*.hh", "kenlm/util/*.hh", "kenlm/util/double-conversion/*.h"],
-                     exclude = ["kenlm/*/*test.cc", "kenlm/*/*main.cc"]) + glob(["boost_locale/**/*.hpp"])
+                     exclude = ["kenlm/*/*test.cc", "kenlm/*/*main.cc"])
 
 KENLM_INCLUDES = [
     "kenlm",
-    "boost_locale"
 ]
 
 DECODER_SOURCES = glob([
@@ -102,24 +101,6 @@ tf_cc_shared_object(
     defines = ["KENLM_MAX_ORDER=6"],
 )
 
-tf_cc_shared_object(
-    name = "libctc_decoder_with_kenlm.so",
-    srcs = [
-            "beam_search.cc",
-            "beam_search.h",
-            "alphabet.h",
-            "trie_node.h"
-           ] +
-           KENLM_SOURCES,
-    includes = KENLM_INCLUDES,
-    copts = ["-std=c++11"],
-    defines = ["KENLM_MAX_ORDER=6"],
-    deps = ["//tensorflow/core:framework_headers_lib",
-            "//tensorflow/core/util/ctc",
-            "//third_party/eigen3",
-    ],
-)
-
 cc_binary(
     name = "generate_trie",
     srcs = [

diff --git a/native_client/README.md b/native_client/README.md
@@ -19,8 +19,6 @@ This will download and extract `native_client.tar.xz` which includes the deepspe
 
 If you want the CUDA capable version of the binaries, use `--arch gpu`. Note that for now we don't publish CUDA-capable macOS binaries.
 
-If you're looking to train a model, you now have a `libctc_decoder_with_kenlm.so` file that you can pass to the `--decoder_library_path` parameter of `DeepSpeech.py`.
-
 ## Required Dependencies
 
 Running inference might require some runtime dependencies to be already installed on your system. Those should be the same, whatever the bindings you are using:
@@ -57,7 +55,7 @@ If you'd like to build the binaries yourself, you'll need the following pre-requ
 
 It is required to use our fork of TensorFlow since it includes fixes for common problems encountered when building the native client files.
 
-If you'd like to build the language bindings, you'll also need:
+If you'd like to build the language bindings or the decoder package, you'll also need:
 
 * [SWIG](http://www.swig.org/)
 * [node-pre-gyp](https://github.com/mapbox/node-pre-gyp) (for Node.JS bindings only)
@@ -77,10 +75,9 @@ Before building the DeepSpeech client libraries, you will need to prepare your e
 Preferably, checkout the version of tensorflow which is currently supported by DeepSpeech (see requirements.txt), and use the bazel version recommended by TensorFlow for that version.
 Then, follow the [instructions](https://www.tensorflow.org/install/install_sources) on the TensorFlow site for your platform, up to the end of 'Configure the installation'.
 
-After that, you can build the Tensorflow and DeepSpeech libraries using the following commands. Please note that the flags for `libctc_decoder_with_kenlm.so` differs a little bit.
+After that, you can build the Tensorflow and DeepSpeech libraries using the following command.
 
 ```
-bazel build -c opt --copt=-O3 --copt="-D_GLIBCXX_USE_CXX11_ABI=0" //native_client:libctc_decoder_with_kenlm.so
 bazel build --config=monolithic -c opt --copt=-O3 --copt="-D_GLIBCXX_USE_CXX11_ABI=0" --copt=-fvisibility=hidden //native_client:libdeepspeech.so //native_client:generate_trie
 ```
 
@@ -118,7 +115,7 @@ Included are a set of generated Python bindings. After following the above build
 ```
 cd native_client/python
 make bindings
-sudo pip install dist/deepspeech*
+pip install dist/deepspeech*
 ```
 
 The API mirrors the C++ API and is demonstrated in [client.py](python/client.py). Refer to [deepspeech.h](deepspeech.h) for documentation.
@@ -134,3 +131,13 @@ make npm-pack
 ```
 
 This will create the package `deepspeech-0.3.0.tgz` in `native_client/javascript`.
+
+## Building the CTC decoder package
+
+To build the `ds_ctcdecoder` package, you'll need the general requirements listed above (in particular SWIG). The command below builds the bindings using 8 processes for compilation. Adjust the parameter accordingly for more or less parallelism.
+
+```
+cd native_client/ctcdecode
+make bindings NUM_PROCESSES=8
+pip install dist/*.whl
+```