From c428acf478bc2e0ea18da69e02554e095f8654b5 Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Tue, 31 Mar 2020 17:55:48 +0200
Subject: [PATCH] Automatically install ds_ctcdecoder in setup.py

---
 doc/TRAINING.rst                              |  11 +-
 setup.py                                      | 101 ++++++++++++++----
 taskcluster/tc-all-utils.sh                   |  22 ----
 taskcluster/tc-single-shot-inference.sh       |   3 -
 taskcluster/tc-train-tests.sh                 |   7 --
 taskcluster/tc-transfer-tests.sh              |   9 +-
 .../deepspeech_training/util/taskcluster.py   |  28 -----
 7 files changed, 86 insertions(+), 95 deletions(-)

diff --git a/doc/TRAINING.rst b/doc/TRAINING.rst
index 99444df94d..2ef1512ee1 100644
--- a/doc/TRAINING.rst
+++ b/doc/TRAINING.rst
@@ -46,7 +46,8 @@ Install the required dependencies using ``pip3``\ :
 .. code-block:: bash
 
    cd DeepSpeech
-   pip3 install -e .
+   pip3 install --upgrade pip wheel setuptools
+   pip3 install --upgrade --force-reinstall -e .
 
 The ``webrtcvad`` Python package might require you to ensure you have proper tooling to build Python modules:
 
@@ -54,14 +55,6 @@ The ``webrtcvad`` Python package might require you to ensure you have proper too
 
    sudo apt-get install python3-dev
 
-You'll also need to install the ``ds_ctcdecoder`` Python package. ``ds_ctcdecoder`` is required for decoding the outputs of the ``deepspeech`` acoustic model into text. You can use ``util/taskcluster.py`` with the ``--decoder`` flag to get a URL to a binary of the decoder package appropriate for your platform and Python version:
-
-.. code-block:: bash
-
-   pip3 install $(python3 util/taskcluster.py --decoder)
-
-This command will download and install the ``ds_ctcdecoder`` package. You can override the platform with ``--arch`` if you want the package for ARM7 (\ ``--arch arm``\ ) or ARM64 (\ ``--arch arm64``\ ). If you prefer building the ``ds_ctcdecoder`` package from source, see the :github:`native_client README file <native_client/README.rst>`.
-
 Recommendations
 ^^^^^^^^^^^^^^^
 
diff --git a/setup.py b/setup.py
index d50dc60181..bdd5682d11 100644
--- a/setup.py
+++ b/setup.py
@@ -1,13 +1,95 @@
+import os
+import platform
+import sys
 from pathlib import Path
 
+from pkg_resources import parse_version
 from setuptools import find_packages, setup
 
 
+def get_decoder_pkg_url(version, artifacts_root=None):
+    is_arm = 'arm' in platform.machine()
+    is_mac = 'darwin' in sys.platform
+    is_64bit = sys.maxsize > (2**31 - 1)
+
+    if is_arm:
+        tc_arch = 'arm64-ctc' if is_64bit else 'arm-ctc'
+    elif is_mac:
+        tc_arch = 'osx-ctc'
+    else:
+        tc_arch = 'cpu-ctc'
+
+    ds_version = parse_version(version)
+    branch = "v{}".format(version)
+
+    plat = platform.system().lower()
+    arch = platform.machine()
+
+    if plat == 'linux' and arch == 'x86_64':
+        plat = 'manylinux1'
+
+    if plat == 'darwin':
+        plat = 'macosx_10_10'
+
+    is_ucs2 = sys.maxunicode < 0x10ffff
+    m_or_mu = 'mu' if is_ucs2 else 'm'
+
+    pyver = ''.join(str(i) for i in sys.version_info[0:2])
+
+    if not artifacts_root:
+        artifacts_root = 'https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.deepspeech.native_client.{branch_name}.{tc_arch_string}/artifacts/public'.format(
+            branch_name=branch,
+            tc_arch_string=tc_arch)
+
+    return 'ds_ctcdecoder @ {artifacts_root}/ds_ctcdecoder-{ds_version}-cp{pyver}-cp{pyver}{m_or_mu}-{platform}_{arch}.whl'.format(
+        artifacts_root=artifacts_root,
+        ds_version=ds_version,
+        pyver=pyver,
+        m_or_mu=m_or_mu,
+        platform=plat,
+        arch=arch,
+    )
+
+
 def main():
     version_file = Path(__file__).parent / 'VERSION'
     with open(str(version_file)) as fin:
         version = fin.read().strip()
 
+    decoder_pkg_url = get_decoder_pkg_url(version)
+
+    install_requires_base = [
+        'tensorflow == 1.15.2',
+        'numpy == 1.18.1',
+        'progressbar2',
+        'six',
+        'pyxdg',
+        'attrdict',
+        'absl-py',
+        'semver',
+        'opuslib == 2.0.0',
+        'optuna',
+        'sox',
+        'bs4',
+        'pandas',
+        'requests',
+        'librosa',
+        'soundfile',
+    ]
+
+    # Due to pip craziness environment variables are the only consistent way to
+    # get options into this script when doing `pip install`.
+    tc_decoder_artifacts_root = os.environ.get('DECODER_ARTIFACTS_ROOT', '')
+    if tc_decoder_artifacts_root:
+        # We're running inside the TaskCluster environment, override the decoder
+        # package URL with the one we just built.
+        decoder_pkg_url = get_decoder_pkg_url(version, tc_decoder_artifacts_root)
+        install_requires = install_requires_base + [decoder_pkg_url]
+    elif os.environ.get('DS_NODECODER', ''):
+        install_requires = install_requires_base
+    else:
+        install_requires = install_requires_base + [decoder_pkg_url]
+
     setup(
         name='deepspeech_training',
         version=version,
@@ -28,24 +110,7 @@ def main():
         package_dir={'': 'training'},
         packages=find_packages(where='training'),
         python_requires='>=3.5, <4',
-        install_requires=[
-            'tensorflow == 1.15.2',
-            'numpy == 1.18.1',
-            'progressbar2',
-            'six',
-            'pyxdg',
-            'attrdict',
-            'absl-py',
-            'semver',
-            'opuslib == 2.0.0',
-            'optuna',
-            'sox',
-            'bs4',
-            'pandas',
-            'requests',
-            'librosa',
-            'soundfile',
-        ],
+        install_requires=install_requires,
         # If there are data files included in your packages that need to be
         # installed, specify them here.
         package_data={
diff --git a/taskcluster/tc-all-utils.sh b/taskcluster/tc-all-utils.sh
index 6863c45194..1ef5b32599 100755
--- a/taskcluster/tc-all-utils.sh
+++ b/taskcluster/tc-all-utils.sh
@@ -122,25 +122,3 @@ verify_bazel_rebuild()
     exit 1
   fi;
 }
-
-# Should be called from context where Python virtualenv is set
-verify_ctcdecoder_url()
-{
-  default_url=$(python util/taskcluster.py --decoder)
-  echo "${default_url}" | grep -F "deepspeech.native_client.v${DS_VERSION}"
-  rc_default_url=$?
-
-  tag_url=$(python util/taskcluster.py --decoder --branch 'v1.2.3')
-  echo "${tag_url}" | grep -F "deepspeech.native_client.v1.2.3"
-  rc_tag_url=$?
-
-  master_url=$(python util/taskcluster.py --decoder --branch 'master')
-  echo "${master_url}" | grep -F "deepspeech.native_client.master"
-  rc_master_url=$?
-
-  if [ ${rc_default_url} -eq 0 -a ${rc_tag_url} -eq 0 -a ${rc_master_url} -eq 0 ]; then
-    return 0
-  else
-    return 1
-  fi;
-}
diff --git a/taskcluster/tc-single-shot-inference.sh b/taskcluster/tc-single-shot-inference.sh
index e058cecc39..13a8605b78 100755
--- a/taskcluster/tc-single-shot-inference.sh
+++ b/taskcluster/tc-single-shot-inference.sh
@@ -22,9 +22,6 @@ pushd ${HOME}/DeepSpeech/ds
 popd
 set +o pipefail
 
-decoder_pkg_url=$(get_python_pkg_url ${pyver_pkg} ${py_unicode_type} "ds_ctcdecoder" "${DECODER_ARTIFACTS_ROOT}")
-LD_LIBRARY_PATH=${PY37_LDPATH}:$LD_LIBRARY_PATH pip install --verbose --only-binary :all: --upgrade ${decoder_pkg_url} | cat
-
 pushd ${HOME}/DeepSpeech/ds/
     time ./bin/run-tc-ldc93s1_singleshotinference.sh
 popd
diff --git a/taskcluster/tc-train-tests.sh b/taskcluster/tc-train-tests.sh
index e5220df51a..c8708f50e8 100644
--- a/taskcluster/tc-train-tests.sh
+++ b/taskcluster/tc-train-tests.sh
@@ -21,13 +21,6 @@ pushd ${HOME}/DeepSpeech/ds
 popd
 set +o pipefail
 
-pushd ${HOME}/DeepSpeech/ds/
-    verify_ctcdecoder_url
-popd
-
-decoder_pkg_url=$(get_python_pkg_url ${pyver_pkg} ${py_unicode_type} "ds_ctcdecoder" "${DECODER_ARTIFACTS_ROOT}")
-LD_LIBRARY_PATH=${PY37_LDPATH}:$LD_LIBRARY_PATH pip install --verbose --only-binary :all: ${PY37_SOURCE_PACKAGE} ${decoder_pkg_url} | cat
-
 # Prepare correct arguments for training
 case "${bitrate}" in
     8k)
diff --git a/taskcluster/tc-transfer-tests.sh b/taskcluster/tc-transfer-tests.sh
index 382d4c91b9..3f9894cef4 100644
--- a/taskcluster/tc-transfer-tests.sh
+++ b/taskcluster/tc-transfer-tests.sh
@@ -15,17 +15,10 @@ virtualenv_activate "${pyalias}" "deepspeech"
 set -o pipefail
 pip install --upgrade pip==19.3.1 setuptools==45.0.0 wheel==0.33.6 | cat
 pushd ${HOME}/DeepSpeech/ds
-    pip install --upgrade . | cat
+    DS_NODECODER=1 pip install --upgrade . | cat
 popd
 set +o pipefail
 
-pushd ${HOME}/DeepSpeech/ds/
-    verify_ctcdecoder_url
-popd
-
-decoder_pkg_url=$(get_python_pkg_url ${pyver_pkg} ${py_unicode_type} "ds_ctcdecoder" "${DECODER_ARTIFACTS_ROOT}")
-LD_LIBRARY_PATH=${PY37_LDPATH}:$LD_LIBRARY_PATH pip install --verbose --only-binary :all: ${PY37_SOURCE_PACKAGE} ${decoder_pkg_url} | cat
-
 pushd ${HOME}/DeepSpeech/ds/
     time ./bin/run-tc-transfer.sh
 popd
diff --git a/training/deepspeech_training/util/taskcluster.py b/training/deepspeech_training/util/taskcluster.py
index 0fbf6fae28..d0053c7dce 100644
--- a/training/deepspeech_training/util/taskcluster.py
+++ b/training/deepspeech_training/util/taskcluster.py
@@ -90,8 +90,6 @@ def main():
                         help='Name of the TaskCluster scheme to use.')
     parser.add_argument('--branch', required=False,
                         help='Branch name to use. Defaulting to current content of VERSION file.')
-    parser.add_argument('--decoder', action='store_true',
-                        help='Get URL to ds_ctcdecoder Python package.')
 
     args = parser.parse_args()
 
@@ -119,32 +117,6 @@ def main():
     else:
         ds_version = parse_version(args.branch)
 
-    if args.decoder:
-        plat = platform.system().lower()
-        arch = platform.machine()
-
-        if plat == 'linux' and arch == 'x86_64':
-            plat = 'manylinux1'
-
-        if plat == 'darwin':
-            plat = 'macosx_10_10'
-
-        m_or_mu = 'mu' if is_ucs2 else 'm'
-        pyver = ''.join(map(str, sys.version_info[0:2]))
-
-        artifact = "ds_ctcdecoder-{ds_version}-cp{pyver}-cp{pyver}{m_or_mu}-{platform}_{arch}.whl".format(
-            ds_version=ds_version,
-            pyver=pyver,
-            m_or_mu=m_or_mu,
-            platform=plat,
-            arch=arch
-        )
-
-        ctc_arch = args.arch + '-ctc'
-
-        print(get_tc_url(ctc_arch, artifact, args.branch))
-        sys.exit(0)
-
     if args.source is not None:
         if args.source in DEFAULT_SCHEMES:
             global TASKCLUSTER_SCHEME