From de602159d1bb86106ffd2a911ad2ddca7c3d6d60 Mon Sep 17 00:00:00 2001
From: Caroline Chen <carolinechen@fb.com>
Date: Thu, 24 Mar 2022 06:50:35 -0700
Subject: [PATCH] Update CTC decoder docs and add citation (#2278)

Summary:
rendered:
- [tutorial](https://output.circle-artifacts.com/output/job/e7fb5a23-87cf-4dd5-b4a8-8b4f91e20eb4/artifacts/0/docs/tutorials/asr_inference_with_ctc_decoder_tutorial.html)
- [docs](https://output.circle-artifacts.com/output/job/e7fb5a23-87cf-4dd5-b4a8-8b4f91e20eb4/artifacts/0/docs/prototype.ctc_decoder.html)

Pull Request resolved: https://github.com/pytorch/audio/pull/2278

Reviewed By: mthrok

Differential Revision: D35097734

Pulled By: carolineechen

fbshipit-source-id: 1e5d5fff0b7740757cca358cf3ea44c6488fcd5c
---
 docs/source/prototype.ctc_decoder.rst         |  5 +++
 docs/source/refs.bib                          |  6 ++++
 ...asr_inference_with_ctc_decoder_tutorial.py |  5 ++-
 .../prototype/ctc_decoder/ctc_decoder.py      | 36 ++++++++-----------
 4 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/docs/source/prototype.ctc_decoder.rst b/docs/source/prototype.ctc_decoder.rst
index c82224ec1d..24f185fa40 100644
--- a/docs/source/prototype.ctc_decoder.rst
+++ b/docs/source/prototype.ctc_decoder.rst
@@ -28,3 +28,8 @@ lexicon_decoder
 ~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: lexicon_decoder
+
+References
+----------
+
+.. footbibliography::
diff --git a/docs/source/refs.bib b/docs/source/refs.bib
index 39fe61f08a..3f66a34ff6 100644
--- a/docs/source/refs.bib
+++ b/docs/source/refs.bib
@@ -261,3 +261,9 @@ @article{capon1969high
   year={1969},
   publisher={IEEE}
 }
+@article{kahn2022flashlight,
+  title={Flashlight: Enabling Innovation in Tools for Machine Learning},
+  author={Kahn, Jacob and Pratap, Vineel and Likhomanenko, Tatiana and Xu, Qiantong and Hannun, Awni and Cai, Jeff and Tomasello, Paden and Lee, Ann and Grave, Edouard and Avidov, Gilad and others},
+  journal={arXiv preprint arXiv:2201.12465},
+  year={2022}
+}
diff --git a/examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py b/examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py
index 442d1ea7f5..6816225317 100644
--- a/examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py
+++ b/examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py
@@ -36,7 +36,10 @@
 # highest scores at each time step. A language model can be incorporated into
 # the scoring computation, and adding a lexicon constraint restricts the
 # next possible tokens for the hypotheses so that only words from the lexicon
-# can be generated. A mathematical formula for the decoder optimization can be
+# can be generated.
+#
+# The underlying implementation is ported from `Flashlight <https://arxiv.org/pdf/2201.12465.pdf>`__'s
+# beam search decoder. A mathematical formula for the decoder optimization can be
 # found in the `Wav2Letter paper <https://arxiv.org/pdf/1609.03193.pdf>`__, and
 # a more detailed algorithm can be found in this `blog
 # <https://towardsdatascience.com/boosting-your-sequence-generation-performance-with-beam-search-language-model-decoding-74ee64de435a>`__.
diff --git a/torchaudio/prototype/ctc_decoder/ctc_decoder.py b/torchaudio/prototype/ctc_decoder/ctc_decoder.py
index 567dcbe659..8cbc776370 100644
--- a/torchaudio/prototype/ctc_decoder/ctc_decoder.py
+++ b/torchaudio/prototype/ctc_decoder/ctc_decoder.py
@@ -39,10 +39,21 @@ class Hypothesis(NamedTuple):
 class LexiconDecoder:
     """torchaudio.prototype.ctc_decoder.LexiconDecoder()
 
+    Lexically contrained CTC beam search decoder from *Flashlight* [:footcite:`kahn2022flashlight`].
+
     Note:
-        To build the decoder, please use factory function
-        :py:func:`lexicon_decoder`.
+        To build the decoder, please use the factory function :py:func:`lexicon_decoder`.
 
+    Args:
+        nbest (int): number of best decodings to return
+        lexicon (Dict): lexicon mapping of words to spellings
+        word_dict (_Dictionary): dictionary of words
+        tokens_dict (_Dictionary): dictionary of tokens
+        lm (_LM): language model
+        decoder_options (_LexiconDecoderOptions): parameters used for beam search decoding
+        blank_token (str): token corresopnding to blank
+        sil_token (str): token corresponding to silence
+        unk_word (str): word corresponding to unknown
     """
 
     def __init__(
@@ -57,24 +68,6 @@ def __init__(
         sil_token: str,
         unk_word: str,
     ) -> None:
-        """
-        CTC Decoder with Lexicon constraint.
-
-        Note:
-            To build the decoder, please use the factory function lexicon_decoder.
-
-        Args:
-            nbest (int): number of best decodings to return
-            lexicon (Dict): lexicon mapping of words to spellings
-            word_dict (_Dictionary): dictionary of words
-            tokens_dict (_Dictionary): dictionary of tokens
-            lm (_LM): language model
-            decoder_options (_LexiconDecoderOptions): parameters used for beam search decoding
-            blank_token (str): token corresopnding to blank
-            sil_token (str): token corresponding to silence
-            unk_word (str): word corresponding to unknown
-        """
-
         self.nbest = nbest
         self.word_dict = word_dict
         self.tokens_dict = tokens_dict
@@ -196,7 +189,8 @@ def lexicon_decoder(
     unk_word: str = "<unk>",
 ) -> LexiconDecoder:
     """
-    Builds Ken LM CTC Lexicon Decoder with given parameters
+    Builds lexically constrained CTC beam search decoder from
+    *Flashlight* [:footcite:`kahn2022flashlight`].
 
     Args:
         lexicon (str): lexicon file containing the possible words and corresponding spellings.