Update CTC decoder docs and add citation (pytorch#2278)

Summary: rendered: - [tutorial](https://output.circle-artifacts.com/output/job/e7fb5a23-87cf-4dd5-b4a8-8b4f91e20eb4/artifacts/0/docs/tutorials/asr_inference_with_ctc_decoder_tutorial.html) - [docs](https://output.circle-artifacts.com/output/job/e7fb5a23-87cf-4dd5-b4a8-8b4f91e20eb4/artifacts/0/docs/prototype.ctc_decoder.html) Pull Request resolved: pytorch#2278 Reviewed By: mthrok Differential Revision: D35097734 Pulled By: carolineechen fbshipit-source-id: 1e5d5fff0b7740757cca358cf3ea44c6488fcd5c
xiaohui-zhang · May 4, 2022 · de60215 · de60215
1 parent ef52592
commit de60215
Show file tree

Hide file tree

Showing 4 changed files with 30 additions and 22 deletions.
diff --git a/docs/source/prototype.ctc_decoder.rst b/docs/source/prototype.ctc_decoder.rst
@@ -28,3 +28,8 @@ lexicon_decoder
 ~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: lexicon_decoder
+
+References
+----------
+
+.. footbibliography::
diff --git a/docs/source/refs.bib b/docs/source/refs.bib
@@ -261,3 +261,9 @@ @article{capon1969high
   year={1969},
   publisher={IEEE}
 }
+@article{kahn2022flashlight,
+  title={Flashlight: Enabling Innovation in Tools for Machine Learning},
+  author={Kahn, Jacob and Pratap, Vineel and Likhomanenko, Tatiana and Xu, Qiantong and Hannun, Awni and Cai, Jeff and Tomasello, Paden and Lee, Ann and Grave, Edouard and Avidov, Gilad and others},
+  journal={arXiv preprint arXiv:2201.12465},
+  year={2022}
+}
diff --git a/examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py b/examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py
@@ -36,7 +36,10 @@
 # highest scores at each time step. A language model can be incorporated into
 # the scoring computation, and adding a lexicon constraint restricts the
 # next possible tokens for the hypotheses so that only words from the lexicon
-# can be generated. A mathematical formula for the decoder optimization can be
+# can be generated.
+#
+# The underlying implementation is ported from `Flashlight <https://arxiv.org/pdf/2201.12465.pdf>`__'s
+# beam search decoder. A mathematical formula for the decoder optimization can be
 # found in the `Wav2Letter paper <https://arxiv.org/pdf/1609.03193.pdf>`__, and
 # a more detailed algorithm can be found in this `blog
 # <https://towardsdatascience.com/boosting-your-sequence-generation-performance-with-beam-search-language-model-decoding-74ee64de435a>`__.

diff --git a/torchaudio/prototype/ctc_decoder/ctc_decoder.py b/torchaudio/prototype/ctc_decoder/ctc_decoder.py
@@ -39,10 +39,21 @@ class Hypothesis(NamedTuple):
 class LexiconDecoder:
     """torchaudio.prototype.ctc_decoder.LexiconDecoder()
 
+    Lexically contrained CTC beam search decoder from *Flashlight* [:footcite:`kahn2022flashlight`].
+
     Note:
-        To build the decoder, please use factory function
-        :py:func:`lexicon_decoder`.
+        To build the decoder, please use the factory function :py:func:`lexicon_decoder`.
 
+    Args:
+        nbest (int): number of best decodings to return
+        lexicon (Dict): lexicon mapping of words to spellings
+        word_dict (_Dictionary): dictionary of words
+        tokens_dict (_Dictionary): dictionary of tokens
+        lm (_LM): language model
+        decoder_options (_LexiconDecoderOptions): parameters used for beam search decoding
+        blank_token (str): token corresopnding to blank
+        sil_token (str): token corresponding to silence
+        unk_word (str): word corresponding to unknown
     """
 
     def __init__(
@@ -57,24 +68,6 @@ def __init__(
         sil_token: str,
         unk_word: str,
     ) -> None:
-        """
-        CTC Decoder with Lexicon constraint.
-
-        Note:
-            To build the decoder, please use the factory function lexicon_decoder.
-
-        Args:
-            nbest (int): number of best decodings to return
-            lexicon (Dict): lexicon mapping of words to spellings
-            word_dict (_Dictionary): dictionary of words
-            tokens_dict (_Dictionary): dictionary of tokens
-            lm (_LM): language model
-            decoder_options (_LexiconDecoderOptions): parameters used for beam search decoding
-            blank_token (str): token corresopnding to blank
-            sil_token (str): token corresponding to silence
-            unk_word (str): word corresponding to unknown
-        """
-
         self.nbest = nbest
         self.word_dict = word_dict
         self.tokens_dict = tokens_dict
@@ -196,7 +189,8 @@ def lexicon_decoder(
     unk_word: str = "<unk>",
 ) -> LexiconDecoder:
     """
-    Builds Ken LM CTC Lexicon Decoder with given parameters
+    Builds lexically constrained CTC beam search decoder from
+    *Flashlight* [:footcite:`kahn2022flashlight`].
 
     Args:
         lexicon (str): lexicon file containing the possible words and corresponding spellings.