From de602159d1bb86106ffd2a911ad2ddca7c3d6d60 Mon Sep 17 00:00:00 2001 From: Caroline Chen Date: Thu, 24 Mar 2022 06:50:35 -0700 Subject: [PATCH] Update CTC decoder docs and add citation (#2278) Summary: rendered: - [tutorial](https://output.circle-artifacts.com/output/job/e7fb5a23-87cf-4dd5-b4a8-8b4f91e20eb4/artifacts/0/docs/tutorials/asr_inference_with_ctc_decoder_tutorial.html) - [docs](https://output.circle-artifacts.com/output/job/e7fb5a23-87cf-4dd5-b4a8-8b4f91e20eb4/artifacts/0/docs/prototype.ctc_decoder.html) Pull Request resolved: https://github.com/pytorch/audio/pull/2278 Reviewed By: mthrok Differential Revision: D35097734 Pulled By: carolineechen fbshipit-source-id: 1e5d5fff0b7740757cca358cf3ea44c6488fcd5c --- docs/source/prototype.ctc_decoder.rst | 5 +++ docs/source/refs.bib | 6 ++++ ...asr_inference_with_ctc_decoder_tutorial.py | 5 ++- .../prototype/ctc_decoder/ctc_decoder.py | 36 ++++++++----------- 4 files changed, 30 insertions(+), 22 deletions(-) diff --git a/docs/source/prototype.ctc_decoder.rst b/docs/source/prototype.ctc_decoder.rst index c82224ec1d..24f185fa40 100644 --- a/docs/source/prototype.ctc_decoder.rst +++ b/docs/source/prototype.ctc_decoder.rst @@ -28,3 +28,8 @@ lexicon_decoder ~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: lexicon_decoder + +References +---------- + +.. footbibliography:: diff --git a/docs/source/refs.bib b/docs/source/refs.bib index 39fe61f08a..3f66a34ff6 100644 --- a/docs/source/refs.bib +++ b/docs/source/refs.bib @@ -261,3 +261,9 @@ @article{capon1969high year={1969}, publisher={IEEE} } +@article{kahn2022flashlight, + title={Flashlight: Enabling Innovation in Tools for Machine Learning}, + author={Kahn, Jacob and Pratap, Vineel and Likhomanenko, Tatiana and Xu, Qiantong and Hannun, Awni and Cai, Jeff and Tomasello, Paden and Lee, Ann and Grave, Edouard and Avidov, Gilad and others}, + journal={arXiv preprint arXiv:2201.12465}, + year={2022} +} diff --git a/examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py b/examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py index 442d1ea7f5..6816225317 100644 --- a/examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py +++ b/examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py @@ -36,7 +36,10 @@ # highest scores at each time step. A language model can be incorporated into # the scoring computation, and adding a lexicon constraint restricts the # next possible tokens for the hypotheses so that only words from the lexicon -# can be generated. A mathematical formula for the decoder optimization can be +# can be generated. +# +# The underlying implementation is ported from `Flashlight `__'s +# beam search decoder. A mathematical formula for the decoder optimization can be # found in the `Wav2Letter paper `__, and # a more detailed algorithm can be found in this `blog # `__. diff --git a/torchaudio/prototype/ctc_decoder/ctc_decoder.py b/torchaudio/prototype/ctc_decoder/ctc_decoder.py index 567dcbe659..8cbc776370 100644 --- a/torchaudio/prototype/ctc_decoder/ctc_decoder.py +++ b/torchaudio/prototype/ctc_decoder/ctc_decoder.py @@ -39,10 +39,21 @@ class Hypothesis(NamedTuple): class LexiconDecoder: """torchaudio.prototype.ctc_decoder.LexiconDecoder() + Lexically contrained CTC beam search decoder from *Flashlight* [:footcite:`kahn2022flashlight`]. + Note: - To build the decoder, please use factory function - :py:func:`lexicon_decoder`. + To build the decoder, please use the factory function :py:func:`lexicon_decoder`. + Args: + nbest (int): number of best decodings to return + lexicon (Dict): lexicon mapping of words to spellings + word_dict (_Dictionary): dictionary of words + tokens_dict (_Dictionary): dictionary of tokens + lm (_LM): language model + decoder_options (_LexiconDecoderOptions): parameters used for beam search decoding + blank_token (str): token corresopnding to blank + sil_token (str): token corresponding to silence + unk_word (str): word corresponding to unknown """ def __init__( @@ -57,24 +68,6 @@ def __init__( sil_token: str, unk_word: str, ) -> None: - """ - CTC Decoder with Lexicon constraint. - - Note: - To build the decoder, please use the factory function lexicon_decoder. - - Args: - nbest (int): number of best decodings to return - lexicon (Dict): lexicon mapping of words to spellings - word_dict (_Dictionary): dictionary of words - tokens_dict (_Dictionary): dictionary of tokens - lm (_LM): language model - decoder_options (_LexiconDecoderOptions): parameters used for beam search decoding - blank_token (str): token corresopnding to blank - sil_token (str): token corresponding to silence - unk_word (str): word corresponding to unknown - """ - self.nbest = nbest self.word_dict = word_dict self.tokens_dict = tokens_dict @@ -196,7 +189,8 @@ def lexicon_decoder( unk_word: str = "", ) -> LexiconDecoder: """ - Builds Ken LM CTC Lexicon Decoder with given parameters + Builds lexically constrained CTC beam search decoder from + *Flashlight* [:footcite:`kahn2022flashlight`]. Args: lexicon (str): lexicon file containing the possible words and corresponding spellings.