diff --git a/docs/source/refs.bib b/docs/source/refs.bib index 39fe61f08a9..3f66a34ff6f 100644 --- a/docs/source/refs.bib +++ b/docs/source/refs.bib @@ -261,3 +261,9 @@ @article{capon1969high year={1969}, publisher={IEEE} } +@article{kahn2022flashlight, + title={Flashlight: Enabling Innovation in Tools for Machine Learning}, + author={Kahn, Jacob and Pratap, Vineel and Likhomanenko, Tatiana and Xu, Qiantong and Hannun, Awni and Cai, Jeff and Tomasello, Paden and Lee, Ann and Grave, Edouard and Avidov, Gilad and others}, + journal={arXiv preprint arXiv:2201.12465}, + year={2022} +} diff --git a/examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py b/examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py index 4b73c1bd228..af92fc21376 100644 --- a/examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py +++ b/examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py @@ -21,7 +21,10 @@ # highest scores at each time step. A language model can be incorporated into # the scoring computation, and adding a lexicon constraint restricts the # next possible tokens for the hypotheses so that only words from the lexicon -# can be generated. A mathematical formula for the decoder optimization can be +# can be generated. +# +# The underlying implementation is ported from `Flashlight `__'s +# beam search decoder. A mathematical formula for the decoder optimization can be # found in the `Wav2Letter paper `__, and # a more detailed algorithm can be found in this `blog # `__. diff --git a/torchaudio/prototype/ctc_decoder/ctc_decoder.py b/torchaudio/prototype/ctc_decoder/ctc_decoder.py index 567dcbe659d..a5f723ae61b 100644 --- a/torchaudio/prototype/ctc_decoder/ctc_decoder.py +++ b/torchaudio/prototype/ctc_decoder/ctc_decoder.py @@ -39,10 +39,21 @@ class Hypothesis(NamedTuple): class LexiconDecoder: """torchaudio.prototype.ctc_decoder.LexiconDecoder() + Lexically contrained CTC Beam Search Decoder from *Flashlight* [:footcite:`kahn2022flashlight`] + Note: - To build the decoder, please use factory function - :py:func:`lexicon_decoder`. + To build the decoder, please use the factory function :py:func:`lexicon_decoder`. + Args: + nbest (int): number of best decodings to return + lexicon (Dict): lexicon mapping of words to spellings + word_dict (_Dictionary): dictionary of words + tokens_dict (_Dictionary): dictionary of tokens + lm (_LM): language model + decoder_options (_LexiconDecoderOptions): parameters used for beam search decoding + blank_token (str): token corresopnding to blank + sil_token (str): token corresponding to silence + unk_word (str): word corresponding to unknown """ def __init__( @@ -57,24 +68,6 @@ def __init__( sil_token: str, unk_word: str, ) -> None: - """ - CTC Decoder with Lexicon constraint. - - Note: - To build the decoder, please use the factory function lexicon_decoder. - - Args: - nbest (int): number of best decodings to return - lexicon (Dict): lexicon mapping of words to spellings - word_dict (_Dictionary): dictionary of words - tokens_dict (_Dictionary): dictionary of tokens - lm (_LM): language model - decoder_options (_LexiconDecoderOptions): parameters used for beam search decoding - blank_token (str): token corresopnding to blank - sil_token (str): token corresponding to silence - unk_word (str): word corresponding to unknown - """ - self.nbest = nbest self.word_dict = word_dict self.tokens_dict = tokens_dict @@ -196,7 +189,8 @@ def lexicon_decoder( unk_word: str = "", ) -> LexiconDecoder: """ - Builds Ken LM CTC Lexicon Decoder with given parameters + Builds a lexicon constrained CTC lexically constrained beam search decoder from + *Flashlight* [:footcite:`kahn2022flashlight`]. Args: lexicon (str): lexicon file containing the possible words and corresponding spellings.