diff --git a/examples/asr/asr_chunked_inference/aed/speech_to_text_aed_chunked_infer.py b/examples/asr/asr_chunked_inference/aed/speech_to_text_aed_chunked_infer.py index 3a0a9ba5ce65..8188bcced14d 100644 --- a/examples/asr/asr_chunked_inference/aed/speech_to_text_aed_chunked_infer.py +++ b/examples/asr/asr_chunked_inference/aed/speech_to_text_aed_chunked_infer.py @@ -69,8 +69,9 @@ @dataclass class TranscriptionConfig: """ - Transcription config + Transcription config """ + # Required configs model_path: Optional[str] = None # Path to a .nemo file pretrained_name: Optional[str] = None # Name of a pretrained model @@ -121,7 +122,7 @@ class TranscriptionConfig: @hydra_runner(config_name="TranscriptionConfig", schema=TranscriptionConfig) def main(cfg: TranscriptionConfig) -> TranscriptionConfig: """ - Transcribes the input audio and can be used to infer long audio files by chunking + Transcribes the input audio and can be used to infer long audio files by chunking them into smaller segments. """ logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') diff --git a/examples/asr/asr_chunked_inference/ctc/speech_to_text_buffered_infer_ctc.py b/examples/asr/asr_chunked_inference/ctc/speech_to_text_buffered_infer_ctc.py index fd5429ba6165..87370d278f98 100644 --- a/examples/asr/asr_chunked_inference/ctc/speech_to_text_buffered_infer_ctc.py +++ b/examples/asr/asr_chunked_inference/ctc/speech_to_text_buffered_infer_ctc.py @@ -67,6 +67,7 @@ class TranscriptionConfig: """ Transcription Configuration for buffered inference. """ + # Required configs model_path: Optional[str] = None # Path to a .nemo file pretrained_name: Optional[str] = None # Name of a pretrained model @@ -117,7 +118,7 @@ class TranscriptionConfig: @hydra_runner(config_name="TranscriptionConfig", schema=TranscriptionConfig) def main(cfg: TranscriptionConfig) -> TranscriptionConfig: """ - Transcribes the input audio and can be used to infer long audio files by chunking + Transcribes the input audio and can be used to infer long audio files by chunking them into smaller segments. """ logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') diff --git a/examples/asr/asr_chunked_inference/rnnt/speech_to_text_buffered_infer_rnnt.py b/examples/asr/asr_chunked_inference/rnnt/speech_to_text_buffered_infer_rnnt.py index bdd5d2155c5e..e6e84cdfa6c4 100644 --- a/examples/asr/asr_chunked_inference/rnnt/speech_to_text_buffered_infer_rnnt.py +++ b/examples/asr/asr_chunked_inference/rnnt/speech_to_text_buffered_infer_rnnt.py @@ -90,6 +90,7 @@ class TranscriptionConfig: """ Transcription Configuration for buffered inference. """ + # Required configs model_path: Optional[str] = None # Path to a .nemo file pretrained_name: Optional[str] = None # Name of a pretrained model @@ -147,7 +148,7 @@ class TranscriptionConfig: @hydra_runner(config_name="TranscriptionConfig", schema=TranscriptionConfig) def main(cfg: TranscriptionConfig) -> TranscriptionConfig: """ - Transcribes the input audio and can be used to infer long audio files by chunking + Transcribes the input audio and can be used to infer long audio files by chunking them into smaller segments. """ logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') diff --git a/examples/asr/speech_translation/translate_speech.py b/examples/asr/speech_translation/translate_speech.py index 66fb321718ee..53599e1b3511 100644 --- a/examples/asr/speech_translation/translate_speech.py +++ b/examples/asr/speech_translation/translate_speech.py @@ -67,6 +67,7 @@ class ModelChangeConfig: """ Sub-config for changes specific to the Conformer Encoder """ + conformer: ConformerChangeConfig = ConformerChangeConfig() @@ -75,6 +76,7 @@ class TranslationConfig: """ Translation Configuration for audio to text translation. """ + # Required configs model_path: Optional[str] = None # Path to a .nemo file pretrained_name: Optional[str] = None # Name of a pretrained model diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py index d0e4804daeb6..a543fcf5e252 100644 --- a/examples/asr/transcribe_speech.py +++ b/examples/asr/transcribe_speech.py @@ -106,6 +106,7 @@ class ModelChangeConfig: """ Sub-config for changes specific to the Conformer Encoder """ + conformer: ConformerChangeConfig = field(default_factory=ConformerChangeConfig) @@ -114,6 +115,7 @@ class TranscriptionConfig: """ Transcription Configuration for audio to text transcription. """ + # Required configs model_path: Optional[str] = None # Path to a .nemo file pretrained_name: Optional[str] = None # Name of a pretrained model @@ -170,7 +172,7 @@ class TranscriptionConfig: # Implicit single-turn assuming default role='user' (works with Canary-1B) # +prompt.source_lang=en +prompt.target_lang=es +prompt.task=asr +prompt.pnc=yes # Explicit single-turn prompt: - # +prompt.role=user +prompt.slots.source_lang=en +prompt.slots.target_lang=es + # +prompt.role=user +prompt.slots.source_lang=en +prompt.slots.target_lang=es # +prompt.slots.task=s2t_translation +prompt.slots.pnc=yes # Explicit multi-turn prompt: # +prompt.turns='[{role:user,slots:{source_lang:en,target_lang:es,task:asr,pnc:yes}}]' diff --git a/nemo/collections/asr/models/aed_multitask_models.py b/nemo/collections/asr/models/aed_multitask_models.py index 4eb75143e45b..f18fe02d2ed8 100644 --- a/nemo/collections/asr/models/aed_multitask_models.py +++ b/nemo/collections/asr/models/aed_multitask_models.py @@ -224,7 +224,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): self.val_loss = GlobalAverageLossMetric(dist_sync_on_step=False, take_avg_loss=True) - # TODO: PytorchMetrics lets you join two metrics together to save compute. + # TODO: PytorchMetrics lets you join two metrics together to save compute. # But need to make wer and bleu have same outputs first self.wer = WER(self.decoding, log_prediction=self.cfg.get("log_prediction")) self.bleu = BLEU( @@ -273,14 +273,14 @@ def change_vocabulary( prompt_format: Optional[str] = None, ): """ - Changes vocabulary used during AED decoding process. Use this method when fine-tuning on + Changes vocabulary used during AED decoding process. Use this method when fine-tuning on from pre-trained model. This method changes only decoder and leaves encoder and pre-processing - modules unchanged. For example, you would use it if you want to use pretrained encoder when - fine-tuning on data in another language, or when you'd need model to learn capitalization, + modules unchanged. For example, you would use it if you want to use pretrained encoder when + fine-tuning on data in another language, or when you'd need model to learn capitalization, punctuation and/or special characters. Args: - new_tokenizer_dir: Directory path to tokenizer or a config for a new tokenizer + new_tokenizer_dir: Directory path to tokenizer or a config for a new tokenizer (if the tokenizer type is `agg`) new_tokenizer_type: Type of tokenizer. Can be either `agg`, `bpe` or `wpe`. decoding_cfg: A config for the decoding, which is optional. If the decoding type diff --git a/nemo/collections/asr/models/ctc_bpe_models.py b/nemo/collections/asr/models/ctc_bpe_models.py index 4bd829dc4cb1..79c22794de01 100644 --- a/nemo/collections/asr/models/ctc_bpe_models.py +++ b/nemo/collections/asr/models/ctc_bpe_models.py @@ -209,13 +209,13 @@ def change_vocabulary( """ Changes vocabulary of the tokenizer used during CTC decoding process. Use this method when fine-tuning on from pre-trained model. - This method changes only decoder and leaves encoder and pre-processing modules unchanged. - For example, you would use it if you want to use pretrained encoder when fine-tuning on a - data in another language, or when you'd need model to learn capitalization, punctuation + This method changes only decoder and leaves encoder and pre-processing modules unchanged. + For example, you would use it if you want to use pretrained encoder when fine-tuning on a + data in another language, or when you'd need model to learn capitalization, punctuation and/or special characters. Args: - new_tokenizer_dir: Directory path to tokenizer or a config for a new tokenizer + new_tokenizer_dir: Directory path to tokenizer or a config for a new tokenizer (if the tokenizer type is `agg`) new_tokenizer_type: Either `agg`, `bpe` or `wpe`. `bpe` is used for SentencePiece tokenizers, whereas `wpe` is used for `BertTokenizer`. diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py index 4e36c3995c26..1d437a19a86b 100644 --- a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py +++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py @@ -253,10 +253,10 @@ def change_vocabulary( ctc_decoding_cfg: Optional[DictConfig] = None, ): """ - Changes vocabulary used during RNNT decoding process. Use this method when fine-tuning on - from pre-trained model. This method changes only decoder and leaves encoder and pre-processing - modules unchanged. For example, you would use it if you want to use pretrained encoder when - fine-tuning on data in another language, or when you'd need model to learn capitalization, + Changes vocabulary used during RNNT decoding process. Use this method when fine-tuning on + from pre-trained model. This method changes only decoder and leaves encoder and pre-processing + modules unchanged. For example, you would use it if you want to use pretrained encoder when + fine-tuning on data in another language, or when you'd need model to learn capitalization, punctuation and/or special characters. Args: diff --git a/nemo/collections/asr/models/rnnt_bpe_models.py b/nemo/collections/asr/models/rnnt_bpe_models.py index f3b738291d69..25890ec716c8 100644 --- a/nemo/collections/asr/models/rnnt_bpe_models.py +++ b/nemo/collections/asr/models/rnnt_bpe_models.py @@ -344,14 +344,14 @@ def change_vocabulary( decoding_cfg: Optional[DictConfig] = None, ): """ - Changes vocabulary used during RNNT decoding process. Use this method when fine-tuning - on from pre-trained model. This method changes only decoder and leaves encoder and pre-processing - modules unchanged. For example, you would use it if you want to use pretrained encoder when fine-tuning - on data in another language, or when you'd need model to learn capitalization, punctuation + Changes vocabulary used during RNNT decoding process. Use this method when fine-tuning + on from pre-trained model. This method changes only decoder and leaves encoder and pre-processing + modules unchanged. For example, you would use it if you want to use pretrained encoder when fine-tuning + on data in another language, or when you'd need model to learn capitalization, punctuation and/or special characters. Args: - new_tokenizer_dir: Directory path to tokenizer or a config for a new tokenizer + new_tokenizer_dir: Directory path to tokenizer or a config for a new tokenizer (if the tokenizer type is `agg`) new_tokenizer_type: Type of tokenizer. Can be either `agg`, `bpe` or `wpe`. decoding_cfg: A config for the decoder, which is optional. If the decoding type diff --git a/nemo/collections/asr/modules/conv_asr.py b/nemo/collections/asr/modules/conv_asr.py index dd29f0b56436..e48d76a9b7a3 100644 --- a/nemo/collections/asr/modules/conv_asr.py +++ b/nemo/collections/asr/modules/conv_asr.py @@ -776,7 +776,7 @@ class SpeakerDecoder(NeuralModule, Exportable): Args: feat_in (int): Number of channels being input to this module num_classes (int): Number of unique speakers in dataset - emb_sizes (list) : shapes of intermediate embedding layers (we consider speaker embbeddings + emb_sizes (list) : shapes of intermediate embedding layers (we consider speaker embbeddings from 1st of this layers). Defaults to [1024,1024] pool_mode (str) : Pooling strategy type. options are 'xvector','tap', 'attention' Defaults to 'xvector (mean and variance)' diff --git a/nemo/collections/asr/parts/mixins/transcription.py b/nemo/collections/asr/parts/mixins/transcription.py index cbb3296de7f9..ac928fe99272 100644 --- a/nemo/collections/asr/parts/mixins/transcription.py +++ b/nemo/collections/asr/parts/mixins/transcription.py @@ -202,8 +202,8 @@ def transcribe( to `None`. Defaults to `None`. Uses zero-based indexing. augmentor: (DictConfig): Augment audio samples during transcription if augmentor is applied. verbose: (bool) whether to display tqdm progress bar - timestamps: Optional(Bool): timestamps will be returned if set to True as part of hypothesis object - (output.timestep['segment']/output.timestep['word']). Refer to `Hypothesis` class for more details. + timestamps: Optional(Bool): timestamps will be returned if set to True as part of hypothesis object + (output.timestep['segment']/output.timestep['word']). Refer to `Hypothesis` class for more details. Default is None and would retain the previous state set by using self.change_decoding_strategy(). override_config: (Optional[TranscribeConfig]) override transcription config pre-defined by the user. **Note**: All other arguments in the function will be ignored if override_config is passed. diff --git a/nemo/collections/asr/parts/submodules/rnnt_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_decoding.py index dc5d645f1488..da280a0c6b3c 100644 --- a/nemo/collections/asr/parts/submodules/rnnt_decoding.py +++ b/nemo/collections/asr/parts/submodules/rnnt_decoding.py @@ -67,14 +67,14 @@ class AbstractRNNTDecoding(ConfidenceMixin): rnnt_timestamp_type: A str value, which represents the types of timestamps that should be calculated. Can take the following values - "char" for character/subword time stamps, "word" for word level - time stamps, "segment" for segment level time stamps and "all" (default), for character, + time stamps, "segment" for segment level time stamps and "all" (default), for character, word and segment level time stamps. word_seperator: Str token representing the seperator between words. segment_seperators: List containing tokens representing the seperator(s) between segments. - segment_gap_threshold: The threshold (in frames) that caps the gap between two words necessary + segment_gap_threshold: The threshold (in frames) that caps the gap between two words necessary for forming the segments. preserve_frame_confidence: Bool flag which preserves the history of per-frame confidence scores @@ -104,9 +104,9 @@ class AbstractRNNTDecoding(ConfidenceMixin): The length of the list corresponds to the number of recognized words. exclude_blank: Bool flag indicating that blank token confidence scores are to be excluded from the `token_confidence`. - aggregation: Which aggregation type to use for collapsing per-token confidence into per-word + aggregation: Which aggregation type to use for collapsing per-token confidence into per-word confidence. Valid options are `mean`, `min`, `max`, `prod`. - tdt_include_duration: Bool flag indicating that the duration confidence scores are to be calculated + tdt_include_duration: Bool flag indicating that the duration confidence scores are to be calculated and attached to the regular frame confidence, making TDT frame confidence element a pair: (`prediction_confidence`, `duration_confidence`). method_cfg: A dict-like object which contains the method name and settings to compute per-frame @@ -179,22 +179,22 @@ class AbstractRNNTDecoding(ConfidenceMixin): maes_num_steps: Number of adaptive steps to take. From the paper, 2 steps is generally sufficient, and can be reduced to 1 to improve decoding speed while sacrificing some accuracy. int > 0. - maes_prefix_alpha: Maximum prefix length in prefix search. Must be an integer, and is advised to + maes_prefix_alpha: Maximum prefix length in prefix search. Must be an integer, and is advised to keep this as 1 in order to reduce expensive beam search cost later. int >= 0. maes_expansion_beta: Maximum number of prefix expansions allowed, in addition to the beam size. Effectively, the number of hypothesis = beam_size + maes_expansion_beta. Must be an int >= 0, - and affects the speed of inference since large values will perform large beam search in the + and affects the speed of inference since large values will perform large beam search in the next step. - maes_expansion_gamma: Float pruning threshold used in the prune-by-value step when computing the - expansions. The default (2.3) is selected from the paper. It performs a comparison - (max_log_prob - gamma <= log_prob[v]) where v is all vocabulary indices in the Vocab set + maes_expansion_gamma: Float pruning threshold used in the prune-by-value step when computing the + expansions. The default (2.3) is selected from the paper. It performs a comparison + (max_log_prob - gamma <= log_prob[v]) where v is all vocabulary indices in the Vocab set and max_log_prob is the "most" likely token to be predicted. Gamma therefore provides a margin - of additional tokens which can be potential candidates for expansion apart from the "most likely" - candidate. Lower values will reduce the number of expansions (by increasing pruning-by-value, - thereby improving speed but hurting accuracy). Higher values will increase the number of expansions - (by reducing pruning-by-value, thereby reducing speed but potentially improving accuracy). + of additional tokens which can be potential candidates for expansion apart from the "most likely" + candidate. Lower values will reduce the number of expansions (by increasing pruning-by-value, + thereby improving speed but hurting accuracy). Higher values will increase the number of expansions + (by reducing pruning-by-value, thereby reducing speed but potentially improving accuracy). This is a hyper parameter to be experimentally tuned on a validation set. softmax_temperature: Scales the logits of the joint prior to computing log_softmax. @@ -953,7 +953,7 @@ def _refine_timestamps_tdt( # Check if token is a punctuation mark # If so, set its start and end offset as start and end of the previous token - # This is done because there was observed a behaviour, when punctuation marks are predicted long + # This is done because there was observed a behaviour, when punctuation marks are predicted long # after preceding token (i.e. after silence) if offset['char'][0] in supported_punctuation and i > 0: encoded_char_offsets[i]['start_offset'] = offset['start_offset'] = char_offsets[i - 1]['end_offset'] @@ -1241,7 +1241,7 @@ class RNNTDecoding(AbstractRNNTDecoding): The length of the list corresponds to the number of recognized words. exclude_blank: Bool flag indicating that blank token confidence scores are to be excluded from the `token_confidence`. - aggregation: Which aggregation type to use for collapsing per-token confidence into per-word + aggregation: Which aggregation type to use for collapsing per-token confidence into per-word confidence. Valid options are `mean`, `min`, `max`, `prod`. tdt_include_duration: Bool flag indicating that the duration confidence scores are to be calculated and attached to the regular frame confidence, @@ -1317,7 +1317,7 @@ class RNNTDecoding(AbstractRNNTDecoding): per timestep of the acoustic model. Larger values will allow longer sentences to be decoded, at increased cost to execution time. - alsd_max_target_len: optional int or float, determines the potential maximum target sequence + alsd_max_target_len: optional int or float, determines the potential maximum target sequence length. If an integer is provided, it can decode sequences of that particular maximum length. If a float is provided, it can decode sequences of int(alsd_max_target_len * seq_len), where seq_len is the length of the acoustic model output (T). @@ -1330,23 +1330,23 @@ class RNNTDecoding(AbstractRNNTDecoding): maes_num_steps: Number of adaptive steps to take. From the paper, 2 steps is generally sufficient, and can be reduced to 1 to improve decoding speed while sacrificing some accuracy. int > 0. - maes_prefix_alpha: Maximum prefix length in prefix search. Must be an integer, and is advised to + maes_prefix_alpha: Maximum prefix length in prefix search. Must be an integer, and is advised to keep this as 1 in order to reduce expensive beam search cost later. int >= 0. maes_expansion_beta: Maximum number of prefix expansions allowed, in addition to the beam size. Effectively, the number of hypothesis = beam_size + maes_expansion_beta. Must be an int >= 0, - and affects the speed of inference since large values will perform large beam search in the + and affects the speed of inference since large values will perform large beam search in the next step. - maes_expansion_gamma: Float pruning threshold used in the prune-by-value step when computing the + maes_expansion_gamma: Float pruning threshold used in the prune-by-value step when computing the expansions. - The default (2.3) is selected from the paper. It performs a comparison - (max_log_prob - gamma <= log_prob[v]) where v is all vocabulary indices in the Vocab set and - max_log_prob is the "most" likely token to be predicted. Gamma therefore provides a margin of + The default (2.3) is selected from the paper. It performs a comparison + (max_log_prob - gamma <= log_prob[v]) where v is all vocabulary indices in the Vocab set and + max_log_prob is the "most" likely token to be predicted. Gamma therefore provides a margin of additional tokens which can be potential candidates for expansion apart from the "most likely" candidate. Lower values will reduce the number of expansions (by increasing pruning-by-value, - thereby improving speed but hurting accuracy). Higher values will increase the number of - expansions (by reducing pruning-by-value, thereby reducing speed but potentially improving + thereby improving speed but hurting accuracy). Higher values will increase the number of + expansions (by reducing pruning-by-value, thereby reducing speed but potentially improving accuracy). This is a hyper parameter to be experimentally tuned on a validation set. softmax_temperature: Scales the logits of the joint prior to computing log_softmax. @@ -1498,7 +1498,7 @@ class RNNTBPEDecoding(AbstractRNNTDecoding): segment_seperators: List containing tokens representing the seperator(s) between segments. - segment_gap_threshold: The threshold (in frames) that caps the gap between two words necessary for + segment_gap_threshold: The threshold (in frames) that caps the gap between two words necessary for forming the segments. preserve_frame_confidence: Bool flag which preserves the history of per-frame confidence scores @@ -1528,9 +1528,9 @@ class RNNTBPEDecoding(AbstractRNNTDecoding): The length of the list corresponds to the number of recognized words. exclude_blank: Bool flag indicating that blank token confidence scores are to be excluded from the `token_confidence`. - aggregation: Which aggregation type to use for collapsing per-token confidence into per-word + aggregation: Which aggregation type to use for collapsing per-token confidence into per-word confidence. Valid options are `mean`, `min`, `max`, `prod`. - tdt_include_duration: Bool flag indicating that the duration confidence scores are to be + tdt_include_duration: Bool flag indicating that the duration confidence scores are to be calculated and attached to the regular frame confidence, making TDT frame confidence element a pair: (`prediction_confidence`, `duration_confidence`). method_cfg: A dict-like object which contains the method name and settings to compute per-frame @@ -1601,7 +1601,7 @@ class RNNTBPEDecoding(AbstractRNNTDecoding): per timestep of the acoustic model. Larger values will allow longer sentences to be decoded, at increased cost to execution time. - alsd_max_target_len: optional int or float, determines the potential maximum target sequence + alsd_max_target_len: optional int or float, determines the potential maximum target sequence length. If an integer is provided, it can decode sequences of that particular maximum length. If a float is provided, it can decode sequences of int(alsd_max_target_len * seq_len), where seq_len is the length of the acoustic model output (T). @@ -1614,23 +1614,23 @@ class RNNTBPEDecoding(AbstractRNNTDecoding): maes_num_steps: Number of adaptive steps to take. From the paper, 2 steps is generally sufficient, and can be reduced to 1 to improve decoding speed while sacrificing some accuracy. int > 0. - maes_prefix_alpha: Maximum prefix length in prefix search. Must be an integer, and is advised to + maes_prefix_alpha: Maximum prefix length in prefix search. Must be an integer, and is advised to keep this as 1 in order to reduce expensive beam search cost later. int >= 0. maes_expansion_beta: Maximum number of prefix expansions allowed, in addition to the beam size. Effectively, the number of hypothesis = beam_size + maes_expansion_beta. Must be an int >= 0, - and affects the speed of inference since large values will perform large beam search in the + and affects the speed of inference since large values will perform large beam search in the next step. - maes_expansion_gamma: Float pruning threshold used in the prune-by-value step when - computing the expansions. The default (2.3) is selected from the paper. It performs a - comparison (max_log_prob - gamma <= log_prob[v]) where v is all vocabulary indices in the - Vocab set and max_log_prob is the "most" likely token to be predicted. Gamma therefore - provides a margin of additional tokens which can be potential candidates for expansion + maes_expansion_gamma: Float pruning threshold used in the prune-by-value step when + computing the expansions. The default (2.3) is selected from the paper. It performs a + comparison (max_log_prob - gamma <= log_prob[v]) where v is all vocabulary indices in the + Vocab set and max_log_prob is the "most" likely token to be predicted. Gamma therefore + provides a margin of additional tokens which can be potential candidates for expansion apart from the "most likely" candidate. Lower values will reduce the number of expansions - (by increasing pruning-by-value, thereby improving speed but hurting accuracy). Higher - values will increase the number of expansions (by reducing pruning-by-value, thereby - reducing speed but potentially improving accuracy). This is a hyper parameter to be + (by increasing pruning-by-value, thereby improving speed but hurting accuracy). Higher + values will increase the number of expansions (by reducing pruning-by-value, thereby + reducing speed but potentially improving accuracy). This is a hyper parameter to be experimentally tuned on a validation set. softmax_temperature: Scales the logits of the joint prior to computing log_softmax. diff --git a/nemo/collections/asr/parts/utils/streaming_utils.py b/nemo/collections/asr/parts/utils/streaming_utils.py index 3d15eb0dcffc..cb272e3d0462 100644 --- a/nemo/collections/asr/parts/utils/streaming_utils.py +++ b/nemo/collections/asr/parts/utils/streaming_utils.py @@ -79,7 +79,7 @@ def longest_common_subsequence_merge(X, Y, filepath=None): Assumption is that the two chunks are consecutive chunks, and there exists at least small overlap acoustically. - It is a sub-word token merge algorithm, operating on the abstract notion of integer ids representing + It is a sub-word token merge algorithm, operating on the abstract notion of integer ids representing the subword ids. It is independent of text or character encoding. Since the algorithm is merge based, and depends on consecutive buffers, the very first buffer is processes using @@ -327,7 +327,7 @@ def inplace_buffer_merge(buffer, data, timesteps, model): Merges the new text from the current frame with the previous text contained in the buffer. The alignment is based on a Longest Common Subsequence algorithm, with some additional heuristics leveraging - the notion that the chunk size is >= the context window. In case this assumptio is violated, the results of + the notion that the chunk size is >= the context window. In case this assumptio is violated, the results of the merge will be incorrect (or at least obtain worse WER overall). """ # If delay timesteps is 0, that means no future context was used. Simply concatenate the buffer with new data. @@ -1091,14 +1091,14 @@ def _get_batch_preds(self): - For all samples, determine if signal has finished. - If so, skip calculation of mel-specs. - If not, compute mel spec and length - - Perform Encoder forward over this sub-batch of samples. Maintain the indices of samples that + - Perform Encoder forward over this sub-batch of samples. Maintain the indices of samples that were processed. - - If performing stateful decoding, prior to decoder forward, remove the states of samples that + - If performing stateful decoding, prior to decoder forward, remove the states of samples that were not processed. - Perform Decoder + Joint forward for samples that were processed. - For all output RNNT alignment matrix of the joint do: - If signal has ended previously (this was last buffer of padding), skip alignment - - Otherwise, recalculate global index of this sample from the sub-batch index, and preserve + - Otherwise, recalculate global index of this sample from the sub-batch index, and preserve alignment. - Same for preds - Update indices of sub-batch with global index map. @@ -1365,8 +1365,8 @@ def transcribe( class CacheAwareStreamingAudioBuffer: """ - A buffer to be used for cache-aware streaming. It can load a single or multiple audio - files/processed signals, split them in chunks and return one on one. It can be used to + A buffer to be used for cache-aware streaming. It can load a single or multiple audio + files/processed signals, split them in chunks and return one on one. It can be used to simulate streaming audio or audios. """ @@ -1374,7 +1374,7 @@ def __init__(self, model, online_normalization=None, pad_and_drop_preencoded=Fal ''' Args: model: An ASR model. - online_normalization (bool): whether to perform online normalization per chunk or + online_normalization (bool): whether to perform online normalization per chunk or normalize the whole audio before chunking pad_and_drop_preencoded (bool): if true pad first audio chunk and always drop preencoded ''' @@ -1435,7 +1435,7 @@ def __iter__(self): audio_chunk = self.buffer[:, :, self.buffer_idx : self.buffer_idx + chunk_size] if self.sampling_frames is not None: - # checking to make sure the audio chunk has enough frames to produce at least one output after + # checking to make sure the audio chunk has enough frames to produce at least one output after # downsampling if self.buffer_idx == 0 and isinstance(self.sampling_frames, list): cur_sampling_frames = self.sampling_frames[0] diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py index 1520cc93a579..189d98537d3f 100644 --- a/nemo/collections/asr/parts/utils/transcribe_utils.py +++ b/nemo/collections/asr/parts/utils/transcribe_utils.py @@ -282,15 +282,15 @@ def prepare_audio_data(cfg: DictConfig) -> Tuple[List[str], bool]: - append_pred (bool): Flag indicating whether to append predictions to an existing dataset. - audio_type (str): Type of audio files to consider. - dataset_manifest (str): Path to the dataset manifest file. - - audio_key (str, optional): Key in the manifest file specifying the audio file path. + - audio_key (str, optional): Key in the manifest file specifying the audio file path. Defaults to 'audio_filepath'. - - presort_manifest (bool, optional): Flag indicating whether to presort the manifest file. + - presort_manifest (bool, optional): Flag indicating whether to presort the manifest file. Defaults to True. Returns: Tuple[List[str], bool]: A tuple containing the following: - - filepaths (List[str]): List of filepaths to the audio files if path to the directory + - filepaths (List[str]): List of filepaths to the audio files if path to the directory containing audio files is provided. - - sorted_manifest_path (bool): Path to the sorted manifest file if path to the dataset + - sorted_manifest_path (bool): Path to the sorted manifest file if path to the dataset manifest file is provided. """ @@ -498,13 +498,13 @@ def compute_metrics_per_sample( Args: manifest_path: str, Required - path to dataset JSON manifest file (in NeMo format) - reference_field: str, Optional - name of field in .json manifest with the reference text + reference_field: str, Optional - name of field in .json manifest with the reference text ("text" by default). - hypothesis_field: str, Optional - name of field in .json manifest with the hypothesis text + hypothesis_field: str, Optional - name of field in .json manifest with the hypothesis text ("pred_text" by default). - metrics: list[str], Optional - list of metrics to be computed + metrics: list[str], Optional - list of metrics to be computed (currently supported "wer", "cer", "punct_er") - punctuation_marks: list[str], Optional - list of punctuation marks for computing + punctuation_marks: list[str], Optional - list of punctuation marks for computing punctuation error rate ([".", ",", "?"] by default). output_manifest_path: str, Optional - path where .json manifest with calculated metrics will be saved.