Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update Frame-VAD doc and fix onnx export #7076

Merged
merged 13 commits into from
Jul 19, 2023
1 change: 1 addition & 0 deletions examples/asr/conf/vad/frame_vad_infer_postprocess.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
name: &name "vad_inference_postprocessing"

dataset: null # Path of json file of evaluation data. Audio files should have unique names
output: null # Path to output directory where results will be stored
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how about making the name clearer? such as output_folder

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated

num_workers: 12
sample_rate: 16000
evaluate: False # whether to get AUROC and DERs, the manifest must contains groundtruth if enabled
Expand Down
16 changes: 14 additions & 2 deletions examples/asr/speech_classification/frame_vad_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@
## Usage:
python frame_vad_infer.py \
--config-path="../conf/vad" --config-name="frame_vad_infer_postprocess" \
dataset=<Path of manifest file containing evaluation data. Audio files should have unique names>
dataset=<Path of manifest file containing evaluation data. Audio files should have unique names> \
output=<Path of output directory>
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as above

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated


The manifest json file should have the following format (each line is a Python dictionary):
{"audio_filepath": "/path/to/audio_file1", "offset": 0, "duration": 10000}
Expand Down Expand Up @@ -60,6 +61,16 @@
def main(cfg):
if not cfg.dataset:
raise ValueError("You must input the path of json file of evaluation data")
output_dir = cfg.output if cfg.output else "frame_vad_outputs"
if os.path.exists(output_dir):
logging.warning(
f"Output directory {output_dir} already exists, use this only if you're tuning post-processing params."
)
Path(output_dir).mkdir(parents=True, exist_ok=True)

cfg.frame_out_dir = os.path.join(output_dir, "frame_preds")
cfg.smoothing_out_dir = os.path.join(output_dir, "smoothing_preds")
cfg.rttm_out_dir = os.path.join(output_dir, "rttm_preds")

# each line of dataset should be have different audio_filepath and unique name to simplify edge cases or conditions
logging.info(f"Loading manifest file {cfg.dataset}")
Expand All @@ -76,6 +87,7 @@ def main(cfg):
'split_duration': cfg.prepare_manifest.split_duration,
'num_workers': cfg.num_workers,
'prepared_manifest_vad_input': cfg.prepared_manifest_vad_input,
'out_dir': output_dir,
}
manifest_vad_input = prepare_manifest(config)
else:
Expand Down Expand Up @@ -171,7 +183,7 @@ def main(cfg):
key_pred_rttm_map[key] = entry['rttm_filepath']

if not cfg.out_manifest_filepath:
out_manifest_filepath = "manifest_vad_output.json"
out_manifest_filepath = os.path.join(output_dir, "manifest_vad_output.json")
else:
out_manifest_filepath = cfg.out_manifest_filepath
write_manifest(out_manifest_filepath, manifest_new)
Expand Down
43 changes: 43 additions & 0 deletions nemo/collections/asr/models/classification_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
from nemo.core.classes.common import PretrainedModelInfo, typecheck
from nemo.core.neural_types import *
from nemo.utils import logging, model_utils
from nemo.utils.cast_utils import cast_all

__all__ = ['EncDecClassificationModel', 'EncDecRegressionModel']

Expand Down Expand Up @@ -851,6 +852,8 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
self.eval_loop_cnt = 0
self.ratio_threshold = cfg.get('ratio_threshold', 0.2)
super().__init__(cfg=cfg, trainer=trainer)
self.decoder.output_types = self.output_types
self.decoder.output_types_for_export = self.output_types

@classmethod
def list_available_models(cls) -> Optional[List[PretrainedModelInfo]]:
Expand Down Expand Up @@ -1148,3 +1151,43 @@ def get_metric_logits_labels(self, logits, labels, masks):
labels = labels.gather(dim=0, index=idx.view(-1))

return logits, labels

def forward_for_export(
self, input, length=None, cache_last_channel=None, cache_last_time=None, cache_last_channel_len=None
):
"""
This forward is used when we need to export the model to ONNX format.
Inputs cache_last_channel and cache_last_time are needed to be passed for exporting streaming models.
Args:
input: Tensor that represents a batch of raw audio signals,
of shape [B, T]. T here represents timesteps.
length: Vector of length B, that contains the individual lengths of the audio sequences.
cache_last_channel: Tensor of shape [N, B, T, H] which contains the cache for last channel layers
cache_last_time: Tensor of shape [N, B, H, T] which contains the cache for last time layers
N is the number of such layers which need caching, B is batch size, H is the hidden size of activations,
and T is the length of the cache

Returns:
the output of the model
"""
enc_fun = getattr(self.input_module, 'forward_for_export', self.input_module.forward)
if cache_last_channel is None:
encoder_output = enc_fun(audio_signal=input, length=length)
if isinstance(encoder_output, tuple):
encoder_output = encoder_output[0]
else:
encoder_output, length, cache_last_channel, cache_last_time, cache_last_channel_len = enc_fun(
audio_signal=input,
length=length,
cache_last_channel=cache_last_channel,
cache_last_time=cache_last_time,
cache_last_channel_len=cache_last_channel_len,
)

dec_fun = getattr(self.output_module, 'forward_for_export', self.output_module.forward)
ret = dec_fun(hidden_states=encoder_output.transpose(1, 2))
if isinstance(ret, tuple):
ret = ret[0]
if cache_last_channel is not None:
ret = (ret, length, cache_last_channel, cache_last_time, cache_last_channel_len)
return cast_all(ret, from_dtype=torch.float16, to_dtype=torch.float32)
4 changes: 2 additions & 2 deletions tests/collections/asr/test_asr_classification_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,8 @@ def frame_classification_model():
}

decoder = {
'cls': 'nemo.collections.asr.modules.ConvASRDecoderClassification',
'params': {'feat_in': 32, 'num_classes': 5,},
'cls': 'nemo.collections.common.parts.MultiLayerPerceptron',
'params': {'hidden_size': 32, 'num_classes': 5,},
}

modelConfig = DictConfig(
Expand Down