Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update Frame-VAD doc and fix onnx export #7076

Merged
merged 13 commits into from
Jul 19, 2023
3 changes: 2 additions & 1 deletion examples/asr/conf/vad/frame_vad_infer_postprocess.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
name: &name "vad_inference_postprocessing"

dataset: null # Path of json file of evaluation data. Audio files should have unique names
input_manifest: null # Path of json file of evaluation data. Audio files should have unique names
output_dir: null # Path to output directory where results will be stored
num_workers: 12
sample_rate: 16000
evaluate: False # whether to get AUROC and DERs, the manifest must contains groundtruth if enabled
Expand Down
24 changes: 18 additions & 6 deletions examples/asr/speech_classification/frame_vad_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@
## Usage:
python frame_vad_infer.py \
--config-path="../conf/vad" --config-name="frame_vad_infer_postprocess" \
dataset=<Path of manifest file containing evaluation data. Audio files should have unique names>
input_manifest=<Path of manifest file containing evaluation data. Audio files should have unique names> \
output_dir=<Path of output directory>

The manifest json file should have the following format (each line is a Python dictionary):
{"audio_filepath": "/path/to/audio_file1", "offset": 0, "duration": 10000}
Expand Down Expand Up @@ -58,15 +59,25 @@

@hydra_runner(config_path="../conf/vad", config_name="frame_vad_infer_postprocess")
def main(cfg):
if not cfg.dataset:
if not cfg.input_manifest:
raise ValueError("You must input the path of json file of evaluation data")
output_dir = cfg.output_dir if cfg.output_dir else "frame_vad_outputs"
if os.path.exists(output_dir):
logging.warning(
f"Output directory {output_dir} already exists, use this only if you're tuning post-processing params."
)
Path(output_dir).mkdir(parents=True, exist_ok=True)

cfg.frame_out_dir = os.path.join(output_dir, "frame_preds")
cfg.smoothing_out_dir = os.path.join(output_dir, "smoothing_preds")
cfg.rttm_out_dir = os.path.join(output_dir, "rttm_preds")

# each line of dataset should be have different audio_filepath and unique name to simplify edge cases or conditions
logging.info(f"Loading manifest file {cfg.dataset}")
# each line of input_manifest should be have different audio_filepath and unique name to simplify edge cases or conditions
logging.info(f"Loading manifest file {cfg.input_manifest}")
manifest_orig, key_labels_map, key_rttm_map = frame_vad_infer_load_manifest(cfg)

# Prepare manifest for streaming VAD
manifest_vad_input = cfg.dataset
manifest_vad_input = cfg.input_manifest
if cfg.prepare_manifest.auto_split:
logging.info("Split long audio file to avoid CUDA memory issue")
logging.debug("Try smaller split_duration if you still have CUDA memory issue")
Expand All @@ -76,6 +87,7 @@ def main(cfg):
'split_duration': cfg.prepare_manifest.split_duration,
'num_workers': cfg.num_workers,
'prepared_manifest_vad_input': cfg.prepared_manifest_vad_input,
'out_dir': output_dir,
}
manifest_vad_input = prepare_manifest(config)
else:
Expand Down Expand Up @@ -171,7 +183,7 @@ def main(cfg):
key_pred_rttm_map[key] = entry['rttm_filepath']

if not cfg.out_manifest_filepath:
out_manifest_filepath = "manifest_vad_output.json"
out_manifest_filepath = os.path.join(output_dir, "manifest_vad_output.json")
else:
out_manifest_filepath = cfg.out_manifest_filepath
write_manifest(out_manifest_filepath, manifest_new)
Expand Down
43 changes: 43 additions & 0 deletions nemo/collections/asr/models/classification_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
from nemo.core.classes.common import PretrainedModelInfo, typecheck
from nemo.core.neural_types import *
from nemo.utils import logging, model_utils
from nemo.utils.cast_utils import cast_all

__all__ = ['EncDecClassificationModel', 'EncDecRegressionModel']

Expand Down Expand Up @@ -851,6 +852,8 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
self.eval_loop_cnt = 0
self.ratio_threshold = cfg.get('ratio_threshold', 0.2)
super().__init__(cfg=cfg, trainer=trainer)
self.decoder.output_types = self.output_types
self.decoder.output_types_for_export = self.output_types

@classmethod
def list_available_models(cls) -> Optional[List[PretrainedModelInfo]]:
Expand Down Expand Up @@ -1148,3 +1151,43 @@ def get_metric_logits_labels(self, logits, labels, masks):
labels = labels.gather(dim=0, index=idx.view(-1))

return logits, labels

def forward_for_export(
self, input, length=None, cache_last_channel=None, cache_last_time=None, cache_last_channel_len=None
):
"""
This forward is used when we need to export the model to ONNX format.
Inputs cache_last_channel and cache_last_time are needed to be passed for exporting streaming models.
Args:
input: Tensor that represents a batch of raw audio signals,
of shape [B, T]. T here represents timesteps.
length: Vector of length B, that contains the individual lengths of the audio sequences.
cache_last_channel: Tensor of shape [N, B, T, H] which contains the cache for last channel layers
cache_last_time: Tensor of shape [N, B, H, T] which contains the cache for last time layers
N is the number of such layers which need caching, B is batch size, H is the hidden size of activations,
and T is the length of the cache

Returns:
the output of the model
"""
enc_fun = getattr(self.input_module, 'forward_for_export', self.input_module.forward)
if cache_last_channel is None:
encoder_output = enc_fun(audio_signal=input, length=length)
if isinstance(encoder_output, tuple):
encoder_output = encoder_output[0]
else:
encoder_output, length, cache_last_channel, cache_last_time, cache_last_channel_len = enc_fun(
audio_signal=input,
length=length,
cache_last_channel=cache_last_channel,
cache_last_time=cache_last_time,
cache_last_channel_len=cache_last_channel_len,
)

dec_fun = getattr(self.output_module, 'forward_for_export', self.output_module.forward)
ret = dec_fun(hidden_states=encoder_output.transpose(1, 2))
if isinstance(ret, tuple):
ret = ret[0]
if cache_last_channel is not None:
ret = (ret, length, cache_last_channel, cache_last_time, cache_last_channel_len)
return cast_all(ret, from_dtype=torch.float16, to_dtype=torch.float32)
6 changes: 4 additions & 2 deletions nemo/collections/asr/parts/utils/vad_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,9 @@ def generate_overlap_vad_seq(
if out_dir:
overlap_out_dir = out_dir
else:
overlap_out_dir = frame_pred_dir + "/overlap_smoothing_output" + "_" + smoothing_method + "_" + str(overlap)
overlap_out_dir = os.path.join(
frame_pred_dir, "/overlap_smoothing_output" + "_" + smoothing_method + "_" + str(overlap)
)

if not os.path.exists(overlap_out_dir):
os.mkdir(overlap_out_dir)
Expand Down Expand Up @@ -732,7 +734,7 @@ def generate_vad_segment_table(
if not out_dir:
out_dir_name = "seg_output_"
for key in postprocessing_params:
out_dir_name = out_dir_name + str(key) + str(postprocessing_params[key]) + "-"
out_dir_name = out_dir_name + "-" + str(key) + str(postprocessing_params[key])

out_dir = os.path.join(vad_pred_dir, out_dir_name)

Expand Down
4 changes: 2 additions & 2 deletions tests/collections/asr/test_asr_classification_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,8 @@ def frame_classification_model():
}

decoder = {
'cls': 'nemo.collections.asr.modules.ConvASRDecoderClassification',
'params': {'feat_in': 32, 'num_classes': 5,},
'cls': 'nemo.collections.common.parts.MultiLayerPerceptron',
'params': {'hidden_size': 32, 'num_classes': 5,},
}

modelConfig = DictConfig(
Expand Down