diff --git a/egs/librispeech/ASR/zipformer/export-onnx-streaming.py b/egs/librispeech/ASR/zipformer/export-onnx-streaming.py index 5d0c9ea430..e5ceb36831 100755 --- a/egs/librispeech/ASR/zipformer/export-onnx-streaming.py +++ b/egs/librispeech/ASR/zipformer/export-onnx-streaming.py @@ -48,7 +48,8 @@ --joiner-dim 512 \ --causal True \ --chunk-size 16 \ - --left-context-frames 128 + --left-context-frames 128 \ + --fp16 True The --chunk-size in training is "16,32,64,-1", so we select one of them (excluding -1) during streaming export. The same applies to `--left-context`, @@ -73,6 +74,7 @@ import torch import torch.nn as nn from decoder import Decoder +from onnxconverter_common import float16 from onnxruntime.quantization import QuantType, quantize_dynamic from scaling_converter import convert_scaled_to_non_scaled from train import add_model_arguments, get_model, get_params @@ -154,6 +156,13 @@ def get_parser(): help="The context size in the decoder. 1 means bigram; 2 means tri-gram", ) + parser.add_argument( + "--fp16", + type=str2bool, + default=False, + help="Whether to export models in fp16", + ) + add_model_arguments(parser) return parser @@ -479,7 +488,6 @@ def build_inputs_outputs(tensors, i): add_meta_data(filename=encoder_filename, meta_data=meta_data) - def export_decoder_model_onnx( decoder_model: OnnxDecoder, decoder_filename: str, @@ -747,11 +755,29 @@ def main(): ) logging.info(f"Exported joiner to {joiner_filename}") + if(params.fp16) : + logging.info("Generate fp16 models") + + encoder = onnx.load(encoder_filename) + encoder_fp16 = float16.convert_float_to_float16(encoder, keep_io_types=True) + encoder_filename_fp16 = params.exp_dir / f"encoder-{suffix}.fp16.onnx" + onnx.save(encoder_fp16,encoder_filename_fp16) + + decoder = onnx.load(decoder_filename) + decoder_fp16 = float16.convert_float_to_float16(decoder, keep_io_types=True) + decoder_filename_fp16 = params.exp_dir / f"decoder-{suffix}.fp16.onnx" + onnx.save(decoder_fp16,decoder_filename_fp16) + + joiner = onnx.load(joiner_filename) + joiner_fp16 = float16.convert_float_to_float16(joiner, keep_io_types=True) + joiner_filename_fp16 = params.exp_dir / f"joiner-{suffix}.fp16.onnx" + onnx.save(joiner_fp16,joiner_filename_fp16) + # Generate int8 quantization models # See https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html#data-type-selection logging.info("Generate int8 quantization models") - + encoder_filename_int8 = params.exp_dir / f"encoder-{suffix}.int8.onnx" quantize_dynamic( model_input=encoder_filename, diff --git a/egs/librispeech/ASR/zipformer/export-onnx.py b/egs/librispeech/ASR/zipformer/export-onnx.py index 3682f0b625..ed8a0ef0fd 100755 --- a/egs/librispeech/ASR/zipformer/export-onnx.py +++ b/egs/librispeech/ASR/zipformer/export-onnx.py @@ -48,8 +48,8 @@ --joiner-dim 512 \ --causal False \ --chunk-size "16,32,64,-1" \ - --left-context-frames "64,128,256,-1" - + --left-context-frames "64,128,256,-1" \ + --fp16 True It will generate the following 3 files inside $repo/exp: - encoder-epoch-99-avg-1.onnx @@ -70,6 +70,7 @@ import torch import torch.nn as nn from decoder import Decoder +from onnxconverter_common import float16 from onnxruntime.quantization import QuantType, quantize_dynamic from scaling_converter import convert_scaled_to_non_scaled from train import add_model_arguments, get_model, get_params @@ -151,6 +152,13 @@ def get_parser(): help="The context size in the decoder. 1 means bigram; 2 means tri-gram", ) + parser.add_argument( + "--fp16", + type=str2bool, + default=False, + help="Whether to export models in fp16", + ) + add_model_arguments(parser) return parser @@ -584,6 +592,24 @@ def main(): ) logging.info(f"Exported joiner to {joiner_filename}") + if(params.fp16) : + logging.info("Generate fp16 models") + + encoder = onnx.load(encoder_filename) + encoder_fp16 = float16.convert_float_to_float16(encoder, keep_io_types=True) + encoder_filename_fp16 = params.exp_dir / f"encoder-{suffix}.fp16.onnx" + onnx.save(encoder_fp16,encoder_filename_fp16) + + decoder = onnx.load(decoder_filename) + decoder_fp16 = float16.convert_float_to_float16(decoder, keep_io_types=True) + decoder_filename_fp16 = params.exp_dir / f"decoder-{suffix}.fp16.onnx" + onnx.save(decoder_fp16,decoder_filename_fp16) + + joiner = onnx.load(joiner_filename) + joiner_fp16 = float16.convert_float_to_float16(joiner, keep_io_types=True) + joiner_filename_fp16 = params.exp_dir / f"joiner-{suffix}.fp16.onnx" + onnx.save(joiner_fp16,joiner_filename_fp16) + # Generate int8 quantization models # See https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html#data-type-selection diff --git a/requirements.txt b/requirements.txt index 226adaba1b..d97263142c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,6 +12,7 @@ onnx>=1.15.0 onnxruntime>=1.16.3 onnxoptimizer onnxsim +onnxconverter_common # style check session: black==22.3.0