huggingface · michaelbenayoun · Feb 2, 2023 · Jan 17, 2023 · Jan 17, 2023 · Jan 17, 2023
diff --git a/docs/source/onnxruntime/usage_guides/optimization.mdx b/docs/source/onnxruntime/usage_guides/optimization.mdx
@@ -15,7 +15,41 @@ specific language governing permissions and limitations under the License.
 🤗 Optimum provides an `optimum.onnxruntime` package that enables you to apply graph optimization on many model hosted on the 🤗 hub using the [ONNX Runtime](https://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers) model optimization tool.
 
 
-## Creating an `ORTOptimizer`
+## Optimizing a model to be used with Optimum's CLI
+
+The Optimum ONNX Runtime optimization tool can be used through Optimum command-line interface:
+
+```bash
+optimum-cli onnxruntime optimize --help
+usage: optimum-cli <command> [<args>] onnxruntime optimize [-h] --onnx_model ONNX_MODEL [-o OUTPUT] (-O1 | -O2 | -O3 | -O4)
+
+options:
+ -h, --help show this help message and exit
+ -O1 Basic general optimizations (see: https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization for more details).
+ -O2 Basic and extended general optimizations, transformers-specific fusions (see: https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization for more details).
+ -O3 Same as O2 with Gelu approximation (see: https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization for more details).
+ -O4 Same as O3 with mixed precision (see: https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization for more details).
+ -c, --config `ORTConfig` file to use to optimize the model.
+
+Required arguments:
+ --onnx_model ONNX_MODEL
+ Path indicating where the ONNX models to optimize are located.
+
+Optional arguments:
+ -o OUTPUT, --output OUTPUT
+ Path indicating the directory where to store generated ONNX model. (defaults to --onnx_model value).
+```
+
+Optimizing an ONNX model can be done as follows:
+
+```bash
+ optimum-cli onnxruntime optimize --onnx_model onnx_model_location/ -O1
+```
+
+This optimize all the ONNX files in `onnx_model_location` with the basic general optimizations. The optimized models will be created in the same directory by default unless the `--output` argument is specified.
+
+
+## Optimizing a model to be used with Optimum's `ORTOptimizer`
 
 The [`~onnxruntime.ORTOptimizer`] class is used to optimize your ONNX model. The class can be initialized using the `from_pretrained()` method, which supports different checkpoint formats.
 

diff --git a/docs/source/onnxruntime/usage_guides/quantization.mdx b/docs/source/onnxruntime/usage_guides/quantization.mdx
@@ -28,6 +28,40 @@ explains the main concepts that you will be using when performing quantization w
 
 </Tip>
 
+## Quantizing a model to be used with Optimum's CLI
+
+The Optimum ONNX Runtime quantization tool can be used through Optimum command-line interface:
+
+```bash
+optimum-cli onnxruntime quantize --help
+usage: optimum-cli <command> [<args>] onnxruntime quantize [-h] --onnx_model ONNX_MODEL [-o OUTPUT] (--arm64 | --avx2 | --avx512 | --avx512_vnni | --tensorrt)
+
+options:
+ -h, --help show this help message and exit
+ --arm64 Quantization for the ARM64 architecture.
+ --avx2 Quantization with AVX-2 instructions.
+ --avx512 Quantization with AVX-512 instructions.
+ --avx512_vnni Quantization with AVX-512 and VNNI instructions.
+ --tensorrt Quantization for NVIDIA TensorRT optimizer.
+ -c, --config `ORTConfig` file to use to optimize the model.
+
+Required arguments:
+ --onnx_model ONNX_MODEL
+ Path indicating where the ONNX models to quantize are located.
+
+Optional arguments:
+ -o OUTPUT, --output OUTPUT
+ Path indicating the directory where to store generated ONNX model. (defaults to --onnx_model value).
+```
+
+Quantizing an ONNX model can be done as follows:
+
+```bash
+ optimum-cli onnxruntime quantize --onnx_model onnx_model_location/ --avx512
+```
+
+This quantize all the ONNX files in `onnx_model_location` with the AVX-512 instructions. The quantized models will be created in the same directory by default unless the `--output` argument is specified.
+
 ## Creating an `ORTQuantizer`
 
 The [`~optimum.onnxruntime.ORTQuantizer`] class is used to quantize your ONNX model. The class can be initialized using

diff --git a/optimum/commands/onnxruntime/__init__.py b/optimum/commands/onnxruntime/__init__.py
@@ -0,0 +1,33 @@
+import sys
+from argparse import ArgumentParser
+
+from .. import BaseOptimumCLICommand
+from .optimize import ONNXRuntimmeOptimizeCommand, parse_args_onnxruntime_optimize
+from .quantize import ONNXRuntimmeQuantizeCommand, parse_args_onnxruntime_quantize
+
+
+def onnxruntime_optimize_factory(args):
+ return ONNXRuntimmeOptimizeCommand(args)
+
+
+def onnxruntime_quantize_factory(args):
+ return ONNXRuntimmeQuantizeCommand(args)
+
+
+class ONNXRuntimeCommand(BaseOptimumCLICommand):
+ @staticmethod
+ def register_subcommand(parser: ArgumentParser):
+ onnxruntime_parser = parser.add_parser("onnxruntime", help="ONNX Runtime optimize and quantize utilities.")
+ onnxruntime_sub_parsers = onnxruntime_parser.add_subparsers()
+
+ optimize_parser = onnxruntime_sub_parsers.add_parser("optimize", help="Optimize ONNX models.")
+ quantize_parser = onnxruntime_sub_parsers.add_parser("quantize", help="Dynammic quantization for ONNX models.")
+
+ parse_args_onnxruntime_optimize(optimize_parser)
+ parse_args_onnxruntime_quantize(quantize_parser)
+
+ optimize_parser.set_defaults(func=onnxruntime_optimize_factory)
+ quantize_parser.set_defaults(func=onnxruntime_quantize_factory)
+
+ def run(self):
+ raise NotImplementedError()
diff --git a/optimum/commands/onnxruntime/optimize.py b/optimum/commands/onnxruntime/optimize.py
@@ -0,0 +1,80 @@
+from pathlib import Path
+
+from ...onnxruntime.configuration import AutoOptimizationConfig, ORTConfig
+from ...onnxruntime.optimization import ORTOptimizer
+
+
+def parse_args_onnxruntime_optimize(parser):
+ required_group = parser.add_argument_group("Required arguments")
+ required_group.add_argument(
+ "--onnx_model",
- "--onnx_model",
+ "onnx_model",
- "--onnx_model",
+ "onnx_model",
+ type=Path,
+ required=True,
+ help="Path to the repository where the ONNX models to optimize are located.",
+ )
+
+ optional_group = parser.add_argument_group("Optional arguments")
+ optional_group.add_argument(
+ "-o",
+ "--output",
+ type=Path,
+ help="Path to the directory where to store generated ONNX model. (defaults to --onnx_model value).",
+ )
+
+ level_group = parser.add_mutually_exclusive_group(required=True)
+ level_group.add_argument(
+ "-O1",
+ action="store_true",
+ help="Basic general optimizations (see: https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization for more details).",
+ )
+ level_group.add_argument(
+ "-O2",
+ action="store_true",
+ help="Basic and extended general optimizations, transformers-specific fusions (see: https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization for more details).",
+ )
+ level_group.add_argument(
+ "-O3",
+ action="store_true",
+ help="Same as O2 with Gelu approximation (see: https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization for more details).",
+ )
+ level_group.add_argument(
+ "-O4",
+ action="store_true",
+ help="Same as O3 with mixed precision (see: https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization for more details).",
+ )
+ level_group.add_argument(
+ "-c",
+ "--config",
+ type=Path,
+ help="`ORTConfig` file to use to optimize the model.",
+ )
+
+
+class ONNXRuntimmeOptimizeCommand:
+ def __init__(self, args):
+ self.args = args
+
+ def run(self):
+ if not self.args.output:
+ save_dir = self.args.onnx_model
+ else:
+ save_dir = self.args.output
+
+ file_names = [model.name for model in self.args.onnx_model.glob("*.onnx")]
+
+ optimizer = ORTOptimizer.from_pretrained(self.args.onnx_model, file_names)
+
+ if self.args.config:
+ optimization_config = ORTConfig
+ elif self.args.O1:
+ optimization_config = AutoOptimizationConfig.O1()
+ elif self.args.O2:
+ optimization_config = AutoOptimizationConfig.O2()
+ elif self.args.O3:
+ optimization_config = AutoOptimizationConfig.O3()
+ elif self.args.O4:
+ optimization_config = AutoOptimizationConfig.O4()
+ else:
- else:
+ elif self.args.config:
- else:
+ elif self.args.config:
+ optimization_config = ORTConfig.from_pretained(self.args.config).optimization
+
-
+ else:
+ raise ValueError("An optimization configuration must be provided, either by using the predefined optimization configurations (O1, O2, O3, O4) or by specifying the path to a custom ORTCOnfig")
-
+ else:
+ raise ValueError("An optimization configuration must be provided, either by using the predefined optimization configurations (O1, O2, O3, O4) or by specifying the path to a custom ORTCOnfig")
+ optimizer.optimize(save_dir=save_dir, optimization_config=optimization_config)
diff --git a/optimum/commands/onnxruntime/quantize.py b/optimum/commands/onnxruntime/quantize.py
@@ -0,0 +1,77 @@
+from argparse import ArgumentParser
+from pathlib import Path
+
+from ...onnxruntime.configuration import AutoQuantizationConfig, ORTConfig
+from ...onnxruntime.quantization import ORTQuantizer
+
+
+def parse_args_onnxruntime_quantize(parser):
+ required_group = parser.add_argument_group("Required arguments")
+ required_group.add_argument(
+ "--onnx_model",
+ type=Path,
+ required=True,
+ help="Path to the repository where the ONNX models to quantize are located.",
+ )
+
+ optional_group = parser.add_argument_group("Optional arguments")
+ optional_group.add_argument(
+ "-o",
+ "--output",
+ type=Path,
+ help="Path to the directory where to store generated ONNX model. (defaults to --onnx_model value).",
+ )
+ optional_group.add_argument(
+ "--per_channel",
+ action="store_true",
+ help="Compute the quantization parameters on a per-channel basis.",
+ )
+
+ level_group = parser.add_mutually_exclusive_group(required=True)
+ level_group.add_argument("--arm64", action="store_true", help="Quantization for the ARM64 architecture.")
+ level_group.add_argument("--avx2", action="store_true", help="Quantization with AVX-2 instructions.")
+ level_group.add_argument("--avx512", action="store_true", help="Quantization with AVX-512 instructions.")
+ level_group.add_argument(
+ "--avx512_vnni", action="store_true", help="Quantization with AVX-512 and VNNI instructions."
+ )
+ level_group.add_argument("--tensorrt", action="store_true", help="Quantization for NVIDIA TensorRT optimizer.")
+ level_group.add_argument(
+ "-c",
+ "--config",
+ type=Path,
+ help="`ORTConfig` file to use to optimize the model.",
+ )
+
+
+class ONNXRuntimmeQuantizeCommand:
+ def __init__(self, args):
+ self.args = args
+
+ def run(self):
+ if not self.args.output:
+ save_dir = self.args.onnx_model
+ else:
+ save_dir = self.args.output
+
+ quantizers = []
+
+ quantizers = [
+ ORTQuantizer.from_pretrained(save_dir, file_name=model.name)
+ for model in self.args.onnx_model.glob("*.onnx")
+ ]
+
+ if self.args.arm64:
+ qconfig = AutoQuantizationConfig.arm64(is_static=False, per_channel=self.args.per_channel)
+ elif self.args.avx2:
+ qconfig = AutoQuantizationConfig.avx2(is_static=False, per_channel=self.args.per_channel)
+ elif self.args.avx512:
+ qconfig = AutoQuantizationConfig.avx512(is_static=False, per_channel=self.args.per_channel)
+ elif self.args.avx512_vnni:
+ qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=self.args.per_channel)
+ elif self.args.tensorrt:
+ qconfig = AutoQuantizationConfig.tensorrt(is_static=False, per_channel=self.args.per_channel)
+ else:
+ qconfig = ORTConfig.from_pretained(self.args.config).quantization
+
+ for q in quantizers:
+ q.quantize(save_dir=save_dir, quantization_config=qconfig)
diff --git a/optimum/commands/optimum_cli.py b/optimum/commands/optimum_cli.py
@@ -18,6 +18,7 @@
 
 from .env import EnvironmentCommand
 from .export import ExportCommand
+from .onnxruntime import ONNXRuntimeCommand
 
 
 def main():
@@ -27,6 +28,7 @@ def main():
  # Register commands
  ExportCommand.register_subcommand(commands_parser)
  EnvironmentCommand.register_subcommand(commands_parser)
+ ONNXRuntimeCommand.register_subcommand(commands_parser)
 
  args = parser.parse_args()
 

diff --git a/setup.py b/setup.py
@@ -20,6 +20,7 @@
  "packaging",
  "numpy<1.24.0",
  "huggingface_hub>=0.8.0",
+ "datasets",
 ]
 
 TESTS_REQUIRE = ["pytest", "requests", "parameterized", "pytest-xdist", "Pillow", "sacremoses", "diffusers"]

diff --git a/tests/cli/test_cli.py b/tests/cli/test_cli.py
@@ -25,17 +25,55 @@ def test_helps_no_raise(self):
  "optimum-cli export --help",
  "optimum-cli export onnx --help",
  "optimum-cli env --help",
+ "optimum-cli onnxruntime quantize --help",
+ "optimum-cli onnxruntime optimize --help",
  ]
 
  for command in commands:
  subprocess.run(command, shell=True, check=True)
 
- def test_basic_commands(self):
+ def test_env_commands(self):
+ subprocess.run("optimum-cli env", shell=True, check=True)
+
+ def test_export_commands(self):
  with tempfile.TemporaryDirectory() as tempdir:
- commands = [
- "optimum-cli env",
+ command = (
  f"optimum-cli export onnx --model hf-internal-testing/tiny-random-vision_perceiver_conv --task image-classification {tempdir}",
+ )
+ subprocess.run(command, shell=True, check=True)
+
+ def test_optimize_commands(self):
+ with tempfile.TemporaryDirectory() as tempdir:
+ # First export a tiny encoder, decoder only and encoder-decoder
+ export_commands = [
+ f"optimum-cli export onnx --model hf-internal-testing/tiny-random-BertModel {tempdir}/encoder",
+ f"optimum-cli export onnx --model hf-internal-testing/tiny-random-gpt2 {tempdir}/decoder",
+ f"optimum-cli export onnx --model hf-internal-testing/tiny-random-t5 {tempdir}/encoder-decoder",
+ ]
+ optimize_commands = [
+ f"optimum-cli onnxruntime optimize --onnx_model {tempdir}/encoder -O1",
+ f"optimum-cli onnxruntime optimize --onnx_model {tempdir}/decoder -O1",
+ f"optimum-cli onnxruntime optimize --onnx_model {tempdir}/encoder-decoder -O1",
+ ]
+
+ for export, optimize in zip(export_commands, optimize_commands):
+ subprocess.run(export, shell=True, check=True)
+ subprocess.run(optimize, shell=True, check=True)
+
+ def test_optimize_commands(self):
+ with tempfile.TemporaryDirectory() as tempdir:
+ # First export a tiny encoder, decoder only and encoder-decoder
+ export_commands = [
+ f"optimum-cli export onnx --model hf-internal-testing/tiny-random-BertModel {tempdir}/encoder",
+ f"optimum-cli export onnx --model hf-internal-testing/tiny-random-gpt2 {tempdir}/decoder",
+ f"optimum-cli export onnx --model hf-internal-testing/tiny-random-t5 {tempdir}/encoder-decoder",
+ ]
+ optimize_commands = [
+ f"optimum-cli onnxruntime quantize --onnx_model {tempdir}/encoder --avx2",
+ f"optimum-cli onnxruntime quantize --onnx_model {tempdir}/decoder --avx2",
+ f"optimum-cli onnxruntime quantize --onnx_model {tempdir}/encoder-decoder --avx2",
  ]
 
- for command in commands:
- subprocess.run(command, shell=True, check=True)
+ for export, optimize in zip(export_commands, optimize_commands):
+ subprocess.run(export, shell=True, check=True)
+ subprocess.run(optimize, shell=True, check=True)