diff --git a/onnxscript/_internal/version_utils.py b/onnxscript/_internal/version_utils.py index 390f7ee378..2b43c54f49 100644 --- a/onnxscript/_internal/version_utils.py +++ b/onnxscript/_internal/version_utils.py @@ -43,26 +43,6 @@ def transformers_older_than(version: str) -> bool | None: ) -def is_onnxruntime_training() -> bool: - """Returns True if the onnxruntime is onnxruntime-training.""" - try: - from onnxruntime import training # pylint: disable=import-outside-toplevel - - assert training - except ImportError: - # onnxruntime not training - return False - - try: - from onnxruntime.capi.onnxruntime_pybind11_state import ( # pylint: disable=import-outside-toplevel - OrtValueVector, - ) - except ImportError: - return False - - return hasattr(OrtValueVector, "push_back_batch") - - def onnxruntime_older_than(version: str) -> bool: """Returns True if the onnxruntime version is older than the given version.""" import onnxruntime # pylint: disable=import-outside-toplevel diff --git a/onnxscript/tools/benchmark/__init__.py b/onnxscript/tools/benchmark/__init__.py deleted file mode 100644 index 8f1b6f4d3e..0000000000 --- a/onnxscript/tools/benchmark/__init__.py +++ /dev/null @@ -1,23 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -from onnxscript.tools.benchmark.benchmark_helpers import ( - common_export, - get_parsed_args, - make_configs, - make_dataframe_from_benchmark_data, - multi_run, - run_inference, - run_onnx_inference, -) - -__all__ = [ - "get_parsed_args", - "common_export", - "make_configs", - "multi_run", - "make_dataframe_from_benchmark_data", - "run_inference", - "run_onnx_inference", -] diff --git a/onnxscript/tools/benchmark/benchmark_helpers.py b/onnxscript/tools/benchmark/benchmark_helpers.py deleted file mode 100644 index 09ff39843f..0000000000 --- a/onnxscript/tools/benchmark/benchmark_helpers.py +++ /dev/null @@ -1,784 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. -# pylint: disable=import-outside-toplevel, no-else-raise, consider-using-with, consider-using-enumerate - -from __future__ import annotations - -import argparse -import itertools -import multiprocessing -import os -import platform -import re -import subprocess -import sys -import time -from typing import Any, Sequence - -import numpy as np -import onnx -import onnx.inliner - -import onnxscript.optimizer -import onnxscript.rewriter -import onnxscript.rewriter.llama_rule_sets as rules -import onnxscript.rewriter.ort_fusions as ort_rules -import onnxscript.rewriter.pattern as orp -from onnxscript import ir -from onnxscript.optimizer import remove_unused_nodes - - -def get_parsed_args( - name: str, - description: str | None = None, - epilog: str | None = None, - new_args: list[str] | None = None, - **kwargs: tuple[Any, str], -) -> dict[str, Any]: - """ - Returns parsed arguments for examples in this package. - - Args: - name: script name - scenarios: list of available scenarios - description: parser description - epilog: text at the end of the parser - number: default value for number parameter - repeat: default value for repeat parameter - warmup: default value for warmup parameter - sleep: default value for sleep parameter - expose: if empty, keeps all the parameters, - if not None, only publish kwargs contains, otherwise the list - of parameters to publish separated by a comma - new_args: args to consider or None to take `sys.args` - kwargs: additional parameters, - example: `n_trees=(10, "number of trees to train")` - - Returns: - interpreted parameters in a dictionary - """ - parser = argparse.ArgumentParser( - prog=name, - description=description or f"Available options for {name}.py.", - epilog=epilog or "", - ) - for k, v in kwargs.items(): - parser.add_argument( - f"--{k}", - help=f"{v[1]}, default is {v[0]}", - type=type(v[0]), - default=v[0], - ) - - parsed = parser.parse_args(args=new_args) - return {k: getattr(parsed, k) for k in kwargs} - - -class BenchmarkError(RuntimeError): - pass - - -def get_machine() -> dict[str, Any]: - """Returns the machine specification.""" - cpu: dict[str, Any] = dict( - machine=str(platform.machine()), - processor=str(platform.processor()), - version=str(sys.version), - cpu=int(multiprocessing.cpu_count()), - executable=str(sys.executable), - ) - try: - import torch.cuda - except ImportError: - return cpu - - cpu["has_cuda"] = bool(torch.cuda.is_available()) - if cpu["has_cuda"]: - cpu["capability"] = torch.cuda.get_device_capability(0) - cpu["device_name"] = str(torch.cuda.get_device_name(0)) - return cpu - - -def _cmd_line(script_name: str, **kwargs: dict[str, Any]) -> list[str]: - args = [sys.executable, "-m", script_name] - for k, v in kwargs.items(): - args.append(f"--{k}") - args.append(str(v)) - return args - - -def _extract_metrics(text: str) -> dict[str, str]: - reg = re.compile(r":(.*?),(.*.?);") - res = reg.findall(text) - if len(res) == 0: - return {} - return dict(res) - - -def _make_prefix(script_name: str, index: int) -> str: - name = os.path.splitext(script_name)[0] - return f"{name}_dort_c{index}_" - - -def run_benchmark( - script_name: str, - configs: list[dict[str, Any]], - verbose: int = 0, - stop_if_exception: bool = True, - dump: bool = False, -) -> list[dict[str, Any]]: - """ - Runs a script multiple times and extract information from the output - following the pattern ``:,;``. - - Args: - script_name: python script to run - configs: list of execution to do - stop_if_exception: stop if one experiment failed, otherwise continue - verbose: use tqdm to follow the progress - dump: dump onnx file - - Returns: - values - """ - if verbose: - from tqdm import tqdm - - loop = tqdm(configs) - else: - loop = configs - - data: list[dict[str, Any]] = [] - for i, config in enumerate(loop): - cmd = _cmd_line(script_name, **config) - - if dump: - os.environ["ONNXRT_DUMP_PATH"] = _make_prefix(script_name, i) - else: - os.environ["ONNXRT_DUMP_PATH"] = "" - if verbose > 3: - print(f"[run_benchmark] cmd={cmd if isinstance(cmd, str) else ' '.join(cmd)}") - p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - res = p.communicate() - out, err = res - sout = out.decode("utf-8", errors="ignore") - serr = err.decode("utf-8", errors="ignore") - - if "ONNXRuntimeError" in serr or "ONNXRuntimeError" in sout: - if stop_if_exception: - raise RuntimeError( - f"Unable to continue with config {config} due to the " - f"following error\n{serr}" - f"\n----OUTPUT--\n{sout}" - ) - - metrics = _extract_metrics(sout) - if len(metrics) == 0: - if stop_if_exception: - raise BenchmarkError( - f"Unable (2) to continue with config {config}, no metric was " - f"collected.\n--ERROR--\n{serr}\n--OUTPUT--\n{sout}" - ) - else: - metrics = {} - metrics.update(config) - metrics["ERROR"] = serr - metrics["OUTPUT"] = sout - metrics["CMD"] = f"[{' '.join(cmd)}]" - data.append(metrics) - if verbose > 5: - print("--------------- ERROR") - print(serr) - if verbose >= 10: - print("--------------- OUTPUT") - print(sout) - - return data - - -def measure_discrepancies( - expected: list[tuple[Any, ...]], - outputs: list[tuple[Any, ...]], -) -> tuple[float, float]: - """ - Computes the discrepancies. - - Args: - expected: list of outputs coming from a torch model - outputs: list of outputs coming from an onnx model - - Returns: - max absolute errors, max relative errors - """ - - def _flatten(outputs): - flat = [] - for tensor in outputs: - if isinstance(tensor, tuple): - flat.extend(_flatten(tensor)) - else: - flat.append(tensor) - return tuple(flat) - - abs_errs = [] - rel_errs = [] - for torch_outputs_mixed_types, onnx_outputs in zip(expected, outputs): - torch_outputs = _flatten(torch_outputs_mixed_types) - assert len(torch_outputs) == len(onnx_outputs), ( - f"Length mismatch {len(torch_outputs)} != {len(onnx_outputs)}" - ) - for torch_tensor, onnx_tensor in zip(torch_outputs, onnx_outputs): - assert torch_tensor.dtype == onnx_tensor.dtype, ( - f"Type mismatch {torch_tensor.dtype} != {onnx_tensor.dtype}" - ) - assert torch_tensor.shape == onnx_tensor.shape, ( - f"Type mismatch {torch_tensor.shape} != {onnx_tensor.shape}" - ) - diff = torch_tensor - onnx_tensor - abs_err = float(diff.abs().max()) - rel_err = float((diff.abs() / torch_tensor).max()) - abs_errs.append(abs_err) - rel_errs.append(rel_err) - return max(abs_errs), max(rel_errs) - - -def common_export( - model: Any, - inputs: Sequence[Any], - exporter: str = "dynamo", - target_opset: int = 18, - folder: str = "", - filename: str = "model.onnx", - dynamic_shapes: Any | None = None, - verbose: int = 0, - optimization: str | None = None, - stats: dict[str, Any] | None = None, -): - """ - Exports a model into a folder. - - Args: - model: model - exporter: script, dynamo - folder: folder to export into - filename: onnx filename - inputs: inputs - dynamic_shapes: dynamic shapes - target_opset: target opset - optimization: optimization scenario, '/' separated values - verbose: verbosity - stats: if not None, populates this - dictionary with statistics about time - - Returns: - onnx proto - - """ - import torch.onnx - - if folder: - if not os.path.exists(folder): - os.mkdir(folder) - filename = os.path.join(folder, filename) - - if verbose: - print(f"[common_export] start exporting with {exporter!r} in {filename!r}") - begin = time.perf_counter() - if exporter == "script": - torch.onnx.export( - model, - inputs, # type: ignore[arg-type] - filename, - do_constant_folding=False, - input_names=[f"input{i}" for i in range(len(inputs))], - opset_version=target_opset, - dynamic_axes=dynamic_shapes, - ) - elif exporter == "dynamo": - assert dynamic_shapes is None, ( - f"dynamic_shapes={dynamic_shapes} is not implemented yet" - ) - with torch.no_grad(): - prog = torch.onnx.dynamo_export(model, *inputs) - onnx.save(prog.model_proto, filename) - else: - raise ValueError(f"Unknown exporter {exporter!r}") - - if stats is not None: - stats["export_time"] = time.perf_counter() - begin - stats["filesize"] = os.stat(filename).st_size - - if verbose: - print(f"[common_export] exporter done in {time.perf_counter() - begin}s") - print(f"[common_export] size of the export: {os.stat(filename).st_size / 2**20} Mb") - - with open(filename, "rb") as f: - onx = onnx.load(f) - - if optimization: - if verbose: - print(f"[common_export] start optimization with {optimization!r}") - begin = time.perf_counter() - optimized_model = optimize_model_proto(onx, optimization, verbose=verbose, stats=stats) - end = time.perf_counter() - begin - if stats is not None: - stats["optimization_time"] = end - if verbose: - print(f"[common_export] optimization done in {end}") - print(f"[common_export] saves the model in {filename!r}") - begin = time.perf_counter() - - onnx.save(optimized_model, filename) - if verbose: - print(f"[common_export] done saving in {time.perf_counter() - begin}") - - return onx - - -def apply_rule_sets( - model_proto: onnx.ModelProto, - rule_sets: list[str], - stats: dict[str, Any] | None = None, - verbose: int = 0, -): - """ - Applies set of patterns on a model to optimizes. - - Args: - model_proto: model - rule_sets: sets ot apply - stats: add statistics if not empty - verbose: verbosity - - Returns: - optimized model - """ - assert rule_sets, "No need to call apply_rule_sets for an empty set." - if verbose: - print(f"[apply_rule_sets] deserialize model before {rule_sets}") - begin = time.perf_counter() - ir_model = ir.serde.deserialize_model(model_proto) - end = time.perf_counter() - begin - if stats is not None: - stats["deserialize_time"] = end - if verbose: - print(f"[apply_rule_sets] deserialize done in {end}") - - for rule_set_name in rule_sets: - if verbose: - print(f"[apply_rule_sets] applies {rule_set_name!r}") - - if rule_set_name == "llama0": - rule_set = rules.llama_p0_rule_set() - elif rule_set_name == "onnxruntime": - rule_set = orp.RewriteRuleSet(ort_rules.ORT_PATTERN_REWRITE_RULES) - else: - raise ValueError(f"Unexpected rule_set name {rule_set_name!r}") - - begin = time.perf_counter() - rule_set.apply_to_model(ir_model) - remove_unused_nodes(ir_model) - end = time.perf_counter() - begin - if stats is not None: - stats[f"opt_rule_{rule_set_name}_time"] = end - if verbose: - print(f"[apply_rule_sets] {rule_set_name} done in {end}") - - if verbose: - print("[apply_rule_sets] serialize model") - begin = time.perf_counter() - rewritten_model = ir.serde.serialize_model(ir_model) - end = time.perf_counter() - begin - if stats is not None: - stats["serialize_time"] = end - if verbose: - print(f"[apply_rule_sets] serialize done in {end}") - - if verbose: - print("[apply_rule_sets] remove unused") - begin = time.perf_counter() - - remove_unused_nodes(rewritten_model) - - end = time.perf_counter() - begin - if stats is not None: - stats["opt_remove_unused_time"] = end - if verbose: - print(f"[apply_rule_sets] remove unused done in {end}") - - return rewritten_model - - -def optimize_model_proto( - model_proto: onnx.ModelProto, - optimization: str | None = None, - verbose: int = 0, - stats: dict[str, Any] | None = None, -): - """ - Optimizes a model given some scenarios. - - Args: - model_proto: ModelProto - optimization: '/' separated value - verbose: verbosity - stats: if not None, populates this dictionary with statistics - - Returns: - optmized model - """ - if not optimization: - return model_proto - - known_rule_sets = {"llama0", "onnxruntime"} - - rule_sets: list[str] = [] - for value in optimization.split("/"): - if value in known_rule_sets: - rule_sets.append(value) - continue - if value not in known_rule_sets and rule_sets: - model_proto = apply_rule_sets(model_proto, rule_sets, stats=stats, verbose=verbose) - del rule_sets[:] - continue - - if verbose: - print(f"[optimize_model_proto] start {value}") - - n_nodes = len(model_proto.graph.node) - n_functions = len(model_proto.functions) - begin = time.perf_counter() - - if value == "optimize": - model_ir = onnxscript.optimizer.optimize( - ir.from_proto(model_proto), - num_iterations=2, - onnx_shape_inference=False, - ) - model_proto = ir.to_proto(model_ir) - - elif value == "rewrite": - model_proto = onnxscript.rewriter.rewrite(model_proto) - - elif value == "inline": - model_proto = onnx.inliner.inline_local_functions(model_proto) - - else: - raise AssertionError( - f"Optimization step {value!r} is not implemented in {optimization!r}" - ) - - end = time.perf_counter() - begin - delta = len(model_proto.graph.node) - n_nodes - deltaf = len(model_proto.functions) - n_functions - if stats: - stats[f"opt_{value}_time"] = end - stats[f"opt_{value}_dnodes"] = delta - stats[f"opt_{value}_dfunctions"] = deltaf - if verbose: - print( - f"[optimize_model_proto] {value} done in {end} " - f"with +/- {delta} nodes, +/- {deltaf} functions" - ) - if rule_sets: - model_proto = apply_rule_sets(model_proto, rule_sets, stats=stats, verbose=verbose) - - return model_proto - - -def run_inference( - model: Any, - example_inputs: Sequence[Any], - warmup: int = 5, - repeat: int = 5, - verbose: int = 0, -) -> dict[str, Any]: - """ - Runs multiple times the same inference. - - Args: - model: torch model to run - example_inputs: dummy inputs - warmup: number of iterations to warmup - repeat: number of iterations to repeat - verbose: verbosity - - Returns: - statistcs - """ - if verbose: - print(f"[run_inference] start {warmup} warmup iterations") - - stats: dict[str, Any] = {} - iterations: list[float] = [] - begin = time.perf_counter() - for i in range(warmup): - t0 = time.perf_counter() - model(*example_inputs[i % len(example_inputs)]) - iterations.append(time.perf_counter() - t0) - end = time.perf_counter() - begin - stats["warmup"] = warmup - stats["warmup_time"] = end - stats["warmup_iter"] = iterations - - if verbose: - print(f"[run_inference] warmup done in {time.perf_counter() - begin}") - print(f"[run_inference] start {repeat} iterations") - - iterations = [] - begin = time.perf_counter() - for i in range(warmup): - t0 = time.perf_counter() - model(*example_inputs[i % len(example_inputs)]) - iterations.append(time.perf_counter() - t0) - end = time.perf_counter() - begin - stats["repeat"] = repeat - stats["repeat_time"] = end - stats["repeat_iter"] = iterations - - if verbose: - print(f"[run_inference] measure done in {time.perf_counter() - begin}") - - return stats - - -class WrapInferenceSessionForTorch: - def __init__(self, sess: Any): - # onnxruntime is importing when needed as it takes a couple of seconds if it contains CUDA EP. - import onnxruntime - import torch - from onnxruntime.capi import _pybind_state as ORTC # noqa: N812 - - self.sess = sess - self.input_names = [i.name for i in sess.get_inputs()] - self.output_names = [i.name for i in sess.get_outputs()] - self.bind = onnxruntime.SessionIOBinding(sess._sess) - self.OrtValue = ORTC.OrtValue - self.ORTC = ORTC - self.torch = torch - self.run_options = onnxruntime.RunOptions() - - self.TORCH_DTYPE_TO_NUMPY_DTYPE = { - torch.float16: np.float16, - torch.float32: np.float32, - torch.float64: np.float64, - torch.uint8: np.uint8, - torch.int8: np.int8, - torch.int16: np.int16, - torch.int32: np.int32, - torch.int64: np.int64, - torch.bool: np.bool_, - } - - DEVICES = { - -1: ORTC.OrtDevice(ORTC.OrtDevice.cpu(), ORTC.OrtDevice.default_memory(), 0) - } - - if torch.cuda.is_available(): - for i in range(torch.cuda.device_count()): - DEVICES[i] = ORTC.OrtDevice( - ORTC.OrtDevice.cuda(), ORTC.OrtDevice.default_memory(), i - ) - - self.DEVICES = DEVICES - - def _get_ortvalues_from_torch_tensors( - self, - tensors: tuple[Any, ...], # tuple["torch.Tensor", ...], - n_outputs: int, - ) -> tuple[Any, Any]: # tuple[tuple["torch.Tensor", ...], tuple["OrtDevice", ...]]: - ortvalues = self.ORTC.OrtValueVector() - ortvalues.reserve(len(tensors)) - dtypes = [] - shapes = [] - data_ptrs = [] - devices = [] - - max_device = -1 - assert isinstance(max_device, int), f"unexpected type for device={max_device!r}" - assert tensors is not None, "tensors cannot be None" - new_tensors = [] - for tensor in tensors: - assert isinstance(tensor, self.torch.Tensor), f"Unexpected type {type(tensor)}" - dtypes.append(self.TORCH_DTYPE_TO_NUMPY_DTYPE[tensor.dtype]) - shapes.append(tensor.size()) - data_ptrs.append(tensor.data_ptr()) - d = tensor.get_device() - devices.append(self.DEVICES[d]) - new_tensors.append(tensor) - max_device = max(max_device, tensor.get_device()) - - ortvalues.push_back_batch(new_tensors, data_ptrs, dtypes, shapes, devices) - output_devices = [] - for _ in range(n_outputs): - dev = self.DEVICES[max_device] - output_devices.append(dev) - - return ortvalues, output_devices - - def _ortvalues_to_torch_tensor( - self, - ortvalues: Any, # "onnxruntime.OrtValueVector", - ) -> tuple[Any, ...]: # tuple["torch.Tensor", ...]: - if len(ortvalues) == 0: - return tuple() - - from torch._C import _from_dlpack - - if all(map(lambda i: ortvalues[i].has_value(), range(len(ortvalues)))): # noqa: C417 - res = ortvalues.to_dlpacks(_from_dlpack) - else: - res = [] - for i in range(len(ortvalues)): - res.append( - _from_dlpack(ortvalues[i].to_dlpack()) - if ortvalues[i].has_value() - else None - ) - return tuple(res) - - def run(self, output_names, feeds): - inputs = [feeds[i] for i in self.input_names] - return self.run_dlpack(*inputs, output_names=output_names) - - def run_dlpack(self, *inputs, output_names=None): - if output_names is None: - output_names = self.output_names - ortvalues, output_devices = self._get_ortvalues_from_torch_tensors( - inputs, len(output_names) - ) - - ort_outputs = self.ORTC.OrtValueVector() - self.sess.run_with_ortvaluevector( - self.run_options, - self.input_names, - ortvalues, - output_names, - ort_outputs, - output_devices, - ) - pth_outputs = self._ortvalues_to_torch_tensor(ort_outputs) - return pth_outputs - - -def run_onnx_inference( - model: onnx.ModelProto, - example_inputs: Sequence[Any], - warmup: int = 5, - repeat: int = 5, - verbose: int = 0, - ort_optimize: bool = True, - torch_model: Any | None = None, -) -> dict[str, Any]: - """ - Runs multiple times the same inference with onnxruntime. - - Args: - model: torch model to run - example_inputs: dummy inputs - warmup: number of iterations to warmup - repeat: number of iterations to repeat - verbose: verbosity - ort_optimize: enable, disable onnxruntime optimizations - torch_model: if not empty, measure the discrepancies - - Returns: - statistcs - """ - stats: dict[str, Any] = {} - device = example_inputs[0][0].get_device() - providers = ( - ["CUDAExecutionProvider", "CPUExecutionProvider"] - if device >= 0 - else ["CPUExecutionProvider"] - ) - stats["providers"] = ",".join(providers) - if verbose: - print(f"[run_inference] create session with providers {providers!r}") - - begin = time.perf_counter() - # onnxruntime is importing when needed as it takes a couple of seconds if it contains CUDA EP. - import onnxruntime - - so = onnxruntime.SessionOptions() - if ort_optimize: - so.add_session_config_entry("session.disable_aot_function_inlining", "0") - so.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL - else: - so.add_session_config_entry("session.disable_aot_function_inlining", "1") - so.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL - - sess = onnxruntime.InferenceSession(model.SerializeToString(), so, providers) - wrapped_session = WrapInferenceSessionForTorch(sess) - - end = time.perf_counter() - begin - stats["ort_session_create_time"] = end - if verbose: - print(f"[run_inference] created session in {end}") - print(f"[run_inference] start {warmup} warmup iterations") - - if torch_model: - expected = [ - torch_model(*example_inputs[i % len(example_inputs)]) for i in range(warmup) - ] - - got = [] - iterations = [] - begin = time.perf_counter() - for i in range(warmup): - t0 = time.perf_counter() - got.append(wrapped_session.run_dlpack(*example_inputs[i % len(example_inputs)])) - iterations.append(time.perf_counter() - t0) - end = time.perf_counter() - begin - stats["warmup"] = warmup - stats["warmup_time"] = end / warmup - stats["warmup_iter"] = iterations - if torch_model: - abs_err, rel_err = measure_discrepancies(expected, got) - stats["discrepancies_abs"] = abs_err - stats["discrepancies_rel"] = rel_err - - if verbose: - print(f"[run_inference] warmup done in {time.perf_counter() - begin}") - print(f"[run_inference] start {repeat} iterations") - - iterations = [] - begin = time.perf_counter() - for i in range(repeat): - t0 = time.perf_counter() - wrapped_session.run_dlpack(*example_inputs[i % len(example_inputs)]) - iterations.append(time.perf_counter() - t0) - end = time.perf_counter() - begin - stats["repeat"] = repeat - stats["repeat_time"] = end / repeat - stats["repeat_iter"] = iterations - - if verbose: - print(f"[run_inference] measure done in {time.perf_counter() - begin}") - - return stats - - -def multi_run(kwargs: dict[str, Any]) -> bool: - """Checks if multiple values were sent for one argument.""" - return any(isinstance(v, str) and "," in v for v in kwargs.values()) - - -def make_configs(kwargs: dict[str, Any]) -> list[dict[str, Any]]: - """Creates all the configurations based on the command line arguments.""" - print(kwargs) - args = [] - for k, v in kwargs.items(): - if isinstance(v, str): - args.append([(k, s) for s in v.split(",")]) - else: - args.append([(k, v)]) - configs = list(itertools.product(*args)) - return [dict(c) for c in configs] - - -def make_dataframe_from_benchmark_data(data: list[dict]) -> Any: - """Creates a dataframe from the received data.""" - import pandas - - return pandas.DataFrame(data) diff --git a/onnxscript/tools/benchmark/benchmark_helpers_test.py b/onnxscript/tools/benchmark/benchmark_helpers_test.py deleted file mode 100644 index ec88ffd9e1..0000000000 --- a/onnxscript/tools/benchmark/benchmark_helpers_test.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. -import unittest - -import onnxscript.tools.benchmark.benchmark_helpers as bh - - -class BenchmarkHelperTest(unittest.TestCase): - def test_make_configs(self): - value = { - "warmup": 5, - "model": "llama,phi", - "device": "cpu,cuda", - "config": "medium", - "dump_folder": "", - } - self.assertTrue(bh.multi_run(value)) - configs = bh.make_configs(value) - expected = [ - { - "warmup": 5, - "model": "llama", - "device": "cpu", - "config": "medium", - "dump_folder": "", - }, - { - "warmup": 5, - "model": "llama", - "device": "cuda", - "config": "medium", - "dump_folder": "", - }, - { - "warmup": 5, - "model": "phi", - "device": "cpu", - "config": "medium", - "dump_folder": "", - }, - { - "warmup": 5, - "model": "phi", - "device": "cuda", - "config": "medium", - "dump_folder": "", - }, - ] - self.assertEqual(expected, configs) - - -if __name__ == "__main__": - unittest.main(verbosity=2) diff --git a/onnxscript/tools/benchmark/benchmark_run.py b/onnxscript/tools/benchmark/benchmark_run.py deleted file mode 100644 index f961b9b320..0000000000 --- a/onnxscript/tools/benchmark/benchmark_run.py +++ /dev/null @@ -1,140 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. -# pylint: disable=consider-using-with,import-outside-toplevel -from __future__ import annotations - -import multiprocessing -import os -import platform -import re -import subprocess -import sys - - -class BenchmarkError(RuntimeError): - pass - - -def get_machine() -> dict[str, str | int | float | tuple[int, int]]: - """Returns the machine specification.""" - config: dict[str, str | int | float | tuple[int, int]] = dict( - machine=str(platform.machine()), - processor=str(platform.processor()), - version=str(sys.version), - config=int(multiprocessing.cpu_count()), - executable=str(sys.executable), - ) - try: - import torch.cuda - except ImportError: - return config - - config["has_cuda"] = bool(torch.cuda.is_available()) - if config["has_cuda"]: - config["capability"] = torch.cuda.get_device_capability(0) - config["device_name"] = str(torch.cuda.get_device_name(0)) - return config - - -def _cmd_line(script_name: str, **kwargs: dict[str, str | int | float]) -> list[str]: - args = [sys.executable, "-m", script_name] - for k, v in kwargs.items(): - args.append(f"--{k}") - args.append(str(v)) - return args - - -def _extract_metrics(text: str) -> dict[str, str]: - reg = re.compile(r":(.*?),(.*.?);") - res = reg.findall(text) - if len(res) == 0: - return {} - return dict(res) - - -def _make_prefix(script_name: str, index: int) -> str: - name = os.path.splitext(script_name)[0] - return f"{name}_dort_c{index}_" - - -def run_benchmark( - script_name: str, - configs: list[dict[str, str | int | float]], - verbose: int = 0, - stop_if_exception: bool = True, - dort_dump: bool = False, -) -> list[dict[str, str | int | float | tuple[int, int]]]: - """ - Runs a script multiple times and extract information from the output - following the pattern ``:,;``. - - :param script_name: python script to run - :param configs: list of execution to do - :param stop_if_exception: stop if one experiment failed, otherwise continue - :param verbose: use tqdm to follow the progress - :param dort_dump: dump onnx file if dort is used - :return: values - """ - if verbose: - try: - from tqdm import tqdm - - loop = tqdm(configs) - except ImportError: - loop = configs - else: - loop = configs - - data: list[dict[str, str | int | float | tuple[int, int]]] = [] - for i, config in enumerate(loop): - cmd = _cmd_line(script_name, **config) - - if dort_dump: - os.environ["ONNXRT_DUMP_PATH"] = _make_prefix(script_name, i) - else: - os.environ["ONNXRT_DUMP_PATH"] = "" - if verbose > 3: - print(f"[run_benchmark] cmd={cmd if isinstance(cmd, str) else ' '.join(cmd)}") - - p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - try: - res = p.communicate(timeout=30) - out, err = res - serr = err.decode("utf-8", errors="ignore") - except subprocess.TimeoutExpired as e: - p.kill() - res = p.communicate() - out, err = res - serr = f"{e}\n:timeout,1;{err.decode('utf-8', errors='ignore')}" - sout = out.decode("utf-8", errors="ignore") - - if "ONNXRuntimeError" in serr or "ONNXRuntimeError" in sout: - if stop_if_exception: # pylint: disable=no-else-raise - raise RuntimeError( - f"Unable to continue with config {config} due to the " - f"following error\n{serr}" - f"\n----OUTPUT--\n{sout}" - ) - - metrics = _extract_metrics(sout) - if len(metrics) == 0: - if stop_if_exception: # pylint: disable=no-else-raise - raise BenchmarkError( - f"Unable (2) to continue with config {config}, no metric was " - f"collected.\n--ERROR--\n{serr}\n--OUTPUT--\n{sout}" - ) - else: - metrics = {} - metrics.update(config) - metrics["ERROR"] = serr - metrics["OUTPUT"] = sout - metrics["CMD"] = f"[{' '.join(cmd)}]" - data.append(metrics) # type: ignore[arg-type] - if verbose > 5: - print("--------------- ERROR") - print(serr) - if verbose >= 10: - print("--------------- OUTPUT") - print(sout) - - return data diff --git a/onnxscript/tools/benchmark/export_model.py b/onnxscript/tools/benchmark/export_model.py deleted file mode 100644 index b6bbc37fd6..0000000000 --- a/onnxscript/tools/benchmark/export_model.py +++ /dev/null @@ -1,207 +0,0 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# pylint: disable=import-outside-toplevel - -import hashlib -import pprint -import textwrap -import time -from typing import Any - - -def main(args=None): - import onnxscript.tools.benchmark - - kwargs: dict[str, Any] = onnxscript.tools.benchmark.get_parsed_args( - "export_model", - description=textwrap.dedent( - """Measures the inference time for a particular model. - This script can be used to quickly evaluate the improvment made by a pattern optimization - for a particular model. - - If one value contains ",", the script understand multiple commands - must be run. It computes all the possible configurations. - In that case, it produces a csv file (if output_data is not empty) with all the results. - - Example with a large phi model:: - - python -m onnxscript.tools.benchmark.export_model --model phi --device cuda --config large --num_hidden_layers=6 --dtype=float32 --dynamic=0 --verbose=1 --exporter=dynamo - - Example with a medium llama model:: - - python -m onnxscript.tools.benchmark.export_model --model llama --device cuda --config medium --num_hidden_layers=1 --dtype=float32 --dynamic=0 --verbose=1 --exporter=dynamo --optimization=rewrite/optimize/inline/llama0/onnxruntime - """ - ), - repeat=(10, "number of inferences to measure"), - warmup=(5, "number of inferences to warm"), - model=("phi", "model to measure, llama, mistral, phi, ..."), - exporter=("dynamo", "script, dynamo"), - device=("cpu", "'cpu' or 'cuda'"), - target_opset=(18, "opset to convert into, use with backend=custom"), - config=("small", "default, medium, or small to test"), - verbose=(0, "verbosity"), - dump_folder=("", "if not empty, dump the model in that folder"), - dump_ort=(1, "produce the model optimized by onnxruntime"), - ort_optimize=(1, "enable or disable onnxruntime optimization"), - dtype=("default", "cast the model and the inputs into this type"), - dynamic=(0, "use dynamic shapes"), - num_hidden_layers=(1, "number of hidden layers"), - with_mask=(1, "with or without mask, dynamo may fail with a mask"), - optimization=( - "", - "optimization scenario, comma separated value, optimize, rewrite, " - "inline, set of patterns (default, onnxruntime, customops)", - ), - implementation=("eager", "eager or sdpa"), - memory_peak=(0, "measure the memory peak during conversion"), - output_data=( - "export_model.csv", - "produces a csv file with the data if multiple configurations are tested", - ), - new_args=args, - ) - if onnxscript.tools.benchmark.multi_run(kwargs): - import onnxscript.tools.benchmark.benchmark_run - - configs = onnxscript.tools.benchmark.make_configs(kwargs) - data = onnxscript.tools.benchmark.benchmark_run.run_benchmark( - "onnxscript.tools.benchmark.export_model", - configs, - kwargs["verbose"], - stop_if_exception=False, - ) - if kwargs["verbose"] > 2: - pprint.pprint(data if kwargs["verbose"] > 3 else data[:2]) - if kwargs["output_data"]: - df = onnxscript.tools.benchmark.make_dataframe_from_benchmark_data(data) - df.to_csv(kwargs["output_data"], index=False) - df.to_excel(kwargs["output_data"] + ".xlsx", index=False) - if kwargs["verbose"]: - print(df) - else: - print("-------------------") - print("[export_model]") - pprint.pprint(kwargs) - print("-------------------") - - # Import is delayed so that help is being display faster (without having to import heavy packages). - import onnxscript.tools - import onnxscript.tools.memory_peak - import onnxscript.tools.transformers_models - - print( - f"[export_model] create the model and inputs for {kwargs['model']!r} and config {kwargs['config']!r}" - ) - begin = time.perf_counter() - model, example_inputs, dynamic_shapes = ( - onnxscript.tools.transformers_models.get_model_and_inputs( - warmup=kwargs["warmup"], - repeat=kwargs["repeat"], - model=kwargs["model"], - config=kwargs["config"], - dynamic_shapes=kwargs["dynamic"], - device=kwargs["device"], - num_hidden_layers=kwargs["num_hidden_layers"], - with_mask=kwargs["with_mask"], - implementation=kwargs["implementation"], - dtype=kwargs["dtype"], - ) - ) - print(f"[export_model] model created in {time.perf_counter() - begin}") - if kwargs["dynamic"]: - print(f"[export_model] dynamic_shapes={dynamic_shapes}") - msg = [tuple(i.shape for i in inp) for inp in example_inputs] - print(f"[export_model] input_shapes={msg}") - conversion: dict[str, Any] = {} - memory_stats: dict[str, float] = {} - - if kwargs["exporter"] == "eager": - print("[export_model] start benchmark") - begin = time.perf_counter() - result = onnxscript.tools.benchmark.run_inference( - model, - example_inputs, - warmup=kwargs["warmup"], - repeat=kwargs["repeat"], - verbose=kwargs["verbose"], - ) - print(f"[export_model] benchmark done in {time.perf_counter() - begin}") - else: - print( - f"[export_model] export to onnx with exporter={kwargs['exporter']!r} " - f"and optimization={kwargs['optimization']!r}" - ) - begin = time.perf_counter() - if kwargs["optimization"]: - m = hashlib.sha256() - m.update(kwargs["optimization"].encode()) - so = m.hexdigest()[:5] - else: - so = "" - name = "_".join( - [ - kwargs["model"], - kwargs["exporter"], - "dynamic" if kwargs["dynamic"] else "static", - kwargs["dtype"].replace("float", "fp"), - kwargs["device"], - kwargs["config"], - f"h{kwargs['num_hidden_layers']}", - so, - ], - ) - filename = f"em_{name}.onnx" - - memory_session = ( - onnxscript.tools.memory_peak.start_spying_on(cuda=kwargs["device"] == "cuda") - if kwargs["memory_peak"] - else None - ) - print(f"[export_model] start memory peak monitoring {memory_session}") - proto = onnxscript.tools.benchmark.common_export( - model=model, - inputs=example_inputs[0], - exporter=kwargs["exporter"], - target_opset=kwargs["target_opset"], - folder=kwargs["dump_folder"], - filename=filename, - dynamic_shapes=dynamic_shapes if kwargs["dynamic"] else None, - optimization=kwargs["optimization"], - verbose=kwargs["verbose"], - stats=conversion, - ) - print(f"[export_model] export to onnx done in {time.perf_counter() - begin}") - if memory_session is not None: - memory_results = memory_session.stop() - print(f"[export_model] ends memory monitoring {memory_results}") - memory_stats = onnxscript.tools.memory_peak.flatten( - memory_results, prefix="memory_" - ) - else: - memory_stats = {} - - result = onnxscript.tools.benchmark.run_onnx_inference( - proto, - example_inputs, - warmup=kwargs["warmup"], - repeat=kwargs["repeat"], - verbose=kwargs["verbose"], - ort_optimize=kwargs["ort_optimize"], - torch_model=model, - ) - - print("[export_model] end") - print("------------------------------") - for k, v in sorted(kwargs.items()): - print(f":{k},{v};") - for k, v in sorted(conversion.items()): - print(f":{k},{v};") - if memory_stats: - for k, v in memory_stats.items(): - print(f":{k},{v};") - for k, v in sorted(result.items()): - print(f":{k},{v};") - - -if __name__ == "__main__": - main() diff --git a/onnxscript/tools/benchmark/export_model_batch.py b/onnxscript/tools/benchmark/export_model_batch.py deleted file mode 100644 index 8dff49e0c9..0000000000 --- a/onnxscript/tools/benchmark/export_model_batch.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# pylint: disable=import-outside-toplevel - -from __future__ import annotations - -import pprint -import textwrap -from typing import Any - -import onnxscript.tools.benchmark - - -def main(args: list[str] | None = None): - kwargs: dict[str, Any] = onnxscript.tools.benchmark.get_parsed_args( - "export_model", - description=textwrap.dedent( - """Measures the inference time for a particular model. - It runs export_model to compare several optimization settings. - - Example:: - - python -m onnxscript.tools.benchmark.export_model_batch --model phi --device cuda --config medium --num_hidden_layers=1 --dtype=float32 --dynamic=0 --verbose=1 - """ - ), - repeat=(10, "number of inferences to measure"), - warmup=(5, "number of inferences to warm"), - model=("phi", "model to measure, llama, mistral, phi, ..."), - device=("cpu", "'cpu' or 'cuda'"), - target_opset=(18, "opset to convert into, use with backend=custom"), - config=("small", "default, medium, or small to test"), - verbose=(0, "verbosity"), - dtype=("default", "cast the model and the inputs into this type"), - dynamic=(0, "use dynamic shapes"), - num_hidden_layers=(1, "number of hidden layers"), - with_mask=(1, "with or without mask, dynamo may fail with a mask"), - implementation=("eager", "eager or sdpa"), - new_args=args, - ) - - print("-------------------") - print("[export_model]") - pprint.pprint(kwargs) - print("-------------------") - - import pandas - - try: - import openpyxl - except ImportError: - openpyxl = None - - from onnxscript.tools.benchmark.benchmark_helpers import ( - BenchmarkError, - run_benchmark, - ) - - script_name = "onnxscript.tools.benchmark.export_model" - - configs: list[dict[str, Any]] = [ - dict(exporter="eager"), - dict(ort_optimize=1, exporter="script"), - dict(ort_optimize=1, optimization="optimize/rewrite/inline", exporter="script"), - dict(ort_optimize=0, optimization="optimize/rewrite/inline", exporter="script"), - dict(ort_optimize=1, optimization="", exporter="dynamo"), - dict(ort_optimize=1, optimization="optimize/rewrite/inline", exporter="dynamo"), - dict(ort_optimize=0, optimization="optimize/rewrite/inline", exporter="dynamo"), - ] - common_kwargs: dict[str, Any] = kwargs.copy() - common_kwargs["verbose"] = max(common_kwargs["verbose"] - 1, 0) - for c in configs: - c.update(common_kwargs) - - if kwargs["verbose"]: - for i, cf in enumerate(configs): - print(f"[export_common_batch] config {i + 1}: {cf}") - - ################################ - # Running configuration. - - try: - data = run_benchmark( - script_name, - configs, - verbose=kwargs["verbose"], - stop_if_exception=False, - ) - data_collected = True - except BenchmarkError as e: - if kwargs["verbose"]: - print(e) - data_collected = False - - prefix = "_".join( - [ - "emb_", - kwargs["model"], - "dynamic" if kwargs["dynamic"] else "static", - kwargs["dtype"].replace("float", "fp"), - kwargs["device"], - kwargs["config"], - f"h{kwargs['num_hidden_layers']}", - ], - ) - - if data_collected: - df = pandas.DataFrame(data) - df = df.drop(["OUTPUT", "ERROR"], axis=1) - df["repeat_time"] = df["repeat_time"].astype(float) - df_eager = df[(df["implementation"] == "eager") & (df["exporter"] == "eager")][ - "repeat_time" - ].dropna() - if df_eager.shape[0] > 0: - min_eager = df_eager.min() - df["increase"] = df["repeat_time"] / min_eager - 1 - filename = f"{prefix}_with_cmd.csv" - df.to_csv(filename, index=False) - - df = df.drop(["CMD"], axis=1) - filename = f"{prefix}.csv" - df.to_csv(filename, index=False) - df = pandas.read_csv(filename) # to cast type - print(df) - - # summary - cs = [ - c - for c in ["exporter", "optimization", "warmup_time", "repeat_time", "increase"] - if c in df.columns - ] - dfs = df[cs] - if openpyxl: - filename = f"{prefix}_summary.xlsx" - dfs.to_excel(filename, index=False) - filename = f"{prefix}_summary.csv" - dfs.to_csv(filename, index=False) - print(dfs) - - ######################## - # First lines. - - print(df.head(2).T) - - -if __name__ == "__main__": - main() diff --git a/onnxscript/tools/benchmark/export_model_test.py b/onnxscript/tools/benchmark/export_model_test.py deleted file mode 100644 index 55698be67f..0000000000 --- a/onnxscript/tools/benchmark/export_model_test.py +++ /dev/null @@ -1,205 +0,0 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. - -import contextlib -import io -import unittest - -import onnxscript.tools.benchmark.export_model -import onnxscript.tools.transformers_models.phi3 -from onnxscript._internal.version_utils import ( - has_transformers, - is_onnxruntime_training, - torch_older_than, -) - -has_phi3 = onnxscript.tools.transformers_models.phi3.has_phi3 - - -class BenchmarkTest(unittest.TestCase): - @unittest.skipIf(not has_transformers(), reason="transformers missing") - def test_export_model_phi_cpu_eager(self): - args = [ - "--verbose", - "1", - "--config", - "medium", - "--dtype", - "float32", - "--device", - "cpu", - "--exporter", - "eager", - "--model", - "phi", - ] - f = io.StringIO() - with contextlib.redirect_stdout(f): - onnxscript.tools.benchmark.export_model.main(args) - - out = f.getvalue() - self.assertIn(":repeat_time,", out) - - @unittest.skipIf(not has_transformers(), reason="transformers missing") - @unittest.skipIf(torch_older_than("2.4"), reason="fails to export") - @unittest.skipIf(not is_onnxruntime_training(), reason="onnxruntime-training is needed") - def test_export_model_mistral_cpu_dynamo_llama0(self): - args = [ - "--verbose", - "1", - "--config", - "medium", - "--dtype", - "float32", - "--device", - "cpu", - "--exporter", - "dynamo", - "--optimization", - "rewrite/optimize/inline/llama0", - "--model", - "mistral", - ] - f = io.StringIO() - with contextlib.redirect_stdout(f): - onnxscript.tools.benchmark.export_model.main(args) - - out = f.getvalue() - self.assertIn(":repeat_time,", out) - - @unittest.skipIf(not has_transformers(), reason="transformers missing") - def test_export_model_llama_cpu_eager(self): - args = [ - "--verbose", - "1", - "--config", - "medium", - "--dtype", - "float32", - "--device", - "cpu", - "--exporter", - "eager", - "--model", - "llama", - ] - f = io.StringIO() - with contextlib.redirect_stdout(f): - onnxscript.tools.benchmark.export_model.main(args) - - out = f.getvalue() - self.assertIn(":repeat_time,", out) - - @unittest.skipIf(not has_transformers(), reason="transformers missing") - @unittest.skipIf(not is_onnxruntime_training(), reason="onnxruntime-training is needed") - @unittest.skipIf( - torch_older_than("2.4"), - reason="TypeError: _functionalize_sync(): " - "argument 't' (position 1) must be Tensor, not NoneType", - ) - def test_export_model_phi_cpu_dynamo(self): - args = [ - "--verbose", - "1", - "--config", - "medium", - "--dtype", - "float32", - "--device", - "cpu", - "--exporter", - "dynamo", - "--model", - "phi", - ] - f = io.StringIO() - with contextlib.redirect_stdout(f): - onnxscript.tools.benchmark.export_model.main(args) - - out = f.getvalue() - self.assertIn(":repeat_time,", out) - - @unittest.skipIf(not has_transformers(), reason="transformers missing") - @unittest.skipIf(not is_onnxruntime_training(), reason="onnxruntime-training is needed") - def test_export_model_phi_cpu_script(self): - args = [ - "--verbose", - "1", - "--config", - "medium", - "--dtype", - "float32", - "--device", - "cpu", - "--exporter", - "script", - "--model", - "phi", - ] - f = io.StringIO() - with contextlib.redirect_stdout(f): - onnxscript.tools.benchmark.export_model.main(args) - - out = f.getvalue() - self.assertIn(":repeat_time,", out) - - @unittest.skipIf(not has_transformers(), reason="transformers missing") - @unittest.skipIf(torch_older_than("2.4"), reason="fails to export") - @unittest.skipIf(not is_onnxruntime_training(), reason="onnxruntime-training is needed") - def test_export_model_phi_cpu_dynamo_llama0(self): - args = [ - "--verbose", - "1", - "--config", - "medium", - "--dtype", - "float32", - "--device", - "cpu", - "--exporter", - "dynamo", - "--optimization", - "rewrite/optimize/inline/llama0/onnxruntime", - "--model", - "phi", - ] - f = io.StringIO() - with contextlib.redirect_stdout(f): - onnxscript.tools.benchmark.export_model.main(args) - - out = f.getvalue() - self.assertIn(":repeat_time,", out) - - @unittest.skipIf(not has_transformers(), reason="transformers missing") - @unittest.skipIf(torch_older_than("2.4"), reason="Fails to export with torch<2.4") - @unittest.skipIf(not is_onnxruntime_training(), reason="onnxruntime-training is needed") - @unittest.skipIf( - not has_phi3(), reason="transformers is not recent enough to contain the phi3 model" - ) - def test_export_model_phi3_cpu_dynamo_llama0(self): - args = [ - "--verbose", - "1", - "--config", - "medium", - "--dtype", - "float32", - "--device", - "cpu", - "--exporter", - "dynamo", - "--optimization", - "rewrite/optimize/inline/llama0", - "--model", - "phi3", - ] - f = io.StringIO() - with contextlib.redirect_stdout(f): - onnxscript.tools.benchmark.export_model.main(args) - - out = f.getvalue() - self.assertIn(":repeat_time,", out) - - -if __name__ == "__main__": - unittest.main(verbosity=2)