diff --git a/onnxruntime/python/tools/transformers/optimizer.py b/onnxruntime/python/tools/transformers/optimizer.py index a18535c10591d..bad5d4f89b7f0 100644 --- a/onnxruntime/python/tools/transformers/optimizer.py +++ b/onnxruntime/python/tools/transformers/optimizer.py @@ -60,6 +60,7 @@ def optimize_by_onnxruntime( optimized_model_path: Optional[str] = None, opt_level: Optional[int] = 99, disabled_optimizers=[], + verbose=False, ) -> str: """ Use onnxruntime to optimize model. @@ -98,6 +99,10 @@ def optimize_by_onnxruntime( sess_options.optimized_model_filepath = optimized_model_path + if verbose: + print("Using onnxruntime to optimize model - Debug level Set to verbose") + sess_options.log_severity_level = 0 + kwargs = {} if disabled_optimizers: kwargs["disabled_optimizers"] = disabled_optimizers @@ -114,7 +119,6 @@ def optimize_by_onnxruntime( elif torch_version.hip: gpu_ep.append("MIGraphXExecutionProvider") gpu_ep.append("ROCMExecutionProvider") - session = onnxruntime.InferenceSession(onnx_model_path, sess_options, providers=gpu_ep, **kwargs) assert not set(onnxruntime.get_available_providers()).isdisjoint( ["CUDAExecutionProvider", "ROCMExecutionProvider", "MIGraphXExecutionProvider"] @@ -189,6 +193,7 @@ def optimize_model( opt_level: Optional[int] = None, use_gpu: bool = False, only_onnxruntime: bool = False, + verbose=False, ): """Optimize Model by OnnxRuntime and/or python fusion logic. @@ -260,6 +265,7 @@ def optimize_model( use_gpu=use_gpu, opt_level=opt_level, disabled_optimizers=disabled_optimizers, + verbose=verbose, ) elif opt_level == 1: # basic optimizations (like constant folding and cast elimination) are not specified to execution provider. @@ -269,6 +275,7 @@ def optimize_model( use_gpu=False, opt_level=1, disabled_optimizers=disabled_optimizers, + verbose=verbose, ) if only_onnxruntime and not temp_model_path: diff --git a/onnxruntime/test/python/transformers/parity_utilities.py b/onnxruntime/test/python/transformers/parity_utilities.py index b61f9fbcf2b61..2443bbfd760ce 100644 --- a/onnxruntime/test/python/transformers/parity_utilities.py +++ b/onnxruntime/test/python/transformers/parity_utilities.py @@ -3,7 +3,7 @@ # Licensed under the MIT License. See License.txt in the project root for # license information. # ------------------------------------------------------------------------- - +import argparse import os import sys @@ -11,6 +11,36 @@ import torch +def parse_arguments(namespace_filter=None): + + parser = argparse.ArgumentParser() + + # useful EPs that don't require the use of optmizer.py + parser.add_argument( + "-n", + "--no_optimize", + required=False, + action="store_false", + default=True, + dest="optimize", + help="Turn off onnxruntime optimizers (Default off optimizers ON)", + ) + + # useful for debugging and viewing state during test runs + parser.add_argument( + "-l", + "--log_verbose", + required=False, + action="store_true", + default=False, + help="Set Onnxruntime log_serverity_level=0 (VERBOSE) ", + ) + + args, remaining_args = parser.parse_known_args(namespace=namespace_filter) + + return args, sys.argv[:1] + remaining_args + + def find_transformers_source(sub_dir_paths=[]): source_dir = os.path.join( os.path.dirname(__file__), @@ -74,13 +104,16 @@ def optimize_onnx( expected_op=None, use_gpu=False, opt_level=None, + verbose=False, ): if find_transformers_source(): from optimizer import optimize_model else: from onnxruntime.transformers.optimizer import optimize_model - onnx_model = optimize_model(input_onnx_path, model_type="gpt2", use_gpu=use_gpu, opt_level=opt_level) + onnx_model = optimize_model( + input_onnx_path, model_type="gpt2", use_gpu=use_gpu, opt_level=opt_level, verbose=verbose + ) onnx_model.save_model_to_file(optimized_onnx_path) if expected_op is not None: @@ -130,21 +163,26 @@ def compare_outputs(torch_outputs, ort_outputs, atol=1e-06, verbose=True): return is_all_close, max(max_abs_diff) -def create_ort_session(onnx_model_path, use_gpu=True): +def create_ort_session(onnx_model_path, use_gpu=True, optimized=True, verbose=False): from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions - from onnxruntime import __version__ as onnxruntime_version sess_options = SessionOptions() sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_DISABLE_ALL sess_options.intra_op_num_threads = 2 sess_options.log_severity_level = 2 + + if verbose: + sess_options.log_severity_level = 0 + execution_providers = ["CPUExecutionProvider"] if use_gpu: if torch.version.cuda: execution_providers.append("CUDAExecutionProvider") elif torch.version.hip: - execution_providers.append("MIGraphXExecutionProvider") + if not optimized: + execution_providers.append("MIGraphXExecutionProvider") + execution_providers.append("ROCMExecutionProvider") return InferenceSession(onnx_model_path, sess_options, providers=execution_providers) @@ -172,7 +210,7 @@ def run_parity( passed_cases = 0 max_diffs = [] printed = False # print only one sample - ort_session = create_ort_session(onnx_model_path, device.type == "cuda") + ort_session = create_ort_session(onnx_model_path, device.type == "cuda", optimized=optimized, verbose=verbose) for i in range(test_cases): input_hidden_states = create_inputs(batch_size, sequence_length, hidden_size, float16, device) diff --git a/onnxruntime/test/python/transformers/test_parity_gelu.py b/onnxruntime/test/python/transformers/test_parity_gelu.py index 7fe42dc76f193..4da7c2f36a999 100644 --- a/onnxruntime/test/python/transformers/test_parity_gelu.py +++ b/onnxruntime/test/python/transformers/test_parity_gelu.py @@ -85,6 +85,7 @@ def run( formula=0, sequence_length=2, fp32_gelu_op=True, + verbose=False, ): test_name = f"device={device}, float16={float16}, optimized={optimized}, batch_size={batch_size}, sequence_length={sequence_length}, hidden_size={hidden_size}, formula={formula}, fp32_gelu_op={fp32_gelu_op}" print(f"\nTesting: {test_name}") @@ -108,6 +109,7 @@ def run( Gelu.get_fused_op(formula), use_gpu=use_gpu, opt_level=2 if use_gpu else None, + verbose=verbose, ) onnx_path = optimized_onnx_path else: @@ -123,7 +125,7 @@ def run( device, optimized, test_cases, - verbose=False, + verbose, ) # clean up onnx file @@ -135,8 +137,10 @@ def run( class TestGeluParity(unittest.TestCase): + verbose = False + optimized = True + def setUp(self): - self.optimized = True # Change it to False if you want to test parity of non optimized ONNX self.test_cases = 100 # Number of test cases per test run self.sequence_length = 2 self.hidden_size = 768 @@ -159,6 +163,7 @@ def run_test( formula, enable_assert=True, fp32_gelu_op=True, + verbose=False, ): if float16 and device.type == "cpu": # CPU does not support FP16 return @@ -172,11 +177,12 @@ def run_test( formula, self.sequence_length, fp32_gelu_op, + verbose, ) if enable_assert: self.assertTrue(num_failure == 0, "Failed: " + test_name) - def run_one(self, optimized, device, hidden_size=768, formula=0): + def run_one(self, optimized, device, hidden_size=768, formula=0, verbose=False): for batch_size in [4]: self.run_test( batch_size, @@ -186,6 +192,7 @@ def run_one(self, optimized, device, hidden_size=768, formula=0): device=device, formula=formula, enable_assert=formula in self.formula_must_pass, + verbose=verbose, ) self.run_test( @@ -197,6 +204,7 @@ def run_one(self, optimized, device, hidden_size=768, formula=0): formula=formula, enable_assert=formula in self.formula_must_pass, fp32_gelu_op=True, + verbose=verbose, ) self.run_test( @@ -208,12 +216,13 @@ def run_one(self, optimized, device, hidden_size=768, formula=0): formula=formula, enable_assert=formula in self.formula_must_pass, fp32_gelu_op=False, + verbose=verbose, ) def test_cpu(self): cpu = torch.device("cpu") for i in self.formula_to_test: - self.run_one(self.optimized, cpu, hidden_size=self.hidden_size, formula=i) + self.run_one(self.optimized, cpu, hidden_size=self.hidden_size, formula=i, verbose=self.verbose) def test_cuda(self): if not torch.cuda.is_available(): @@ -223,8 +232,13 @@ def test_cuda(self): else: gpu = torch.device("cuda") for i in self.formula_to_test: - self.run_one(self.optimized, gpu, hidden_size=self.hidden_size, formula=i) + self.run_one(self.optimized, gpu, hidden_size=self.hidden_size, formula=i, verbose=self.verbose) if __name__ == "__main__": - unittest.main() + args, remaining_args = parse_arguments(namespace_filter=unittest) + + TestGeluParity.verbose = args.log_verbose + TestGeluParity.optimized = args.optimize + + unittest.main(argv=remaining_args) diff --git a/onnxruntime/test/python/transformers/test_parity_huggingface_gpt_attention.py b/onnxruntime/test/python/transformers/test_parity_huggingface_gpt_attention.py index c29cf969734c4..33ff8079a1af3 100644 --- a/onnxruntime/test/python/transformers/test_parity_huggingface_gpt_attention.py +++ b/onnxruntime/test/python/transformers/test_parity_huggingface_gpt_attention.py @@ -9,7 +9,6 @@ # Licensed under the MIT License. See License.txt in the project root for # license information. # ------------------------------------------------------------------------- - import os import random import unittest @@ -19,7 +18,7 @@ import pytest import torch from onnx import helper -from parity_utilities import compare_outputs, create_ort_session, diff_outputs +from parity_utilities import compare_outputs, create_ort_session, parse_arguments from torch import nn from transformers.modeling_utils import Conv1D @@ -308,6 +307,7 @@ def verify_attention( padding_length, optimized, test_cases=100, + verbose=False, ): print( f"optimized={optimized}, batch_size={batch_size}, hidden_size={hidden_size}, num_attention_heads={num_attention_heads}, sequence_length={sequence_length}, past_sequence_length={past_sequence_length}, float16={float16}, padding_length={padding_length}, device={device}" @@ -315,7 +315,7 @@ def verify_attention( passed_cases = 0 max_diffs = [] - ort_session = create_ort_session(onnx_model_path, device.type == "cuda") + ort_session = create_ort_session(onnx_model_path, device.type == "cuda", verbose=verbose) for i in range(test_cases): input_hidden_states, attention_mask, layer_past = create_inputs( batch_size, @@ -350,7 +350,7 @@ def verify_attention( return test_cases - passed_cases -def run(batch_size, float16, optimized, hidden_size, num_attention_heads, device, test_cases): +def run(batch_size, float16, optimized, hidden_size, num_attention_heads, device, test_cases, verbose=False): test_name = f"batch_size={batch_size}, float16={float16}, optimized={optimized}, hidden_size={hidden_size}, num_attention_heads={num_attention_heads}" print(f"\nTesting ONNX parity: {test_name}") @@ -392,6 +392,7 @@ def run(batch_size, float16, optimized, hidden_size, num_attention_heads, device padding_length, optimized, test_cases, + verbose, ) # Test Case: with past state and padding last 2 words @@ -411,6 +412,7 @@ def run(batch_size, float16, optimized, hidden_size, num_attention_heads, device padding_length, optimized, test_cases, + verbose, ) # Test Case: random mask one word @@ -430,6 +432,7 @@ def run(batch_size, float16, optimized, hidden_size, num_attention_heads, device padding_length, optimized, test_cases, + verbose, ) # clean up onnx file @@ -441,11 +444,13 @@ def run(batch_size, float16, optimized, hidden_size, num_attention_heads, device class TestGptAttentionHuggingfaceParity(unittest.TestCase): + verbose = False + optimized = True + def setUp(self): - self.optimized = True # Change it to False if you want to test parity of non optimized ONNX self.test_cases = 10 # Number of test cases per test run - def run_test(self, batch_size, float16, optimized, hidden_size, num_attention_heads, device): + def run_test(self, batch_size, float16, optimized, hidden_size, num_attention_heads, device, verbose=False): if float16 and device.type == "cpu": # CPU does not support FP16 return num_failure, test_name = run( @@ -456,10 +461,11 @@ def run_test(self, batch_size, float16, optimized, hidden_size, num_attention_he num_attention_heads, device, self.test_cases, + verbose=verbose, ) self.assertTrue(num_failure == 0, test_name) - def run_small(self, optimized, device): + def run_small(self, optimized, device, verbose=False): for batch_size in [64]: self.run_test( batch_size, @@ -468,6 +474,7 @@ def run_small(self, optimized, device): hidden_size=768, num_attention_heads=12, device=device, + verbose=verbose, ) self.run_test( batch_size, @@ -476,9 +483,10 @@ def run_small(self, optimized, device): hidden_size=768, num_attention_heads=12, device=device, + verbose=verbose, ) - def run_large(self, optimized, device): + def run_large(self, optimized, device, verbose=False): for batch_size in [2]: self.run_test( batch_size, @@ -487,6 +495,7 @@ def run_large(self, optimized, device): hidden_size=4096, num_attention_heads=32, device=device, + verbose=verbose, ) self.run_test( batch_size, @@ -495,11 +504,12 @@ def run_large(self, optimized, device): hidden_size=4096, num_attention_heads=32, device=device, + verbose=verbose, ) def test_cpu(self): cpu = torch.device("cpu") - self.run_small(self.optimized, cpu) + self.run_small(self.optimized, cpu, verbose=self.verbose) def test_cuda(self): if not torch.cuda.is_available(): @@ -508,7 +518,7 @@ def test_cuda(self): pytest.skip("test requires GPU and torch+cuda") else: gpu = torch.device("cuda") - self.run_small(self.optimized, gpu) + self.run_small(self.optimized, gpu, verbose=self.verbose) @pytest.mark.slow def test_large_cuda(self): @@ -518,8 +528,13 @@ def test_large_cuda(self): pytest.skip("test requires GPU and torch+cuda") else: gpu = torch.device("cuda") - self.run_large(self.optimized, gpu) + self.run_large(self.optimized, gpu, verbose=self.verbose) if __name__ == "__main__": - unittest.main() + args, remaining_args = parse_arguments(namespace_filter=unittest) + + TestGptAttentionHuggingfaceParity.verbose = args.log_verbose + TestGptAttentionHuggingfaceParity.optimized = args.optimize + + unittest.main(argv=remaining_args) diff --git a/onnxruntime/test/python/transformers/test_parity_layernorm.py b/onnxruntime/test/python/transformers/test_parity_layernorm.py index 01122b4830bfa..48190fee150a9 100644 --- a/onnxruntime/test/python/transformers/test_parity_layernorm.py +++ b/onnxruntime/test/python/transformers/test_parity_layernorm.py @@ -155,11 +155,7 @@ def run( if optimized: optimized_onnx_path = "./temp/layer_norm_{}_formula{}_opt.onnx".format("fp16" if float16 else "fp32", formula) if (not float16) or cast_fp16: - optimize_onnx( - onnx_model_path, - optimized_onnx_path, - expected_op=LayerNorm.get_fused_op(), - ) + optimize_onnx(onnx_model_path, optimized_onnx_path, expected_op=LayerNorm.get_fused_op(), verbose=verbose) else: if cast_onnx_only: optimize_fp16_onnx_with_cast(onnx_model_path, optimized_onnx_path, epsilon=epsilon) @@ -180,7 +176,7 @@ def run( device, optimized, test_cases, - verbose=verbose, + verbose, ) # clean up onnx file @@ -192,12 +188,13 @@ def run( class TestLayerNormParity(unittest.TestCase): + verbose = False + optimized = True + def setUp(self): - self.optimized = True # Change it to False if you want to test parity of non optimized ONNX self.test_cases = 100 # Number of test cases per test run self.sequence_length = 2 self.hidden_size = 768 - self.verbose = False def run_test( self, @@ -211,6 +208,7 @@ def run_test( formula=0, epsilon=0.00001, enable_assert=True, + verbose=False, ): if float16 and device.type == "cpu": # CPU does not support FP16 return @@ -227,12 +225,12 @@ def run_test( cast_fp16, cast_onnx_only, formula, - verbose=self.verbose, + verbose=verbose, ) if enable_assert: self.assertTrue(num_failure == 0, "Failed: " + test_name) - def run_one(self, optimized, device, hidden_size=768, run_extra_tests=False): + def run_one(self, optimized, device, hidden_size=768, run_extra_tests=False, verbose=False): for batch_size in [4]: for formula in [0, 1]: for epsilon in [1e-5]: # [1e-5, 1e-12] @@ -244,6 +242,7 @@ def run_one(self, optimized, device, hidden_size=768, run_extra_tests=False): device=device, formula=formula, epsilon=epsilon, + verbose=verbose, ) self.run_test( @@ -257,6 +256,7 @@ def run_one(self, optimized, device, hidden_size=768, run_extra_tests=False): formula=formula, epsilon=epsilon, enable_assert=False, # This setting has small chance to exceed tollerance threshold 0.001 + verbose=verbose, ) if not run_extra_tests: @@ -274,6 +274,7 @@ def run_one(self, optimized, device, hidden_size=768, run_extra_tests=False): formula=formula, epsilon=epsilon, enable_assert=False, # This setting cannot pass tollerance threshold + verbose=verbose, ) self.run_test( @@ -287,11 +288,12 @@ def run_one(self, optimized, device, hidden_size=768, run_extra_tests=False): formula=formula, epsilon=epsilon, enable_assert=False, # This setting cannot pass tollerance threshold + verbose=verbose, ) def test_cpu(self): cpu = torch.device("cpu") - self.run_one(self.optimized, cpu, hidden_size=self.hidden_size) + self.run_one(self.optimized, cpu, hidden_size=self.hidden_size, verbose=self.verbose) def test_cuda(self): if not torch.cuda.is_available(): @@ -300,8 +302,13 @@ def test_cuda(self): pytest.skip("test requires GPU and torch+cuda") else: gpu = torch.device("cuda") - self.run_one(self.optimized, gpu, hidden_size=self.hidden_size, run_extra_tests=True) + self.run_one(self.optimized, gpu, hidden_size=self.hidden_size, run_extra_tests=True, verbose=self.verbose) if __name__ == "__main__": - unittest.main() + args, remaining_args = parse_arguments(namespace_filter=unittest) + + TestLayerNormParity.verbose = args.log_verbose + TestLayerNormParity.optimized = args.optimize + + unittest.main(argv=remaining_args)