diff --git a/.github/workflows/docker_builder.yml b/.github/workflows/docker_builder.yml index 817bc87c82..99b6efe53e 100644 --- a/.github/workflows/docker_builder.yml +++ b/.github/workflows/docker_builder.yml @@ -6,6 +6,7 @@ on: branches: - main - nightly + - release/2.1 # If pushes to main are made in rapid succession, # cancel existing docker builds and use newer commits diff --git a/tools/perf/README.md b/tools/perf/README.md index 452621e939..4d4579efb4 100644 --- a/tools/perf/README.md +++ b/tools/perf/README.md @@ -3,9 +3,10 @@ This is a comprehensive Python benchmark suite to run perf runs using different supported backends. Following backends are supported: 1. Torch -2. Torch-TensorRT -3. FX-TRT -4. TensorRT +2. Torch-TensorRT [Torchscript] +3. Torch-TensorRT [Dynamo] +4. Torch-TensorRT [torch_compile] +5. TensorRT Note: Please note that for ONNX models, user can convert the ONNX model to TensorRT serialized engine and then use this package. @@ -22,9 +23,6 @@ Benchmark scripts depends on following Python packages in addition to requiremen ``` ./ -├── config -│ ├── vgg16_trt.yml -│ └── vgg16.yml ├── models ├── perf_run.py ├── hub.py @@ -35,10 +33,8 @@ Benchmark scripts depends on following Python packages in addition to requiremen ``` - -* `config` - Directory which contains sample yaml configuration files for VGG network. * `models` - Model directory -* `perf_run.py` - Performance benchmarking script which supports torch, torch_tensorrt, fx2trt, tensorrt backends +* `perf_run.py` - Performance benchmarking script which supports torch, ts_trt, torch_compile, dynamo, tensorrt backends * `hub.py` - Script to download torchscript models for VGG16, Resnet50, EfficientNet-B0, VIT, HF-BERT * `custom_models.py` - Script which includes custom models other than torchvision and timm (eg: HF BERT) * `utils.py` - utility functions script @@ -46,76 +42,11 @@ Benchmark scripts depends on following Python packages in addition to requiremen ## Usage -There are two ways you can run a performance benchmark. - -### Using YAML config files - -To run the benchmark for a given configuration file: - -```python -python perf_run.py --config=config/vgg16.yml -``` - -There are two sample configuration files added. - -* vgg16.yml demonstrates a configuration with all the supported backends (Torch, Torch-TensorRT, TensorRT) -* vgg16_trt.yml demonstrates how to use an external TensorRT serialized engine file directly. - - -### Supported fields - -| Name | Supported Values | Description | -| ----------------- | ------------------------------------ | ------------------------------------------------------------ | -| backend | all, torchscript, fx2trt, torch, torch_tensorrt, tensorrt | Supported backends for inference. "all" implies the last four methods in the list at left, and "torchscript" implies the last three (excludes fx path) | -| input | - | Input binding names. Expected to list shapes of each input bindings | -| model | - | Configure the model filename and name | -| model_torch | - | Name of torch model file and name (used for fx2trt) (optional) | -| filename | - | Model file name to load from disk. | -| name | - | Model name | -| runtime | - | Runtime configurations | -| device | 0 | Target device ID to run inference. Range depends on available GPUs | -| precision | fp32, fp16 or half, int8 | Target precision to run inference. int8 cannot be used with 'all' backend | -| calibration_cache | - | Calibration cache file expected for torch_tensorrt runtime in int8 precision | - -Additional sample use case: - -``` -backend: - - torch - - torch_tensorrt - - tensorrt - - fx2trt -input: - input0: - - 3 - - 224 - - 224 - num_inputs: 1 -model: - filename: model.plan - name: vgg16 -model_torch: - filename: model_torch.pt - name: vgg16 -runtime: - device: 0 - precision: - - fp32 - - fp16 -``` - -Note: - -1. Please note that measuring INT8 performance is only supported via a `calibration cache` file or QAT mode for `torch_tensorrt` backend. -2. TensorRT engine filename should end with `.plan` otherwise it will be treated as Torchscript module. - -### Using CompileSpec options via CLI - Here are the list of `CompileSpec` options that can be provided directly to compile the pytorch module -* `--backends` : Comma separated string of backends. Eg: torch,torch_tensorrt,tensorrt,fx2trt -* `--model` : Name of the model file (Can be a torchscript module or a tensorrt engine (ending in `.plan` extension)). If the backend is `fx2trt`, the input should be a Pytorch module (instead of a torchscript module) and the options for model are (`vgg16` | `resnet50` | `efficientnet_b0`) -* `--model_torch` : Name of the PyTorch model file (optional, only necessary if fx2trt is a chosen backend) +* `--backends` : Comma separated string of backends. Eg: torch, torch_compile, dynamo, tensorrt +* `--model` : Name of the model file (Can be a torchscript module or a tensorrt engine (ending in `.plan` extension)). If the backend is `dynamo` or `torch_compile`, the input should be a Pytorch module (instead of a torchscript module). +* `--model_torch` : Name of the PyTorch model file (optional, only necessary if `dynamo` or `torch_compile` is a chosen backend) * `--inputs` : List of input shapes & dtypes. Eg: (1, 3, 224, 224)@fp32 for Resnet or (1, 128)@int32;(1, 128)@int32 for BERT * `--batch_size` : Batch size * `--precision` : Comma separated list of precisions to build TensorRT engine Eg: fp32,fp16 @@ -131,10 +62,15 @@ Eg: --model_torch ${MODELS_DIR}/vgg16_torch.pt \ --precision fp32,fp16 --inputs="(1, 3, 224, 224)@fp32" \ --batch_size 1 \ - --backends torch,torch_tensorrt,tensorrt,fx2trt \ + --backends torch,ts_trt,dynamo,torch_compile,tensorrt \ --report "vgg_perf_bs1.txt" ``` +Note: + +1. Please note that measuring INT8 performance is only supported via a `calibration cache` file or QAT mode for `torch_tensorrt` backend. +2. TensorRT engine filename should end with `.plan` otherwise it will be treated as Torchscript module. + ### Example models This tool benchmarks any pytorch model or torchscript module. As an example, we provide VGG16, Resnet50, EfficientNet-B0, VIT, HF-BERT models in `hub.py` that we internally test for performance. diff --git a/tools/perf/benchmark.sh b/tools/perf/benchmark.sh index fe7400945d..28094c7c3c 100644 --- a/tools/perf/benchmark.sh +++ b/tools/perf/benchmark.sh @@ -6,17 +6,33 @@ MODELS_DIR="models" python hub.py batch_sizes=(1 2 4 8 16 32 64 128 256) +large_model_batch_sizes=(1 2 4 8 16 32 64) -#Benchmark VGG16 model + +# Benchmark VGG16 model echo "Benchmarking VGG16 model" for bs in ${batch_sizes[@]} do python perf_run.py --model ${MODELS_DIR}/vgg16_scripted.jit.pt \ - --model_torch ${MODELS_DIR}/vgg16_pytorch.pt \ + --model_torch vgg16 \ --precision fp32,fp16 --inputs="(${bs}, 3, 224, 224)" \ --batch_size ${bs} \ - --backends torch,torch_tensorrt,tensorrt,fx2trt \ - --report "vgg_perf_bs${bs}.txt" + --truncate \ + --backends torch,ts_trt,dynamo,torch_compile,inductor \ + --report "vgg16_perf_bs${bs}.txt" +done + +# Benchmark AlexNet model +echo "Benchmarking AlexNet model" +for bs in ${batch_sizes[@]} +do + python perf_run.py --model ${MODELS_DIR}/alexnet_scripted.jit.pt \ + --model_torch alexnet \ + --precision fp32,fp16 --inputs="(${bs}, 3, 227, 227)" \ + --batch_size ${bs} \ + --truncate \ + --backends torch,ts_trt,dynamo,torch_compile,inductor \ + --report "alexnet_perf_bs${bs}.txt" done # Benchmark Resnet50 model @@ -24,11 +40,12 @@ echo "Benchmarking Resnet50 model" for bs in ${batch_sizes[@]} do python perf_run.py --model ${MODELS_DIR}/resnet50_scripted.jit.pt \ - --model_torch ${MODELS_DIR}/resnet50_pytorch.pt \ + --model_torch resnet50 \ --precision fp32,fp16 --inputs="(${bs}, 3, 224, 224)" \ --batch_size ${bs} \ - --backends torch,torch_tensorrt,tensorrt,fx2trt \ - --report "rn50_perf_bs${bs}.txt" + --truncate \ + --backends torch,ts_trt,dynamo,torch_compile,inductor \ + --report "resnet50_perf_bs${bs}.txt" done # Benchmark VIT model @@ -36,22 +53,50 @@ echo "Benchmarking VIT model" for bs in ${batch_sizes[@]} do python perf_run.py --model ${MODELS_DIR}/vit_scripted.jit.pt \ + --model_torch vit \ --precision fp32,fp16 --inputs="(${bs}, 3, 224, 224)" \ --batch_size ${bs} \ - --backends torch,torch_tensorrt,tensorrt \ + --truncate \ + --backends torch,ts_trt,dynamo,torch_compile,inductor \ --report "vit_perf_bs${bs}.txt" done +# Benchmark VIT Large model +echo "Benchmarking VIT Large model" +for bs in ${large_model_batch_sizes[@]} +do + python perf_run.py --model ${MODELS_DIR}/vit_large_scripted.jit.pt \ + --model_torch vit_large \ + --precision fp32,fp16 --inputs="(${bs}, 3, 224, 224)" \ + --truncate \ + --batch_size ${bs} \ + --backends torch,ts_trt,dynamo,torch_compile,inductor \ + --report "vit_large_perf_bs${bs}.txt" +done + # Benchmark EfficientNet-B0 model echo "Benchmarking EfficientNet-B0 model" for bs in ${batch_sizes[@]} do python perf_run.py --model ${MODELS_DIR}/efficientnet_b0_scripted.jit.pt \ - --model_torch ${MODELS_DIR}/efficientnet_b0_pytorch.pt \ + --model_torch efficientnet_b0 \ --precision fp32,fp16 --inputs="(${bs}, 3, 224, 224)" \ --batch_size ${bs} \ - --backends torch,torch_tensorrt,tensorrt,fx2trt \ - --report "eff_b0_perf_bs${bs}.txt" + --truncate \ + --backends torch,ts_trt,dynamo,torch_compile,inductor \ + --report "efficientnet_b0_perf_bs${bs}.txt" +done + +# Benchmark Stable Diffusion UNet model +echo "Benchmarking SD UNet model" +for bs in ${large_model_batch_sizes[@]} +do + python perf_run.py --model_torch sd_unet \ + --precision fp32,fp16 --inputs="(${bs}, 4, 128, 128)@fp16;(${bs})@fp16;(${bs}, 1, 768)@fp16" \ + --batch_size ${bs} \ + --backends torch,dynamo,torch_compile,inductor \ + --truncate \ + --report "sd_unet_perf_bs${bs}.txt" done # Benchmark BERT model @@ -59,9 +104,10 @@ echo "Benchmarking Huggingface BERT base model" for bs in ${batch_sizes[@]} do python perf_run.py --model ${MODELS_DIR}/bert_base_uncased_traced.jit.pt \ + --model_torch "bert_base_uncased" \ --precision fp32 --inputs="(${bs}, 128)@int32;(${bs}, 128)@int32" \ --batch_size ${bs} \ - --backends torch,torch_tensorrt \ + --backends torch,ts_trt,dynamo,torch_compile,inductor \ --truncate \ --report "bert_base_perf_bs${bs}.txt" done diff --git a/tools/perf/config/vgg16.yml b/tools/perf/config/vgg16.yml deleted file mode 100755 index d88d489458..0000000000 --- a/tools/perf/config/vgg16.yml +++ /dev/null @@ -1,19 +0,0 @@ ---- -backend: - - all -input: - input0: - - 1 - - 3 - - 224 - - 224 - num_inputs: 1 - batch_size: 1 -model: - filename: models/vgg16_scripted.jit.pt - name: vgg16 -runtime: - device: 0 - precision: - - fp32 - - fp16 diff --git a/tools/perf/config/vgg16_trt.yml b/tools/perf/config/vgg16_trt.yml deleted file mode 100755 index 9392e5c24a..0000000000 --- a/tools/perf/config/vgg16_trt.yml +++ /dev/null @@ -1,20 +0,0 @@ ---- -backend: - - tensorrt -input: - input0: - - 3 - - 224 - - 224 - num_inputs: 1 -model: - filename: models/model.plan - name: vgg16 -calibration_cache: - - vgg16.cache -runtime: - device: 0 - precision: - - fp32 - - fp16 - - int8 diff --git a/tools/perf/custom_models.py b/tools/perf/custom_models.py index a8b8a5dae0..0f85957e1e 100644 --- a/tools/perf/custom_models.py +++ b/tools/perf/custom_models.py @@ -1,10 +1,18 @@ import torch -import torch.nn as nn -from transformers import BertModel, BertTokenizer, BertConfig -import torch.nn.functional as F def BertModule(): + from transformers import BertModel + + model_name = "bert-base-uncased" + model = BertModel.from_pretrained(model_name, torchscript=True) + model.eval() + return model + + +def BertInputs(): + from transformers import BertTokenizer + model_name = "bert-base-uncased" enc = BertTokenizer.from_pretrained(model_name) text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" @@ -15,16 +23,13 @@ def BertModule(): segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1] tokens_tensor = torch.tensor([indexed_tokens]) segments_tensors = torch.tensor([segments_ids]) - config = BertConfig( - vocab_size_or_config_json_file=32000, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - torchscript=True, + return [tokens_tensor, segments_tensors] + + +def StableDiffusionUnet(): + from diffusers import DiffusionPipeline + + pipe = DiffusionPipeline.from_pretrained( + "CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16 ) - model = BertModel(config) - model.eval() - model = BertModel.from_pretrained(model_name, torchscript=True) - traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors]) - return traced_model + return pipe.unet diff --git a/tools/perf/hub.py b/tools/perf/hub.py index 4cedaabded..22b685d1fc 100644 --- a/tools/perf/hub.py +++ b/tools/perf/hub.py @@ -1,12 +1,7 @@ -import torch -import torch.nn as nn -import torch.nn.functional as F -import torchvision.models as models -import timm -from transformers import BertModel, BertTokenizer, BertConfig -import os import json -import custom_models as cm +import os + +import torch torch.hub._validate_not_a_forked_repo = lambda a, b, c: True @@ -25,25 +20,7 @@ VALID_PATHS = ("script", "trace", "torchscript", "pytorch", "all") # Key models selected for benchmarking with their respective paths -BENCHMARK_MODELS = { - "vgg16": { - "model": models.vgg16(weights=models.VGG16_Weights.DEFAULT), - "path": ["script", "pytorch"], - }, - "resnet50": { - "model": models.resnet50(weights=None), - "path": ["script", "pytorch"], - }, - "efficientnet_b0": { - "model": timm.create_model("efficientnet_b0", pretrained=True), - "path": ["script", "pytorch"], - }, - "vit": { - "model": timm.create_model("vit_base_patch16_224", pretrained=True), - "path": "script", - }, - "bert_base_uncased": {"model": cm.BertModule(), "path": "trace"}, -} +from utils import BENCHMARK_MODELS def get(n, m, manifest): @@ -51,42 +28,38 @@ def get(n, m, manifest): traced_filename = "models/" + n + "_traced.jit.pt" script_filename = "models/" + n + "_scripted.jit.pt" pytorch_filename = "models/" + n + "_pytorch.pt" - x = torch.ones((1, 3, 300, 300)).cuda() - if n == "bert_base_uncased": - traced_model = m["model"] - torch.jit.save(traced_model, traced_filename) + + m["model"] = m["model"].eval().cuda() + + # Get all desired model save specifications as list + paths = [m["path"]] if isinstance(m["path"], str) else m["path"] + + # Depending on specified model save specifications, save desired model formats + if any(path in ("all", "torchscript", "trace") for path in paths): + # (TorchScript) Traced model + trace_model = torch.jit.trace(m["model"], [inp.cuda() for inp in m["inputs"]]) + torch.jit.save(trace_model, traced_filename) manifest.update({n: [traced_filename]}) - else: - m["model"] = m["model"].eval().cuda() - - # Get all desired model save specifications as list - paths = [m["path"]] if isinstance(m["path"], str) else m["path"] - - # Depending on specified model save specifications, save desired model formats - if any(path in ("all", "torchscript", "trace") for path in paths): - # (TorchScript) Traced model - trace_model = torch.jit.trace(m["model"], [x]) - torch.jit.save(trace_model, traced_filename) - manifest.update({n: [traced_filename]}) - if any(path in ("all", "torchscript", "script") for path in paths): - # (TorchScript) Scripted model - script_model = torch.jit.script(m["model"]) - torch.jit.save(script_model, script_filename) - if n in manifest.keys(): - files = list(manifest[n]) if type(manifest[n]) != list else manifest[n] - files.append(script_filename) - manifest.update({n: files}) - else: - manifest.update({n: [script_filename]}) - if any(path in ("all", "pytorch") for path in paths): - # (PyTorch Module) model - torch.save(m["model"], pytorch_filename) - if n in manifest.keys(): - files = list(manifest[n]) if type(manifest[n]) != list else manifest[n] - files.append(script_filename) - manifest.update({n: files}) - else: - manifest.update({n: [script_filename]}) + if any(path in ("all", "torchscript", "script") for path in paths): + # (TorchScript) Scripted model + script_model = torch.jit.script(m["model"]) + torch.jit.save(script_model, script_filename) + if n in manifest.keys(): + files = list(manifest[n]) if type(manifest[n]) != list else manifest[n] + files.append(script_filename) + manifest.update({n: files}) + else: + manifest.update({n: [script_filename]}) + if any(path in ("all", "pytorch") for path in paths): + # (PyTorch Module) model + torch.save(m["model"], pytorch_filename) + if n in manifest.keys(): + files = list(manifest[n]) if type(manifest[n]) != list else manifest[n] + files.append(script_filename) + manifest.update({n: files}) + else: + manifest.update({n: [script_filename]}) + return manifest @@ -94,9 +67,25 @@ def download_models(version_matches, manifest): # Download all models if torch version is different than model version if not version_matches: for n, m in BENCHMARK_MODELS.items(): + # Ensure all specified desired model formats exist and are valid + assert all( + (path in VALID_PATHS) + for path in ( + m["path"] if isinstance(m["path"], (list, tuple)) else [m["path"]] + ) + ), "Not all 'path' attributes in BENCHMARK_MODELS are valid" + manifest = get(n, m, manifest) else: for n, m in BENCHMARK_MODELS.items(): + # Ensure all specified desired model formats exist and are valid + assert all( + (path in VALID_PATHS) + for path in ( + m["path"] if isinstance(m["path"], (list, tuple)) else [m["path"]] + ) + ), "Not all 'path' attributes in BENCHMARK_MODELS are valid" + scripted_filename = "models/" + n + "_scripted.jit.pt" traced_filename = "models/" + n + "_traced.jit.pt" pytorch_filename = "models/" + n + "_pytorch.pt" @@ -174,12 +163,4 @@ def main(): if __name__ == "__main__": - # Ensure all specified desired model formats exist and are valid - paths = [ - [m["path"]] if isinstance(m["path"], str) else m["path"] - for m in BENCHMARK_MODELS.values() - ] - assert all( - (path in VALID_PATHS) for path_list in paths for path in path_list - ), "Not all 'path' attributes in BENCHMARK_MODELS are valid" main() diff --git a/tools/perf/models/.gitkeep b/tools/perf/models/.gitkeep deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/tools/perf/perf_run.py b/tools/perf/perf_run.py index 65729008aa..d4d57c6d3f 100644 --- a/tools/perf/perf_run.py +++ b/tools/perf/perf_run.py @@ -1,68 +1,48 @@ -from __future__ import print_function -from __future__ import absolute_import -from __future__ import division +from __future__ import absolute_import, division, print_function +# Config parsers and report generations +import argparse +import logging +import os import time import timeit import warnings -import numpy as np -import torch.backends.cudnn as cudnn -# Config parsers and report generations -import argparse -import yaml -import os +import numpy as np import pandas as pd +import tensorrt as trt # Importing supported Backends import torch -import torch_tensorrt as torchtrt - -# from torch_tensorrt.fx.lower import compile -# from torch_tensorrt.fx.utils import LowerPrecision - -import tensorrt as trt +import torch.backends.cudnn as cudnn from utils import ( - parse_inputs, + BENCHMARK_MODELS, parse_backends, - precision_to_dtype, + parse_inputs, parse_precisions, - BENCHMARK_MODELS, + precision_to_dtype, ) +import torch_tensorrt as torchtrt + WARMUP_ITER = 10 results = [] -# YAML Parser class for parsing the run configurations -class ConfigParser: - def __init__(self, config_file): - self.parser = None - self.config = config_file - self.params = None - - # Reads and loads the yaml file - def read_config(self): - with open(self.config, "r") as stream: - try: - self.params = yaml.safe_load(stream) - except yaml.YAMLError as exc: - print(exc) - return self.params - - # Retrieves the value from the configuration else uses default values - def get(self, key, default_value=None): - if not key in self.params: - if not default_value: - raise ValueError( - "Key {} is not present and default_value is not configured. Please run it with default value", - key, - ) - self.params[key] = default_value - return self.params[key] +def run_with_try_except(func): + def wrapper_func(*args, **kwargs): + try: + return func(*args, **kwargs) + except KeyboardInterrupt: + raise + except: + logging.warning(f"Running {func} failed", exc_info=True) + + return wrapper_func # Runs inference using Torch backend +@run_with_try_except def run_torch(model, input_tensors, params, precision, batch_size): print("Running Torch for precision: ", precision, " batch_size : ", batch_size) iters = params.get("iterations", 20) @@ -88,9 +68,8 @@ def run_torch(model, input_tensors, params, precision, batch_size): # Runs inference using Torch-TensorRT backend -def run_torch_tensorrt( - model, input_tensors, params, precision, truncate_long_and_double, batch_size -): +@run_with_try_except +def run_ts_trt(model, input_tensors, params, precision, batch_size): print( "Running Torch-TensorRT for precision: ", precision, @@ -101,16 +80,16 @@ def run_torch_tensorrt( compile_settings = { "inputs": input_tensors, "enabled_precisions": {precision_to_dtype(precision)}, - "truncate_long_and_double": truncate_long_and_double, + "truncate_long_and_double": params.get("truncate", False), } if precision == "int8": compile_settings.update({"calib": params.get("calibration_cache")}) start_compile = time.time_ns() - model = torchtrt.compile(model, **compile_settings) + model = torchtrt.compile(model, ir="ts", **compile_settings) end_compile = time.time_ns() - compile_time_ms = (end_compile - start_compile) / 1e6 + compile_time_s = (end_compile - start_compile) / 1e9 iters = params.get("iterations", 20) # Warm up @@ -130,27 +109,83 @@ def run_torch_tensorrt( meas_time = end_time - start_time timings.append(meas_time) - recordStats("Torch-TensorRT", timings, precision, batch_size, compile_time_ms) - + recordStats( + "Torch-TensorRT [Torchscript]", timings, precision, batch_size, compile_time_s + ) -# Runs inference using FX2TRT backend -def run_fx2trt(model, input_tensors, params, precision, batch_size): - print("Running FX2TRT for precision: ", precision, " batch_size : ", batch_size) - if precision == "fp16": - model.half() - input_tensors = [tensor.half() for tensor in input_tensors] - # Run lowering eager mode benchmark +@run_with_try_except +def run_dynamo(model, input_tensors, params, precision, batch_size): + """ + Compile the given model using Torch-TensorRT dynamo frontend and record performance stats + """ + print( + "Running Torch-TensorRT [dynamo] for precision: ", + precision, + " batch_size : ", + batch_size, + ) start_compile = time.time_ns() model = torchtrt.compile( model, - ir="fx", inputs=input_tensors, - enabled_precisions={torch.float16 if precision == "fp16" else torch.float32}, + ir="dynamo", + enabled_precisions={precision_to_dtype(precision)}, + min_block_size=params.get("min_block_size", 1), + debug=False, + truncate_long_and_double=params.get("truncate", False), ) end_compile = time.time_ns() - compile_time_ms = (end_compile - start_compile) / 1e6 + compile_time_s = (end_compile - start_compile) / 1e9 + iters = params.get("iterations", 20) + # Warm up + with torch.no_grad(): + for _ in range(WARMUP_ITER): + features = model(*input_tensors) + torch.cuda.synchronize() + + timings = [] + with torch.no_grad(): + for i in range(iters): + start_time = timeit.default_timer() + features = model(*input_tensors) + torch.cuda.synchronize() + end_time = timeit.default_timer() + meas_time = end_time - start_time + timings.append(meas_time) + + recordStats( + "Torch-TensorRT [Dynamo]", timings, precision, batch_size, compile_time_s + ) + + +@run_with_try_except +def run_torch_compile(model, input_tensors, params, precision, batch_size): + """ + Compile the given model using Torch-TensorRT torch.compile frontend and record performance stats + """ + torch._dynamo.reset() + + print( + "Running Torch-TensorRT [torch_compile] for precision: ", + precision, + " batch_size : ", + batch_size, + ) + compile_spec = { + "inputs": input_tensors, + "enabled_precisions": {precision_to_dtype(precision)}, + "truncate_long_and_double": params.get("truncate", False), + "min_block_size": params.get("min_block_size", 1), + } + start_compile = time.time_ns() + model = torch.compile( + model, backend="tensorrt", dynamic=False, options=compile_spec + ) + model(*input_tensors) + end_compile = time.time_ns() + compile_time_s = (end_compile - start_compile) / 1e9 iters = params.get("iterations", 20) # Warm up with torch.no_grad(): @@ -168,70 +203,63 @@ def run_fx2trt(model, input_tensors, params, precision, batch_size): end_time = timeit.default_timer() meas_time = end_time - start_time timings.append(meas_time) + # Reset torch dynamo cache + torch._dynamo.reset() - recordStats("FX-TensorRT", timings, precision, batch_size, compile_time_ms) + recordStats( + "Torch-TensorRT [torch_compile]", + timings, + precision, + batch_size, + compile_time_s, + ) -def run_dynamo(model, input_tensors, params, precision, batch_size): - dynamo_backend = params["dynamo_backend"] +@run_with_try_except +def run_inductor(model, input_tensors, params, precision, batch_size): + """ + Compile the given model using torch inductor and record performance stats + """ + torch._dynamo.reset() + print( - "Running Dynamo with backend: ", - dynamo_backend, - " for precision: ", + "Running Torch [inductor] for precision: ", precision, " batch_size : ", batch_size, ) - if precision == "fp16": - input_tensors = [tensor.half() for tensor in input_tensors] - - fp16_mode = True if precision == "fp16" else False - # dynamo_backend_params = {"fp16_mode" : fp16_mode} - # model = torch.compile( - # model, - # mode="default", - # dynamic=False, - # fullgraph=False, - # backend=dynamo_backend, - # # **dynamo_backend_params - # ) - import torch._dynamo as dynamo - - model = dynamo.optimize(dynamo_backend, nopython=True)(model) - # Compile and measure the time + start_compile = time.time_ns() + model = torch.compile(model, backend="inductor", dynamic=False, mode="max-autotune") + model(*input_tensors) + end_compile = time.time_ns() + compile_time_s = (end_compile - start_compile) / 1e9 + iters = params.get("iterations", 20) + # Warm up with torch.no_grad(): - start_compile = time.time_ns() - features = model(*input_tensors) - end_compile = time.time_ns() - compile_time_ms = (end_compile - start_compile) / 1e6 - iters = params.get("iterations", 20) - # import pdb; pdb.set_trace() - print("============= DONE 0 ==================") - - print("============= DONE 1 ==================") - # Warm up - model = torch._dynamo.run(model) - # import pdb; pdb.set_trace() - - exported_model, _ = torch._dynamo.export(model, *input_tensors) - for i in range(WARMUP_ITER): - print("==== ITER: ", i) - features = exported_model(*input_tensors) + for _ in range(WARMUP_ITER): + features = model(*input_tensors) - torch.cuda.synchronize() - print("============= DONE 2 ==================") - timings = [] + torch.cuda.synchronize() + + timings = [] + with torch.no_grad(): for i in range(iters): start_time = timeit.default_timer() - features = exported_model(*input_tensors) + features = model(*input_tensors) torch.cuda.synchronize() end_time = timeit.default_timer() meas_time = end_time - start_time timings.append(meas_time) + # Reset torch dynamo cache + torch._dynamo.reset() recordStats( - "Dynamo-" + dynamo_backend, timings, precision, batch_size, compile_time_ms + "Torch [inductor]", + timings, + precision, + batch_size, + compile_time_s, ) @@ -259,12 +287,12 @@ def torch_device_from_trt(device): return TypeError("%s is not supported by torch" % device) +@run_with_try_except def run_tensorrt( model, input_tensors, params, precision, - truncate_long_and_double=False, is_trt_engine=False, batch_size=1, ): @@ -276,7 +304,7 @@ def run_tensorrt( compile_settings = { "inputs": input_tensors, "enabled_precisions": {precision_to_dtype(precision)}, - "truncate_long_and_double": truncate_long_and_double, + "truncate_long_and_double": params.get("truncate", False), } print("Converting method to TensorRT engine...") @@ -332,7 +360,6 @@ def run( input_tensors, params, precision, - truncate_long_and_double=False, batch_size=1, is_trt_engine=False, model_torch=None, @@ -347,13 +374,13 @@ def run( if ( backend == "all" - or backend == "torch_tensorrt" + or backend == "ts_trt" or params.get("calibration_cache", None) == None ): print("int8 precision expects calibration cache file for inference") return False - if (model is None) and (backend != "fx2trt"): + if (model is None) and (backend in ("tensorrt", "ts_trt", "all")): warnings.warn( f"Requested backend {backend} without specifying a TorchScript Model, " + "skipping this backend" @@ -369,12 +396,11 @@ def run( if backend == "all": run_torch(model, input_tensors, params, precision, batch_size) - run_torch_tensorrt( + run_ts_trt( model, input_tensors, params, precision, - truncate_long_and_double, batch_size, ) run_tensorrt( @@ -382,65 +408,46 @@ def run( input_tensors, params, precision, - truncate_long_and_double, is_trt_engine, batch_size, ) - run_fx2trt(model_torch, input_tensors, params, precision, batch_size) run_dynamo(model_torch, input_tensors, params, precision, batch_size) - elif backend == "torchscript": - run_torch(model, input_tensors, params, precision, batch_size) - run_torch_tensorrt( - model, - input_tensors, - params, - precision, - truncate_long_and_double, - batch_size, - ) - run_tensorrt( - model, - input_tensors, - params, - precision, - truncate_long_and_double, - is_trt_engine, - batch_size, - ) - elif backend == "torch": - run_torch(model, input_tensors, params, precision, batch_size) + run_torch(model_torch, input_tensors, params, precision, batch_size) - elif backend == "torch_tensorrt": - run_torch_tensorrt( + elif backend == "ts_trt": + run_ts_trt( model, input_tensors, params, precision, - truncate_long_and_double, batch_size, ) - - elif backend == "fx2trt": - run_fx2trt(model_torch, input_tensors, params, precision, batch_size) - elif backend == "tensorrt": run_tensorrt( model, input_tensors, params, precision, - truncate_long_and_double, is_trt_engine, batch_size, ) elif backend == "dynamo": run_dynamo(model_torch, input_tensors, params, precision, batch_size) + elif backend == "torch_compile": + run_torch_compile(model_torch, input_tensors, params, precision, batch_size) + + elif backend == "torch_compile": + run_torch_compile(model_torch, input_tensors, params, precision, batch_size) + + elif backend == "inductor": + run_inductor(model_torch, input_tensors, params, precision, batch_size) + # Generate report -def recordStats(backend, timings, precision, batch_size=1, compile_time_ms=None): +def recordStats(backend, timings, precision, batch_size=1, compile_time_s=None): times = np.array(timings) steps = len(times) speeds = batch_size / times @@ -459,72 +466,21 @@ def recordStats(backend, timings, precision, batch_size=1, compile_time_ms=None) "Mean(FPS)": speed_mean, "Median-Latency(ms)": time_med * 1000, "Mean-Latency(ms)": time_mean * 1000, - "Compile Time(ms)": compile_time_ms, + "Latency-StdDev(ms)": time_std * 1000, + "Compile Time(s)": compile_time_s, } results.append(stats) -def load_ts_model(params): - model = None - is_trt_engine = False - - # No TorchScript Model Specified - if len(params.get("model", "")) == 0: - return None, None, is_trt_engine - - # Load torch model traced/scripted - model_file = params.get("model").get("filename") - try: - model_name = params.get("model").get("name") - except: - model_name = model_file - - print("Loading model: ", model_file) - if model_file.endswith(".plan"): - is_trt_engine = True - # Read the TensorRT engine file - with open(model_file, "rb") as fin: - model = fin.read() - else: - model = torch.jit.load(model_file).cuda() - - return model, model_name, is_trt_engine - - -def load_torch_model(params): - model = None - - # No Torch Model Specified - if len(params.get("model_torch", "")) == 0: - return None, None - - # Load torch model - model_file = params.get("model_torch").get("filename") - try: - model_name = params.get("model_torch").get("name") - except: - model_name = model_file - - print("Loading Torch model: ", model_file) - model = torch.load(model_file).cuda() - - return model, model_name - - if __name__ == "__main__": arg_parser = argparse.ArgumentParser( description="Run inference on a model with random input values" ) - arg_parser.add_argument( - "--config", - type=str, - help="Load YAML based configuration file to run the inference. If this is used other params will be ignored", - ) # The following options are manual user provided settings arg_parser.add_argument( "--backends", type=str, - help="Comma separated string of backends. Eg: torch,torch_tensorrt,fx2trt,tensorrt", + help="Comma separated string of backends. Eg: torch, ts_trt, dynamo, torch_compile, inductor, tensorrt", ) arg_parser.add_argument( "--model", type=str, default="", help="Name of torchscript model file" @@ -533,7 +489,7 @@ def load_torch_model(params): "--model_torch", type=str, default="", - help="Name of torch model file (used for fx2trt)", + help="Name of torch model file", ) arg_parser.add_argument( "--inputs", @@ -553,22 +509,17 @@ def load_torch_model(params): "--calibration_cache", type=str, help="Name of the calibration cache file" ) arg_parser.add_argument("--device", type=int, help="device id") + arg_parser.add_argument("--min_block_size", type=int, default=1, help="device id") arg_parser.add_argument( "--truncate", action="store_true", - help="Truncate long and double weights in the network in Torch-TensorRT", + help="Truncate long and double weights in the network in Torch-TensorRT", ) arg_parser.add_argument( "--is_trt_engine", action="store_true", help="Boolean flag to determine if the user provided model is a TRT engine or not", ) - arg_parser.add_argument( - "--dynamo_backend", - type=str, - default="fx2trt", - help="List of backends to use in Torchdynamo. Select options: inductor|fx2trt", - ) arg_parser.add_argument( "--report", type=str, @@ -580,128 +531,66 @@ def load_torch_model(params): # Create random input tensor of certain size torch.manual_seed(12345) model_name = "Model" - if args.config: - parser = ConfigParser(args.config) - # Load YAML params - params = parser.read_config() - model, model_name, is_trt_engine = load_ts_model(params) - model_torch, model_name_torch = load_torch_model(params) - - # If neither model type was provided - if (model is None) and (model_torch is None): - raise ValueError( - "No valid models specified. Please provide a torchscript model file or model name " - + "(among the following options vgg16|resnet50|efficientnet_b0|vit) " - + "or provide a torch model file" - ) - - # Default device is set to 0. Configurable using yaml config file. - torch.cuda.set_device(params.get("runtime").get("device", 0)) + params = vars(args) + model_name = params["model"] + model = None - num_input = params.get("input").get("num_inputs") - truncate_long_and_double = params.get("runtime").get( - "truncate_long_and_double", False + model_name_torch = params["model_torch"] + model_torch = None + + # Load TorchScript model, if provided + if os.path.exists(model_name): + print("Loading user provided torchscript model: ", model_name) + model = torch.jit.load(model_name).cuda().eval() + elif model_name in BENCHMARK_MODELS: + print("Loading torchscript model from BENCHMARK_MODELS for: ", model_name) + model = BENCHMARK_MODELS[model_name]["model"].eval().cuda() + + # Load PyTorch Model, if provided + if len(model_name_torch) > 0 and os.path.exists(model_name_torch): + print("Loading user provided torch model: ", model_name_torch) + model_torch = torch.load(model_name_torch).eval().cuda() + elif model_name_torch in BENCHMARK_MODELS: + model_torch = BENCHMARK_MODELS[model_name_torch]["model"].eval().cuda() + + # If neither model type was provided + if (model is None) and (model_torch is None): + raise ValueError( + "No valid models specified. Please provide a torchscript model file or model name " + + "(among the following options vgg16|resnet50|efficientnet_b0|vit) " + + "or provide a torch model file" ) - batch_size = params.get("input").get("batch_size", 1) - for precision in params.get("runtime").get("precision", "fp32"): - input_tensors = [] - num_input = params.get("input").get("num_inputs", 1) - for i in range(num_input): - inp_tensor = params.get("input").get("input" + str(i)) - input_tensors.append( - torch.randint( - 0, - 2, - tuple(d for d in inp_tensor), - dtype=precision_to_dtype(precision), - ).cuda() - ) - - if is_trt_engine: - print( - "Warning, TensorRT engine file is configured. Please make sure the precision matches with the TRT engine for reliable results" - ) - - if not is_trt_engine and (precision == "fp16" or precision == "half"): - # If model is TensorRT serialized engine then model.half will report failure - if model is not None: - model = model.half() - if model_torch is not None: - model_torch = model_torch.half() - backends = params.get("backend") - # Run inference - status = run( - model, - backends, - input_tensors, - params, - precision, - truncate_long_and_double, - batch_size, - is_trt_engine, - model_torch, - ) - else: - params = vars(args) - model_name = params["model"] - model = None - - model_name_torch = params["model_torch"] - model_torch = None - - # Load TorchScript model, if provided - if os.path.exists(model_name): - print("Loading user provided torchscript model: ", model_name) - model = torch.jit.load(model_name).cuda().eval() - elif model_name in BENCHMARK_MODELS: - print("Loading torchscript model from BENCHMARK_MODELS for: ", model_name) - model = BENCHMARK_MODELS[model_name]["model"].eval().cuda() - - # Load PyTorch Model, if provided - if len(model_name_torch) > 0 and os.path.exists(model_name_torch): - print("Loading user provided torch model: ", model_name_torch) - model_torch = torch.load(model_name_torch).eval().cuda() - - # If neither model type was provided - if (model is None) and (model_torch is None): - raise ValueError( - "No valid models specified. Please provide a torchscript model file or model name " - + "(among the following options vgg16|resnet50|efficientnet_b0|vit) " - + "or provide a torch model file" - ) - - backends = parse_backends(params["backends"]) - if "dynamo" in backends and (model_torch is None): - raise ValueError( - "No Pytorch model (nn.Module) is provided for torchdynamo compilation. Please provide a pytorch model using --model_torch argument" - ) + backends = parse_backends(params["backends"]) + if ("dynamo" in backends or "torch_compile" in backends) and (model_torch is None): + raise ValueError( + "No Pytorch model (nn.Module) is provided for torchdynamo compilation. Please provide a pytorch model using --model_torch argument" + ) - truncate_long_and_double = params["truncate"] - batch_size = params["batch_size"] - is_trt_engine = params["is_trt_engine"] - precisions = parse_precisions(params["precision"]) + batch_size = params["batch_size"] + is_trt_engine = params["is_trt_engine"] + precisions = parse_precisions(params["precision"]) - for precision in precisions: - input_tensors = parse_inputs( - params["inputs"], precision_to_dtype(precision) - ) + for precision in precisions: + input_tensors = parse_inputs(params["inputs"], precision_to_dtype(precision)) - if not is_trt_engine and (precision == "fp16" or precision == "half"): - # If model is TensorRT serialized engine then model.half will report failure + if not is_trt_engine and (precision == "fp16" or precision == "half"): + # If model is TensorRT serialized engine then model.half will report failure + if model is not None: model = model.half() - - status = run( - model, - backends, - input_tensors, - params, - precision, - truncate_long_and_double, - batch_size, - is_trt_engine, - model_torch=model_torch, - ) + if model_torch is not None: + model_torch = model_torch.half() + + status = run( + model, + backends, + input_tensors, + params, + precision, + batch_size, + is_trt_engine, + model_torch=model_torch, + ) # Generate report print("Model Summary: ", model_name) diff --git a/tools/perf/requirements.txt b/tools/perf/requirements.txt index f9f8813feb..d204d3c335 100644 --- a/tools/perf/requirements.txt +++ b/tools/perf/requirements.txt @@ -1,5 +1,7 @@ -timeit numpy argparse -yaml +pyyaml +transformers==4.33.2 +diffusers==0.21.4 pandas==2.0.1 +timm==0.9.8 diff --git a/tools/perf/utils.py b/tools/perf/utils.py index 96a13ffbc2..41f49439a2 100644 --- a/tools/perf/utils.py +++ b/tools/perf/utils.py @@ -1,30 +1,83 @@ -import torch -import torch_tensorrt +from typing import Optional, Sequence, Union + import custom_models as cm -import torchvision.models as models import timm +import torch +import torchvision.models as models + +import torch_tensorrt -BENCHMARK_MODELS = { - "vgg16": { - "model": models.vgg16(weights=models.VGG16_Weights.DEFAULT), - "path": ["script", "pytorch"], - }, - "resnet50": { - "model": models.resnet50(weights=None), - "path": ["script", "pytorch"], - }, - "efficientnet_b0": { - "model": timm.create_model("efficientnet_b0", pretrained=True), - "path": ["script", "pytorch"], - }, - "vit": { - "model": timm.create_model("vit_base_patch16_224", pretrained=True), - "path": "script", - }, - "bert_base_uncased": {"model": cm.BertModule(), "path": "trace"}, +BENCHMARK_MODEL_NAMES = { + "vgg16", + "alexnet", + "resnet50", + "efficientnet_b0", + "vit", + "vit_large", + "bert_base_uncased", + "sd_unet", } +class ModelStorage: + def __contains__(self, name: str): + return name in BENCHMARK_MODEL_NAMES + + def __getitem__(self, name: str): + assert name in BENCHMARK_MODEL_NAMES + + if name == "vgg16": + return { + "model": models.vgg16(weights=models.VGG16_Weights.DEFAULT), + "path": ["script", "pytorch"], + } + elif name == "alexnet": + return { + "model": models.alexnet(weights=models.AlexNet_Weights.DEFAULT), + "path": ["script", "pytorch"], + } + elif name == "resnet50": + return { + "model": models.resnet50(weights=None), + "path": ["script", "pytorch"], + } + elif name == "efficientnet_b0": + return { + "model": timm.create_model("efficientnet_b0", pretrained=True), + "path": ["script", "pytorch"], + } + elif name == "vit": + return { + "model": timm.create_model("vit_base_patch16_224", pretrained=True), + "path": ["script", "pytorch"], + } + elif name == "vit_large": + return { + "model": timm.create_model("vit_giant_patch14_224", pretrained=False), + "path": ["script", "pytorch"], + } + elif name == "bert_base_uncased": + return { + "model": cm.BertModule(), + "inputs": cm.BertInputs(), + "path": ["trace", "pytorch"], + } + elif name == "sd_unet": + return { + "model": cm.StableDiffusionUnet(), + "path": "pytorch", + } + else: + raise AssertionError(f"Invalid model name {name}") + + def items(self): + for name in BENCHMARK_MODEL_NAMES: + yield name, self.__getitem__(name) + + +BENCHMARK_MODELS = ModelStorage() + + def precision_to_dtype(pr): if pr == "fp32": return torch.float @@ -51,7 +104,16 @@ def parse_inputs(user_inputs, dtype): ) for input_dim in input_shape_and_dtype[0][1:-1].split(","): input_shape.append(int(input_dim)) - torchtrt_inputs.append(torch.randint(0, 5, input_shape, dtype=dtype).cuda()) + + if input_shape != [1]: + if dtype == torch.int32: + torchtrt_inputs.append( + torch.randint(0, 5, input_shape, dtype=dtype).cuda() + ) + else: + torchtrt_inputs.append(torch.randn(input_shape, dtype=dtype).cuda()) + else: + torchtrt_inputs.append(torch.Tensor([1.0]).cuda()) return torchtrt_inputs