| 
 | 1 | +"""  | 
 | 2 | +Dynamo Compile Transformers Example  | 
 | 3 | +=========================  | 
 | 4 | +
  | 
 | 5 | +This interactive script is intended as a sample of the `torch_tensorrt.dynamo.compile` workflow on a transformer-based model."""  | 
 | 6 | + | 
 | 7 | +# %%  | 
 | 8 | +# Imports and Model Definition  | 
 | 9 | +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^  | 
 | 10 | + | 
 | 11 | +import torch  | 
 | 12 | +import torch_tensorrt  | 
 | 13 | +from transformers import BertModel  | 
 | 14 | + | 
 | 15 | +# %%  | 
 | 16 | + | 
 | 17 | +# Initialize model with float precision and sample inputs  | 
 | 18 | +model = BertModel.from_pretrained("bert-base-uncased").eval().to("cuda")  | 
 | 19 | +inputs = [  | 
 | 20 | +    torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda"),  | 
 | 21 | +    torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda"),  | 
 | 22 | +]  | 
 | 23 | + | 
 | 24 | + | 
 | 25 | +# %%  | 
 | 26 | +# Optional Input Arguments to `torch_tensorrt.dynamo.compile`  | 
 | 27 | +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^  | 
 | 28 | + | 
 | 29 | +# Enabled precision for TensorRT optimization  | 
 | 30 | +enabled_precisions = {torch.float}  | 
 | 31 | + | 
 | 32 | +# Whether to print verbose logs  | 
 | 33 | +debug = True  | 
 | 34 | + | 
 | 35 | +# Workspace size for TensorRT  | 
 | 36 | +workspace_size = 20 << 30  | 
 | 37 | + | 
 | 38 | +# Maximum number of TRT Engines  | 
 | 39 | +# (Lower value allows more graph segmentation)  | 
 | 40 | +min_block_size = 3  | 
 | 41 | + | 
 | 42 | +# Operations to Run in Torch, regardless of converter support  | 
 | 43 | +torch_executed_ops = {}  | 
 | 44 | + | 
 | 45 | +# %%  | 
 | 46 | +# Compilation with `torch_tensorrt.dynamo.compile`  | 
 | 47 | +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^  | 
 | 48 | + | 
 | 49 | +# Build and compile the model with torch.compile, using tensorrt backend  | 
 | 50 | +optimized_model = torch_tensorrt.dynamo.compile(  | 
 | 51 | +    model,  | 
 | 52 | +    inputs,  | 
 | 53 | +    enabled_precisions=enabled_precisions,  | 
 | 54 | +    debug=debug,  | 
 | 55 | +    workspace_size=workspace_size,  | 
 | 56 | +    min_block_size=min_block_size,  | 
 | 57 | +    torch_executed_ops=torch_executed_ops,  | 
 | 58 | +)  | 
 | 59 | + | 
 | 60 | +# %%  | 
 | 61 | +# Equivalently, we could have run the above via the convenience frontend, as so:  | 
 | 62 | +# `torch_tensorrt.compile(model, ir="dynamo_compile", inputs=inputs, ...)`  | 
 | 63 | + | 
 | 64 | +# %%  | 
 | 65 | +# Inference  | 
 | 66 | +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^  | 
 | 67 | + | 
 | 68 | +# Does not cause recompilation (same batch size as input)  | 
 | 69 | +new_inputs = [  | 
 | 70 | +    torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda"),  | 
 | 71 | +    torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda"),  | 
 | 72 | +]  | 
 | 73 | +new_outputs = optimized_model(*new_inputs)  | 
 | 74 | + | 
 | 75 | +# %%  | 
 | 76 | + | 
 | 77 | +# Does cause recompilation (new batch size)  | 
 | 78 | +new_inputs = [  | 
 | 79 | +    torch.randint(0, 2, (4, 14), dtype=torch.int32).to("cuda"),  | 
 | 80 | +    torch.randint(0, 2, (4, 14), dtype=torch.int32).to("cuda"),  | 
 | 81 | +]  | 
 | 82 | +new_outputs = optimized_model(*new_inputs)  | 
 | 83 | + | 
 | 84 | +# %%  | 
 | 85 | +# Cleanup  | 
 | 86 | +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^  | 
 | 87 | + | 
 | 88 | +# Finally, we use Torch utilities to clean up the workspace  | 
 | 89 | +torch._dynamo.reset()  | 
 | 90 | + | 
 | 91 | +with torch.no_grad():  | 
 | 92 | +    torch.cuda.empty_cache()  | 
0 commit comments