pytorch · peri044 · Sep 9, 2024 · Sep 17, 2024 · Sep 19, 2024 · Sep 19, 2024
diff --git a/docker/Dockerfile.lab b/docker/Dockerfile.lab
@@ -0,0 +1,33 @@
+# syntax=docker/dockerfile:1
+
+# Base image starts with CUDA
+ARG BASE_IMG=nvidia/cuda:12.4.1-devel-ubuntu22.04
+FROM ${BASE_IMG} as base
+ENV BASE_IMG=nvidia/cuda:12.4.1-devel-ubuntu22.04
+
+ARG PYTHON_VERSION=3.10
+ENV PYTHON_VERSION=${PYTHON_VERSION}
+
+ARG USE_CXX11_ABI
+ENV USE_CXX11=${USE_CXX11_ABI}
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install basic dependencies
+RUN apt-get update
+RUN apt install -y vim build-essential manpages-dev wget zlib1g software-properties-common git libssl-dev zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev wget ca-certificates curl llvm libncurses5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev mecab-ipadic-utf8
+
+# Install PyEnv and desired Python version
+ENV HOME="/root"
+ENV PYENV_DIR="$HOME/.pyenv"
+ENV PATH="$PYENV_DIR/shims:$PYENV_DIR/bin:$PATH"
+RUN wget -L https://github.com/pyenv/pyenv-installer/raw/master/bin/pyenv-installer &&\
+    chmod 755 pyenv-installer &&\
+    bash pyenv-installer &&\
+    eval "$(pyenv init -)"
+
+RUN pyenv install -v ${PYTHON_VERSION}
+RUN pyenv global ${PYTHON_VERSION}
+
+# Setup Bazel via Bazelisk
+RUN wget -q https://github.com/bazelbuild/bazelisk/releases/download/v1.17.0/bazelisk-linux-amd64 -O /usr/bin/bazel &&\
+    chmod a+x /usr/bin/bazel
diff --git a/examples/dynamo/requirements.txt b/examples/dynamo/requirements.txt
@@ -1,4 +1,4 @@
 cupy==13.1.0
 triton==2.3.0
 diffusers==0.30.3
-transformers==4.44.2
+transformers==4.44.2
diff --git a/examples/dynamo/torch_compile_gpt2.py b/examples/dynamo/torch_compile_gpt2.py
@@ -0,0 +1,100 @@
+"""
+.. _torch_compile_gpt2:
+
+Compiling GPT2 using the Torch-TensorRT `torch.compile` Backend
+==========================================================
+
+This interactive script is intended as a sample of the Torch-TensorRT workflow with `torch.compile` on a GPT2 model."""
+
+# %%
+# Imports and Model Definition
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+import torch
+import torch_tensorrt
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+# %%
+
+# Define the parameters
+MAX_TOKENS = 32
+DEVICE = torch.device("cuda:0")
+
+# Define the GPT2 model from hugging face
+# kv_cache is not supported in Torch-TRT currently.
+# CPU is used here so that GPU memory is reserved for TRT compilation.
+with torch.no_grad():
+    tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    model = (
+        AutoModelForCausalLM.from_pretrained(
+            "gpt2",
+            pad_token_id=tokenizer.eos_token_id,
+            use_cache=False,
+            attn_implementation="eager",
+        )
+        .eval()
+        .cuda()
+    )
+
+# %%
+# Tokenize a sample input prompt and get pytorch model outputs
+prompt = "I enjoy walking with my cute dog"
+model_inputs = tokenizer(prompt, return_tensors="pt")
+input_ids = model_inputs["input_ids"].cuda()
+
+# Auto-regressive generation loop for greedy search using PyTorch model.
+pyt_gen_tokens = model.generate(
+    input_ids,
+    max_length=MAX_TOKENS,
+    use_cache=False,
+    pad_token_id=tokenizer.eos_token_id,
+)
+
+# %%
+# Compilation with `torch.compile` using tensorrt backend and generate TensorRT outputs
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+# Compile the model and mark the input sequence length to be dynamic
+torch._dynamo.mark_dynamic(input_ids, 1, min=2, max=1023)
+model.forward = torch.compile(
+    model.forward,
+    backend="tensorrt",
+    dynamic=None,
+    options={
+        "enabled_precisions": {torch.float32},
+        "disable_tf32": True,
+        "min_block_size": 1,
+        "debug": True,
+    },
+)
+
+# Auto-regressive generation loop for greedy decoding using TensorRT model
+# The first token generation compiles the model using TensorRT and the second token
+# encounters recompilation
+trt_gen_tokens = model.generate(
+    inputs=input_ids,
+    max_length=MAX_TOKENS,
+    use_cache=False,
+    pad_token_id=tokenizer.eos_token_id,
+)
+
+# %%
+# Decode the output sentences of PyTorch and TensorRT
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+print("=============================")
+print(
+    "Pytorch model generated text: ",
+    tokenizer.decode(pyt_gen_tokens[0], skip_special_tokens=True),
+)
+print("=============================")
+print(
+    "TensorRT model generated text: ",
+    tokenizer.decode(trt_gen_tokens[0], skip_special_tokens=True),
+)
+
+# %%
+# The output sentences should look like
+
+# Pytorch model generated text:  I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll
+# =============================
+# TensorRT model generated text:  I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll
diff --git a/examples/dynamo/torch_compile_llama2.py b/examples/dynamo/torch_compile_llama2.py
@@ -0,0 +1,89 @@
+"""
+.. _torch_compile_gpt2:
+
+Compiling GPT2 using the Torch-TensorRT `torch.compile` Backend
+==========================================================
+
+This interactive script is intended as a sample of the Torch-TensorRT workflow with `torch.compile` on a GPT2 model."""
+
+# %%
+# Imports and Model Definition
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+import torch
+import torch_tensorrt
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from utils import generate
+
+# %%
+
+# Define the parameters
+MAX_TOKENS = 32
+DEVICE = torch.device("cuda:0")
+
+# Define the GPT2 model from hugging face
+# kv_cache is not supported in Torch-TRT currently.
+# CPU is used here so that GPU memory is reserved for TRT compilation.
+llama_path = "meta-llama/Llama-2-7b-chat-hf"
+with torch.no_grad():
+    model = AutoModelForCausalLM.from_pretrained(
+        llama_path, use_cache=False, attn_implementation="eager"
+    ).eval()
+
+tokenizer = AutoTokenizer.from_pretrained(llama_path)
+
+# %%
+# Tokenize a sample input prompt and get pytorch model outputs
+prompt = "I enjoy walking with my cute dog"
+model_inputs = tokenizer(prompt, return_tensors="pt")
+input_ids = model_inputs["input_ids"].cuda()
+
+# Auto-regressive generation loop for greedy search using PyTorch model.
+# We use a custom generate function which is very similar to the huggingface one.
+# pyt_gen_tokens = generate(model, input_ids, MAX_TOKENS, tokenizer.eos_token_id)
+
+# %%
+# Compilation with `torch.compile` using tensorrt backend and generate TensorRT outputs
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+# Compile the model and mark the input sequence length to be dynamic
+with torch_tensorrt.logging.debug():
+    torch._dynamo.mark_dynamic(input_ids, 1, min=7, max=1023)
+    model.forward = torch.compile(
+        model.forward,
+        backend="tensorrt",
+        dynamic=None,
+        options={
+            "enabled_precisions": {torch.float32},
+            "disable_tf32": True,
+            "debug": True,
+            # "use_python_runtime": True
+        },
+    )
+model(input_ids)
+breakpoint()
+model(input_ids)
+# Auto-regressive generation loop for greedy decoding using TensorRT model
+# We use a custom generate function which is very similar to the huggingface one.
+# Move inputs to GPU
+input_ids = input_ids.to(DEVICE)
+trt_gen_tokens = generate(model, input_ids, MAX_TOKENS, tokenizer.eos_token_id)
+
+# %%
+# Decode the output sentences of PyTorch and TensorRT
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+print("=============================")
+print(
+    "Pytorch model generated text: ",
+    tokenizer.decode(pyt_gen_tokens[0], skip_special_tokens=True),
+)
+print("=============================")
+print(
+    "TensorRT model generated text: ",
+    tokenizer.decode(trt_gen_tokens[0], skip_special_tokens=True),
+)
+
+# %%
+# The output sentences should look like
+#
+#
diff --git a/examples/dynamo/utils.py b/examples/dynamo/utils.py
@@ -51,7 +51,14 @@ def generate(model, input_seq, max_tokens, eos_token_id):
     )
 
     while True:
-        outputs = model(input_seq)
+        outputs = model(
+            input_seq,
+            past_key_values=None,
+            position_ids=None,
+            attention_mask=None,
+            use_cache=False,
+            token_type_ids=None,
+        )
         logits = outputs.logits
         next_token_logits = logits[:, -1, :]
         next_tokens = torch.argmax(next_token_logits, dim=-1)

diff --git a/py/requirements.txt b/py/requirements.txt
@@ -3,6 +3,6 @@ packaging
 pybind11==2.6.2
 --extra-index-url https://download.pytorch.org/whl/nightly/cu124
 torch>=2.6.0.dev,<2.7.0
-torchvision>=0.20.0.dev,<0.21.0
+#torchvision>=0.20.0.dev,<0.21.0
 --extra-index-url https://pypi.ngc.nvidia.com
 pyyaml
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -311,6 +311,7 @@ def compile(
     trt_gm = compile_module(
         gm, trt_arg_inputs, trt_kwarg_inputs, settings, engine_cache
     )
+
     return trt_gm
 
 

diff --git a/py/torch_tensorrt/dynamo/backend/backends.py b/py/torch_tensorrt/dynamo/backend/backends.py
@@ -80,7 +80,8 @@ def _pretraced_backend(
             repair_input_aliasing(gm, settings)
 
             # Remove sym_int placeholders and inputs
-            remove_sym_nodes(gm, settings)
+            remove_sym_nodes(gm, sample_inputs, settings)
+
             torch_inputs = [
                 input for input in sample_inputs if isinstance(input, torch.Tensor)
             ]
@@ -91,7 +92,7 @@ def _pretraced_backend(
             # Invoke AOTAutograd to translate operators to aten
             gm = aot_export_joint_simple(
                 gm,
-                torch_inputs,
+                sample_inputs,
                 trace_joint=False,
                 decompositions=get_decompositions(
                     settings.enable_experimental_decompositions

diff --git a/py/torch_tensorrt/dynamo/lowering/_decompositions.py b/py/torch_tensorrt/dynamo/lowering/_decompositions.py
@@ -3,7 +3,8 @@
 from typing import Any, Callable, Dict, List, Optional
 
 import torch
-from torch._decomp import _decomp_table_to_post_autograd_aten, register_decomposition
+from torch._decomp import register_decomposition
+from torch._export.utils import _decomp_table_to_post_autograd_aten
 from torch._ops import OpOverload
 from torch_tensorrt.dynamo._defaults import default_device
 from torch_tensorrt.dynamo.conversion.converter_utils import get_positive_dim

diff --git a/py/torch_tensorrt/dynamo/lowering/passes/remove_sym_nodes.py b/py/torch_tensorrt/dynamo/lowering/passes/remove_sym_nodes.py
@@ -1,4 +1,5 @@
 import logging
+from typing import Any, Sequence
 
 import torch
 from torch_tensorrt.dynamo._settings import CompilationSettings
@@ -7,15 +8,17 @@
 
 
 def remove_sym_nodes(
-    gm: torch.fx.GraphModule, settings: CompilationSettings
+    gm: torch.fx.GraphModule,
+    sample_inputs: Sequence[Any],
+    settings: CompilationSettings,
 ) -> torch.fx.GraphModule:
     """Remove sym_int placeholders which get inserted due to torch.compile's
     dynamic=True behavior
     """
     # Extract SymInt placeholder Tensors
-    placeholder_sym_ints = [
-        node
-        for node in gm.graph.nodes
+    placeholder_idx_sym_ints = [
+        (idx, node)
+        for idx, node in enumerate(gm.graph.nodes)
         if (
             node.op == "placeholder"
             and isinstance(node.type, type)
@@ -24,8 +27,9 @@ def remove_sym_nodes(
         )
     ]
 
-    for node in placeholder_sym_ints:
+    for idx, node in placeholder_idx_sym_ints:
         gm.graph.erase_node(node)
+        sample_inputs.pop(idx)
 
     gm.graph.lint()
     gm.recompile()

diff --git a/pyproject.toml b/pyproject.toml
@@ -8,7 +8,7 @@ requires = [
     "cffi>=1.15.1",
     "typing-extensions>=4.7.0",
     "future>=0.18.3",
-    "tensorrt-cu12==10.3.0",
+    #"tensorrt-cu12==10.3.0",
     "torch>=2.6.0.dev,<2.7.0",
     "pybind11==2.6.2",
     "numpy",
@@ -55,9 +55,9 @@ keywords = [
 ]
 dependencies = [
     "torch>=2.6.0.dev,<2.7.0",
-    "tensorrt-cu12==10.3.0",
-    "tensorrt-cu12-bindings==10.3.0",
-    "tensorrt-cu12-libs==10.3.0",
+    #"tensorrt-cu12==10.3.0",
+    #"tensorrt-cu12-bindings==10.3.0",
+    #"tensorrt-cu12-libs==10.3.0",
     "packaging>=23",
     "numpy",
     "typing-extensions>=4.7.0",