From 231bf2c8eaf5590d391c0e0e2fd08c8e11888d23 Mon Sep 17 00:00:00 2001
From: lanluo-nvidia <lanl@nvidia.com>
Date: Mon, 29 Sep 2025 13:15:39 -0700
Subject: [PATCH 1/5] test windows access violation issue

---
 .../build-test-linux-aarch64-jetpack.yml      |   2 +-
 .../workflows/build-test-linux-aarch64.yml    |   2 +-
 .github/workflows/build-test-linux-x86_64.yml |   2 +-
 .../workflows/build-test-linux-x86_64_rtx.yml |   2 +-
 .github/workflows/build-test-windows.yml      | 444 ++++++++++--------
 .github/workflows/build-test-windows_rtx.yml  | 346 +++++++-------
 6 files changed, 417 insertions(+), 381 deletions(-)

diff --git a/.github/workflows/build-test-linux-aarch64-jetpack.yml b/.github/workflows/build-test-linux-aarch64-jetpack.yml
index ff827ad3f6..0c92e207bb 100644
--- a/.github/workflows/build-test-linux-aarch64-jetpack.yml
+++ b/.github/workflows/build-test-linux-aarch64-jetpack.yml
@@ -1,7 +1,7 @@
 name: Build and test Linux aarch64 wheels for Jetpack
 
 on:
-  pull_request:
+  #pull_request:
   push:
     branches:
       - main
diff --git a/.github/workflows/build-test-linux-aarch64.yml b/.github/workflows/build-test-linux-aarch64.yml
index 2604d18f92..fc3441ffd2 100644
--- a/.github/workflows/build-test-linux-aarch64.yml
+++ b/.github/workflows/build-test-linux-aarch64.yml
@@ -1,7 +1,7 @@
 name: Build and test Linux aarch64 wheels
 
 on:
-  pull_request:
+  #pull_request:
   push:
     branches:
       - main
diff --git a/.github/workflows/build-test-linux-x86_64.yml b/.github/workflows/build-test-linux-x86_64.yml
index 6d94546177..5f95f7d209 100644
--- a/.github/workflows/build-test-linux-x86_64.yml
+++ b/.github/workflows/build-test-linux-x86_64.yml
@@ -1,7 +1,7 @@
 name: Build and test Linux x86_64 wheels
 
 on:
-  pull_request:
+  #pull_request:
   push:
     branches:
       - main
diff --git a/.github/workflows/build-test-linux-x86_64_rtx.yml b/.github/workflows/build-test-linux-x86_64_rtx.yml
index 34f9d00568..469b8278aa 100644
--- a/.github/workflows/build-test-linux-x86_64_rtx.yml
+++ b/.github/workflows/build-test-linux-x86_64_rtx.yml
@@ -1,7 +1,7 @@
 name: RTX - Build and test Linux x86_64 wheels
 
 on:
-  pull_request:
+  #pull_request:
   push:
     branches:
       - main
diff --git a/.github/workflows/build-test-windows.yml b/.github/workflows/build-test-windows.yml
index c62515cec4..3d637a1826 100644
--- a/.github/workflows/build-test-windows.yml
+++ b/.github/workflows/build-test-windows.yml
@@ -85,69 +85,6 @@ jobs:
       trigger-event: ${{ github.event_name }}
       timeout: 120
 
-  tests-py-torchscript-fe:
-    name: Test torchscript frontend [Python]
-    needs: [substitute-runner, build]
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - repository: pytorch/tensorrt
-            package-name: torch_tensorrt
-    uses: ./.github/workflows/windows-test.yml
-    with:
-      job-name: tests-py-torchscript-fe
-      repository: ${{ matrix.repository }}
-      ref: ""
-      test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
-      build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
-      pre-script: packaging/driver_upgrade.bat
-      script: |
-        set -euo pipefail
-        export USE_HOST_DEPS=1
-        export CI_BUILD=1
-        pushd .
-        cd tests/modules
-        python hub.py
-        popd
-        pushd .
-        cd tests/py/ts
-        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_api_test_results.xml api/
-        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_models_test_results.xml models/
-        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_integrations_test_results.xml integrations/
-        popd
-
-  tests-py-dynamo-converters:
-    name: Test dynamo converters [Python]
-    needs: [substitute-runner, build]
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - repository: pytorch/tensorrt
-            package-name: torch_tensorrt
-    uses: ./.github/workflows/windows-test.yml
-    with:
-      job-name: tests-py-dynamo-converters
-      repository: ${{ matrix.repository }}
-      ref: ""
-      test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
-      build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
-      pre-script: packaging/driver_upgrade.bat
-      script: |
-        set -euo pipefail
-        nvidia-smi
-        nvcc --version
-        export USE_HOST_DEPS=1
-        export CI_BUILD=1
-        pushd .
-        cd tests/py
-        cd dynamo
-        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml -n 4 conversion/
-        popd
-
   tests-py-dynamo-fe:
     name: Test dynamo frontend [Python]
     needs: [substitute-runner, build]
@@ -173,154 +110,253 @@ jobs:
         pushd .
         cd tests/py
         cd dynamo
-        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo models/
-        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_llm.xml llm/
-        popd
-
-  tests-py-dynamo-serde:
-    name: Test dynamo export serde [Python]
-    needs: [substitute-runner, build]
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - repository: pytorch/tensorrt
-            package-name: torch_tensorrt
-    uses: ./.github/workflows/windows-test.yml
-    with:
-      job-name: tests-py-dynamo-serde
-      repository: ${{ matrix.repository }}
-      ref: ""
-      test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
-      build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
-      pre-script: packaging/driver_upgrade.bat
-      script: |
-        set -euo pipefail
-        export USE_HOST_DEPS=1
-        export CI_BUILD=1
-        pushd .
-        cd tests/py
-        cd dynamo
+        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/test_models.xml --ir dynamo models/test_models.py
+        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/test_dyn_models.xml --ir dynamo models/test_dyn_models.py
+        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/test_models_export.xml --ir dynamo models/test_models_export.py
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_serde_test_results.xml --ir dynamo models/test_export_serde.py
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/reexport_test_results.xml --ir dynamo models/test_reexport.py
+        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/test_dtype_support.xml --ir torch_compile models/test_dtype_support.py
+        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_llm.xml llm/
         popd
 
-  tests-py-torch-compile-be:
-    name: Test torch compile backend [Python]
-    needs: [substitute-runner, build]
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - repository: pytorch/tensorrt
-            package-name: torch_tensorrt
-    uses: ./.github/workflows/windows-test.yml
-    with:
-      job-name: tests-py-torch-compile-be
-      repository: ${{ matrix.repository }}
-      ref: ""
-      test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
-      build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
-      pre-script: packaging/driver_upgrade.bat
-      script: |
-        set -euo pipefail
-        export USE_HOST_DEPS=1
-        export CI_BUILD=1
-        pushd .
-        cd tests/py
-        cd dynamo
-        python -m pytest -ra -n 10 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_be_test_results.xml backend/
-        python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_complete_be_e2e_test_results.xml --ir torch_compile models/test_models.py
-        ../../../packaging/vc_env_helper.bat python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_dyn_models_export.xml --ir torch_compile models/test_dyn_models.py
-        popd
 
-  tests-py-dynamo-core:
-    name: Test dynamo core [Python]
-    needs: [substitute-runner, build]
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - repository: pytorch/tensorrt
-            package-name: torch_tensorrt
-    uses: ./.github/workflows/windows-test.yml
-    with:
-      job-name: tests-py-dynamo-core
-      repository: ${{ matrix.repository }}
-      ref: ""
-      test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
-      build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
-      pre-script: packaging/driver_upgrade.bat
-      script: |
-        set -euo pipefail
-        export USE_HOST_DEPS=1
-        export CI_BUILD=1
-        pushd .
-        cd tests/py
-        cd dynamo
-        ../../../packaging/vc_env_helper.bat python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results.xml --ignore runtime/test_002_cudagraphs_py.py --ignore runtime/test_002_cudagraphs_cpp.py runtime/
-        python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_partitioning_test_results.xml partitioning/
-        ../../../packaging/vc_env_helper.bat python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_lowering_test_results.xml lowering/
-        popd
+  # tests-py-torchscript-fe:
+  #   name: Test torchscript frontend [Python]
+  #   needs: [substitute-runner, build]
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       include:
+  #         - repository: pytorch/tensorrt
+  #           package-name: torch_tensorrt
+  #   uses: ./.github/workflows/windows-test.yml
+  #   with:
+  #     job-name: tests-py-torchscript-fe
+  #     repository: ${{ matrix.repository }}
+  #     ref: ""
+  #     test-infra-repository: pytorch/test-infra
+  #     test-infra-ref: main
+  #     build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
+  #     pre-script: packaging/driver_upgrade.bat
+  #     script: |
+  #       set -euo pipefail
+  #       export USE_HOST_DEPS=1
+  #       export CI_BUILD=1
+  #       pushd .
+  #       cd tests/modules
+  #       python hub.py
+  #       popd
+  #       pushd .
+  #       cd tests/py/ts
+  #       python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_api_test_results.xml api/
+  #       python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_models_test_results.xml models/
+  #       python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_integrations_test_results.xml integrations/
+  #       popd
 
-  tests-py-dynamo-cudagraphs:
-    name: Test dynamo cudagraphs [Python]
-    needs: [substitute-runner, build]
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - repository: pytorch/tensorrt
-            package-name: torch_tensorrt
-    uses: ./.github/workflows/windows-test.yml
-    with:
-      job-name: tests-py-dynamo-cudagraphs
-      repository: ${{ matrix.repository }}
-      ref: ""
-      test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
-      build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
-      pre-script: packaging/driver_upgrade.bat
-      script: |
-        set -euo pipefail
-        export USE_HOST_DEPS=1
-        export CI_BUILD=1
-        pushd .
-        cd tests/py
-        cd dynamo
-        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_cpp_test_results.xml runtime/test_002_cudagraphs_cpp.py
-        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_py_test_results.xml runtime/test_002_cudagraphs_py.py
-        popd
+  # tests-py-dynamo-converters:
+  #   name: Test dynamo converters [Python]
+  #   needs: [substitute-runner, build]
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       include:
+  #         - repository: pytorch/tensorrt
+  #           package-name: torch_tensorrt
+  #   uses: ./.github/workflows/windows-test.yml
+  #   with:
+  #     job-name: tests-py-dynamo-converters
+  #     repository: ${{ matrix.repository }}
+  #     ref: ""
+  #     test-infra-repository: pytorch/test-infra
+  #     test-infra-ref: main
+  #     build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
+  #     pre-script: packaging/driver_upgrade.bat
+  #     script: |
+  #       set -euo pipefail
+  #       nvidia-smi
+  #       nvcc --version
+  #       export USE_HOST_DEPS=1
+  #       export CI_BUILD=1
+  #       pushd .
+  #       cd tests/py
+  #       cd dynamo
+  #       python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml -n 4 conversion/
+  #       popd
 
-  tests-py-core:
-    name: Test core [Python]
-    needs: [substitute-runner, build]
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - repository: pytorch/tensorrt
-            package-name: torch_tensorrt
-    uses: ./.github/workflows/windows-test.yml
-    with:
-      job-name: tests-py-core
-      repository: ${{ matrix.repository }}
-      ref: ""
-      test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
-      build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
-      pre-script: packaging/driver_upgrade.bat
-      script: |
-        set -euo pipefail
-        export USE_HOST_DEPS=1
-        export CI_BUILD=1
-        pushd .
-        cd tests/py/core
-        python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_core_test_results.xml .
-        popd
+  # tests-py-dynamo-fe:
+  #   name: Test dynamo frontend [Python]
+  #   needs: [substitute-runner, build]
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       include:
+  #         - repository: pytorch/tensorrt
+  #           package-name: torch_tensorrt
+  #   uses: ./.github/workflows/windows-test.yml
+  #   with:
+  #     job-name: tests-py-dynamo-fe
+  #     repository: ${{ matrix.repository }}
+  #     ref: ""
+  #     test-infra-repository: pytorch/test-infra
+  #     test-infra-ref: main
+  #     build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
+  #     pre-script: packaging/driver_upgrade.bat
+  #     script: |
+  #       set -euo pipefail
+  #       export USE_HOST_DEPS=1
+  #       export CI_BUILD=1
+  #       pushd .
+  #       cd tests/py
+  #       cd dynamo
+  #       python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo models/
+
+  #       python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_llm.xml llm/
+  #       popd
+
+  # tests-py-dynamo-serde:
+  #   name: Test dynamo export serde [Python]
+  #   needs: [substitute-runner, build]
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       include:
+  #         - repository: pytorch/tensorrt
+  #           package-name: torch_tensorrt
+  #   uses: ./.github/workflows/windows-test.yml
+  #   with:
+  #     job-name: tests-py-dynamo-serde
+  #     repository: ${{ matrix.repository }}
+  #     ref: ""
+  #     test-infra-repository: pytorch/test-infra
+  #     test-infra-ref: main
+  #     build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
+  #     pre-script: packaging/driver_upgrade.bat
+  #     script: |
+  #       set -euo pipefail
+  #       export USE_HOST_DEPS=1
+  #       export CI_BUILD=1
+  #       pushd .
+  #       cd tests/py
+  #       cd dynamo
+  #       python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_serde_test_results.xml --ir dynamo models/test_export_serde.py
+  #       python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/reexport_test_results.xml --ir dynamo models/test_reexport.py
+  #       popd
+
+  # tests-py-torch-compile-be:
+  #   name: Test torch compile backend [Python]
+  #   needs: [substitute-runner, build]
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       include:
+  #         - repository: pytorch/tensorrt
+  #           package-name: torch_tensorrt
+  #   uses: ./.github/workflows/windows-test.yml
+  #   with:
+  #     job-name: tests-py-torch-compile-be
+  #     repository: ${{ matrix.repository }}
+  #     ref: ""
+  #     test-infra-repository: pytorch/test-infra
+  #     test-infra-ref: main
+  #     build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
+  #     pre-script: packaging/driver_upgrade.bat
+  #     script: |
+  #       set -euo pipefail
+  #       export USE_HOST_DEPS=1
+  #       export CI_BUILD=1
+  #       pushd .
+  #       cd tests/py
+  #       cd dynamo
+  #       python -m pytest -ra -n 10 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_be_test_results.xml backend/
+  #       python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_complete_be_e2e_test_results.xml --ir torch_compile models/test_models.py
+  #       ../../../packaging/vc_env_helper.bat python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_dyn_models_export.xml --ir torch_compile models/test_dyn_models.py
+  #       popd
+
+  # tests-py-dynamo-core:
+  #   name: Test dynamo core [Python]
+  #   needs: [substitute-runner, build]
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       include:
+  #         - repository: pytorch/tensorrt
+  #           package-name: torch_tensorrt
+  #   uses: ./.github/workflows/windows-test.yml
+  #   with:
+  #     job-name: tests-py-dynamo-core
+  #     repository: ${{ matrix.repository }}
+  #     ref: ""
+  #     test-infra-repository: pytorch/test-infra
+  #     test-infra-ref: main
+  #     build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
+  #     pre-script: packaging/driver_upgrade.bat
+  #     script: |
+  #       set -euo pipefail
+  #       export USE_HOST_DEPS=1
+  #       export CI_BUILD=1
+  #       pushd .
+  #       cd tests/py
+  #       cd dynamo
+  #       ../../../packaging/vc_env_helper.bat python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results.xml --ignore runtime/test_002_cudagraphs_py.py --ignore runtime/test_002_cudagraphs_cpp.py runtime/
+  #       python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_partitioning_test_results.xml partitioning/
+  #       ../../../packaging/vc_env_helper.bat python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_lowering_test_results.xml lowering/
+  #       popd
+
+  # tests-py-dynamo-cudagraphs:
+  #   name: Test dynamo cudagraphs [Python]
+  #   needs: [substitute-runner, build]
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       include:
+  #         - repository: pytorch/tensorrt
+  #           package-name: torch_tensorrt
+  #   uses: ./.github/workflows/windows-test.yml
+  #   with:
+  #     job-name: tests-py-dynamo-cudagraphs
+  #     repository: ${{ matrix.repository }}
+  #     ref: ""
+  #     test-infra-repository: pytorch/test-infra
+  #     test-infra-ref: main
+  #     build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
+  #     pre-script: packaging/driver_upgrade.bat
+  #     script: |
+  #       set -euo pipefail
+  #       export USE_HOST_DEPS=1
+  #       export CI_BUILD=1
+  #       pushd .
+  #       cd tests/py
+  #       cd dynamo
+  #       python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_cpp_test_results.xml runtime/test_002_cudagraphs_cpp.py
+  #       python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_py_test_results.xml runtime/test_002_cudagraphs_py.py
+  #       popd
+
+  # tests-py-core:
+  #   name: Test core [Python]
+  #   needs: [substitute-runner, build]
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       include:
+  #         - repository: pytorch/tensorrt
+  #           package-name: torch_tensorrt
+  #   uses: ./.github/workflows/windows-test.yml
+  #   with:
+  #     job-name: tests-py-core
+  #     repository: ${{ matrix.repository }}
+  #     ref: ""
+  #     test-infra-repository: pytorch/test-infra
+  #     test-infra-ref: main
+  #     build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
+  #     pre-script: packaging/driver_upgrade.bat
+  #     script: |
+  #       set -euo pipefail
+  #       export USE_HOST_DEPS=1
+  #       export CI_BUILD=1
+  #       pushd .
+  #       cd tests/py/core
+  #       python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_core_test_results.xml .
+  #       popd
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ inputs.repository }}-${{ github.event_name == 'workflow_dispatch' }}-${{ inputs.job-name }}
diff --git a/.github/workflows/build-test-windows_rtx.yml b/.github/workflows/build-test-windows_rtx.yml
index 9ee768b964..bf6eaef765 100644
--- a/.github/workflows/build-test-windows_rtx.yml
+++ b/.github/workflows/build-test-windows_rtx.yml
@@ -87,34 +87,34 @@ jobs:
       use-rtx: true
       timeout: 120
 
-  tests-py-dynamo-converters:
-    name: Test dynamo converters [Python]
-    needs: [substitute-runner, build]
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - repository: pytorch/tensorrt
-            package-name: torch_tensorrt
-    uses: ./.github/workflows/windows-test.yml
-    with:
-      job-name: tests-py-dynamo-converters
-      repository: ${{ matrix.repository }}
-      ref: ""
-      test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
-      build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
-      pre-script: packaging/driver_upgrade.bat
-      use-rtx: true
-      script: |
-        set -euo pipefail
-        export USE_HOST_DEPS=1
-        export CI_BUILD=1
-        pushd .
-        cd tests/py
-        cd dynamo
-        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml -n 4 conversion/
-        popd
+  # tests-py-dynamo-converters:
+  #   name: Test dynamo converters [Python]
+  #   needs: [substitute-runner, build]
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       include:
+  #         - repository: pytorch/tensorrt
+  #           package-name: torch_tensorrt
+  #   uses: ./.github/workflows/windows-test.yml
+  #   with:
+  #     job-name: tests-py-dynamo-converters
+  #     repository: ${{ matrix.repository }}
+  #     ref: ""
+  #     test-infra-repository: pytorch/test-infra
+  #     test-infra-ref: main
+  #     build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
+  #     pre-script: packaging/driver_upgrade.bat
+  #     use-rtx: true
+  #     script: |
+  #       set -euo pipefail
+  #       export USE_HOST_DEPS=1
+  #       export CI_BUILD=1
+  #       pushd .
+  #       cd tests/py
+  #       cd dynamo
+  #       python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml -n 4 conversion/
+  #       popd
 
   tests-py-dynamo-fe:
     name: Test dynamo frontend [Python]
@@ -146,155 +146,155 @@ jobs:
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_llm.xml llm/
         popd
 
-  tests-py-dynamo-serde:
-    name: Test dynamo export serde [Python]
-    needs: [substitute-runner, build]
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - repository: pytorch/tensorrt
-            package-name: torch_tensorrt
-    uses: ./.github/workflows/windows-test.yml
-    with:
-      job-name: tests-py-dynamo-serde
-      repository: ${{ matrix.repository }}
-      ref: ""
-      test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
-      build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
-      pre-script: packaging/driver_upgrade.bat
-      use-rtx: true
-      script: |
-        set -euo pipefail
-        export USE_HOST_DEPS=1
-        export CI_BUILD=1
-        pushd .
-        cd tests/py
-        cd dynamo
-        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_serde_test_results.xml --ir dynamo models/test_export_serde.py
-        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/reexport_test_results.xml --ir dynamo models/test_reexport.py
-        popd
+  # tests-py-dynamo-serde:
+  #   name: Test dynamo export serde [Python]
+  #   needs: [substitute-runner, build]
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       include:
+  #         - repository: pytorch/tensorrt
+  #           package-name: torch_tensorrt
+  #   uses: ./.github/workflows/windows-test.yml
+  #   with:
+  #     job-name: tests-py-dynamo-serde
+  #     repository: ${{ matrix.repository }}
+  #     ref: ""
+  #     test-infra-repository: pytorch/test-infra
+  #     test-infra-ref: main
+  #     build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
+  #     pre-script: packaging/driver_upgrade.bat
+  #     use-rtx: true
+  #     script: |
+  #       set -euo pipefail
+  #       export USE_HOST_DEPS=1
+  #       export CI_BUILD=1
+  #       pushd .
+  #       cd tests/py
+  #       cd dynamo
+  #       python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_serde_test_results.xml --ir dynamo models/test_export_serde.py
+  #       python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/reexport_test_results.xml --ir dynamo models/test_reexport.py
+  #       popd
 
-  tests-py-torch-compile-be:
-    name: Test torch compile backend [Python]
-    needs: [substitute-runner, build]
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - repository: pytorch/tensorrt
-            package-name: torch_tensorrt
-    uses: ./.github/workflows/windows-test.yml
-    with:
-      job-name: tests-py-torch-compile-be
-      repository: ${{ matrix.repository }}
-      ref: ""
-      test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
-      build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
-      pre-script: packaging/driver_upgrade.bat
-      use-rtx: true
-      script: |
-        set -euo pipefail
-        export USE_HOST_DEPS=1
-        export CI_BUILD=1
-        pushd .
-        cd tests/py
-        cd dynamo
-        python -m pytest -ra -n 10 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_be_test_results.xml backend/
-        python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_complete_be_e2e_test_results.xml --ir torch_compile models/test_models.py
-        ../../../packaging/vc_env_helper.bat python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_dyn_models_export.xml --ir torch_compile models/test_dyn_models.py
-        popd
+  # tests-py-torch-compile-be:
+  #   name: Test torch compile backend [Python]
+  #   needs: [substitute-runner, build]
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       include:
+  #         - repository: pytorch/tensorrt
+  #           package-name: torch_tensorrt
+  #   uses: ./.github/workflows/windows-test.yml
+  #   with:
+  #     job-name: tests-py-torch-compile-be
+  #     repository: ${{ matrix.repository }}
+  #     ref: ""
+  #     test-infra-repository: pytorch/test-infra
+  #     test-infra-ref: main
+  #     build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
+  #     pre-script: packaging/driver_upgrade.bat
+  #     use-rtx: true
+  #     script: |
+  #       set -euo pipefail
+  #       export USE_HOST_DEPS=1
+  #       export CI_BUILD=1
+  #       pushd .
+  #       cd tests/py
+  #       cd dynamo
+  #       python -m pytest -ra -n 10 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_be_test_results.xml backend/
+  #       python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_complete_be_e2e_test_results.xml --ir torch_compile models/test_models.py
+  #       ../../../packaging/vc_env_helper.bat python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_dyn_models_export.xml --ir torch_compile models/test_dyn_models.py
+  #       popd
 
-  tests-py-dynamo-core:
-    name: Test dynamo core [Python]
-    needs: [substitute-runner, build]
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - repository: pytorch/tensorrt
-            package-name: torch_tensorrt
-    uses: ./.github/workflows/windows-test.yml
-    with:
-      job-name: tests-py-dynamo-core
-      repository: ${{ matrix.repository }}
-      ref: ""
-      test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
-      build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
-      pre-script: packaging/driver_upgrade.bat
-      use-rtx: true
-      script: |
-        set -euo pipefail
-        export USE_HOST_DEPS=1
-        export CI_BUILD=1
-        pushd .
-        cd tests/py
-        cd dynamo
-        ../../../packaging/vc_env_helper.bat python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results.xml --ignore runtime/test_002_cudagraphs_py.py --ignore runtime/test_002_cudagraphs_cpp.py runtime/
-        python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_partitioning_test_results.xml partitioning/
-        ../../../packaging/vc_env_helper.bat python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_lowering_test_results.xml lowering/
-        popd
+  # tests-py-dynamo-core:
+  #   name: Test dynamo core [Python]
+  #   needs: [substitute-runner, build]
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       include:
+  #         - repository: pytorch/tensorrt
+  #           package-name: torch_tensorrt
+  #   uses: ./.github/workflows/windows-test.yml
+  #   with:
+  #     job-name: tests-py-dynamo-core
+  #     repository: ${{ matrix.repository }}
+  #     ref: ""
+  #     test-infra-repository: pytorch/test-infra
+  #     test-infra-ref: main
+  #     build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
+  #     pre-script: packaging/driver_upgrade.bat
+  #     use-rtx: true
+  #     script: |
+  #       set -euo pipefail
+  #       export USE_HOST_DEPS=1
+  #       export CI_BUILD=1
+  #       pushd .
+  #       cd tests/py
+  #       cd dynamo
+  #       ../../../packaging/vc_env_helper.bat python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results.xml --ignore runtime/test_002_cudagraphs_py.py --ignore runtime/test_002_cudagraphs_cpp.py runtime/
+  #       python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_partitioning_test_results.xml partitioning/
+  #       ../../../packaging/vc_env_helper.bat python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_lowering_test_results.xml lowering/
+  #       popd
 
-  tests-py-dynamo-cudagraphs:
-    name: Test dynamo cudagraphs [Python]
-    needs: [substitute-runner, build]
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - repository: pytorch/tensorrt
-            package-name: torch_tensorrt
-    uses: ./.github/workflows/windows-test.yml
-    with:
-      job-name: tests-py-dynamo-cudagraphs
-      repository: ${{ matrix.repository }}
-      ref: ""
-      test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
-      build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
-      pre-script: packaging/driver_upgrade.bat
-      use-rtx: true
-      script: |
-        set -euo pipefail
-        export USE_HOST_DEPS=1
-        export CI_BUILD=1
-        pushd .
-        cd tests/py
-        cd dynamo
-        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_cpp_test_results.xml runtime/test_002_cudagraphs_cpp.py
-        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_py_test_results.xml runtime/test_002_cudagraphs_py.py
-        popd
+  # tests-py-dynamo-cudagraphs:
+  #   name: Test dynamo cudagraphs [Python]
+  #   needs: [substitute-runner, build]
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       include:
+  #         - repository: pytorch/tensorrt
+  #           package-name: torch_tensorrt
+  #   uses: ./.github/workflows/windows-test.yml
+  #   with:
+  #     job-name: tests-py-dynamo-cudagraphs
+  #     repository: ${{ matrix.repository }}
+  #     ref: ""
+  #     test-infra-repository: pytorch/test-infra
+  #     test-infra-ref: main
+  #     build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
+  #     pre-script: packaging/driver_upgrade.bat
+  #     use-rtx: true
+  #     script: |
+  #       set -euo pipefail
+  #       export USE_HOST_DEPS=1
+  #       export CI_BUILD=1
+  #       pushd .
+  #       cd tests/py
+  #       cd dynamo
+  #       python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_cpp_test_results.xml runtime/test_002_cudagraphs_cpp.py
+  #       python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_py_test_results.xml runtime/test_002_cudagraphs_py.py
+  #       popd
 
-  tests-py-core:
-    name: Test core [Python]
-    needs: [substitute-runner, build]
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - repository: pytorch/tensorrt
-            package-name: torch_tensorrt
-    uses: ./.github/workflows/windows-test.yml
-    with:
-      job-name: tests-py-core
-      repository: ${{ matrix.repository }}
-      ref: ""
-      test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
-      build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
-      pre-script: packaging/driver_upgrade.bat
-      use-rtx: true
-      script: |
-        set -euo pipefail
-        export USE_HOST_DEPS=1
-        export CI_BUILD=1
-        pushd .
-        cd tests/py/core
-        python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_core_test_results.xml .
-        popd
+  # tests-py-core:
+  #   name: Test core [Python]
+  #   needs: [substitute-runner, build]
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       include:
+  #         - repository: pytorch/tensorrt
+  #           package-name: torch_tensorrt
+  #   uses: ./.github/workflows/windows-test.yml
+  #   with:
+  #     job-name: tests-py-core
+  #     repository: ${{ matrix.repository }}
+  #     ref: ""
+  #     test-infra-repository: pytorch/test-infra
+  #     test-infra-ref: main
+  #     build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
+  #     pre-script: packaging/driver_upgrade.bat
+  #     use-rtx: true
+  #     script: |
+  #       set -euo pipefail
+  #       export USE_HOST_DEPS=1
+  #       export CI_BUILD=1
+  #       pushd .
+  #       cd tests/py/core
+  #       python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_core_test_results.xml .
+  #       popd
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-tensorrt-rtx-${{ inputs.repository }}-${{ github.event_name == 'workflow_dispatch' }}-${{ inputs.job-name }}

From 49d53cf55cca0a920270fb365f1f11ad87d2c88d Mon Sep 17 00:00:00 2001
From: lanluo-nvidia <lanl@nvidia.com>
Date: Mon, 29 Sep 2025 16:58:23 -0700
Subject: [PATCH 2/5] test

---
 .github/workflows/build-test-windows.yml |  16 +-
 tests/py/dynamo/models/test_models.py    | 820 +++++++++++------------
 2 files changed, 423 insertions(+), 413 deletions(-)

diff --git a/.github/workflows/build-test-windows.yml b/.github/workflows/build-test-windows.yml
index 3d637a1826..70d66eab52 100644
--- a/.github/workflows/build-test-windows.yml
+++ b/.github/workflows/build-test-windows.yml
@@ -104,18 +104,28 @@ jobs:
       build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
       pre-script: packaging/driver_upgrade.bat
       script: |
-        set -euo pipefail
+
+        #set -euo pipefail
+        set -x
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
         cd tests/py
         cd dynamo
-        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/test_models.xml --ir dynamo models/test_models.py
+
+        gdb --version
+        nvidia-smi
+        nvcc --version
+        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/test_dtype_support.xml --ir torch_compile models/test_dtype_support.py
+        echo "test_dtype_support.xml passed"
+
+        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/test_models.xml  --ir dynamo models/test_models.py
+
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/test_dyn_models.xml --ir dynamo models/test_dyn_models.py
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/test_models_export.xml --ir dynamo models/test_models_export.py
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_serde_test_results.xml --ir dynamo models/test_export_serde.py
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/reexport_test_results.xml --ir dynamo models/test_reexport.py
-        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/test_dtype_support.xml --ir torch_compile models/test_dtype_support.py
+
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_llm.xml llm/
         popd
 
diff --git a/tests/py/dynamo/models/test_models.py b/tests/py/dynamo/models/test_models.py
index 3d5e2190d2..33f8d8d458 100644
--- a/tests/py/dynamo/models/test_models.py
+++ b/tests/py/dynamo/models/test_models.py
@@ -54,413 +54,413 @@ def test_resnet18(ir):
     torch._dynamo.reset()
 
 
-@pytest.mark.unit
-@unittest.skipIf(
-    not importlib.util.find_spec("torchvision"),
-    "torchvision is not installed",
-)
-def test_resnet18_cpu_offload(ir):
-    model = models.resnet18(pretrained=True).eval().to("cuda")
-    input = torch.randn((1, 3, 224, 224)).to("cuda")
-
-    compile_spec = {
-        "inputs": [
-            torchtrt.Input(
-                input.shape, dtype=torch.float, format=torch.contiguous_format
-            )
-        ],
-        "device": torchtrt.Device("cuda:0"),
-        "enabled_precisions": {torch.float},
-        "ir": ir,
-        "pass_through_build_failures": True,
-        "optimization_level": 1,
-        "cache_built_engines": False,
-        "reuse_cached_engines": False,
-        "offload_module_to_cpu": True,
-    }
-
-    trt_mod = torchtrt.compile(model, **compile_spec)
-    if ir == "dynamo":
-        assertions.assertTrue(
-            get_model_device(model).type == "cpu",
-            msg="Model should be offloaded to CPU",
-        )
-        model.cuda()
-    cos_sim = cosine_similarity(model(input), trt_mod(input))
-    assertions.assertTrue(
-        cos_sim > COSINE_THRESHOLD,
-        msg=f"Resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
-    )
-
-    # Clean up model env
-    torch._dynamo.reset()
-
-
-@unittest.skipIf(
-    not importlib.util.find_spec("torchvision"), "torchvision not installed"
-)
-def test_resnet18_torch_exec_ops(ir):
-    model = models.resnet18(pretrained=True).eval().to("cuda")
-    input = torch.randn((1, 3, 224, 224)).to("cuda")
-
-    compile_spec = {
-        "inputs": [
-            torchtrt.Input(
-                min_shape=(1, 3, 224, 224),
-                opt_shape=(8, 3, 224, 224),
-                max_shape=(16, 3, 224, 224),
-                dtype=torch.float32,
-            )
-        ],
-        "ir": ir,
-        "enabled_precisions": {torch.float32, torch.float16},
-        "min_block_size": 1,
-        "output_format": "exported_program",
-        "cache_built_engines": True,
-        "reuse_cached_engines": True,
-        "torch_executed_ops": {torch.ops.aten.matmul, "torch.ops.aten.add"},
-    }
-
-    trt_mod = torchtrt.compile(model, **compile_spec)
-    cos_sim = cosine_similarity(model(input), trt_mod(input))
-    assertions.assertTrue(
-        cos_sim > COSINE_THRESHOLD,
-        msg=f"Resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
-    )
-
-    # Clean up model env
-    torch._dynamo.reset()
-
-
-@pytest.mark.unit
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
-@unittest.skipIf(
-    not importlib.util.find_spec("torchvision"),
-    "torchvision is not installed",
-)
-def test_mobilenet_v2(ir, dtype):
-    if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16:
-        pytest.skip("TensorRT-RTX does not support bfloat16")
-
-    model = models.mobilenet_v2(pretrained=True).eval().to("cuda").to(dtype)
-    input = torch.randn((1, 3, 224, 224)).to("cuda").to(dtype)
-
-    compile_spec = {
-        "inputs": [
-            torchtrt.Input(input.shape, dtype=dtype, format=torch.contiguous_format)
-        ],
-        "device": torchtrt.Device("cuda:0"),
-        "ir": ir,
-        "pass_through_build_failures": True,
-        "optimization_level": 1,
-        "min_block_size": 10,
-        "cache_built_engines": False,
-        "reuse_cached_engines": False,
-        "use_explicit_typing": True,
-    }
-
-    trt_mod = torchtrt.compile(model, **compile_spec)
-    pyt_output = model(input)
-    trt_output = trt_mod(input)
-    assert pyt_output.dtype == trt_output.dtype
-    assert pyt_output.dtype == dtype
-    cos_sim = cosine_similarity(pyt_output, trt_output)
-    assertions.assertTrue(
-        cos_sim > COSINE_THRESHOLD,
-        msg=f"Mobilenet v2 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
-    )
-
-    # Clean up model env
-    torch._dynamo.reset()
-
-
-@pytest.mark.unit
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
-@unittest.skipIf(
-    not importlib.util.find_spec("timm") or not importlib.util.find_spec("torchvision"),
-    "timm or torchvision not installed",
-)
-def test_efficientnet_b0(ir, dtype):
-    if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16:
-        pytest.skip("TensorRT-RTX does not support bfloat16, skipping test")
-
-    model = (
-        timm.create_model("efficientnet_b0", pretrained=True)
-        .eval()
-        .to("cuda")
-        .to(dtype)
-    )
-    input = torch.randn((1, 3, 224, 224)).to("cuda").to(dtype)
-
-    compile_spec = {
-        "inputs": [
-            torchtrt.Input(input.shape, dtype=dtype, format=torch.contiguous_format)
-        ],
-        "device": torchtrt.Device("cuda:0"),
-        "ir": ir,
-        "pass_through_build_failures": True,
-        "optimization_level": 1,
-        "min_block_size": 10,
-        "cache_built_engines": False,
-        "reuse_cached_engines": False,
-        "use_explicit_typing": True,
-    }
-
-    trt_mod = torchtrt.compile(model, **compile_spec)
-    pyt_output = model(input)
-    trt_output = trt_mod(input)
-    assert pyt_output.dtype == trt_output.dtype
-    assert pyt_output.dtype == dtype
-    cos_sim = cosine_similarity(pyt_output, trt_output)
-    assertions.assertTrue(
-        cos_sim > COSINE_THRESHOLD,
-        msg=f"EfficientNet-B0 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
-    )
-
-    # Clean up model env
-    torch._dynamo.reset()
-
-
-@pytest.mark.unit
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
-@unittest.skipIf(
-    not importlib.util.find_spec("transformers"),
-    "transformers is required to run this test",
-)
-def test_bert_base_uncased(ir, dtype):
-    if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16:
-        pytest.skip("TensorRT-RTX does not support bfloat16")
-
-    from transformers import BertModel
-
-    model = BertModel.from_pretrained("bert-base-uncased").cuda().eval().to(dtype)
-    input = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda")
-    input2 = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda")
-
-    compile_spec = {
-        "inputs": [
-            torchtrt.Input(
-                input.shape,
-                dtype=input.dtype,
-                format=torch.contiguous_format,
-            ),
-            torchtrt.Input(
-                input.shape,
-                dtype=input.dtype,
-                format=torch.contiguous_format,
-            ),
-        ],
-        "device": torchtrt.Device("cuda:0"),
-        "truncate_double": True,
-        "ir": ir,
-        "pass_through_build_failures": True,
-        "optimization_level": 1,
-        "min_block_size": 15,
-        "cache_built_engines": False,
-        "reuse_cached_engines": False,
-        "use_explicit_typing": True,
-    }
-    trt_mod = torchtrt.compile(model, **compile_spec)
-
-    model_outputs = model(input, input2)
-    trt_model_outputs = trt_mod(input, input2)
-    for key in model_outputs.keys():
-        out, trt_out = model_outputs[key], trt_model_outputs[key]
-        assert out.dtype == trt_out.dtype
-        assert out.dtype == dtype
-        cos_sim = cosine_similarity(out, trt_out)
-        assertions.assertTrue(
-            cos_sim > COSINE_THRESHOLD,
-            msg=f"HF BERT base-uncased TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
-        )
-
-    # Clean up model env
-    torch._dynamo.reset()
-
-
-@pytest.mark.unit
-def test_bert_base_uncased_cpu_offload(ir):
-    from transformers import BertModel
-
-    model = BertModel.from_pretrained("bert-base-uncased").cuda().eval()
-    input = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda")
-    input2 = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda")
-
-    compile_spec = {
-        "inputs": [
-            torchtrt.Input(
-                input.shape,
-                dtype=input.dtype,
-                format=torch.contiguous_format,
-            ),
-            torchtrt.Input(
-                input.shape,
-                dtype=input.dtype,
-                format=torch.contiguous_format,
-            ),
-        ],
-        "device": torchtrt.Device("cuda:0"),
-        "enabled_precisions": {torch.float},
-        "truncate_double": True,
-        "ir": ir,
-        "pass_through_build_failures": True,
-        "optimization_level": 1,
-        "min_block_size": 15,
-        "cache_built_engines": False,
-        "reuse_cached_engines": False,
-        "offload_module_to_cpu": True,
-    }
-    trt_mod = torchtrt.compile(model, **compile_spec)
-    if ir == "dynamo":
-        assertions.assertTrue(
-            get_model_device(model).type == "cpu",
-            msg="Model should be offloaded to CPU",
-        )
-        model.cuda()
-
-    model_outputs = model(input, input2)
-    trt_model_outputs = trt_mod(input, input2)
-    for key in model_outputs.keys():
-        out, trt_out = model_outputs[key], trt_model_outputs[key]
-        cos_sim = cosine_similarity(out, trt_out)
-        assertions.assertTrue(
-            cos_sim > COSINE_THRESHOLD,
-            msg=f"HF BERT base-uncased TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
-        )
-
-    # Clean up model env
-    torch._dynamo.reset()
-
-
-@pytest.mark.unit
-@unittest.skipIf(
-    not importlib.util.find_spec("torchvision"),
-    "torchvision is not installed",
-)
-def test_resnet18_half(ir):
-    model = models.resnet18(pretrained=True).eval().to("cuda").half()
-    input = torch.randn((1, 3, 224, 224)).to("cuda").half()
-
-    compile_spec = {
-        "inputs": [
-            torchtrt.Input(
-                input.shape, dtype=torch.half, format=torch.contiguous_format
-            )
-        ],
-        "device": torchtrt.Device("cuda:0"),
-        "enabled_precisions": {torch.half},
-        "ir": ir,
-        "pass_through_build_failures": True,
-        "optimization_level": 1,
-        "cache_built_engines": False,
-        "reuse_cached_engines": False,
-    }
-
-    trt_mod = torchtrt.compile(model, **compile_spec)
-    cos_sim = cosine_similarity(model(input), trt_mod(input))
-    assertions.assertTrue(
-        cos_sim > COSINE_THRESHOLD,
-        msg=f"Resnet18 Half TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
-    )
-
-    # Clean up model env
-    torch._dynamo.reset()
-
-
-@pytest.mark.unit
-@unittest.skipIf(
-    torchtrt.ENABLED_FEATURES.tensorrt_rtx,
-    "bf16 is not supported for tensorrt_rtx",
-)
-def test_bf16_model(ir):
-    class MyModule(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.conv = torch.nn.Conv2d(3, 16, 3, stride=1, bias=True)
-            self.relu = torch.nn.ReLU()
-
-        def forward(self, x):
-            out = self.conv(x)
-            out = self.relu(out)
-            return out
-
-    model = MyModule().eval().cuda().to(torch.bfloat16)
-    input = torch.randn((1, 3, 224, 224)).to("cuda").to(torch.bfloat16)
-
-    compile_spec = {
-        "inputs": [
-            torchtrt.Input(
-                input.shape, dtype=torch.bfloat16, format=torch.contiguous_format
-            )
-        ],
-        "device": torchtrt.Device("cuda:0"),
-        "enabled_precisions": {torch.float32},
-        "ir": ir,
-        "pass_through_build_failures": True,
-        "min_block_size": 1,
-        "cache_built_engines": False,
-        "reuse_cached_engines": False,
-        "use_explicit_typing": True,
-    }
-
-    trt_mod = torchtrt.compile(model, **compile_spec)
-    cos_sim = cosine_similarity(model(input), trt_mod(input))
-
-    assertions.assertTrue(
-        cos_sim > COSINE_THRESHOLD,
-        msg=f"BF16 model TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
-    )
-
-    # Clean up model env
-    torch._dynamo.reset()
-
-
-@pytest.mark.unit
-@unittest.skipIf(
-    torchtrt.ENABLED_FEATURES.tensorrt_rtx,
-    "bf16 is not supported for tensorrt_rtx",
-)
-def test_bf16_fallback_model(ir):
-    class MyModule(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.conv = torch.nn.Conv2d(3, 16, 3, padding=1, stride=1, bias=True)
-            self.relu = torch.nn.ReLU()
-            self.conv2 = torch.nn.Conv2d(16, 16, 3, padding=1, stride=1, bias=True)
-
-        def forward(self, x):
-            out = self.conv(x)
-            out = self.relu(out)
-            out = self.conv2(out)
-            return out
-
-    model = MyModule().eval().cuda().to(torch.bfloat16)
-    input = torch.randn((1, 3, 224, 224)).to("cuda").to(torch.bfloat16)
-
-    compile_spec = {
-        "inputs": [
-            torchtrt.Input(
-                input.shape, dtype=torch.bfloat16, format=torch.contiguous_format
-            )
-        ],
-        "device": torchtrt.Device("cuda:0"),
-        "enabled_precisions": {torch.float32},
-        "ir": ir,
-        "pass_through_build_failures": True,
-        "min_block_size": 1,
-        "cache_built_engines": False,
-        "reuse_cached_engines": False,
-        "use_explicit_typing": True,
-        "torch_executed_ops": {"torch.ops.aten.relu.default"},
-    }
-
-    trt_mod = torchtrt.compile(model, **compile_spec)
-    cos_sim = cosine_similarity(model(input), trt_mod(input))
-
-    assertions.assertTrue(
-        cos_sim > COSINE_THRESHOLD,
-        msg=f"BF16 fallback model TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
-    )
-
-    # Clean up model env
-    torch._dynamo.reset()
+# @pytest.mark.unit
+# @unittest.skipIf(
+#     not importlib.util.find_spec("torchvision"),
+#     "torchvision is not installed",
+# )
+# def test_resnet18_cpu_offload(ir):
+#     model = models.resnet18(pretrained=True).eval().to("cuda")
+#     input = torch.randn((1, 3, 224, 224)).to("cuda")
+
+#     compile_spec = {
+#         "inputs": [
+#             torchtrt.Input(
+#                 input.shape, dtype=torch.float, format=torch.contiguous_format
+#             )
+#         ],
+#         "device": torchtrt.Device("cuda:0"),
+#         "enabled_precisions": {torch.float},
+#         "ir": ir,
+#         "pass_through_build_failures": True,
+#         "optimization_level": 1,
+#         "cache_built_engines": False,
+#         "reuse_cached_engines": False,
+#         "offload_module_to_cpu": True,
+#     }
+
+#     trt_mod = torchtrt.compile(model, **compile_spec)
+#     if ir == "dynamo":
+#         assertions.assertTrue(
+#             get_model_device(model).type == "cpu",
+#             msg="Model should be offloaded to CPU",
+#         )
+#         model.cuda()
+#     cos_sim = cosine_similarity(model(input), trt_mod(input))
+#     assertions.assertTrue(
+#         cos_sim > COSINE_THRESHOLD,
+#         msg=f"Resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+#     )
+
+#     # Clean up model env
+#     torch._dynamo.reset()
+
+
+# @unittest.skipIf(
+#     not importlib.util.find_spec("torchvision"), "torchvision not installed"
+# )
+# def test_resnet18_torch_exec_ops(ir):
+#     model = models.resnet18(pretrained=True).eval().to("cuda")
+#     input = torch.randn((1, 3, 224, 224)).to("cuda")
+
+#     compile_spec = {
+#         "inputs": [
+#             torchtrt.Input(
+#                 min_shape=(1, 3, 224, 224),
+#                 opt_shape=(8, 3, 224, 224),
+#                 max_shape=(16, 3, 224, 224),
+#                 dtype=torch.float32,
+#             )
+#         ],
+#         "ir": ir,
+#         "enabled_precisions": {torch.float32, torch.float16},
+#         "min_block_size": 1,
+#         "output_format": "exported_program",
+#         "cache_built_engines": True,
+#         "reuse_cached_engines": True,
+#         "torch_executed_ops": {torch.ops.aten.matmul, "torch.ops.aten.add"},
+#     }
+
+#     trt_mod = torchtrt.compile(model, **compile_spec)
+#     cos_sim = cosine_similarity(model(input), trt_mod(input))
+#     assertions.assertTrue(
+#         cos_sim > COSINE_THRESHOLD,
+#         msg=f"Resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+#     )
+
+#     # Clean up model env
+#     torch._dynamo.reset()
+
+
+# @pytest.mark.unit
+# @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
+# @unittest.skipIf(
+#     not importlib.util.find_spec("torchvision"),
+#     "torchvision is not installed",
+# )
+# def test_mobilenet_v2(ir, dtype):
+#     if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16:
+#         pytest.skip("TensorRT-RTX does not support bfloat16")
+
+#     model = models.mobilenet_v2(pretrained=True).eval().to("cuda").to(dtype)
+#     input = torch.randn((1, 3, 224, 224)).to("cuda").to(dtype)
+
+#     compile_spec = {
+#         "inputs": [
+#             torchtrt.Input(input.shape, dtype=dtype, format=torch.contiguous_format)
+#         ],
+#         "device": torchtrt.Device("cuda:0"),
+#         "ir": ir,
+#         "pass_through_build_failures": True,
+#         "optimization_level": 1,
+#         "min_block_size": 10,
+#         "cache_built_engines": False,
+#         "reuse_cached_engines": False,
+#         "use_explicit_typing": True,
+#     }
+
+#     trt_mod = torchtrt.compile(model, **compile_spec)
+#     pyt_output = model(input)
+#     trt_output = trt_mod(input)
+#     assert pyt_output.dtype == trt_output.dtype
+#     assert pyt_output.dtype == dtype
+#     cos_sim = cosine_similarity(pyt_output, trt_output)
+#     assertions.assertTrue(
+#         cos_sim > COSINE_THRESHOLD,
+#         msg=f"Mobilenet v2 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+#     )
+
+#     # Clean up model env
+#     torch._dynamo.reset()
+
+
+# @pytest.mark.unit
+# @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
+# @unittest.skipIf(
+#     not importlib.util.find_spec("timm") or not importlib.util.find_spec("torchvision"),
+#     "timm or torchvision not installed",
+# )
+# def test_efficientnet_b0(ir, dtype):
+#     if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16:
+#         pytest.skip("TensorRT-RTX does not support bfloat16, skipping test")
+
+#     model = (
+#         timm.create_model("efficientnet_b0", pretrained=True)
+#         .eval()
+#         .to("cuda")
+#         .to(dtype)
+#     )
+#     input = torch.randn((1, 3, 224, 224)).to("cuda").to(dtype)
+
+#     compile_spec = {
+#         "inputs": [
+#             torchtrt.Input(input.shape, dtype=dtype, format=torch.contiguous_format)
+#         ],
+#         "device": torchtrt.Device("cuda:0"),
+#         "ir": ir,
+#         "pass_through_build_failures": True,
+#         "optimization_level": 1,
+#         "min_block_size": 10,
+#         "cache_built_engines": False,
+#         "reuse_cached_engines": False,
+#         "use_explicit_typing": True,
+#     }
+
+#     trt_mod = torchtrt.compile(model, **compile_spec)
+#     pyt_output = model(input)
+#     trt_output = trt_mod(input)
+#     assert pyt_output.dtype == trt_output.dtype
+#     assert pyt_output.dtype == dtype
+#     cos_sim = cosine_similarity(pyt_output, trt_output)
+#     assertions.assertTrue(
+#         cos_sim > COSINE_THRESHOLD,
+#         msg=f"EfficientNet-B0 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+#     )
+
+#     # Clean up model env
+#     torch._dynamo.reset()
+
+
+# @pytest.mark.unit
+# @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
+# @unittest.skipIf(
+#     not importlib.util.find_spec("transformers"),
+#     "transformers is required to run this test",
+# )
+# def test_bert_base_uncased(ir, dtype):
+#     if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16:
+#         pytest.skip("TensorRT-RTX does not support bfloat16")
+
+#     from transformers import BertModel
+
+#     model = BertModel.from_pretrained("bert-base-uncased").cuda().eval().to(dtype)
+#     input = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda")
+#     input2 = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda")
+
+#     compile_spec = {
+#         "inputs": [
+#             torchtrt.Input(
+#                 input.shape,
+#                 dtype=input.dtype,
+#                 format=torch.contiguous_format,
+#             ),
+#             torchtrt.Input(
+#                 input.shape,
+#                 dtype=input.dtype,
+#                 format=torch.contiguous_format,
+#             ),
+#         ],
+#         "device": torchtrt.Device("cuda:0"),
+#         "truncate_double": True,
+#         "ir": ir,
+#         "pass_through_build_failures": True,
+#         "optimization_level": 1,
+#         "min_block_size": 15,
+#         "cache_built_engines": False,
+#         "reuse_cached_engines": False,
+#         "use_explicit_typing": True,
+#     }
+#     trt_mod = torchtrt.compile(model, **compile_spec)
+
+#     model_outputs = model(input, input2)
+#     trt_model_outputs = trt_mod(input, input2)
+#     for key in model_outputs.keys():
+#         out, trt_out = model_outputs[key], trt_model_outputs[key]
+#         assert out.dtype == trt_out.dtype
+#         assert out.dtype == dtype
+#         cos_sim = cosine_similarity(out, trt_out)
+#         assertions.assertTrue(
+#             cos_sim > COSINE_THRESHOLD,
+#             msg=f"HF BERT base-uncased TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+#         )
+
+#     # Clean up model env
+#     torch._dynamo.reset()
+
+
+# @pytest.mark.unit
+# def test_bert_base_uncased_cpu_offload(ir):
+#     from transformers import BertModel
+
+#     model = BertModel.from_pretrained("bert-base-uncased").cuda().eval()
+#     input = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda")
+#     input2 = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda")
+
+#     compile_spec = {
+#         "inputs": [
+#             torchtrt.Input(
+#                 input.shape,
+#                 dtype=input.dtype,
+#                 format=torch.contiguous_format,
+#             ),
+#             torchtrt.Input(
+#                 input.shape,
+#                 dtype=input.dtype,
+#                 format=torch.contiguous_format,
+#             ),
+#         ],
+#         "device": torchtrt.Device("cuda:0"),
+#         "enabled_precisions": {torch.float},
+#         "truncate_double": True,
+#         "ir": ir,
+#         "pass_through_build_failures": True,
+#         "optimization_level": 1,
+#         "min_block_size": 15,
+#         "cache_built_engines": False,
+#         "reuse_cached_engines": False,
+#         "offload_module_to_cpu": True,
+#     }
+#     trt_mod = torchtrt.compile(model, **compile_spec)
+#     if ir == "dynamo":
+#         assertions.assertTrue(
+#             get_model_device(model).type == "cpu",
+#             msg="Model should be offloaded to CPU",
+#         )
+#         model.cuda()
+
+#     model_outputs = model(input, input2)
+#     trt_model_outputs = trt_mod(input, input2)
+#     for key in model_outputs.keys():
+#         out, trt_out = model_outputs[key], trt_model_outputs[key]
+#         cos_sim = cosine_similarity(out, trt_out)
+#         assertions.assertTrue(
+#             cos_sim > COSINE_THRESHOLD,
+#             msg=f"HF BERT base-uncased TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+#         )
+
+#     # Clean up model env
+#     torch._dynamo.reset()
+
+
+# @pytest.mark.unit
+# @unittest.skipIf(
+#     not importlib.util.find_spec("torchvision"),
+#     "torchvision is not installed",
+# )
+# def test_resnet18_half(ir):
+#     model = models.resnet18(pretrained=True).eval().to("cuda").half()
+#     input = torch.randn((1, 3, 224, 224)).to("cuda").half()
+
+#     compile_spec = {
+#         "inputs": [
+#             torchtrt.Input(
+#                 input.shape, dtype=torch.half, format=torch.contiguous_format
+#             )
+#         ],
+#         "device": torchtrt.Device("cuda:0"),
+#         "enabled_precisions": {torch.half},
+#         "ir": ir,
+#         "pass_through_build_failures": True,
+#         "optimization_level": 1,
+#         "cache_built_engines": False,
+#         "reuse_cached_engines": False,
+#     }
+
+#     trt_mod = torchtrt.compile(model, **compile_spec)
+#     cos_sim = cosine_similarity(model(input), trt_mod(input))
+#     assertions.assertTrue(
+#         cos_sim > COSINE_THRESHOLD,
+#         msg=f"Resnet18 Half TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+#     )
+
+#     # Clean up model env
+#     torch._dynamo.reset()
+
+
+# @pytest.mark.unit
+# @unittest.skipIf(
+#     torchtrt.ENABLED_FEATURES.tensorrt_rtx,
+#     "bf16 is not supported for tensorrt_rtx",
+# )
+# def test_bf16_model(ir):
+#     class MyModule(torch.nn.Module):
+#         def __init__(self):
+#             super().__init__()
+#             self.conv = torch.nn.Conv2d(3, 16, 3, stride=1, bias=True)
+#             self.relu = torch.nn.ReLU()
+
+#         def forward(self, x):
+#             out = self.conv(x)
+#             out = self.relu(out)
+#             return out
+
+#     model = MyModule().eval().cuda().to(torch.bfloat16)
+#     input = torch.randn((1, 3, 224, 224)).to("cuda").to(torch.bfloat16)
+
+#     compile_spec = {
+#         "inputs": [
+#             torchtrt.Input(
+#                 input.shape, dtype=torch.bfloat16, format=torch.contiguous_format
+#             )
+#         ],
+#         "device": torchtrt.Device("cuda:0"),
+#         "enabled_precisions": {torch.float32},
+#         "ir": ir,
+#         "pass_through_build_failures": True,
+#         "min_block_size": 1,
+#         "cache_built_engines": False,
+#         "reuse_cached_engines": False,
+#         "use_explicit_typing": True,
+#     }
+
+#     trt_mod = torchtrt.compile(model, **compile_spec)
+#     cos_sim = cosine_similarity(model(input), trt_mod(input))
+
+#     assertions.assertTrue(
+#         cos_sim > COSINE_THRESHOLD,
+#         msg=f"BF16 model TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+#     )
+
+#     # Clean up model env
+#     torch._dynamo.reset()
+
+
+# @pytest.mark.unit
+# @unittest.skipIf(
+#     torchtrt.ENABLED_FEATURES.tensorrt_rtx,
+#     "bf16 is not supported for tensorrt_rtx",
+# )
+# def test_bf16_fallback_model(ir):
+#     class MyModule(torch.nn.Module):
+#         def __init__(self):
+#             super().__init__()
+#             self.conv = torch.nn.Conv2d(3, 16, 3, padding=1, stride=1, bias=True)
+#             self.relu = torch.nn.ReLU()
+#             self.conv2 = torch.nn.Conv2d(16, 16, 3, padding=1, stride=1, bias=True)
+
+#         def forward(self, x):
+#             out = self.conv(x)
+#             out = self.relu(out)
+#             out = self.conv2(out)
+#             return out
+
+#     model = MyModule().eval().cuda().to(torch.bfloat16)
+#     input = torch.randn((1, 3, 224, 224)).to("cuda").to(torch.bfloat16)
+
+#     compile_spec = {
+#         "inputs": [
+#             torchtrt.Input(
+#                 input.shape, dtype=torch.bfloat16, format=torch.contiguous_format
+#             )
+#         ],
+#         "device": torchtrt.Device("cuda:0"),
+#         "enabled_precisions": {torch.float32},
+#         "ir": ir,
+#         "pass_through_build_failures": True,
+#         "min_block_size": 1,
+#         "cache_built_engines": False,
+#         "reuse_cached_engines": False,
+#         "use_explicit_typing": True,
+#         "torch_executed_ops": {"torch.ops.aten.relu.default"},
+#     }
+
+#     trt_mod = torchtrt.compile(model, **compile_spec)
+#     cos_sim = cosine_similarity(model(input), trt_mod(input))
+
+#     assertions.assertTrue(
+#         cos_sim > COSINE_THRESHOLD,
+#         msg=f"BF16 fallback model TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+#     )
+
+#     # Clean up model env
+#     torch._dynamo.reset()

From 19f428e4a6926de04f3bb20f336c83fc7624e49b Mon Sep 17 00:00:00 2001
From: lanluo-nvidia <lanl@nvidia.com>
Date: Tue, 30 Sep 2025 08:19:22 -0700
Subject: [PATCH 3/5] debug

---
 .github/workflows/build-test-windows.yml |  10 +-
 tests/py/dynamo/models/test_models.py    | 787 +++++++++++------------
 2 files changed, 385 insertions(+), 412 deletions(-)

diff --git a/.github/workflows/build-test-windows.yml b/.github/workflows/build-test-windows.yml
index 70d66eab52..ace1a595be 100644
--- a/.github/workflows/build-test-windows.yml
+++ b/.github/workflows/build-test-windows.yml
@@ -119,7 +119,15 @@ jobs:
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/test_dtype_support.xml --ir torch_compile models/test_dtype_support.py
         echo "test_dtype_support.xml passed"
 
-        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/test_models.xml  --ir dynamo models/test_models.py
+        python -m pytest -k "lan_1" -rP models/test_models.py
+        python -m pytest -k "lan_2" -rP models/test_models.py
+        python -m pytest -k "lan_3" -rP models/test_models.py
+        python -m pytest -k "lan_4" -rP models/test_models.py
+        python -m pytest -k "lan_5" -rP models/test_models.py
+        python -m pytest -k "lan_6" -rP models/test_models.py
+        python -m pytest -k "lan_7" -rP models/test_models.py
+        python -m pytest -k "lan_8" -rP models/test_models.py
+        python -m pytest -k "lan_9" -rP models/test_models.py
 
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/test_dyn_models.xml --ir dynamo models/test_dyn_models.py
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/test_models_export.xml --ir dynamo models/test_models_export.py
diff --git a/tests/py/dynamo/models/test_models.py b/tests/py/dynamo/models/test_models.py
index 33f8d8d458..1ff465dc27 100644
--- a/tests/py/dynamo/models/test_models.py
+++ b/tests/py/dynamo/models/test_models.py
@@ -24,7 +24,7 @@
     not importlib.util.find_spec("torchvision"),
     "torchvision is not installed",
 )
-def test_resnet18(ir):
+def test_lan_1_resnet18_cpu_offload(ir):
     model = models.resnet18(pretrained=True).eval().to("cuda")
     input = torch.randn((1, 3, 224, 224)).to("cuda")
 
@@ -41,9 +41,16 @@ def test_resnet18(ir):
         "optimization_level": 1,
         "cache_built_engines": False,
         "reuse_cached_engines": False,
+        "offload_module_to_cpu": True,
     }
 
     trt_mod = torchtrt.compile(model, **compile_spec)
+    if ir == "dynamo":
+        assertions.assertTrue(
+            get_model_device(model).type == "cpu",
+            msg="Model should be offloaded to CPU",
+        )
+        model.cuda()
     cos_sim = cosine_similarity(model(input), trt_mod(input))
     assertions.assertTrue(
         cos_sim > COSINE_THRESHOLD,
@@ -54,413 +61,371 @@ def test_resnet18(ir):
     torch._dynamo.reset()
 
 
-# @pytest.mark.unit
-# @unittest.skipIf(
-#     not importlib.util.find_spec("torchvision"),
-#     "torchvision is not installed",
-# )
-# def test_resnet18_cpu_offload(ir):
-#     model = models.resnet18(pretrained=True).eval().to("cuda")
-#     input = torch.randn((1, 3, 224, 224)).to("cuda")
-
-#     compile_spec = {
-#         "inputs": [
-#             torchtrt.Input(
-#                 input.shape, dtype=torch.float, format=torch.contiguous_format
-#             )
-#         ],
-#         "device": torchtrt.Device("cuda:0"),
-#         "enabled_precisions": {torch.float},
-#         "ir": ir,
-#         "pass_through_build_failures": True,
-#         "optimization_level": 1,
-#         "cache_built_engines": False,
-#         "reuse_cached_engines": False,
-#         "offload_module_to_cpu": True,
-#     }
-
-#     trt_mod = torchtrt.compile(model, **compile_spec)
-#     if ir == "dynamo":
-#         assertions.assertTrue(
-#             get_model_device(model).type == "cpu",
-#             msg="Model should be offloaded to CPU",
-#         )
-#         model.cuda()
-#     cos_sim = cosine_similarity(model(input), trt_mod(input))
-#     assertions.assertTrue(
-#         cos_sim > COSINE_THRESHOLD,
-#         msg=f"Resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
-#     )
-
-#     # Clean up model env
-#     torch._dynamo.reset()
-
-
-# @unittest.skipIf(
-#     not importlib.util.find_spec("torchvision"), "torchvision not installed"
-# )
-# def test_resnet18_torch_exec_ops(ir):
-#     model = models.resnet18(pretrained=True).eval().to("cuda")
-#     input = torch.randn((1, 3, 224, 224)).to("cuda")
-
-#     compile_spec = {
-#         "inputs": [
-#             torchtrt.Input(
-#                 min_shape=(1, 3, 224, 224),
-#                 opt_shape=(8, 3, 224, 224),
-#                 max_shape=(16, 3, 224, 224),
-#                 dtype=torch.float32,
-#             )
-#         ],
-#         "ir": ir,
-#         "enabled_precisions": {torch.float32, torch.float16},
-#         "min_block_size": 1,
-#         "output_format": "exported_program",
-#         "cache_built_engines": True,
-#         "reuse_cached_engines": True,
-#         "torch_executed_ops": {torch.ops.aten.matmul, "torch.ops.aten.add"},
-#     }
-
-#     trt_mod = torchtrt.compile(model, **compile_spec)
-#     cos_sim = cosine_similarity(model(input), trt_mod(input))
-#     assertions.assertTrue(
-#         cos_sim > COSINE_THRESHOLD,
-#         msg=f"Resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
-#     )
-
-#     # Clean up model env
-#     torch._dynamo.reset()
-
-
-# @pytest.mark.unit
-# @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
-# @unittest.skipIf(
-#     not importlib.util.find_spec("torchvision"),
-#     "torchvision is not installed",
-# )
-# def test_mobilenet_v2(ir, dtype):
-#     if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16:
-#         pytest.skip("TensorRT-RTX does not support bfloat16")
-
-#     model = models.mobilenet_v2(pretrained=True).eval().to("cuda").to(dtype)
-#     input = torch.randn((1, 3, 224, 224)).to("cuda").to(dtype)
-
-#     compile_spec = {
-#         "inputs": [
-#             torchtrt.Input(input.shape, dtype=dtype, format=torch.contiguous_format)
-#         ],
-#         "device": torchtrt.Device("cuda:0"),
-#         "ir": ir,
-#         "pass_through_build_failures": True,
-#         "optimization_level": 1,
-#         "min_block_size": 10,
-#         "cache_built_engines": False,
-#         "reuse_cached_engines": False,
-#         "use_explicit_typing": True,
-#     }
-
-#     trt_mod = torchtrt.compile(model, **compile_spec)
-#     pyt_output = model(input)
-#     trt_output = trt_mod(input)
-#     assert pyt_output.dtype == trt_output.dtype
-#     assert pyt_output.dtype == dtype
-#     cos_sim = cosine_similarity(pyt_output, trt_output)
-#     assertions.assertTrue(
-#         cos_sim > COSINE_THRESHOLD,
-#         msg=f"Mobilenet v2 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
-#     )
-
-#     # Clean up model env
-#     torch._dynamo.reset()
-
-
-# @pytest.mark.unit
-# @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
-# @unittest.skipIf(
-#     not importlib.util.find_spec("timm") or not importlib.util.find_spec("torchvision"),
-#     "timm or torchvision not installed",
-# )
-# def test_efficientnet_b0(ir, dtype):
-#     if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16:
-#         pytest.skip("TensorRT-RTX does not support bfloat16, skipping test")
-
-#     model = (
-#         timm.create_model("efficientnet_b0", pretrained=True)
-#         .eval()
-#         .to("cuda")
-#         .to(dtype)
-#     )
-#     input = torch.randn((1, 3, 224, 224)).to("cuda").to(dtype)
-
-#     compile_spec = {
-#         "inputs": [
-#             torchtrt.Input(input.shape, dtype=dtype, format=torch.contiguous_format)
-#         ],
-#         "device": torchtrt.Device("cuda:0"),
-#         "ir": ir,
-#         "pass_through_build_failures": True,
-#         "optimization_level": 1,
-#         "min_block_size": 10,
-#         "cache_built_engines": False,
-#         "reuse_cached_engines": False,
-#         "use_explicit_typing": True,
-#     }
-
-#     trt_mod = torchtrt.compile(model, **compile_spec)
-#     pyt_output = model(input)
-#     trt_output = trt_mod(input)
-#     assert pyt_output.dtype == trt_output.dtype
-#     assert pyt_output.dtype == dtype
-#     cos_sim = cosine_similarity(pyt_output, trt_output)
-#     assertions.assertTrue(
-#         cos_sim > COSINE_THRESHOLD,
-#         msg=f"EfficientNet-B0 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
-#     )
-
-#     # Clean up model env
-#     torch._dynamo.reset()
-
-
-# @pytest.mark.unit
-# @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
-# @unittest.skipIf(
-#     not importlib.util.find_spec("transformers"),
-#     "transformers is required to run this test",
-# )
-# def test_bert_base_uncased(ir, dtype):
-#     if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16:
-#         pytest.skip("TensorRT-RTX does not support bfloat16")
-
-#     from transformers import BertModel
-
-#     model = BertModel.from_pretrained("bert-base-uncased").cuda().eval().to(dtype)
-#     input = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda")
-#     input2 = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda")
-
-#     compile_spec = {
-#         "inputs": [
-#             torchtrt.Input(
-#                 input.shape,
-#                 dtype=input.dtype,
-#                 format=torch.contiguous_format,
-#             ),
-#             torchtrt.Input(
-#                 input.shape,
-#                 dtype=input.dtype,
-#                 format=torch.contiguous_format,
-#             ),
-#         ],
-#         "device": torchtrt.Device("cuda:0"),
-#         "truncate_double": True,
-#         "ir": ir,
-#         "pass_through_build_failures": True,
-#         "optimization_level": 1,
-#         "min_block_size": 15,
-#         "cache_built_engines": False,
-#         "reuse_cached_engines": False,
-#         "use_explicit_typing": True,
-#     }
-#     trt_mod = torchtrt.compile(model, **compile_spec)
-
-#     model_outputs = model(input, input2)
-#     trt_model_outputs = trt_mod(input, input2)
-#     for key in model_outputs.keys():
-#         out, trt_out = model_outputs[key], trt_model_outputs[key]
-#         assert out.dtype == trt_out.dtype
-#         assert out.dtype == dtype
-#         cos_sim = cosine_similarity(out, trt_out)
-#         assertions.assertTrue(
-#             cos_sim > COSINE_THRESHOLD,
-#             msg=f"HF BERT base-uncased TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
-#         )
-
-#     # Clean up model env
-#     torch._dynamo.reset()
-
-
-# @pytest.mark.unit
-# def test_bert_base_uncased_cpu_offload(ir):
-#     from transformers import BertModel
-
-#     model = BertModel.from_pretrained("bert-base-uncased").cuda().eval()
-#     input = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda")
-#     input2 = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda")
-
-#     compile_spec = {
-#         "inputs": [
-#             torchtrt.Input(
-#                 input.shape,
-#                 dtype=input.dtype,
-#                 format=torch.contiguous_format,
-#             ),
-#             torchtrt.Input(
-#                 input.shape,
-#                 dtype=input.dtype,
-#                 format=torch.contiguous_format,
-#             ),
-#         ],
-#         "device": torchtrt.Device("cuda:0"),
-#         "enabled_precisions": {torch.float},
-#         "truncate_double": True,
-#         "ir": ir,
-#         "pass_through_build_failures": True,
-#         "optimization_level": 1,
-#         "min_block_size": 15,
-#         "cache_built_engines": False,
-#         "reuse_cached_engines": False,
-#         "offload_module_to_cpu": True,
-#     }
-#     trt_mod = torchtrt.compile(model, **compile_spec)
-#     if ir == "dynamo":
-#         assertions.assertTrue(
-#             get_model_device(model).type == "cpu",
-#             msg="Model should be offloaded to CPU",
-#         )
-#         model.cuda()
-
-#     model_outputs = model(input, input2)
-#     trt_model_outputs = trt_mod(input, input2)
-#     for key in model_outputs.keys():
-#         out, trt_out = model_outputs[key], trt_model_outputs[key]
-#         cos_sim = cosine_similarity(out, trt_out)
-#         assertions.assertTrue(
-#             cos_sim > COSINE_THRESHOLD,
-#             msg=f"HF BERT base-uncased TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
-#         )
-
-#     # Clean up model env
-#     torch._dynamo.reset()
-
-
-# @pytest.mark.unit
-# @unittest.skipIf(
-#     not importlib.util.find_spec("torchvision"),
-#     "torchvision is not installed",
-# )
-# def test_resnet18_half(ir):
-#     model = models.resnet18(pretrained=True).eval().to("cuda").half()
-#     input = torch.randn((1, 3, 224, 224)).to("cuda").half()
-
-#     compile_spec = {
-#         "inputs": [
-#             torchtrt.Input(
-#                 input.shape, dtype=torch.half, format=torch.contiguous_format
-#             )
-#         ],
-#         "device": torchtrt.Device("cuda:0"),
-#         "enabled_precisions": {torch.half},
-#         "ir": ir,
-#         "pass_through_build_failures": True,
-#         "optimization_level": 1,
-#         "cache_built_engines": False,
-#         "reuse_cached_engines": False,
-#     }
-
-#     trt_mod = torchtrt.compile(model, **compile_spec)
-#     cos_sim = cosine_similarity(model(input), trt_mod(input))
-#     assertions.assertTrue(
-#         cos_sim > COSINE_THRESHOLD,
-#         msg=f"Resnet18 Half TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
-#     )
-
-#     # Clean up model env
-#     torch._dynamo.reset()
-
-
-# @pytest.mark.unit
-# @unittest.skipIf(
-#     torchtrt.ENABLED_FEATURES.tensorrt_rtx,
-#     "bf16 is not supported for tensorrt_rtx",
-# )
-# def test_bf16_model(ir):
-#     class MyModule(torch.nn.Module):
-#         def __init__(self):
-#             super().__init__()
-#             self.conv = torch.nn.Conv2d(3, 16, 3, stride=1, bias=True)
-#             self.relu = torch.nn.ReLU()
-
-#         def forward(self, x):
-#             out = self.conv(x)
-#             out = self.relu(out)
-#             return out
-
-#     model = MyModule().eval().cuda().to(torch.bfloat16)
-#     input = torch.randn((1, 3, 224, 224)).to("cuda").to(torch.bfloat16)
-
-#     compile_spec = {
-#         "inputs": [
-#             torchtrt.Input(
-#                 input.shape, dtype=torch.bfloat16, format=torch.contiguous_format
-#             )
-#         ],
-#         "device": torchtrt.Device("cuda:0"),
-#         "enabled_precisions": {torch.float32},
-#         "ir": ir,
-#         "pass_through_build_failures": True,
-#         "min_block_size": 1,
-#         "cache_built_engines": False,
-#         "reuse_cached_engines": False,
-#         "use_explicit_typing": True,
-#     }
-
-#     trt_mod = torchtrt.compile(model, **compile_spec)
-#     cos_sim = cosine_similarity(model(input), trt_mod(input))
-
-#     assertions.assertTrue(
-#         cos_sim > COSINE_THRESHOLD,
-#         msg=f"BF16 model TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
-#     )
-
-#     # Clean up model env
-#     torch._dynamo.reset()
-
-
-# @pytest.mark.unit
-# @unittest.skipIf(
-#     torchtrt.ENABLED_FEATURES.tensorrt_rtx,
-#     "bf16 is not supported for tensorrt_rtx",
-# )
-# def test_bf16_fallback_model(ir):
-#     class MyModule(torch.nn.Module):
-#         def __init__(self):
-#             super().__init__()
-#             self.conv = torch.nn.Conv2d(3, 16, 3, padding=1, stride=1, bias=True)
-#             self.relu = torch.nn.ReLU()
-#             self.conv2 = torch.nn.Conv2d(16, 16, 3, padding=1, stride=1, bias=True)
-
-#         def forward(self, x):
-#             out = self.conv(x)
-#             out = self.relu(out)
-#             out = self.conv2(out)
-#             return out
-
-#     model = MyModule().eval().cuda().to(torch.bfloat16)
-#     input = torch.randn((1, 3, 224, 224)).to("cuda").to(torch.bfloat16)
-
-#     compile_spec = {
-#         "inputs": [
-#             torchtrt.Input(
-#                 input.shape, dtype=torch.bfloat16, format=torch.contiguous_format
-#             )
-#         ],
-#         "device": torchtrt.Device("cuda:0"),
-#         "enabled_precisions": {torch.float32},
-#         "ir": ir,
-#         "pass_through_build_failures": True,
-#         "min_block_size": 1,
-#         "cache_built_engines": False,
-#         "reuse_cached_engines": False,
-#         "use_explicit_typing": True,
-#         "torch_executed_ops": {"torch.ops.aten.relu.default"},
-#     }
-
-#     trt_mod = torchtrt.compile(model, **compile_spec)
-#     cos_sim = cosine_similarity(model(input), trt_mod(input))
-
-#     assertions.assertTrue(
-#         cos_sim > COSINE_THRESHOLD,
-#         msg=f"BF16 fallback model TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
-#     )
-
-#     # Clean up model env
-#     torch._dynamo.reset()
+@unittest.skipIf(
+    not importlib.util.find_spec("torchvision"), "torchvision not installed"
+)
+def test_lan_2_resnet18_torch_exec_ops(ir):
+    model = models.resnet18(pretrained=True).eval().to("cuda")
+    input = torch.randn((1, 3, 224, 224)).to("cuda")
+
+    compile_spec = {
+        "inputs": [
+            torchtrt.Input(
+                min_shape=(1, 3, 224, 224),
+                opt_shape=(8, 3, 224, 224),
+                max_shape=(16, 3, 224, 224),
+                dtype=torch.float32,
+            )
+        ],
+        "ir": ir,
+        "enabled_precisions": {torch.float32, torch.float16},
+        "min_block_size": 1,
+        "output_format": "exported_program",
+        "cache_built_engines": True,
+        "reuse_cached_engines": True,
+        "torch_executed_ops": {torch.ops.aten.matmul, "torch.ops.aten.add"},
+    }
+
+    trt_mod = torchtrt.compile(model, **compile_spec)
+    cos_sim = cosine_similarity(model(input), trt_mod(input))
+    assertions.assertTrue(
+        cos_sim > COSINE_THRESHOLD,
+        msg=f"Resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+    )
+
+    # Clean up model env
+    torch._dynamo.reset()
+
+
+@pytest.mark.unit
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
+@unittest.skipIf(
+    not importlib.util.find_spec("torchvision"),
+    "torchvision is not installed",
+)
+def test_lan_3_mobilenet_v2(ir, dtype):
+    if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16:
+        pytest.skip("TensorRT-RTX does not support bfloat16")
+
+    model = models.mobilenet_v2(pretrained=True).eval().to("cuda").to(dtype)
+    input = torch.randn((1, 3, 224, 224)).to("cuda").to(dtype)
+
+    compile_spec = {
+        "inputs": [
+            torchtrt.Input(input.shape, dtype=dtype, format=torch.contiguous_format)
+        ],
+        "device": torchtrt.Device("cuda:0"),
+        "ir": ir,
+        "pass_through_build_failures": True,
+        "optimization_level": 1,
+        "min_block_size": 10,
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
+        "use_explicit_typing": True,
+    }
+
+    trt_mod = torchtrt.compile(model, **compile_spec)
+    pyt_output = model(input)
+    trt_output = trt_mod(input)
+    assert pyt_output.dtype == trt_output.dtype
+    assert pyt_output.dtype == dtype
+    cos_sim = cosine_similarity(pyt_output, trt_output)
+    assertions.assertTrue(
+        cos_sim > COSINE_THRESHOLD,
+        msg=f"Mobilenet v2 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+    )
+
+    # Clean up model env
+    torch._dynamo.reset()
+
+
+@pytest.mark.unit
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
+@unittest.skipIf(
+    not importlib.util.find_spec("timm") or not importlib.util.find_spec("torchvision"),
+    "timm or torchvision not installed",
+)
+def test_lan_4_efficientnet_b0(ir, dtype):
+    if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16:
+        pytest.skip("TensorRT-RTX does not support bfloat16, skipping test")
+
+    model = (
+        timm.create_model("efficientnet_b0", pretrained=True)
+        .eval()
+        .to("cuda")
+        .to(dtype)
+    )
+    input = torch.randn((1, 3, 224, 224)).to("cuda").to(dtype)
+
+    compile_spec = {
+        "inputs": [
+            torchtrt.Input(input.shape, dtype=dtype, format=torch.contiguous_format)
+        ],
+        "device": torchtrt.Device("cuda:0"),
+        "ir": ir,
+        "pass_through_build_failures": True,
+        "optimization_level": 1,
+        "min_block_size": 10,
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
+        "use_explicit_typing": True,
+    }
+
+    trt_mod = torchtrt.compile(model, **compile_spec)
+    pyt_output = model(input)
+    trt_output = trt_mod(input)
+    assert pyt_output.dtype == trt_output.dtype
+    assert pyt_output.dtype == dtype
+    cos_sim = cosine_similarity(pyt_output, trt_output)
+    assertions.assertTrue(
+        cos_sim > COSINE_THRESHOLD,
+        msg=f"EfficientNet-B0 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+    )
+
+    # Clean up model env
+    torch._dynamo.reset()
+
+
+@pytest.mark.unit
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
+@unittest.skipIf(
+    not importlib.util.find_spec("transformers"),
+    "transformers is required to run this test",
+)
+def test_lan_5_bert_base_uncased(ir, dtype):
+    if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16:
+        pytest.skip("TensorRT-RTX does not support bfloat16")
+
+    from transformers import BertModel
+
+    model = BertModel.from_pretrained("bert-base-uncased").cuda().eval().to(dtype)
+    input = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda")
+    input2 = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda")
+
+    compile_spec = {
+        "inputs": [
+            torchtrt.Input(
+                input.shape,
+                dtype=input.dtype,
+                format=torch.contiguous_format,
+            ),
+            torchtrt.Input(
+                input.shape,
+                dtype=input.dtype,
+                format=torch.contiguous_format,
+            ),
+        ],
+        "device": torchtrt.Device("cuda:0"),
+        "truncate_double": True,
+        "ir": ir,
+        "pass_through_build_failures": True,
+        "optimization_level": 1,
+        "min_block_size": 15,
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
+        "use_explicit_typing": True,
+    }
+    trt_mod = torchtrt.compile(model, **compile_spec)
+
+    model_outputs = model(input, input2)
+    trt_model_outputs = trt_mod(input, input2)
+    for key in model_outputs.keys():
+        out, trt_out = model_outputs[key], trt_model_outputs[key]
+        assert out.dtype == trt_out.dtype
+        assert out.dtype == dtype
+        cos_sim = cosine_similarity(out, trt_out)
+        assertions.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"HF BERT base-uncased TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
+
+    # Clean up model env
+    torch._dynamo.reset()
+
+
+@pytest.mark.unit
+def test_lan_6_bert_base_uncased_cpu_offload(ir):
+    from transformers import BertModel
+
+    model = BertModel.from_pretrained("bert-base-uncased").cuda().eval()
+    input = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda")
+    input2 = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda")
+
+    compile_spec = {
+        "inputs": [
+            torchtrt.Input(
+                input.shape,
+                dtype=input.dtype,
+                format=torch.contiguous_format,
+            ),
+            torchtrt.Input(
+                input.shape,
+                dtype=input.dtype,
+                format=torch.contiguous_format,
+            ),
+        ],
+        "device": torchtrt.Device("cuda:0"),
+        "enabled_precisions": {torch.float},
+        "truncate_double": True,
+        "ir": ir,
+        "pass_through_build_failures": True,
+        "optimization_level": 1,
+        "min_block_size": 15,
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
+        "offload_module_to_cpu": True,
+    }
+    trt_mod = torchtrt.compile(model, **compile_spec)
+    if ir == "dynamo":
+        assertions.assertTrue(
+            get_model_device(model).type == "cpu",
+            msg="Model should be offloaded to CPU",
+        )
+        model.cuda()
+
+    model_outputs = model(input, input2)
+    trt_model_outputs = trt_mod(input, input2)
+    for key in model_outputs.keys():
+        out, trt_out = model_outputs[key], trt_model_outputs[key]
+        cos_sim = cosine_similarity(out, trt_out)
+        assertions.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"HF BERT base-uncased TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
+
+    # Clean up model env
+    torch._dynamo.reset()
+
+
+@pytest.mark.unit
+@unittest.skipIf(
+    not importlib.util.find_spec("torchvision"),
+    "torchvision is not installed",
+)
+def test_lan_7_resnet18_half(ir):
+    model = models.resnet18(pretrained=True).eval().to("cuda").half()
+    input = torch.randn((1, 3, 224, 224)).to("cuda").half()
+
+    compile_spec = {
+        "inputs": [
+            torchtrt.Input(
+                input.shape, dtype=torch.half, format=torch.contiguous_format
+            )
+        ],
+        "device": torchtrt.Device("cuda:0"),
+        "enabled_precisions": {torch.half},
+        "ir": ir,
+        "pass_through_build_failures": True,
+        "optimization_level": 1,
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
+    }
+
+    trt_mod = torchtrt.compile(model, **compile_spec)
+    cos_sim = cosine_similarity(model(input), trt_mod(input))
+    assertions.assertTrue(
+        cos_sim > COSINE_THRESHOLD,
+        msg=f"Resnet18 Half TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+    )
+
+    # Clean up model env
+    torch._dynamo.reset()
+
+
+@pytest.mark.unit
+@unittest.skipIf(
+    torchtrt.ENABLED_FEATURES.tensorrt_rtx,
+    "bf16 is not supported for tensorrt_rtx",
+)
+def test_lan_8_bf16_model(ir):
+    class MyModule(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = torch.nn.Conv2d(3, 16, 3, stride=1, bias=True)
+            self.relu = torch.nn.ReLU()
+
+        def forward(self, x):
+            out = self.conv(x)
+            out = self.relu(out)
+            return out
+
+    model = MyModule().eval().cuda().to(torch.bfloat16)
+    input = torch.randn((1, 3, 224, 224)).to("cuda").to(torch.bfloat16)
+
+    compile_spec = {
+        "inputs": [
+            torchtrt.Input(
+                input.shape, dtype=torch.bfloat16, format=torch.contiguous_format
+            )
+        ],
+        "device": torchtrt.Device("cuda:0"),
+        "enabled_precisions": {torch.float32},
+        "ir": ir,
+        "pass_through_build_failures": True,
+        "min_block_size": 1,
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
+        "use_explicit_typing": True,
+    }
+
+    trt_mod = torchtrt.compile(model, **compile_spec)
+    cos_sim = cosine_similarity(model(input), trt_mod(input))
+
+    assertions.assertTrue(
+        cos_sim > COSINE_THRESHOLD,
+        msg=f"BF16 model TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+    )
+
+    # Clean up model env
+    torch._dynamo.reset()
+
+
+@pytest.mark.unit
+@unittest.skipIf(
+    torchtrt.ENABLED_FEATURES.tensorrt_rtx,
+    "bf16 is not supported for tensorrt_rtx",
+)
+def test_lan_9_bf16_fallback_model(ir):
+    class MyModule(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = torch.nn.Conv2d(3, 16, 3, padding=1, stride=1, bias=True)
+            self.relu = torch.nn.ReLU()
+            self.conv2 = torch.nn.Conv2d(16, 16, 3, padding=1, stride=1, bias=True)
+
+        def forward(self, x):
+            out = self.conv(x)
+            out = self.relu(out)
+            out = self.conv2(out)
+            return out
+
+    model = MyModule().eval().cuda().to(torch.bfloat16)
+    input = torch.randn((1, 3, 224, 224)).to("cuda").to(torch.bfloat16)
+
+    compile_spec = {
+        "inputs": [
+            torchtrt.Input(
+                input.shape, dtype=torch.bfloat16, format=torch.contiguous_format
+            )
+        ],
+        "device": torchtrt.Device("cuda:0"),
+        "enabled_precisions": {torch.float32},
+        "ir": ir,
+        "pass_through_build_failures": True,
+        "min_block_size": 1,
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
+        "use_explicit_typing": True,
+        "torch_executed_ops": {"torch.ops.aten.relu.default"},
+    }
+
+    trt_mod = torchtrt.compile(model, **compile_spec)
+    cos_sim = cosine_similarity(model(input), trt_mod(input))
+
+    assertions.assertTrue(
+        cos_sim > COSINE_THRESHOLD,
+        msg=f"BF16 fallback model TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+    )
+
+    # Clean up model env
+    torch._dynamo.reset()

From 108f8231871836578cd2c853a17f5a4b7be3fe41 Mon Sep 17 00:00:00 2001
From: lanluo-nvidia <lanl@nvidia.com>
Date: Tue, 30 Sep 2025 11:34:21 -0700
Subject: [PATCH 4/5] test

---
 .github/workflows/build-test-windows.yml     |  19 +-
 .github/workflows/build-test-windows_rtx.yml |   2 +-
 py/torch_tensorrt/_compile.py                |   2 +
 py/torch_tensorrt/dynamo/_compiler.py        |   6 +
 tests/py/dynamo/models/test_models.py        | 685 ++++++++++---------
 5 files changed, 369 insertions(+), 345 deletions(-)

diff --git a/.github/workflows/build-test-windows.yml b/.github/workflows/build-test-windows.yml
index ace1a595be..bed72e7274 100644
--- a/.github/workflows/build-test-windows.yml
+++ b/.github/workflows/build-test-windows.yml
@@ -116,25 +116,12 @@ jobs:
         gdb --version
         nvidia-smi
         nvcc --version
-        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/test_dtype_support.xml --ir torch_compile models/test_dtype_support.py
-        echo "test_dtype_support.xml passed"
-
         python -m pytest -k "lan_1" -rP models/test_models.py
+        echo "lan added finished lan_1"
         python -m pytest -k "lan_2" -rP models/test_models.py
+        echo "lan added finished lan_2"
         python -m pytest -k "lan_3" -rP models/test_models.py
-        python -m pytest -k "lan_4" -rP models/test_models.py
-        python -m pytest -k "lan_5" -rP models/test_models.py
-        python -m pytest -k "lan_6" -rP models/test_models.py
-        python -m pytest -k "lan_7" -rP models/test_models.py
-        python -m pytest -k "lan_8" -rP models/test_models.py
-        python -m pytest -k "lan_9" -rP models/test_models.py
-
-        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/test_dyn_models.xml --ir dynamo models/test_dyn_models.py
-        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/test_models_export.xml --ir dynamo models/test_models_export.py
-        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_serde_test_results.xml --ir dynamo models/test_export_serde.py
-        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/reexport_test_results.xml --ir dynamo models/test_reexport.py
-
-        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_llm.xml llm/
+        echo "lan added finished lan_3"
         popd
 
 
diff --git a/.github/workflows/build-test-windows_rtx.yml b/.github/workflows/build-test-windows_rtx.yml
index bf6eaef765..4bd08deafd 100644
--- a/.github/workflows/build-test-windows_rtx.yml
+++ b/.github/workflows/build-test-windows_rtx.yml
@@ -1,7 +1,7 @@
 name: RTX - Build and test Windows wheels
 
 on:
-  pull_request:
+  #pull_request:
   push:
     branches:
       - main
diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py
index 8d8c51cbfc..c26fff2cc0 100644
--- a/py/torch_tensorrt/_compile.py
+++ b/py/torch_tensorrt/_compile.py
@@ -227,6 +227,7 @@ def compile(
 
     module_type = _parse_module_type(module)
     target_ir = _get_target_fe(module_type, ir)
+    print(f"lan added {target_ir=}")
     if target_ir == _IRType.ts:
         ts_mod = module
         if module_type == _ModuleType.nn:
@@ -307,6 +308,7 @@ def _fx_input_interface(
         exp_program = dynamo_trace(
             module, torchtrt_arg_inputs, kwarg_inputs=torchtrt_kwarg_inputs, **kwargs
         )
+        print(f"lan added {str(exp_program.graph)=}")
         trt_graph_module = dynamo_compile(
             exp_program,
             arg_inputs=torchtrt_arg_inputs,
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
index 0dc4654db0..a24aeea75f 100644
--- a/py/torch_tensorrt/dynamo/_compiler.py
+++ b/py/torch_tensorrt/dynamo/_compiler.py
@@ -5,6 +5,7 @@
 import os
 import platform
 import warnings
+from functools import total_ordering
 from typing import Any, Collection, List, Optional, Sequence, Set, Tuple, Union
 
 import torch
@@ -809,6 +810,7 @@ def preserve_module_specs(
 
     # Partition module into components that can be TRT-accelerated
     fast_partitioner_failed = False
+    print(f"lan added {str(gm.graph)=}")
     # If specified, try using the fast partitioner and fall back to the global one on failure
     if settings.use_fast_partitioner:
         try:
@@ -847,10 +849,14 @@ def preserve_module_specs(
         dryrun_tracker.to_run_in_torch.extend(parse_non_trt_nodes(partitioned_module))
 
     submodule_node_dict = {}
+    print(f"lan added {list(partitioned_module.graph.nodes)=}")
+    print(f"lan added {total_ops=} {num_supported_ops=}")
     for node in partitioned_module.graph.nodes:
         if "_run_on_acc" not in node.name:
+            print(f"lan added skipped node{node.name=}")
             continue
         submodule_node_dict[node.name] = node
+        print(f"lan added added submodule{node.name=}")
 
     preserve_module_specs(original_in_spec, original_out_spec, partitioned_module)
     # Store TRT replicas of Torch subgraphs
diff --git a/tests/py/dynamo/models/test_models.py b/tests/py/dynamo/models/test_models.py
index 1ff465dc27..dfd5b5aba5 100644
--- a/tests/py/dynamo/models/test_models.py
+++ b/tests/py/dynamo/models/test_models.py
@@ -19,52 +19,10 @@
     import timm
 
 
-@pytest.mark.unit
-@unittest.skipIf(
-    not importlib.util.find_spec("torchvision"),
-    "torchvision is not installed",
-)
-def test_lan_1_resnet18_cpu_offload(ir):
-    model = models.resnet18(pretrained=True).eval().to("cuda")
-    input = torch.randn((1, 3, 224, 224)).to("cuda")
-
-    compile_spec = {
-        "inputs": [
-            torchtrt.Input(
-                input.shape, dtype=torch.float, format=torch.contiguous_format
-            )
-        ],
-        "device": torchtrt.Device("cuda:0"),
-        "enabled_precisions": {torch.float},
-        "ir": ir,
-        "pass_through_build_failures": True,
-        "optimization_level": 1,
-        "cache_built_engines": False,
-        "reuse_cached_engines": False,
-        "offload_module_to_cpu": True,
-    }
-
-    trt_mod = torchtrt.compile(model, **compile_spec)
-    if ir == "dynamo":
-        assertions.assertTrue(
-            get_model_device(model).type == "cpu",
-            msg="Model should be offloaded to CPU",
-        )
-        model.cuda()
-    cos_sim = cosine_similarity(model(input), trt_mod(input))
-    assertions.assertTrue(
-        cos_sim > COSINE_THRESHOLD,
-        msg=f"Resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
-    )
-
-    # Clean up model env
-    torch._dynamo.reset()
-
-
 @unittest.skipIf(
     not importlib.util.find_spec("torchvision"), "torchvision not installed"
 )
-def test_lan_2_resnet18_torch_exec_ops(ir):
+def test_lan_1_resnet18_torch_exec_ops(ir):
     model = models.resnet18(pretrained=True).eval().to("cuda")
     input = torch.randn((1, 3, 224, 224)).to("cuda")
 
@@ -97,335 +55,406 @@ def test_lan_2_resnet18_torch_exec_ops(ir):
     torch._dynamo.reset()
 
 
-@pytest.mark.unit
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
-@unittest.skipIf(
-    not importlib.util.find_spec("torchvision"),
-    "torchvision is not installed",
-)
-def test_lan_3_mobilenet_v2(ir, dtype):
-    if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16:
-        pytest.skip("TensorRT-RTX does not support bfloat16")
-
-    model = models.mobilenet_v2(pretrained=True).eval().to("cuda").to(dtype)
-    input = torch.randn((1, 3, 224, 224)).to("cuda").to(dtype)
-
-    compile_spec = {
-        "inputs": [
-            torchtrt.Input(input.shape, dtype=dtype, format=torch.contiguous_format)
-        ],
-        "device": torchtrt.Device("cuda:0"),
-        "ir": ir,
-        "pass_through_build_failures": True,
-        "optimization_level": 1,
-        "min_block_size": 10,
-        "cache_built_engines": False,
-        "reuse_cached_engines": False,
-        "use_explicit_typing": True,
-    }
-
-    trt_mod = torchtrt.compile(model, **compile_spec)
-    pyt_output = model(input)
-    trt_output = trt_mod(input)
-    assert pyt_output.dtype == trt_output.dtype
-    assert pyt_output.dtype == dtype
-    cos_sim = cosine_similarity(pyt_output, trt_output)
-    assertions.assertTrue(
-        cos_sim > COSINE_THRESHOLD,
-        msg=f"Mobilenet v2 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
-    )
-
-    # Clean up model env
-    torch._dynamo.reset()
-
-
-@pytest.mark.unit
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
-@unittest.skipIf(
-    not importlib.util.find_spec("timm") or not importlib.util.find_spec("torchvision"),
-    "timm or torchvision not installed",
-)
-def test_lan_4_efficientnet_b0(ir, dtype):
-    if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16:
-        pytest.skip("TensorRT-RTX does not support bfloat16, skipping test")
-
-    model = (
-        timm.create_model("efficientnet_b0", pretrained=True)
-        .eval()
-        .to("cuda")
-        .to(dtype)
-    )
-    input = torch.randn((1, 3, 224, 224)).to("cuda").to(dtype)
-
-    compile_spec = {
-        "inputs": [
-            torchtrt.Input(input.shape, dtype=dtype, format=torch.contiguous_format)
-        ],
-        "device": torchtrt.Device("cuda:0"),
-        "ir": ir,
-        "pass_through_build_failures": True,
-        "optimization_level": 1,
-        "min_block_size": 10,
-        "cache_built_engines": False,
-        "reuse_cached_engines": False,
-        "use_explicit_typing": True,
-    }
-
-    trt_mod = torchtrt.compile(model, **compile_spec)
-    pyt_output = model(input)
-    trt_output = trt_mod(input)
-    assert pyt_output.dtype == trt_output.dtype
-    assert pyt_output.dtype == dtype
-    cos_sim = cosine_similarity(pyt_output, trt_output)
-    assertions.assertTrue(
-        cos_sim > COSINE_THRESHOLD,
-        msg=f"EfficientNet-B0 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
-    )
-
-    # Clean up model env
-    torch._dynamo.reset()
-
-
-@pytest.mark.unit
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
-@unittest.skipIf(
-    not importlib.util.find_spec("transformers"),
-    "transformers is required to run this test",
-)
-def test_lan_5_bert_base_uncased(ir, dtype):
-    if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16:
-        pytest.skip("TensorRT-RTX does not support bfloat16")
-
-    from transformers import BertModel
-
-    model = BertModel.from_pretrained("bert-base-uncased").cuda().eval().to(dtype)
-    input = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda")
-    input2 = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda")
-
-    compile_spec = {
-        "inputs": [
-            torchtrt.Input(
-                input.shape,
-                dtype=input.dtype,
-                format=torch.contiguous_format,
-            ),
-            torchtrt.Input(
-                input.shape,
-                dtype=input.dtype,
-                format=torch.contiguous_format,
-            ),
-        ],
-        "device": torchtrt.Device("cuda:0"),
-        "truncate_double": True,
-        "ir": ir,
-        "pass_through_build_failures": True,
-        "optimization_level": 1,
-        "min_block_size": 15,
-        "cache_built_engines": False,
-        "reuse_cached_engines": False,
-        "use_explicit_typing": True,
-    }
-    trt_mod = torchtrt.compile(model, **compile_spec)
-
-    model_outputs = model(input, input2)
-    trt_model_outputs = trt_mod(input, input2)
-    for key in model_outputs.keys():
-        out, trt_out = model_outputs[key], trt_model_outputs[key]
-        assert out.dtype == trt_out.dtype
-        assert out.dtype == dtype
-        cos_sim = cosine_similarity(out, trt_out)
-        assertions.assertTrue(
-            cos_sim > COSINE_THRESHOLD,
-            msg=f"HF BERT base-uncased TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
-        )
-
-    # Clean up model env
-    torch._dynamo.reset()
-
-
-@pytest.mark.unit
-def test_lan_6_bert_base_uncased_cpu_offload(ir):
-    from transformers import BertModel
-
-    model = BertModel.from_pretrained("bert-base-uncased").cuda().eval()
-    input = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda")
-    input2 = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda")
-
-    compile_spec = {
-        "inputs": [
-            torchtrt.Input(
-                input.shape,
-                dtype=input.dtype,
-                format=torch.contiguous_format,
-            ),
-            torchtrt.Input(
-                input.shape,
-                dtype=input.dtype,
-                format=torch.contiguous_format,
-            ),
-        ],
-        "device": torchtrt.Device("cuda:0"),
-        "enabled_precisions": {torch.float},
-        "truncate_double": True,
-        "ir": ir,
-        "pass_through_build_failures": True,
-        "optimization_level": 1,
-        "min_block_size": 15,
-        "cache_built_engines": False,
-        "reuse_cached_engines": False,
-        "offload_module_to_cpu": True,
-    }
-    trt_mod = torchtrt.compile(model, **compile_spec)
-    if ir == "dynamo":
-        assertions.assertTrue(
-            get_model_device(model).type == "cpu",
-            msg="Model should be offloaded to CPU",
-        )
-        model.cuda()
-
-    model_outputs = model(input, input2)
-    trt_model_outputs = trt_mod(input, input2)
-    for key in model_outputs.keys():
-        out, trt_out = model_outputs[key], trt_model_outputs[key]
-        cos_sim = cosine_similarity(out, trt_out)
-        assertions.assertTrue(
-            cos_sim > COSINE_THRESHOLD,
-            msg=f"HF BERT base-uncased TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
-        )
-
-    # Clean up model env
-    torch._dynamo.reset()
-
-
-@pytest.mark.unit
 @unittest.skipIf(
-    not importlib.util.find_spec("torchvision"),
-    "torchvision is not installed",
+    not importlib.util.find_spec("torchvision"), "torchvision not installed"
 )
-def test_lan_7_resnet18_half(ir):
-    model = models.resnet18(pretrained=True).eval().to("cuda").half()
-    input = torch.randn((1, 3, 224, 224)).to("cuda").half()
+def test_lan_2_resnet18_torch_exec_ops(ir):
+    model = models.resnet18(pretrained=True).eval().to("cuda")
+    input = torch.randn((1, 3, 224, 224)).to("cuda")
 
     compile_spec = {
         "inputs": [
             torchtrt.Input(
-                input.shape, dtype=torch.half, format=torch.contiguous_format
+                min_shape=(1, 3, 224, 224),
+                opt_shape=(8, 3, 224, 224),
+                max_shape=(16, 3, 224, 224),
+                dtype=torch.float32,
             )
         ],
-        "device": torchtrt.Device("cuda:0"),
-        "enabled_precisions": {torch.half},
         "ir": ir,
-        "pass_through_build_failures": True,
-        "optimization_level": 1,
-        "cache_built_engines": False,
-        "reuse_cached_engines": False,
-    }
-
-    trt_mod = torchtrt.compile(model, **compile_spec)
-    cos_sim = cosine_similarity(model(input), trt_mod(input))
-    assertions.assertTrue(
-        cos_sim > COSINE_THRESHOLD,
-        msg=f"Resnet18 Half TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
-    )
-
-    # Clean up model env
-    torch._dynamo.reset()
-
-
-@pytest.mark.unit
-@unittest.skipIf(
-    torchtrt.ENABLED_FEATURES.tensorrt_rtx,
-    "bf16 is not supported for tensorrt_rtx",
-)
-def test_lan_8_bf16_model(ir):
-    class MyModule(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.conv = torch.nn.Conv2d(3, 16, 3, stride=1, bias=True)
-            self.relu = torch.nn.ReLU()
-
-        def forward(self, x):
-            out = self.conv(x)
-            out = self.relu(out)
-            return out
-
-    model = MyModule().eval().cuda().to(torch.bfloat16)
-    input = torch.randn((1, 3, 224, 224)).to("cuda").to(torch.bfloat16)
-
-    compile_spec = {
-        "inputs": [
-            torchtrt.Input(
-                input.shape, dtype=torch.bfloat16, format=torch.contiguous_format
-            )
-        ],
-        "device": torchtrt.Device("cuda:0"),
         "enabled_precisions": {torch.float32},
-        "ir": ir,
-        "pass_through_build_failures": True,
         "min_block_size": 1,
+        "output_format": "exported_program",
         "cache_built_engines": False,
         "reuse_cached_engines": False,
-        "use_explicit_typing": True,
+        "torch_executed_ops": {torch.ops.aten.matmul, "torch.ops.aten.add"},
     }
 
     trt_mod = torchtrt.compile(model, **compile_spec)
     cos_sim = cosine_similarity(model(input), trt_mod(input))
-
     assertions.assertTrue(
         cos_sim > COSINE_THRESHOLD,
-        msg=f"BF16 model TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        msg=f"Resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
     )
 
     # Clean up model env
     torch._dynamo.reset()
 
 
-@pytest.mark.unit
 @unittest.skipIf(
-    torchtrt.ENABLED_FEATURES.tensorrt_rtx,
-    "bf16 is not supported for tensorrt_rtx",
+    not importlib.util.find_spec("torchvision"), "torchvision not installed"
 )
-def test_lan_9_bf16_fallback_model(ir):
-    class MyModule(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.conv = torch.nn.Conv2d(3, 16, 3, padding=1, stride=1, bias=True)
-            self.relu = torch.nn.ReLU()
-            self.conv2 = torch.nn.Conv2d(16, 16, 3, padding=1, stride=1, bias=True)
-
-        def forward(self, x):
-            out = self.conv(x)
-            out = self.relu(out)
-            out = self.conv2(out)
-            return out
-
-    model = MyModule().eval().cuda().to(torch.bfloat16)
-    input = torch.randn((1, 3, 224, 224)).to("cuda").to(torch.bfloat16)
+def test_lan_3_resnet18_torch_exec_ops(ir):
+    model = models.resnet18(pretrained=True).eval().to("cuda")
+    input = torch.randn((1, 3, 224, 224)).to("cuda")
 
     compile_spec = {
         "inputs": [
             torchtrt.Input(
-                input.shape, dtype=torch.bfloat16, format=torch.contiguous_format
+                min_shape=(1, 3, 224, 224),
+                opt_shape=(8, 3, 224, 224),
+                max_shape=(16, 3, 224, 224),
+                dtype=torch.float32,
             )
         ],
-        "device": torchtrt.Device("cuda:0"),
-        "enabled_precisions": {torch.float32},
         "ir": ir,
-        "pass_through_build_failures": True,
+        "enabled_precisions": {torch.float32},
         "min_block_size": 1,
         "cache_built_engines": False,
         "reuse_cached_engines": False,
-        "use_explicit_typing": True,
-        "torch_executed_ops": {"torch.ops.aten.relu.default"},
+        "torch_executed_ops": {torch.ops.aten.matmul, "torch.ops.aten.add"},
     }
 
     trt_mod = torchtrt.compile(model, **compile_spec)
     cos_sim = cosine_similarity(model(input), trt_mod(input))
-
     assertions.assertTrue(
         cos_sim > COSINE_THRESHOLD,
-        msg=f"BF16 fallback model TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        msg=f"Resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
     )
 
     # Clean up model env
     torch._dynamo.reset()
+
+
+# @pytest.mark.unit
+# @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
+# @unittest.skipIf(
+#     not importlib.util.find_spec("torchvision"),
+#     "torchvision is not installed",
+# )
+# def test_lan_3_mobilenet_v2(ir, dtype):
+#     if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16:
+#         pytest.skip("TensorRT-RTX does not support bfloat16")
+
+#     model = models.mobilenet_v2(pretrained=True).eval().to("cuda").to(dtype)
+#     input = torch.randn((1, 3, 224, 224)).to("cuda").to(dtype)
+
+#     compile_spec = {
+#         "inputs": [
+#             torchtrt.Input(input.shape, dtype=dtype, format=torch.contiguous_format)
+#         ],
+#         "device": torchtrt.Device("cuda:0"),
+#         "ir": ir,
+#         "pass_through_build_failures": True,
+#         "optimization_level": 1,
+#         "min_block_size": 10,
+#         "cache_built_engines": False,
+#         "reuse_cached_engines": False,
+#         "use_explicit_typing": True,
+#     }
+
+#     trt_mod = torchtrt.compile(model, **compile_spec)
+#     pyt_output = model(input)
+#     trt_output = trt_mod(input)
+#     assert pyt_output.dtype == trt_output.dtype
+#     assert pyt_output.dtype == dtype
+#     cos_sim = cosine_similarity(pyt_output, trt_output)
+#     assertions.assertTrue(
+#         cos_sim > COSINE_THRESHOLD,
+#         msg=f"Mobilenet v2 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+#     )
+
+#     # Clean up model env
+#     torch._dynamo.reset()
+
+
+# @pytest.mark.unit
+# @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
+# @unittest.skipIf(
+#     not importlib.util.find_spec("timm") or not importlib.util.find_spec("torchvision"),
+#     "timm or torchvision not installed",
+# )
+# def test_lan_4_efficientnet_b0(ir, dtype):
+#     if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16:
+#         pytest.skip("TensorRT-RTX does not support bfloat16, skipping test")
+
+#     model = (
+#         timm.create_model("efficientnet_b0", pretrained=True)
+#         .eval()
+#         .to("cuda")
+#         .to(dtype)
+#     )
+#     input = torch.randn((1, 3, 224, 224)).to("cuda").to(dtype)
+
+#     compile_spec = {
+#         "inputs": [
+#             torchtrt.Input(input.shape, dtype=dtype, format=torch.contiguous_format)
+#         ],
+#         "device": torchtrt.Device("cuda:0"),
+#         "ir": ir,
+#         "pass_through_build_failures": True,
+#         "optimization_level": 1,
+#         "min_block_size": 10,
+#         "cache_built_engines": False,
+#         "reuse_cached_engines": False,
+#         "use_explicit_typing": True,
+#     }
+
+#     trt_mod = torchtrt.compile(model, **compile_spec)
+#     pyt_output = model(input)
+#     trt_output = trt_mod(input)
+#     assert pyt_output.dtype == trt_output.dtype
+#     assert pyt_output.dtype == dtype
+#     cos_sim = cosine_similarity(pyt_output, trt_output)
+#     assertions.assertTrue(
+#         cos_sim > COSINE_THRESHOLD,
+#         msg=f"EfficientNet-B0 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+#     )
+
+#     # Clean up model env
+#     torch._dynamo.reset()
+
+
+# @pytest.mark.unit
+# @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
+# @unittest.skipIf(
+#     not importlib.util.find_spec("transformers"),
+#     "transformers is required to run this test",
+# )
+# def test_lan_5_bert_base_uncased(ir, dtype):
+#     if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16:
+#         pytest.skip("TensorRT-RTX does not support bfloat16")
+
+#     from transformers import BertModel
+
+#     model = BertModel.from_pretrained("bert-base-uncased").cuda().eval().to(dtype)
+#     input = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda")
+#     input2 = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda")
+
+#     compile_spec = {
+#         "inputs": [
+#             torchtrt.Input(
+#                 input.shape,
+#                 dtype=input.dtype,
+#                 format=torch.contiguous_format,
+#             ),
+#             torchtrt.Input(
+#                 input.shape,
+#                 dtype=input.dtype,
+#                 format=torch.contiguous_format,
+#             ),
+#         ],
+#         "device": torchtrt.Device("cuda:0"),
+#         "truncate_double": True,
+#         "ir": ir,
+#         "pass_through_build_failures": True,
+#         "optimization_level": 1,
+#         "min_block_size": 15,
+#         "cache_built_engines": False,
+#         "reuse_cached_engines": False,
+#         "use_explicit_typing": True,
+#     }
+#     trt_mod = torchtrt.compile(model, **compile_spec)
+
+#     model_outputs = model(input, input2)
+#     trt_model_outputs = trt_mod(input, input2)
+#     for key in model_outputs.keys():
+#         out, trt_out = model_outputs[key], trt_model_outputs[key]
+#         assert out.dtype == trt_out.dtype
+#         assert out.dtype == dtype
+#         cos_sim = cosine_similarity(out, trt_out)
+#         assertions.assertTrue(
+#             cos_sim > COSINE_THRESHOLD,
+#             msg=f"HF BERT base-uncased TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+#         )
+
+#     # Clean up model env
+#     torch._dynamo.reset()
+
+
+# @pytest.mark.unit
+# def test_lan_6_bert_base_uncased_cpu_offload(ir):
+#     from transformers import BertModel
+
+#     model = BertModel.from_pretrained("bert-base-uncased").cuda().eval()
+#     input = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda")
+#     input2 = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda")
+
+#     compile_spec = {
+#         "inputs": [
+#             torchtrt.Input(
+#                 input.shape,
+#                 dtype=input.dtype,
+#                 format=torch.contiguous_format,
+#             ),
+#             torchtrt.Input(
+#                 input.shape,
+#                 dtype=input.dtype,
+#                 format=torch.contiguous_format,
+#             ),
+#         ],
+#         "device": torchtrt.Device("cuda:0"),
+#         "enabled_precisions": {torch.float},
+#         "truncate_double": True,
+#         "ir": ir,
+#         "pass_through_build_failures": True,
+#         "optimization_level": 1,
+#         "min_block_size": 15,
+#         "cache_built_engines": False,
+#         "reuse_cached_engines": False,
+#         "offload_module_to_cpu": True,
+#     }
+#     trt_mod = torchtrt.compile(model, **compile_spec)
+#     if ir == "dynamo":
+#         assertions.assertTrue(
+#             get_model_device(model).type == "cpu",
+#             msg="Model should be offloaded to CPU",
+#         )
+#         model.cuda()
+
+#     model_outputs = model(input, input2)
+#     trt_model_outputs = trt_mod(input, input2)
+#     for key in model_outputs.keys():
+#         out, trt_out = model_outputs[key], trt_model_outputs[key]
+#         cos_sim = cosine_similarity(out, trt_out)
+#         assertions.assertTrue(
+#             cos_sim > COSINE_THRESHOLD,
+#             msg=f"HF BERT base-uncased TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+#         )
+
+#     # Clean up model env
+#     torch._dynamo.reset()
+
+
+# @pytest.mark.unit
+# @unittest.skipIf(
+#     not importlib.util.find_spec("torchvision"),
+#     "torchvision is not installed",
+# )
+# def test_lan_7_resnet18_half(ir):
+#     model = models.resnet18(pretrained=True).eval().to("cuda").half()
+#     input = torch.randn((1, 3, 224, 224)).to("cuda").half()
+
+#     compile_spec = {
+#         "inputs": [
+#             torchtrt.Input(
+#                 input.shape, dtype=torch.half, format=torch.contiguous_format
+#             )
+#         ],
+#         "device": torchtrt.Device("cuda:0"),
+#         "enabled_precisions": {torch.half},
+#         "ir": ir,
+#         "pass_through_build_failures": True,
+#         "optimization_level": 1,
+#         "cache_built_engines": False,
+#         "reuse_cached_engines": False,
+#     }
+
+#     trt_mod = torchtrt.compile(model, **compile_spec)
+#     cos_sim = cosine_similarity(model(input), trt_mod(input))
+#     assertions.assertTrue(
+#         cos_sim > COSINE_THRESHOLD,
+#         msg=f"Resnet18 Half TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+#     )
+
+#     # Clean up model env
+#     torch._dynamo.reset()
+
+
+# @pytest.mark.unit
+# @unittest.skipIf(
+#     torchtrt.ENABLED_FEATURES.tensorrt_rtx,
+#     "bf16 is not supported for tensorrt_rtx",
+# )
+# def test_lan_8_bf16_model(ir):
+#     class MyModule(torch.nn.Module):
+#         def __init__(self):
+#             super().__init__()
+#             self.conv = torch.nn.Conv2d(3, 16, 3, stride=1, bias=True)
+#             self.relu = torch.nn.ReLU()
+
+#         def forward(self, x):
+#             out = self.conv(x)
+#             out = self.relu(out)
+#             return out
+
+#     model = MyModule().eval().cuda().to(torch.bfloat16)
+#     input = torch.randn((1, 3, 224, 224)).to("cuda").to(torch.bfloat16)
+
+#     compile_spec = {
+#         "inputs": [
+#             torchtrt.Input(
+#                 input.shape, dtype=torch.bfloat16, format=torch.contiguous_format
+#             )
+#         ],
+#         "device": torchtrt.Device("cuda:0"),
+#         "enabled_precisions": {torch.float32},
+#         "ir": ir,
+#         "pass_through_build_failures": True,
+#         "min_block_size": 1,
+#         "cache_built_engines": False,
+#         "reuse_cached_engines": False,
+#         "use_explicit_typing": True,
+#     }
+
+#     trt_mod = torchtrt.compile(model, **compile_spec)
+#     cos_sim = cosine_similarity(model(input), trt_mod(input))
+
+#     assertions.assertTrue(
+#         cos_sim > COSINE_THRESHOLD,
+#         msg=f"BF16 model TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+#     )
+
+#     # Clean up model env
+#     torch._dynamo.reset()
+
+
+# @pytest.mark.unit
+# @unittest.skipIf(
+#     torchtrt.ENABLED_FEATURES.tensorrt_rtx,
+#     "bf16 is not supported for tensorrt_rtx",
+# )
+# def test_lan_9_bf16_fallback_model(ir):
+#     class MyModule(torch.nn.Module):
+#         def __init__(self):
+#             super().__init__()
+#             self.conv = torch.nn.Conv2d(3, 16, 3, padding=1, stride=1, bias=True)
+#             self.relu = torch.nn.ReLU()
+#             self.conv2 = torch.nn.Conv2d(16, 16, 3, padding=1, stride=1, bias=True)
+
+#         def forward(self, x):
+#             out = self.conv(x)
+#             out = self.relu(out)
+#             out = self.conv2(out)
+#             return out
+
+#     model = MyModule().eval().cuda().to(torch.bfloat16)
+#     input = torch.randn((1, 3, 224, 224)).to("cuda").to(torch.bfloat16)
+
+#     compile_spec = {
+#         "inputs": [
+#             torchtrt.Input(
+#                 input.shape, dtype=torch.bfloat16, format=torch.contiguous_format
+#             )
+#         ],
+#         "device": torchtrt.Device("cuda:0"),
+#         "enabled_precisions": {torch.float32},
+#         "ir": ir,
+#         "pass_through_build_failures": True,
+#         "min_block_size": 1,
+#         "cache_built_engines": False,
+#         "reuse_cached_engines": False,
+#         "use_explicit_typing": True,
+#         "torch_executed_ops": {"torch.ops.aten.relu.default"},
+#     }
+
+#     trt_mod = torchtrt.compile(model, **compile_spec)
+#     cos_sim = cosine_similarity(model(input), trt_mod(input))
+
+#     assertions.assertTrue(
+#         cos_sim > COSINE_THRESHOLD,
+#         msg=f"BF16 fallback model TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+#     )
+
+#     # Clean up model env
+#     torch._dynamo.reset()

From 662afdef21d00a5a960da399baf8efa5144f317b Mon Sep 17 00:00:00 2001
From: lanluo-nvidia <lanl@nvidia.com>
Date: Tue, 30 Sep 2025 13:54:28 -0700
Subject: [PATCH 5/5]  test

---
 tests/py/dynamo/models/test_models.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tests/py/dynamo/models/test_models.py b/tests/py/dynamo/models/test_models.py
index dfd5b5aba5..96b9ac9002 100644
--- a/tests/py/dynamo/models/test_models.py
+++ b/tests/py/dynamo/models/test_models.py
@@ -36,7 +36,7 @@ def test_lan_1_resnet18_torch_exec_ops(ir):
             )
         ],
         "ir": ir,
-        "enabled_precisions": {torch.float32, torch.float16},
+        "enabled_precisions": {torch.float32},
         "min_block_size": 1,
         "output_format": "exported_program",
         "cache_built_engines": True,
@@ -72,11 +72,11 @@ def test_lan_2_resnet18_torch_exec_ops(ir):
             )
         ],
         "ir": ir,
-        "enabled_precisions": {torch.float32},
+        "enabled_precisions": {torch.float32, torch.float16},
         "min_block_size": 1,
         "output_format": "exported_program",
-        "cache_built_engines": False,
-        "reuse_cached_engines": False,
+        "cache_built_engines": True,
+        "reuse_cached_engines": True,
         "torch_executed_ops": {torch.ops.aten.matmul, "torch.ops.aten.add"},
     }
 
@@ -108,8 +108,9 @@ def test_lan_3_resnet18_torch_exec_ops(ir):
             )
         ],
         "ir": ir,
-        "enabled_precisions": {torch.float32},
+        "enabled_precisions": {torch.float32, torch.float16},
         "min_block_size": 1,
+        "output_format": "exported_program",
         "cache_built_engines": False,
         "reuse_cached_engines": False,
         "torch_executed_ops": {torch.ops.aten.matmul, "torch.ops.aten.add"},