From 231bf2c8eaf5590d391c0e0e2fd08c8e11888d23 Mon Sep 17 00:00:00 2001 From: lanluo-nvidia Date: Mon, 29 Sep 2025 13:15:39 -0700 Subject: [PATCH 1/5] test windows access violation issue --- .../build-test-linux-aarch64-jetpack.yml | 2 +- .../workflows/build-test-linux-aarch64.yml | 2 +- .github/workflows/build-test-linux-x86_64.yml | 2 +- .../workflows/build-test-linux-x86_64_rtx.yml | 2 +- .github/workflows/build-test-windows.yml | 444 ++++++++++-------- .github/workflows/build-test-windows_rtx.yml | 346 +++++++------- 6 files changed, 417 insertions(+), 381 deletions(-) diff --git a/.github/workflows/build-test-linux-aarch64-jetpack.yml b/.github/workflows/build-test-linux-aarch64-jetpack.yml index ff827ad3f6..0c92e207bb 100644 --- a/.github/workflows/build-test-linux-aarch64-jetpack.yml +++ b/.github/workflows/build-test-linux-aarch64-jetpack.yml @@ -1,7 +1,7 @@ name: Build and test Linux aarch64 wheels for Jetpack on: - pull_request: + #pull_request: push: branches: - main diff --git a/.github/workflows/build-test-linux-aarch64.yml b/.github/workflows/build-test-linux-aarch64.yml index 2604d18f92..fc3441ffd2 100644 --- a/.github/workflows/build-test-linux-aarch64.yml +++ b/.github/workflows/build-test-linux-aarch64.yml @@ -1,7 +1,7 @@ name: Build and test Linux aarch64 wheels on: - pull_request: + #pull_request: push: branches: - main diff --git a/.github/workflows/build-test-linux-x86_64.yml b/.github/workflows/build-test-linux-x86_64.yml index 6d94546177..5f95f7d209 100644 --- a/.github/workflows/build-test-linux-x86_64.yml +++ b/.github/workflows/build-test-linux-x86_64.yml @@ -1,7 +1,7 @@ name: Build and test Linux x86_64 wheels on: - pull_request: + #pull_request: push: branches: - main diff --git a/.github/workflows/build-test-linux-x86_64_rtx.yml b/.github/workflows/build-test-linux-x86_64_rtx.yml index 34f9d00568..469b8278aa 100644 --- a/.github/workflows/build-test-linux-x86_64_rtx.yml +++ b/.github/workflows/build-test-linux-x86_64_rtx.yml @@ -1,7 +1,7 @@ name: RTX - Build and test Linux x86_64 wheels on: - pull_request: + #pull_request: push: branches: - main diff --git a/.github/workflows/build-test-windows.yml b/.github/workflows/build-test-windows.yml index c62515cec4..3d637a1826 100644 --- a/.github/workflows/build-test-windows.yml +++ b/.github/workflows/build-test-windows.yml @@ -85,69 +85,6 @@ jobs: trigger-event: ${{ github.event_name }} timeout: 120 - tests-py-torchscript-fe: - name: Test torchscript frontend [Python] - needs: [substitute-runner, build] - strategy: - fail-fast: false - matrix: - include: - - repository: pytorch/tensorrt - package-name: torch_tensorrt - uses: ./.github/workflows/windows-test.yml - with: - job-name: tests-py-torchscript-fe - repository: ${{ matrix.repository }} - ref: "" - test-infra-repository: pytorch/test-infra - test-infra-ref: main - build-matrix: ${{ needs.substitute-runner.outputs.matrix }} - pre-script: packaging/driver_upgrade.bat - script: | - set -euo pipefail - export USE_HOST_DEPS=1 - export CI_BUILD=1 - pushd . - cd tests/modules - python hub.py - popd - pushd . - cd tests/py/ts - python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_api_test_results.xml api/ - python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_models_test_results.xml models/ - python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_integrations_test_results.xml integrations/ - popd - - tests-py-dynamo-converters: - name: Test dynamo converters [Python] - needs: [substitute-runner, build] - strategy: - fail-fast: false - matrix: - include: - - repository: pytorch/tensorrt - package-name: torch_tensorrt - uses: ./.github/workflows/windows-test.yml - with: - job-name: tests-py-dynamo-converters - repository: ${{ matrix.repository }} - ref: "" - test-infra-repository: pytorch/test-infra - test-infra-ref: main - build-matrix: ${{ needs.substitute-runner.outputs.matrix }} - pre-script: packaging/driver_upgrade.bat - script: | - set -euo pipefail - nvidia-smi - nvcc --version - export USE_HOST_DEPS=1 - export CI_BUILD=1 - pushd . - cd tests/py - cd dynamo - python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml -n 4 conversion/ - popd - tests-py-dynamo-fe: name: Test dynamo frontend [Python] needs: [substitute-runner, build] @@ -173,154 +110,253 @@ jobs: pushd . cd tests/py cd dynamo - python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo models/ - python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_llm.xml llm/ - popd - - tests-py-dynamo-serde: - name: Test dynamo export serde [Python] - needs: [substitute-runner, build] - strategy: - fail-fast: false - matrix: - include: - - repository: pytorch/tensorrt - package-name: torch_tensorrt - uses: ./.github/workflows/windows-test.yml - with: - job-name: tests-py-dynamo-serde - repository: ${{ matrix.repository }} - ref: "" - test-infra-repository: pytorch/test-infra - test-infra-ref: main - build-matrix: ${{ needs.substitute-runner.outputs.matrix }} - pre-script: packaging/driver_upgrade.bat - script: | - set -euo pipefail - export USE_HOST_DEPS=1 - export CI_BUILD=1 - pushd . - cd tests/py - cd dynamo + python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/test_models.xml --ir dynamo models/test_models.py + python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/test_dyn_models.xml --ir dynamo models/test_dyn_models.py + python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/test_models_export.xml --ir dynamo models/test_models_export.py python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_serde_test_results.xml --ir dynamo models/test_export_serde.py python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/reexport_test_results.xml --ir dynamo models/test_reexport.py + python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/test_dtype_support.xml --ir torch_compile models/test_dtype_support.py + python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_llm.xml llm/ popd - tests-py-torch-compile-be: - name: Test torch compile backend [Python] - needs: [substitute-runner, build] - strategy: - fail-fast: false - matrix: - include: - - repository: pytorch/tensorrt - package-name: torch_tensorrt - uses: ./.github/workflows/windows-test.yml - with: - job-name: tests-py-torch-compile-be - repository: ${{ matrix.repository }} - ref: "" - test-infra-repository: pytorch/test-infra - test-infra-ref: main - build-matrix: ${{ needs.substitute-runner.outputs.matrix }} - pre-script: packaging/driver_upgrade.bat - script: | - set -euo pipefail - export USE_HOST_DEPS=1 - export CI_BUILD=1 - pushd . - cd tests/py - cd dynamo - python -m pytest -ra -n 10 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_be_test_results.xml backend/ - python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_complete_be_e2e_test_results.xml --ir torch_compile models/test_models.py - ../../../packaging/vc_env_helper.bat python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_dyn_models_export.xml --ir torch_compile models/test_dyn_models.py - popd - tests-py-dynamo-core: - name: Test dynamo core [Python] - needs: [substitute-runner, build] - strategy: - fail-fast: false - matrix: - include: - - repository: pytorch/tensorrt - package-name: torch_tensorrt - uses: ./.github/workflows/windows-test.yml - with: - job-name: tests-py-dynamo-core - repository: ${{ matrix.repository }} - ref: "" - test-infra-repository: pytorch/test-infra - test-infra-ref: main - build-matrix: ${{ needs.substitute-runner.outputs.matrix }} - pre-script: packaging/driver_upgrade.bat - script: | - set -euo pipefail - export USE_HOST_DEPS=1 - export CI_BUILD=1 - pushd . - cd tests/py - cd dynamo - ../../../packaging/vc_env_helper.bat python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results.xml --ignore runtime/test_002_cudagraphs_py.py --ignore runtime/test_002_cudagraphs_cpp.py runtime/ - python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_partitioning_test_results.xml partitioning/ - ../../../packaging/vc_env_helper.bat python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_lowering_test_results.xml lowering/ - popd + # tests-py-torchscript-fe: + # name: Test torchscript frontend [Python] + # needs: [substitute-runner, build] + # strategy: + # fail-fast: false + # matrix: + # include: + # - repository: pytorch/tensorrt + # package-name: torch_tensorrt + # uses: ./.github/workflows/windows-test.yml + # with: + # job-name: tests-py-torchscript-fe + # repository: ${{ matrix.repository }} + # ref: "" + # test-infra-repository: pytorch/test-infra + # test-infra-ref: main + # build-matrix: ${{ needs.substitute-runner.outputs.matrix }} + # pre-script: packaging/driver_upgrade.bat + # script: | + # set -euo pipefail + # export USE_HOST_DEPS=1 + # export CI_BUILD=1 + # pushd . + # cd tests/modules + # python hub.py + # popd + # pushd . + # cd tests/py/ts + # python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_api_test_results.xml api/ + # python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_models_test_results.xml models/ + # python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_integrations_test_results.xml integrations/ + # popd - tests-py-dynamo-cudagraphs: - name: Test dynamo cudagraphs [Python] - needs: [substitute-runner, build] - strategy: - fail-fast: false - matrix: - include: - - repository: pytorch/tensorrt - package-name: torch_tensorrt - uses: ./.github/workflows/windows-test.yml - with: - job-name: tests-py-dynamo-cudagraphs - repository: ${{ matrix.repository }} - ref: "" - test-infra-repository: pytorch/test-infra - test-infra-ref: main - build-matrix: ${{ needs.substitute-runner.outputs.matrix }} - pre-script: packaging/driver_upgrade.bat - script: | - set -euo pipefail - export USE_HOST_DEPS=1 - export CI_BUILD=1 - pushd . - cd tests/py - cd dynamo - python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_cpp_test_results.xml runtime/test_002_cudagraphs_cpp.py - python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_py_test_results.xml runtime/test_002_cudagraphs_py.py - popd + # tests-py-dynamo-converters: + # name: Test dynamo converters [Python] + # needs: [substitute-runner, build] + # strategy: + # fail-fast: false + # matrix: + # include: + # - repository: pytorch/tensorrt + # package-name: torch_tensorrt + # uses: ./.github/workflows/windows-test.yml + # with: + # job-name: tests-py-dynamo-converters + # repository: ${{ matrix.repository }} + # ref: "" + # test-infra-repository: pytorch/test-infra + # test-infra-ref: main + # build-matrix: ${{ needs.substitute-runner.outputs.matrix }} + # pre-script: packaging/driver_upgrade.bat + # script: | + # set -euo pipefail + # nvidia-smi + # nvcc --version + # export USE_HOST_DEPS=1 + # export CI_BUILD=1 + # pushd . + # cd tests/py + # cd dynamo + # python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml -n 4 conversion/ + # popd - tests-py-core: - name: Test core [Python] - needs: [substitute-runner, build] - strategy: - fail-fast: false - matrix: - include: - - repository: pytorch/tensorrt - package-name: torch_tensorrt - uses: ./.github/workflows/windows-test.yml - with: - job-name: tests-py-core - repository: ${{ matrix.repository }} - ref: "" - test-infra-repository: pytorch/test-infra - test-infra-ref: main - build-matrix: ${{ needs.substitute-runner.outputs.matrix }} - pre-script: packaging/driver_upgrade.bat - script: | - set -euo pipefail - export USE_HOST_DEPS=1 - export CI_BUILD=1 - pushd . - cd tests/py/core - python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_core_test_results.xml . - popd + # tests-py-dynamo-fe: + # name: Test dynamo frontend [Python] + # needs: [substitute-runner, build] + # strategy: + # fail-fast: false + # matrix: + # include: + # - repository: pytorch/tensorrt + # package-name: torch_tensorrt + # uses: ./.github/workflows/windows-test.yml + # with: + # job-name: tests-py-dynamo-fe + # repository: ${{ matrix.repository }} + # ref: "" + # test-infra-repository: pytorch/test-infra + # test-infra-ref: main + # build-matrix: ${{ needs.substitute-runner.outputs.matrix }} + # pre-script: packaging/driver_upgrade.bat + # script: | + # set -euo pipefail + # export USE_HOST_DEPS=1 + # export CI_BUILD=1 + # pushd . + # cd tests/py + # cd dynamo + # python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo models/ + + # python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_llm.xml llm/ + # popd + + # tests-py-dynamo-serde: + # name: Test dynamo export serde [Python] + # needs: [substitute-runner, build] + # strategy: + # fail-fast: false + # matrix: + # include: + # - repository: pytorch/tensorrt + # package-name: torch_tensorrt + # uses: ./.github/workflows/windows-test.yml + # with: + # job-name: tests-py-dynamo-serde + # repository: ${{ matrix.repository }} + # ref: "" + # test-infra-repository: pytorch/test-infra + # test-infra-ref: main + # build-matrix: ${{ needs.substitute-runner.outputs.matrix }} + # pre-script: packaging/driver_upgrade.bat + # script: | + # set -euo pipefail + # export USE_HOST_DEPS=1 + # export CI_BUILD=1 + # pushd . + # cd tests/py + # cd dynamo + # python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_serde_test_results.xml --ir dynamo models/test_export_serde.py + # python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/reexport_test_results.xml --ir dynamo models/test_reexport.py + # popd + + # tests-py-torch-compile-be: + # name: Test torch compile backend [Python] + # needs: [substitute-runner, build] + # strategy: + # fail-fast: false + # matrix: + # include: + # - repository: pytorch/tensorrt + # package-name: torch_tensorrt + # uses: ./.github/workflows/windows-test.yml + # with: + # job-name: tests-py-torch-compile-be + # repository: ${{ matrix.repository }} + # ref: "" + # test-infra-repository: pytorch/test-infra + # test-infra-ref: main + # build-matrix: ${{ needs.substitute-runner.outputs.matrix }} + # pre-script: packaging/driver_upgrade.bat + # script: | + # set -euo pipefail + # export USE_HOST_DEPS=1 + # export CI_BUILD=1 + # pushd . + # cd tests/py + # cd dynamo + # python -m pytest -ra -n 10 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_be_test_results.xml backend/ + # python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_complete_be_e2e_test_results.xml --ir torch_compile models/test_models.py + # ../../../packaging/vc_env_helper.bat python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_dyn_models_export.xml --ir torch_compile models/test_dyn_models.py + # popd + + # tests-py-dynamo-core: + # name: Test dynamo core [Python] + # needs: [substitute-runner, build] + # strategy: + # fail-fast: false + # matrix: + # include: + # - repository: pytorch/tensorrt + # package-name: torch_tensorrt + # uses: ./.github/workflows/windows-test.yml + # with: + # job-name: tests-py-dynamo-core + # repository: ${{ matrix.repository }} + # ref: "" + # test-infra-repository: pytorch/test-infra + # test-infra-ref: main + # build-matrix: ${{ needs.substitute-runner.outputs.matrix }} + # pre-script: packaging/driver_upgrade.bat + # script: | + # set -euo pipefail + # export USE_HOST_DEPS=1 + # export CI_BUILD=1 + # pushd . + # cd tests/py + # cd dynamo + # ../../../packaging/vc_env_helper.bat python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results.xml --ignore runtime/test_002_cudagraphs_py.py --ignore runtime/test_002_cudagraphs_cpp.py runtime/ + # python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_partitioning_test_results.xml partitioning/ + # ../../../packaging/vc_env_helper.bat python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_lowering_test_results.xml lowering/ + # popd + + # tests-py-dynamo-cudagraphs: + # name: Test dynamo cudagraphs [Python] + # needs: [substitute-runner, build] + # strategy: + # fail-fast: false + # matrix: + # include: + # - repository: pytorch/tensorrt + # package-name: torch_tensorrt + # uses: ./.github/workflows/windows-test.yml + # with: + # job-name: tests-py-dynamo-cudagraphs + # repository: ${{ matrix.repository }} + # ref: "" + # test-infra-repository: pytorch/test-infra + # test-infra-ref: main + # build-matrix: ${{ needs.substitute-runner.outputs.matrix }} + # pre-script: packaging/driver_upgrade.bat + # script: | + # set -euo pipefail + # export USE_HOST_DEPS=1 + # export CI_BUILD=1 + # pushd . + # cd tests/py + # cd dynamo + # python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_cpp_test_results.xml runtime/test_002_cudagraphs_cpp.py + # python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_py_test_results.xml runtime/test_002_cudagraphs_py.py + # popd + + # tests-py-core: + # name: Test core [Python] + # needs: [substitute-runner, build] + # strategy: + # fail-fast: false + # matrix: + # include: + # - repository: pytorch/tensorrt + # package-name: torch_tensorrt + # uses: ./.github/workflows/windows-test.yml + # with: + # job-name: tests-py-core + # repository: ${{ matrix.repository }} + # ref: "" + # test-infra-repository: pytorch/test-infra + # test-infra-ref: main + # build-matrix: ${{ needs.substitute-runner.outputs.matrix }} + # pre-script: packaging/driver_upgrade.bat + # script: | + # set -euo pipefail + # export USE_HOST_DEPS=1 + # export CI_BUILD=1 + # pushd . + # cd tests/py/core + # python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_core_test_results.xml . + # popd concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ inputs.repository }}-${{ github.event_name == 'workflow_dispatch' }}-${{ inputs.job-name }} diff --git a/.github/workflows/build-test-windows_rtx.yml b/.github/workflows/build-test-windows_rtx.yml index 9ee768b964..bf6eaef765 100644 --- a/.github/workflows/build-test-windows_rtx.yml +++ b/.github/workflows/build-test-windows_rtx.yml @@ -87,34 +87,34 @@ jobs: use-rtx: true timeout: 120 - tests-py-dynamo-converters: - name: Test dynamo converters [Python] - needs: [substitute-runner, build] - strategy: - fail-fast: false - matrix: - include: - - repository: pytorch/tensorrt - package-name: torch_tensorrt - uses: ./.github/workflows/windows-test.yml - with: - job-name: tests-py-dynamo-converters - repository: ${{ matrix.repository }} - ref: "" - test-infra-repository: pytorch/test-infra - test-infra-ref: main - build-matrix: ${{ needs.substitute-runner.outputs.matrix }} - pre-script: packaging/driver_upgrade.bat - use-rtx: true - script: | - set -euo pipefail - export USE_HOST_DEPS=1 - export CI_BUILD=1 - pushd . - cd tests/py - cd dynamo - python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml -n 4 conversion/ - popd + # tests-py-dynamo-converters: + # name: Test dynamo converters [Python] + # needs: [substitute-runner, build] + # strategy: + # fail-fast: false + # matrix: + # include: + # - repository: pytorch/tensorrt + # package-name: torch_tensorrt + # uses: ./.github/workflows/windows-test.yml + # with: + # job-name: tests-py-dynamo-converters + # repository: ${{ matrix.repository }} + # ref: "" + # test-infra-repository: pytorch/test-infra + # test-infra-ref: main + # build-matrix: ${{ needs.substitute-runner.outputs.matrix }} + # pre-script: packaging/driver_upgrade.bat + # use-rtx: true + # script: | + # set -euo pipefail + # export USE_HOST_DEPS=1 + # export CI_BUILD=1 + # pushd . + # cd tests/py + # cd dynamo + # python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml -n 4 conversion/ + # popd tests-py-dynamo-fe: name: Test dynamo frontend [Python] @@ -146,155 +146,155 @@ jobs: python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_llm.xml llm/ popd - tests-py-dynamo-serde: - name: Test dynamo export serde [Python] - needs: [substitute-runner, build] - strategy: - fail-fast: false - matrix: - include: - - repository: pytorch/tensorrt - package-name: torch_tensorrt - uses: ./.github/workflows/windows-test.yml - with: - job-name: tests-py-dynamo-serde - repository: ${{ matrix.repository }} - ref: "" - test-infra-repository: pytorch/test-infra - test-infra-ref: main - build-matrix: ${{ needs.substitute-runner.outputs.matrix }} - pre-script: packaging/driver_upgrade.bat - use-rtx: true - script: | - set -euo pipefail - export USE_HOST_DEPS=1 - export CI_BUILD=1 - pushd . - cd tests/py - cd dynamo - python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_serde_test_results.xml --ir dynamo models/test_export_serde.py - python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/reexport_test_results.xml --ir dynamo models/test_reexport.py - popd + # tests-py-dynamo-serde: + # name: Test dynamo export serde [Python] + # needs: [substitute-runner, build] + # strategy: + # fail-fast: false + # matrix: + # include: + # - repository: pytorch/tensorrt + # package-name: torch_tensorrt + # uses: ./.github/workflows/windows-test.yml + # with: + # job-name: tests-py-dynamo-serde + # repository: ${{ matrix.repository }} + # ref: "" + # test-infra-repository: pytorch/test-infra + # test-infra-ref: main + # build-matrix: ${{ needs.substitute-runner.outputs.matrix }} + # pre-script: packaging/driver_upgrade.bat + # use-rtx: true + # script: | + # set -euo pipefail + # export USE_HOST_DEPS=1 + # export CI_BUILD=1 + # pushd . + # cd tests/py + # cd dynamo + # python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_serde_test_results.xml --ir dynamo models/test_export_serde.py + # python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/reexport_test_results.xml --ir dynamo models/test_reexport.py + # popd - tests-py-torch-compile-be: - name: Test torch compile backend [Python] - needs: [substitute-runner, build] - strategy: - fail-fast: false - matrix: - include: - - repository: pytorch/tensorrt - package-name: torch_tensorrt - uses: ./.github/workflows/windows-test.yml - with: - job-name: tests-py-torch-compile-be - repository: ${{ matrix.repository }} - ref: "" - test-infra-repository: pytorch/test-infra - test-infra-ref: main - build-matrix: ${{ needs.substitute-runner.outputs.matrix }} - pre-script: packaging/driver_upgrade.bat - use-rtx: true - script: | - set -euo pipefail - export USE_HOST_DEPS=1 - export CI_BUILD=1 - pushd . - cd tests/py - cd dynamo - python -m pytest -ra -n 10 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_be_test_results.xml backend/ - python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_complete_be_e2e_test_results.xml --ir torch_compile models/test_models.py - ../../../packaging/vc_env_helper.bat python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_dyn_models_export.xml --ir torch_compile models/test_dyn_models.py - popd + # tests-py-torch-compile-be: + # name: Test torch compile backend [Python] + # needs: [substitute-runner, build] + # strategy: + # fail-fast: false + # matrix: + # include: + # - repository: pytorch/tensorrt + # package-name: torch_tensorrt + # uses: ./.github/workflows/windows-test.yml + # with: + # job-name: tests-py-torch-compile-be + # repository: ${{ matrix.repository }} + # ref: "" + # test-infra-repository: pytorch/test-infra + # test-infra-ref: main + # build-matrix: ${{ needs.substitute-runner.outputs.matrix }} + # pre-script: packaging/driver_upgrade.bat + # use-rtx: true + # script: | + # set -euo pipefail + # export USE_HOST_DEPS=1 + # export CI_BUILD=1 + # pushd . + # cd tests/py + # cd dynamo + # python -m pytest -ra -n 10 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_be_test_results.xml backend/ + # python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_complete_be_e2e_test_results.xml --ir torch_compile models/test_models.py + # ../../../packaging/vc_env_helper.bat python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_dyn_models_export.xml --ir torch_compile models/test_dyn_models.py + # popd - tests-py-dynamo-core: - name: Test dynamo core [Python] - needs: [substitute-runner, build] - strategy: - fail-fast: false - matrix: - include: - - repository: pytorch/tensorrt - package-name: torch_tensorrt - uses: ./.github/workflows/windows-test.yml - with: - job-name: tests-py-dynamo-core - repository: ${{ matrix.repository }} - ref: "" - test-infra-repository: pytorch/test-infra - test-infra-ref: main - build-matrix: ${{ needs.substitute-runner.outputs.matrix }} - pre-script: packaging/driver_upgrade.bat - use-rtx: true - script: | - set -euo pipefail - export USE_HOST_DEPS=1 - export CI_BUILD=1 - pushd . - cd tests/py - cd dynamo - ../../../packaging/vc_env_helper.bat python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results.xml --ignore runtime/test_002_cudagraphs_py.py --ignore runtime/test_002_cudagraphs_cpp.py runtime/ - python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_partitioning_test_results.xml partitioning/ - ../../../packaging/vc_env_helper.bat python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_lowering_test_results.xml lowering/ - popd + # tests-py-dynamo-core: + # name: Test dynamo core [Python] + # needs: [substitute-runner, build] + # strategy: + # fail-fast: false + # matrix: + # include: + # - repository: pytorch/tensorrt + # package-name: torch_tensorrt + # uses: ./.github/workflows/windows-test.yml + # with: + # job-name: tests-py-dynamo-core + # repository: ${{ matrix.repository }} + # ref: "" + # test-infra-repository: pytorch/test-infra + # test-infra-ref: main + # build-matrix: ${{ needs.substitute-runner.outputs.matrix }} + # pre-script: packaging/driver_upgrade.bat + # use-rtx: true + # script: | + # set -euo pipefail + # export USE_HOST_DEPS=1 + # export CI_BUILD=1 + # pushd . + # cd tests/py + # cd dynamo + # ../../../packaging/vc_env_helper.bat python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results.xml --ignore runtime/test_002_cudagraphs_py.py --ignore runtime/test_002_cudagraphs_cpp.py runtime/ + # python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_partitioning_test_results.xml partitioning/ + # ../../../packaging/vc_env_helper.bat python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_lowering_test_results.xml lowering/ + # popd - tests-py-dynamo-cudagraphs: - name: Test dynamo cudagraphs [Python] - needs: [substitute-runner, build] - strategy: - fail-fast: false - matrix: - include: - - repository: pytorch/tensorrt - package-name: torch_tensorrt - uses: ./.github/workflows/windows-test.yml - with: - job-name: tests-py-dynamo-cudagraphs - repository: ${{ matrix.repository }} - ref: "" - test-infra-repository: pytorch/test-infra - test-infra-ref: main - build-matrix: ${{ needs.substitute-runner.outputs.matrix }} - pre-script: packaging/driver_upgrade.bat - use-rtx: true - script: | - set -euo pipefail - export USE_HOST_DEPS=1 - export CI_BUILD=1 - pushd . - cd tests/py - cd dynamo - python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_cpp_test_results.xml runtime/test_002_cudagraphs_cpp.py - python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_py_test_results.xml runtime/test_002_cudagraphs_py.py - popd + # tests-py-dynamo-cudagraphs: + # name: Test dynamo cudagraphs [Python] + # needs: [substitute-runner, build] + # strategy: + # fail-fast: false + # matrix: + # include: + # - repository: pytorch/tensorrt + # package-name: torch_tensorrt + # uses: ./.github/workflows/windows-test.yml + # with: + # job-name: tests-py-dynamo-cudagraphs + # repository: ${{ matrix.repository }} + # ref: "" + # test-infra-repository: pytorch/test-infra + # test-infra-ref: main + # build-matrix: ${{ needs.substitute-runner.outputs.matrix }} + # pre-script: packaging/driver_upgrade.bat + # use-rtx: true + # script: | + # set -euo pipefail + # export USE_HOST_DEPS=1 + # export CI_BUILD=1 + # pushd . + # cd tests/py + # cd dynamo + # python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_cpp_test_results.xml runtime/test_002_cudagraphs_cpp.py + # python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_py_test_results.xml runtime/test_002_cudagraphs_py.py + # popd - tests-py-core: - name: Test core [Python] - needs: [substitute-runner, build] - strategy: - fail-fast: false - matrix: - include: - - repository: pytorch/tensorrt - package-name: torch_tensorrt - uses: ./.github/workflows/windows-test.yml - with: - job-name: tests-py-core - repository: ${{ matrix.repository }} - ref: "" - test-infra-repository: pytorch/test-infra - test-infra-ref: main - build-matrix: ${{ needs.substitute-runner.outputs.matrix }} - pre-script: packaging/driver_upgrade.bat - use-rtx: true - script: | - set -euo pipefail - export USE_HOST_DEPS=1 - export CI_BUILD=1 - pushd . - cd tests/py/core - python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_core_test_results.xml . - popd + # tests-py-core: + # name: Test core [Python] + # needs: [substitute-runner, build] + # strategy: + # fail-fast: false + # matrix: + # include: + # - repository: pytorch/tensorrt + # package-name: torch_tensorrt + # uses: ./.github/workflows/windows-test.yml + # with: + # job-name: tests-py-core + # repository: ${{ matrix.repository }} + # ref: "" + # test-infra-repository: pytorch/test-infra + # test-infra-ref: main + # build-matrix: ${{ needs.substitute-runner.outputs.matrix }} + # pre-script: packaging/driver_upgrade.bat + # use-rtx: true + # script: | + # set -euo pipefail + # export USE_HOST_DEPS=1 + # export CI_BUILD=1 + # pushd . + # cd tests/py/core + # python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_core_test_results.xml . + # popd concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-tensorrt-rtx-${{ inputs.repository }}-${{ github.event_name == 'workflow_dispatch' }}-${{ inputs.job-name }} From 49d53cf55cca0a920270fb365f1f11ad87d2c88d Mon Sep 17 00:00:00 2001 From: lanluo-nvidia Date: Mon, 29 Sep 2025 16:58:23 -0700 Subject: [PATCH 2/5] test --- .github/workflows/build-test-windows.yml | 16 +- tests/py/dynamo/models/test_models.py | 820 +++++++++++------------ 2 files changed, 423 insertions(+), 413 deletions(-) diff --git a/.github/workflows/build-test-windows.yml b/.github/workflows/build-test-windows.yml index 3d637a1826..70d66eab52 100644 --- a/.github/workflows/build-test-windows.yml +++ b/.github/workflows/build-test-windows.yml @@ -104,18 +104,28 @@ jobs: build-matrix: ${{ needs.substitute-runner.outputs.matrix }} pre-script: packaging/driver_upgrade.bat script: | - set -euo pipefail + + #set -euo pipefail + set -x export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . cd tests/py cd dynamo - python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/test_models.xml --ir dynamo models/test_models.py + + gdb --version + nvidia-smi + nvcc --version + python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/test_dtype_support.xml --ir torch_compile models/test_dtype_support.py + echo "test_dtype_support.xml passed" + + python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/test_models.xml --ir dynamo models/test_models.py + python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/test_dyn_models.xml --ir dynamo models/test_dyn_models.py python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/test_models_export.xml --ir dynamo models/test_models_export.py python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_serde_test_results.xml --ir dynamo models/test_export_serde.py python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/reexport_test_results.xml --ir dynamo models/test_reexport.py - python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/test_dtype_support.xml --ir torch_compile models/test_dtype_support.py + python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_llm.xml llm/ popd diff --git a/tests/py/dynamo/models/test_models.py b/tests/py/dynamo/models/test_models.py index 3d5e2190d2..33f8d8d458 100644 --- a/tests/py/dynamo/models/test_models.py +++ b/tests/py/dynamo/models/test_models.py @@ -54,413 +54,413 @@ def test_resnet18(ir): torch._dynamo.reset() -@pytest.mark.unit -@unittest.skipIf( - not importlib.util.find_spec("torchvision"), - "torchvision is not installed", -) -def test_resnet18_cpu_offload(ir): - model = models.resnet18(pretrained=True).eval().to("cuda") - input = torch.randn((1, 3, 224, 224)).to("cuda") - - compile_spec = { - "inputs": [ - torchtrt.Input( - input.shape, dtype=torch.float, format=torch.contiguous_format - ) - ], - "device": torchtrt.Device("cuda:0"), - "enabled_precisions": {torch.float}, - "ir": ir, - "pass_through_build_failures": True, - "optimization_level": 1, - "cache_built_engines": False, - "reuse_cached_engines": False, - "offload_module_to_cpu": True, - } - - trt_mod = torchtrt.compile(model, **compile_spec) - if ir == "dynamo": - assertions.assertTrue( - get_model_device(model).type == "cpu", - msg="Model should be offloaded to CPU", - ) - model.cuda() - cos_sim = cosine_similarity(model(input), trt_mod(input)) - assertions.assertTrue( - cos_sim > COSINE_THRESHOLD, - msg=f"Resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", - ) - - # Clean up model env - torch._dynamo.reset() - - -@unittest.skipIf( - not importlib.util.find_spec("torchvision"), "torchvision not installed" -) -def test_resnet18_torch_exec_ops(ir): - model = models.resnet18(pretrained=True).eval().to("cuda") - input = torch.randn((1, 3, 224, 224)).to("cuda") - - compile_spec = { - "inputs": [ - torchtrt.Input( - min_shape=(1, 3, 224, 224), - opt_shape=(8, 3, 224, 224), - max_shape=(16, 3, 224, 224), - dtype=torch.float32, - ) - ], - "ir": ir, - "enabled_precisions": {torch.float32, torch.float16}, - "min_block_size": 1, - "output_format": "exported_program", - "cache_built_engines": True, - "reuse_cached_engines": True, - "torch_executed_ops": {torch.ops.aten.matmul, "torch.ops.aten.add"}, - } - - trt_mod = torchtrt.compile(model, **compile_spec) - cos_sim = cosine_similarity(model(input), trt_mod(input)) - assertions.assertTrue( - cos_sim > COSINE_THRESHOLD, - msg=f"Resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", - ) - - # Clean up model env - torch._dynamo.reset() - - -@pytest.mark.unit -@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32]) -@unittest.skipIf( - not importlib.util.find_spec("torchvision"), - "torchvision is not installed", -) -def test_mobilenet_v2(ir, dtype): - if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16: - pytest.skip("TensorRT-RTX does not support bfloat16") - - model = models.mobilenet_v2(pretrained=True).eval().to("cuda").to(dtype) - input = torch.randn((1, 3, 224, 224)).to("cuda").to(dtype) - - compile_spec = { - "inputs": [ - torchtrt.Input(input.shape, dtype=dtype, format=torch.contiguous_format) - ], - "device": torchtrt.Device("cuda:0"), - "ir": ir, - "pass_through_build_failures": True, - "optimization_level": 1, - "min_block_size": 10, - "cache_built_engines": False, - "reuse_cached_engines": False, - "use_explicit_typing": True, - } - - trt_mod = torchtrt.compile(model, **compile_spec) - pyt_output = model(input) - trt_output = trt_mod(input) - assert pyt_output.dtype == trt_output.dtype - assert pyt_output.dtype == dtype - cos_sim = cosine_similarity(pyt_output, trt_output) - assertions.assertTrue( - cos_sim > COSINE_THRESHOLD, - msg=f"Mobilenet v2 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", - ) - - # Clean up model env - torch._dynamo.reset() - - -@pytest.mark.unit -@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32]) -@unittest.skipIf( - not importlib.util.find_spec("timm") or not importlib.util.find_spec("torchvision"), - "timm or torchvision not installed", -) -def test_efficientnet_b0(ir, dtype): - if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16: - pytest.skip("TensorRT-RTX does not support bfloat16, skipping test") - - model = ( - timm.create_model("efficientnet_b0", pretrained=True) - .eval() - .to("cuda") - .to(dtype) - ) - input = torch.randn((1, 3, 224, 224)).to("cuda").to(dtype) - - compile_spec = { - "inputs": [ - torchtrt.Input(input.shape, dtype=dtype, format=torch.contiguous_format) - ], - "device": torchtrt.Device("cuda:0"), - "ir": ir, - "pass_through_build_failures": True, - "optimization_level": 1, - "min_block_size": 10, - "cache_built_engines": False, - "reuse_cached_engines": False, - "use_explicit_typing": True, - } - - trt_mod = torchtrt.compile(model, **compile_spec) - pyt_output = model(input) - trt_output = trt_mod(input) - assert pyt_output.dtype == trt_output.dtype - assert pyt_output.dtype == dtype - cos_sim = cosine_similarity(pyt_output, trt_output) - assertions.assertTrue( - cos_sim > COSINE_THRESHOLD, - msg=f"EfficientNet-B0 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", - ) - - # Clean up model env - torch._dynamo.reset() - - -@pytest.mark.unit -@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32]) -@unittest.skipIf( - not importlib.util.find_spec("transformers"), - "transformers is required to run this test", -) -def test_bert_base_uncased(ir, dtype): - if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16: - pytest.skip("TensorRT-RTX does not support bfloat16") - - from transformers import BertModel - - model = BertModel.from_pretrained("bert-base-uncased").cuda().eval().to(dtype) - input = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda") - input2 = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda") - - compile_spec = { - "inputs": [ - torchtrt.Input( - input.shape, - dtype=input.dtype, - format=torch.contiguous_format, - ), - torchtrt.Input( - input.shape, - dtype=input.dtype, - format=torch.contiguous_format, - ), - ], - "device": torchtrt.Device("cuda:0"), - "truncate_double": True, - "ir": ir, - "pass_through_build_failures": True, - "optimization_level": 1, - "min_block_size": 15, - "cache_built_engines": False, - "reuse_cached_engines": False, - "use_explicit_typing": True, - } - trt_mod = torchtrt.compile(model, **compile_spec) - - model_outputs = model(input, input2) - trt_model_outputs = trt_mod(input, input2) - for key in model_outputs.keys(): - out, trt_out = model_outputs[key], trt_model_outputs[key] - assert out.dtype == trt_out.dtype - assert out.dtype == dtype - cos_sim = cosine_similarity(out, trt_out) - assertions.assertTrue( - cos_sim > COSINE_THRESHOLD, - msg=f"HF BERT base-uncased TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", - ) - - # Clean up model env - torch._dynamo.reset() - - -@pytest.mark.unit -def test_bert_base_uncased_cpu_offload(ir): - from transformers import BertModel - - model = BertModel.from_pretrained("bert-base-uncased").cuda().eval() - input = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda") - input2 = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda") - - compile_spec = { - "inputs": [ - torchtrt.Input( - input.shape, - dtype=input.dtype, - format=torch.contiguous_format, - ), - torchtrt.Input( - input.shape, - dtype=input.dtype, - format=torch.contiguous_format, - ), - ], - "device": torchtrt.Device("cuda:0"), - "enabled_precisions": {torch.float}, - "truncate_double": True, - "ir": ir, - "pass_through_build_failures": True, - "optimization_level": 1, - "min_block_size": 15, - "cache_built_engines": False, - "reuse_cached_engines": False, - "offload_module_to_cpu": True, - } - trt_mod = torchtrt.compile(model, **compile_spec) - if ir == "dynamo": - assertions.assertTrue( - get_model_device(model).type == "cpu", - msg="Model should be offloaded to CPU", - ) - model.cuda() - - model_outputs = model(input, input2) - trt_model_outputs = trt_mod(input, input2) - for key in model_outputs.keys(): - out, trt_out = model_outputs[key], trt_model_outputs[key] - cos_sim = cosine_similarity(out, trt_out) - assertions.assertTrue( - cos_sim > COSINE_THRESHOLD, - msg=f"HF BERT base-uncased TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", - ) - - # Clean up model env - torch._dynamo.reset() - - -@pytest.mark.unit -@unittest.skipIf( - not importlib.util.find_spec("torchvision"), - "torchvision is not installed", -) -def test_resnet18_half(ir): - model = models.resnet18(pretrained=True).eval().to("cuda").half() - input = torch.randn((1, 3, 224, 224)).to("cuda").half() - - compile_spec = { - "inputs": [ - torchtrt.Input( - input.shape, dtype=torch.half, format=torch.contiguous_format - ) - ], - "device": torchtrt.Device("cuda:0"), - "enabled_precisions": {torch.half}, - "ir": ir, - "pass_through_build_failures": True, - "optimization_level": 1, - "cache_built_engines": False, - "reuse_cached_engines": False, - } - - trt_mod = torchtrt.compile(model, **compile_spec) - cos_sim = cosine_similarity(model(input), trt_mod(input)) - assertions.assertTrue( - cos_sim > COSINE_THRESHOLD, - msg=f"Resnet18 Half TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", - ) - - # Clean up model env - torch._dynamo.reset() - - -@pytest.mark.unit -@unittest.skipIf( - torchtrt.ENABLED_FEATURES.tensorrt_rtx, - "bf16 is not supported for tensorrt_rtx", -) -def test_bf16_model(ir): - class MyModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.conv = torch.nn.Conv2d(3, 16, 3, stride=1, bias=True) - self.relu = torch.nn.ReLU() - - def forward(self, x): - out = self.conv(x) - out = self.relu(out) - return out - - model = MyModule().eval().cuda().to(torch.bfloat16) - input = torch.randn((1, 3, 224, 224)).to("cuda").to(torch.bfloat16) - - compile_spec = { - "inputs": [ - torchtrt.Input( - input.shape, dtype=torch.bfloat16, format=torch.contiguous_format - ) - ], - "device": torchtrt.Device("cuda:0"), - "enabled_precisions": {torch.float32}, - "ir": ir, - "pass_through_build_failures": True, - "min_block_size": 1, - "cache_built_engines": False, - "reuse_cached_engines": False, - "use_explicit_typing": True, - } - - trt_mod = torchtrt.compile(model, **compile_spec) - cos_sim = cosine_similarity(model(input), trt_mod(input)) - - assertions.assertTrue( - cos_sim > COSINE_THRESHOLD, - msg=f"BF16 model TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", - ) - - # Clean up model env - torch._dynamo.reset() - - -@pytest.mark.unit -@unittest.skipIf( - torchtrt.ENABLED_FEATURES.tensorrt_rtx, - "bf16 is not supported for tensorrt_rtx", -) -def test_bf16_fallback_model(ir): - class MyModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.conv = torch.nn.Conv2d(3, 16, 3, padding=1, stride=1, bias=True) - self.relu = torch.nn.ReLU() - self.conv2 = torch.nn.Conv2d(16, 16, 3, padding=1, stride=1, bias=True) - - def forward(self, x): - out = self.conv(x) - out = self.relu(out) - out = self.conv2(out) - return out - - model = MyModule().eval().cuda().to(torch.bfloat16) - input = torch.randn((1, 3, 224, 224)).to("cuda").to(torch.bfloat16) - - compile_spec = { - "inputs": [ - torchtrt.Input( - input.shape, dtype=torch.bfloat16, format=torch.contiguous_format - ) - ], - "device": torchtrt.Device("cuda:0"), - "enabled_precisions": {torch.float32}, - "ir": ir, - "pass_through_build_failures": True, - "min_block_size": 1, - "cache_built_engines": False, - "reuse_cached_engines": False, - "use_explicit_typing": True, - "torch_executed_ops": {"torch.ops.aten.relu.default"}, - } - - trt_mod = torchtrt.compile(model, **compile_spec) - cos_sim = cosine_similarity(model(input), trt_mod(input)) - - assertions.assertTrue( - cos_sim > COSINE_THRESHOLD, - msg=f"BF16 fallback model TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", - ) - - # Clean up model env - torch._dynamo.reset() +# @pytest.mark.unit +# @unittest.skipIf( +# not importlib.util.find_spec("torchvision"), +# "torchvision is not installed", +# ) +# def test_resnet18_cpu_offload(ir): +# model = models.resnet18(pretrained=True).eval().to("cuda") +# input = torch.randn((1, 3, 224, 224)).to("cuda") + +# compile_spec = { +# "inputs": [ +# torchtrt.Input( +# input.shape, dtype=torch.float, format=torch.contiguous_format +# ) +# ], +# "device": torchtrt.Device("cuda:0"), +# "enabled_precisions": {torch.float}, +# "ir": ir, +# "pass_through_build_failures": True, +# "optimization_level": 1, +# "cache_built_engines": False, +# "reuse_cached_engines": False, +# "offload_module_to_cpu": True, +# } + +# trt_mod = torchtrt.compile(model, **compile_spec) +# if ir == "dynamo": +# assertions.assertTrue( +# get_model_device(model).type == "cpu", +# msg="Model should be offloaded to CPU", +# ) +# model.cuda() +# cos_sim = cosine_similarity(model(input), trt_mod(input)) +# assertions.assertTrue( +# cos_sim > COSINE_THRESHOLD, +# msg=f"Resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", +# ) + +# # Clean up model env +# torch._dynamo.reset() + + +# @unittest.skipIf( +# not importlib.util.find_spec("torchvision"), "torchvision not installed" +# ) +# def test_resnet18_torch_exec_ops(ir): +# model = models.resnet18(pretrained=True).eval().to("cuda") +# input = torch.randn((1, 3, 224, 224)).to("cuda") + +# compile_spec = { +# "inputs": [ +# torchtrt.Input( +# min_shape=(1, 3, 224, 224), +# opt_shape=(8, 3, 224, 224), +# max_shape=(16, 3, 224, 224), +# dtype=torch.float32, +# ) +# ], +# "ir": ir, +# "enabled_precisions": {torch.float32, torch.float16}, +# "min_block_size": 1, +# "output_format": "exported_program", +# "cache_built_engines": True, +# "reuse_cached_engines": True, +# "torch_executed_ops": {torch.ops.aten.matmul, "torch.ops.aten.add"}, +# } + +# trt_mod = torchtrt.compile(model, **compile_spec) +# cos_sim = cosine_similarity(model(input), trt_mod(input)) +# assertions.assertTrue( +# cos_sim > COSINE_THRESHOLD, +# msg=f"Resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", +# ) + +# # Clean up model env +# torch._dynamo.reset() + + +# @pytest.mark.unit +# @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32]) +# @unittest.skipIf( +# not importlib.util.find_spec("torchvision"), +# "torchvision is not installed", +# ) +# def test_mobilenet_v2(ir, dtype): +# if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16: +# pytest.skip("TensorRT-RTX does not support bfloat16") + +# model = models.mobilenet_v2(pretrained=True).eval().to("cuda").to(dtype) +# input = torch.randn((1, 3, 224, 224)).to("cuda").to(dtype) + +# compile_spec = { +# "inputs": [ +# torchtrt.Input(input.shape, dtype=dtype, format=torch.contiguous_format) +# ], +# "device": torchtrt.Device("cuda:0"), +# "ir": ir, +# "pass_through_build_failures": True, +# "optimization_level": 1, +# "min_block_size": 10, +# "cache_built_engines": False, +# "reuse_cached_engines": False, +# "use_explicit_typing": True, +# } + +# trt_mod = torchtrt.compile(model, **compile_spec) +# pyt_output = model(input) +# trt_output = trt_mod(input) +# assert pyt_output.dtype == trt_output.dtype +# assert pyt_output.dtype == dtype +# cos_sim = cosine_similarity(pyt_output, trt_output) +# assertions.assertTrue( +# cos_sim > COSINE_THRESHOLD, +# msg=f"Mobilenet v2 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", +# ) + +# # Clean up model env +# torch._dynamo.reset() + + +# @pytest.mark.unit +# @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32]) +# @unittest.skipIf( +# not importlib.util.find_spec("timm") or not importlib.util.find_spec("torchvision"), +# "timm or torchvision not installed", +# ) +# def test_efficientnet_b0(ir, dtype): +# if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16: +# pytest.skip("TensorRT-RTX does not support bfloat16, skipping test") + +# model = ( +# timm.create_model("efficientnet_b0", pretrained=True) +# .eval() +# .to("cuda") +# .to(dtype) +# ) +# input = torch.randn((1, 3, 224, 224)).to("cuda").to(dtype) + +# compile_spec = { +# "inputs": [ +# torchtrt.Input(input.shape, dtype=dtype, format=torch.contiguous_format) +# ], +# "device": torchtrt.Device("cuda:0"), +# "ir": ir, +# "pass_through_build_failures": True, +# "optimization_level": 1, +# "min_block_size": 10, +# "cache_built_engines": False, +# "reuse_cached_engines": False, +# "use_explicit_typing": True, +# } + +# trt_mod = torchtrt.compile(model, **compile_spec) +# pyt_output = model(input) +# trt_output = trt_mod(input) +# assert pyt_output.dtype == trt_output.dtype +# assert pyt_output.dtype == dtype +# cos_sim = cosine_similarity(pyt_output, trt_output) +# assertions.assertTrue( +# cos_sim > COSINE_THRESHOLD, +# msg=f"EfficientNet-B0 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", +# ) + +# # Clean up model env +# torch._dynamo.reset() + + +# @pytest.mark.unit +# @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32]) +# @unittest.skipIf( +# not importlib.util.find_spec("transformers"), +# "transformers is required to run this test", +# ) +# def test_bert_base_uncased(ir, dtype): +# if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16: +# pytest.skip("TensorRT-RTX does not support bfloat16") + +# from transformers import BertModel + +# model = BertModel.from_pretrained("bert-base-uncased").cuda().eval().to(dtype) +# input = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda") +# input2 = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda") + +# compile_spec = { +# "inputs": [ +# torchtrt.Input( +# input.shape, +# dtype=input.dtype, +# format=torch.contiguous_format, +# ), +# torchtrt.Input( +# input.shape, +# dtype=input.dtype, +# format=torch.contiguous_format, +# ), +# ], +# "device": torchtrt.Device("cuda:0"), +# "truncate_double": True, +# "ir": ir, +# "pass_through_build_failures": True, +# "optimization_level": 1, +# "min_block_size": 15, +# "cache_built_engines": False, +# "reuse_cached_engines": False, +# "use_explicit_typing": True, +# } +# trt_mod = torchtrt.compile(model, **compile_spec) + +# model_outputs = model(input, input2) +# trt_model_outputs = trt_mod(input, input2) +# for key in model_outputs.keys(): +# out, trt_out = model_outputs[key], trt_model_outputs[key] +# assert out.dtype == trt_out.dtype +# assert out.dtype == dtype +# cos_sim = cosine_similarity(out, trt_out) +# assertions.assertTrue( +# cos_sim > COSINE_THRESHOLD, +# msg=f"HF BERT base-uncased TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", +# ) + +# # Clean up model env +# torch._dynamo.reset() + + +# @pytest.mark.unit +# def test_bert_base_uncased_cpu_offload(ir): +# from transformers import BertModel + +# model = BertModel.from_pretrained("bert-base-uncased").cuda().eval() +# input = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda") +# input2 = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda") + +# compile_spec = { +# "inputs": [ +# torchtrt.Input( +# input.shape, +# dtype=input.dtype, +# format=torch.contiguous_format, +# ), +# torchtrt.Input( +# input.shape, +# dtype=input.dtype, +# format=torch.contiguous_format, +# ), +# ], +# "device": torchtrt.Device("cuda:0"), +# "enabled_precisions": {torch.float}, +# "truncate_double": True, +# "ir": ir, +# "pass_through_build_failures": True, +# "optimization_level": 1, +# "min_block_size": 15, +# "cache_built_engines": False, +# "reuse_cached_engines": False, +# "offload_module_to_cpu": True, +# } +# trt_mod = torchtrt.compile(model, **compile_spec) +# if ir == "dynamo": +# assertions.assertTrue( +# get_model_device(model).type == "cpu", +# msg="Model should be offloaded to CPU", +# ) +# model.cuda() + +# model_outputs = model(input, input2) +# trt_model_outputs = trt_mod(input, input2) +# for key in model_outputs.keys(): +# out, trt_out = model_outputs[key], trt_model_outputs[key] +# cos_sim = cosine_similarity(out, trt_out) +# assertions.assertTrue( +# cos_sim > COSINE_THRESHOLD, +# msg=f"HF BERT base-uncased TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", +# ) + +# # Clean up model env +# torch._dynamo.reset() + + +# @pytest.mark.unit +# @unittest.skipIf( +# not importlib.util.find_spec("torchvision"), +# "torchvision is not installed", +# ) +# def test_resnet18_half(ir): +# model = models.resnet18(pretrained=True).eval().to("cuda").half() +# input = torch.randn((1, 3, 224, 224)).to("cuda").half() + +# compile_spec = { +# "inputs": [ +# torchtrt.Input( +# input.shape, dtype=torch.half, format=torch.contiguous_format +# ) +# ], +# "device": torchtrt.Device("cuda:0"), +# "enabled_precisions": {torch.half}, +# "ir": ir, +# "pass_through_build_failures": True, +# "optimization_level": 1, +# "cache_built_engines": False, +# "reuse_cached_engines": False, +# } + +# trt_mod = torchtrt.compile(model, **compile_spec) +# cos_sim = cosine_similarity(model(input), trt_mod(input)) +# assertions.assertTrue( +# cos_sim > COSINE_THRESHOLD, +# msg=f"Resnet18 Half TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", +# ) + +# # Clean up model env +# torch._dynamo.reset() + + +# @pytest.mark.unit +# @unittest.skipIf( +# torchtrt.ENABLED_FEATURES.tensorrt_rtx, +# "bf16 is not supported for tensorrt_rtx", +# ) +# def test_bf16_model(ir): +# class MyModule(torch.nn.Module): +# def __init__(self): +# super().__init__() +# self.conv = torch.nn.Conv2d(3, 16, 3, stride=1, bias=True) +# self.relu = torch.nn.ReLU() + +# def forward(self, x): +# out = self.conv(x) +# out = self.relu(out) +# return out + +# model = MyModule().eval().cuda().to(torch.bfloat16) +# input = torch.randn((1, 3, 224, 224)).to("cuda").to(torch.bfloat16) + +# compile_spec = { +# "inputs": [ +# torchtrt.Input( +# input.shape, dtype=torch.bfloat16, format=torch.contiguous_format +# ) +# ], +# "device": torchtrt.Device("cuda:0"), +# "enabled_precisions": {torch.float32}, +# "ir": ir, +# "pass_through_build_failures": True, +# "min_block_size": 1, +# "cache_built_engines": False, +# "reuse_cached_engines": False, +# "use_explicit_typing": True, +# } + +# trt_mod = torchtrt.compile(model, **compile_spec) +# cos_sim = cosine_similarity(model(input), trt_mod(input)) + +# assertions.assertTrue( +# cos_sim > COSINE_THRESHOLD, +# msg=f"BF16 model TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", +# ) + +# # Clean up model env +# torch._dynamo.reset() + + +# @pytest.mark.unit +# @unittest.skipIf( +# torchtrt.ENABLED_FEATURES.tensorrt_rtx, +# "bf16 is not supported for tensorrt_rtx", +# ) +# def test_bf16_fallback_model(ir): +# class MyModule(torch.nn.Module): +# def __init__(self): +# super().__init__() +# self.conv = torch.nn.Conv2d(3, 16, 3, padding=1, stride=1, bias=True) +# self.relu = torch.nn.ReLU() +# self.conv2 = torch.nn.Conv2d(16, 16, 3, padding=1, stride=1, bias=True) + +# def forward(self, x): +# out = self.conv(x) +# out = self.relu(out) +# out = self.conv2(out) +# return out + +# model = MyModule().eval().cuda().to(torch.bfloat16) +# input = torch.randn((1, 3, 224, 224)).to("cuda").to(torch.bfloat16) + +# compile_spec = { +# "inputs": [ +# torchtrt.Input( +# input.shape, dtype=torch.bfloat16, format=torch.contiguous_format +# ) +# ], +# "device": torchtrt.Device("cuda:0"), +# "enabled_precisions": {torch.float32}, +# "ir": ir, +# "pass_through_build_failures": True, +# "min_block_size": 1, +# "cache_built_engines": False, +# "reuse_cached_engines": False, +# "use_explicit_typing": True, +# "torch_executed_ops": {"torch.ops.aten.relu.default"}, +# } + +# trt_mod = torchtrt.compile(model, **compile_spec) +# cos_sim = cosine_similarity(model(input), trt_mod(input)) + +# assertions.assertTrue( +# cos_sim > COSINE_THRESHOLD, +# msg=f"BF16 fallback model TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", +# ) + +# # Clean up model env +# torch._dynamo.reset() From 19f428e4a6926de04f3bb20f336c83fc7624e49b Mon Sep 17 00:00:00 2001 From: lanluo-nvidia Date: Tue, 30 Sep 2025 08:19:22 -0700 Subject: [PATCH 3/5] debug --- .github/workflows/build-test-windows.yml | 10 +- tests/py/dynamo/models/test_models.py | 787 +++++++++++------------ 2 files changed, 385 insertions(+), 412 deletions(-) diff --git a/.github/workflows/build-test-windows.yml b/.github/workflows/build-test-windows.yml index 70d66eab52..ace1a595be 100644 --- a/.github/workflows/build-test-windows.yml +++ b/.github/workflows/build-test-windows.yml @@ -119,7 +119,15 @@ jobs: python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/test_dtype_support.xml --ir torch_compile models/test_dtype_support.py echo "test_dtype_support.xml passed" - python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/test_models.xml --ir dynamo models/test_models.py + python -m pytest -k "lan_1" -rP models/test_models.py + python -m pytest -k "lan_2" -rP models/test_models.py + python -m pytest -k "lan_3" -rP models/test_models.py + python -m pytest -k "lan_4" -rP models/test_models.py + python -m pytest -k "lan_5" -rP models/test_models.py + python -m pytest -k "lan_6" -rP models/test_models.py + python -m pytest -k "lan_7" -rP models/test_models.py + python -m pytest -k "lan_8" -rP models/test_models.py + python -m pytest -k "lan_9" -rP models/test_models.py python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/test_dyn_models.xml --ir dynamo models/test_dyn_models.py python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/test_models_export.xml --ir dynamo models/test_models_export.py diff --git a/tests/py/dynamo/models/test_models.py b/tests/py/dynamo/models/test_models.py index 33f8d8d458..1ff465dc27 100644 --- a/tests/py/dynamo/models/test_models.py +++ b/tests/py/dynamo/models/test_models.py @@ -24,7 +24,7 @@ not importlib.util.find_spec("torchvision"), "torchvision is not installed", ) -def test_resnet18(ir): +def test_lan_1_resnet18_cpu_offload(ir): model = models.resnet18(pretrained=True).eval().to("cuda") input = torch.randn((1, 3, 224, 224)).to("cuda") @@ -41,9 +41,16 @@ def test_resnet18(ir): "optimization_level": 1, "cache_built_engines": False, "reuse_cached_engines": False, + "offload_module_to_cpu": True, } trt_mod = torchtrt.compile(model, **compile_spec) + if ir == "dynamo": + assertions.assertTrue( + get_model_device(model).type == "cpu", + msg="Model should be offloaded to CPU", + ) + model.cuda() cos_sim = cosine_similarity(model(input), trt_mod(input)) assertions.assertTrue( cos_sim > COSINE_THRESHOLD, @@ -54,413 +61,371 @@ def test_resnet18(ir): torch._dynamo.reset() -# @pytest.mark.unit -# @unittest.skipIf( -# not importlib.util.find_spec("torchvision"), -# "torchvision is not installed", -# ) -# def test_resnet18_cpu_offload(ir): -# model = models.resnet18(pretrained=True).eval().to("cuda") -# input = torch.randn((1, 3, 224, 224)).to("cuda") - -# compile_spec = { -# "inputs": [ -# torchtrt.Input( -# input.shape, dtype=torch.float, format=torch.contiguous_format -# ) -# ], -# "device": torchtrt.Device("cuda:0"), -# "enabled_precisions": {torch.float}, -# "ir": ir, -# "pass_through_build_failures": True, -# "optimization_level": 1, -# "cache_built_engines": False, -# "reuse_cached_engines": False, -# "offload_module_to_cpu": True, -# } - -# trt_mod = torchtrt.compile(model, **compile_spec) -# if ir == "dynamo": -# assertions.assertTrue( -# get_model_device(model).type == "cpu", -# msg="Model should be offloaded to CPU", -# ) -# model.cuda() -# cos_sim = cosine_similarity(model(input), trt_mod(input)) -# assertions.assertTrue( -# cos_sim > COSINE_THRESHOLD, -# msg=f"Resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", -# ) - -# # Clean up model env -# torch._dynamo.reset() - - -# @unittest.skipIf( -# not importlib.util.find_spec("torchvision"), "torchvision not installed" -# ) -# def test_resnet18_torch_exec_ops(ir): -# model = models.resnet18(pretrained=True).eval().to("cuda") -# input = torch.randn((1, 3, 224, 224)).to("cuda") - -# compile_spec = { -# "inputs": [ -# torchtrt.Input( -# min_shape=(1, 3, 224, 224), -# opt_shape=(8, 3, 224, 224), -# max_shape=(16, 3, 224, 224), -# dtype=torch.float32, -# ) -# ], -# "ir": ir, -# "enabled_precisions": {torch.float32, torch.float16}, -# "min_block_size": 1, -# "output_format": "exported_program", -# "cache_built_engines": True, -# "reuse_cached_engines": True, -# "torch_executed_ops": {torch.ops.aten.matmul, "torch.ops.aten.add"}, -# } - -# trt_mod = torchtrt.compile(model, **compile_spec) -# cos_sim = cosine_similarity(model(input), trt_mod(input)) -# assertions.assertTrue( -# cos_sim > COSINE_THRESHOLD, -# msg=f"Resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", -# ) - -# # Clean up model env -# torch._dynamo.reset() - - -# @pytest.mark.unit -# @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32]) -# @unittest.skipIf( -# not importlib.util.find_spec("torchvision"), -# "torchvision is not installed", -# ) -# def test_mobilenet_v2(ir, dtype): -# if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16: -# pytest.skip("TensorRT-RTX does not support bfloat16") - -# model = models.mobilenet_v2(pretrained=True).eval().to("cuda").to(dtype) -# input = torch.randn((1, 3, 224, 224)).to("cuda").to(dtype) - -# compile_spec = { -# "inputs": [ -# torchtrt.Input(input.shape, dtype=dtype, format=torch.contiguous_format) -# ], -# "device": torchtrt.Device("cuda:0"), -# "ir": ir, -# "pass_through_build_failures": True, -# "optimization_level": 1, -# "min_block_size": 10, -# "cache_built_engines": False, -# "reuse_cached_engines": False, -# "use_explicit_typing": True, -# } - -# trt_mod = torchtrt.compile(model, **compile_spec) -# pyt_output = model(input) -# trt_output = trt_mod(input) -# assert pyt_output.dtype == trt_output.dtype -# assert pyt_output.dtype == dtype -# cos_sim = cosine_similarity(pyt_output, trt_output) -# assertions.assertTrue( -# cos_sim > COSINE_THRESHOLD, -# msg=f"Mobilenet v2 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", -# ) - -# # Clean up model env -# torch._dynamo.reset() - - -# @pytest.mark.unit -# @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32]) -# @unittest.skipIf( -# not importlib.util.find_spec("timm") or not importlib.util.find_spec("torchvision"), -# "timm or torchvision not installed", -# ) -# def test_efficientnet_b0(ir, dtype): -# if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16: -# pytest.skip("TensorRT-RTX does not support bfloat16, skipping test") - -# model = ( -# timm.create_model("efficientnet_b0", pretrained=True) -# .eval() -# .to("cuda") -# .to(dtype) -# ) -# input = torch.randn((1, 3, 224, 224)).to("cuda").to(dtype) - -# compile_spec = { -# "inputs": [ -# torchtrt.Input(input.shape, dtype=dtype, format=torch.contiguous_format) -# ], -# "device": torchtrt.Device("cuda:0"), -# "ir": ir, -# "pass_through_build_failures": True, -# "optimization_level": 1, -# "min_block_size": 10, -# "cache_built_engines": False, -# "reuse_cached_engines": False, -# "use_explicit_typing": True, -# } - -# trt_mod = torchtrt.compile(model, **compile_spec) -# pyt_output = model(input) -# trt_output = trt_mod(input) -# assert pyt_output.dtype == trt_output.dtype -# assert pyt_output.dtype == dtype -# cos_sim = cosine_similarity(pyt_output, trt_output) -# assertions.assertTrue( -# cos_sim > COSINE_THRESHOLD, -# msg=f"EfficientNet-B0 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", -# ) - -# # Clean up model env -# torch._dynamo.reset() - - -# @pytest.mark.unit -# @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32]) -# @unittest.skipIf( -# not importlib.util.find_spec("transformers"), -# "transformers is required to run this test", -# ) -# def test_bert_base_uncased(ir, dtype): -# if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16: -# pytest.skip("TensorRT-RTX does not support bfloat16") - -# from transformers import BertModel - -# model = BertModel.from_pretrained("bert-base-uncased").cuda().eval().to(dtype) -# input = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda") -# input2 = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda") - -# compile_spec = { -# "inputs": [ -# torchtrt.Input( -# input.shape, -# dtype=input.dtype, -# format=torch.contiguous_format, -# ), -# torchtrt.Input( -# input.shape, -# dtype=input.dtype, -# format=torch.contiguous_format, -# ), -# ], -# "device": torchtrt.Device("cuda:0"), -# "truncate_double": True, -# "ir": ir, -# "pass_through_build_failures": True, -# "optimization_level": 1, -# "min_block_size": 15, -# "cache_built_engines": False, -# "reuse_cached_engines": False, -# "use_explicit_typing": True, -# } -# trt_mod = torchtrt.compile(model, **compile_spec) - -# model_outputs = model(input, input2) -# trt_model_outputs = trt_mod(input, input2) -# for key in model_outputs.keys(): -# out, trt_out = model_outputs[key], trt_model_outputs[key] -# assert out.dtype == trt_out.dtype -# assert out.dtype == dtype -# cos_sim = cosine_similarity(out, trt_out) -# assertions.assertTrue( -# cos_sim > COSINE_THRESHOLD, -# msg=f"HF BERT base-uncased TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", -# ) - -# # Clean up model env -# torch._dynamo.reset() - - -# @pytest.mark.unit -# def test_bert_base_uncased_cpu_offload(ir): -# from transformers import BertModel - -# model = BertModel.from_pretrained("bert-base-uncased").cuda().eval() -# input = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda") -# input2 = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda") - -# compile_spec = { -# "inputs": [ -# torchtrt.Input( -# input.shape, -# dtype=input.dtype, -# format=torch.contiguous_format, -# ), -# torchtrt.Input( -# input.shape, -# dtype=input.dtype, -# format=torch.contiguous_format, -# ), -# ], -# "device": torchtrt.Device("cuda:0"), -# "enabled_precisions": {torch.float}, -# "truncate_double": True, -# "ir": ir, -# "pass_through_build_failures": True, -# "optimization_level": 1, -# "min_block_size": 15, -# "cache_built_engines": False, -# "reuse_cached_engines": False, -# "offload_module_to_cpu": True, -# } -# trt_mod = torchtrt.compile(model, **compile_spec) -# if ir == "dynamo": -# assertions.assertTrue( -# get_model_device(model).type == "cpu", -# msg="Model should be offloaded to CPU", -# ) -# model.cuda() - -# model_outputs = model(input, input2) -# trt_model_outputs = trt_mod(input, input2) -# for key in model_outputs.keys(): -# out, trt_out = model_outputs[key], trt_model_outputs[key] -# cos_sim = cosine_similarity(out, trt_out) -# assertions.assertTrue( -# cos_sim > COSINE_THRESHOLD, -# msg=f"HF BERT base-uncased TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", -# ) - -# # Clean up model env -# torch._dynamo.reset() - - -# @pytest.mark.unit -# @unittest.skipIf( -# not importlib.util.find_spec("torchvision"), -# "torchvision is not installed", -# ) -# def test_resnet18_half(ir): -# model = models.resnet18(pretrained=True).eval().to("cuda").half() -# input = torch.randn((1, 3, 224, 224)).to("cuda").half() - -# compile_spec = { -# "inputs": [ -# torchtrt.Input( -# input.shape, dtype=torch.half, format=torch.contiguous_format -# ) -# ], -# "device": torchtrt.Device("cuda:0"), -# "enabled_precisions": {torch.half}, -# "ir": ir, -# "pass_through_build_failures": True, -# "optimization_level": 1, -# "cache_built_engines": False, -# "reuse_cached_engines": False, -# } - -# trt_mod = torchtrt.compile(model, **compile_spec) -# cos_sim = cosine_similarity(model(input), trt_mod(input)) -# assertions.assertTrue( -# cos_sim > COSINE_THRESHOLD, -# msg=f"Resnet18 Half TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", -# ) - -# # Clean up model env -# torch._dynamo.reset() - - -# @pytest.mark.unit -# @unittest.skipIf( -# torchtrt.ENABLED_FEATURES.tensorrt_rtx, -# "bf16 is not supported for tensorrt_rtx", -# ) -# def test_bf16_model(ir): -# class MyModule(torch.nn.Module): -# def __init__(self): -# super().__init__() -# self.conv = torch.nn.Conv2d(3, 16, 3, stride=1, bias=True) -# self.relu = torch.nn.ReLU() - -# def forward(self, x): -# out = self.conv(x) -# out = self.relu(out) -# return out - -# model = MyModule().eval().cuda().to(torch.bfloat16) -# input = torch.randn((1, 3, 224, 224)).to("cuda").to(torch.bfloat16) - -# compile_spec = { -# "inputs": [ -# torchtrt.Input( -# input.shape, dtype=torch.bfloat16, format=torch.contiguous_format -# ) -# ], -# "device": torchtrt.Device("cuda:0"), -# "enabled_precisions": {torch.float32}, -# "ir": ir, -# "pass_through_build_failures": True, -# "min_block_size": 1, -# "cache_built_engines": False, -# "reuse_cached_engines": False, -# "use_explicit_typing": True, -# } - -# trt_mod = torchtrt.compile(model, **compile_spec) -# cos_sim = cosine_similarity(model(input), trt_mod(input)) - -# assertions.assertTrue( -# cos_sim > COSINE_THRESHOLD, -# msg=f"BF16 model TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", -# ) - -# # Clean up model env -# torch._dynamo.reset() - - -# @pytest.mark.unit -# @unittest.skipIf( -# torchtrt.ENABLED_FEATURES.tensorrt_rtx, -# "bf16 is not supported for tensorrt_rtx", -# ) -# def test_bf16_fallback_model(ir): -# class MyModule(torch.nn.Module): -# def __init__(self): -# super().__init__() -# self.conv = torch.nn.Conv2d(3, 16, 3, padding=1, stride=1, bias=True) -# self.relu = torch.nn.ReLU() -# self.conv2 = torch.nn.Conv2d(16, 16, 3, padding=1, stride=1, bias=True) - -# def forward(self, x): -# out = self.conv(x) -# out = self.relu(out) -# out = self.conv2(out) -# return out - -# model = MyModule().eval().cuda().to(torch.bfloat16) -# input = torch.randn((1, 3, 224, 224)).to("cuda").to(torch.bfloat16) - -# compile_spec = { -# "inputs": [ -# torchtrt.Input( -# input.shape, dtype=torch.bfloat16, format=torch.contiguous_format -# ) -# ], -# "device": torchtrt.Device("cuda:0"), -# "enabled_precisions": {torch.float32}, -# "ir": ir, -# "pass_through_build_failures": True, -# "min_block_size": 1, -# "cache_built_engines": False, -# "reuse_cached_engines": False, -# "use_explicit_typing": True, -# "torch_executed_ops": {"torch.ops.aten.relu.default"}, -# } - -# trt_mod = torchtrt.compile(model, **compile_spec) -# cos_sim = cosine_similarity(model(input), trt_mod(input)) - -# assertions.assertTrue( -# cos_sim > COSINE_THRESHOLD, -# msg=f"BF16 fallback model TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", -# ) - -# # Clean up model env -# torch._dynamo.reset() +@unittest.skipIf( + not importlib.util.find_spec("torchvision"), "torchvision not installed" +) +def test_lan_2_resnet18_torch_exec_ops(ir): + model = models.resnet18(pretrained=True).eval().to("cuda") + input = torch.randn((1, 3, 224, 224)).to("cuda") + + compile_spec = { + "inputs": [ + torchtrt.Input( + min_shape=(1, 3, 224, 224), + opt_shape=(8, 3, 224, 224), + max_shape=(16, 3, 224, 224), + dtype=torch.float32, + ) + ], + "ir": ir, + "enabled_precisions": {torch.float32, torch.float16}, + "min_block_size": 1, + "output_format": "exported_program", + "cache_built_engines": True, + "reuse_cached_engines": True, + "torch_executed_ops": {torch.ops.aten.matmul, "torch.ops.aten.add"}, + } + + trt_mod = torchtrt.compile(model, **compile_spec) + cos_sim = cosine_similarity(model(input), trt_mod(input)) + assertions.assertTrue( + cos_sim > COSINE_THRESHOLD, + msg=f"Resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + ) + + # Clean up model env + torch._dynamo.reset() + + +@pytest.mark.unit +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32]) +@unittest.skipIf( + not importlib.util.find_spec("torchvision"), + "torchvision is not installed", +) +def test_lan_3_mobilenet_v2(ir, dtype): + if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16: + pytest.skip("TensorRT-RTX does not support bfloat16") + + model = models.mobilenet_v2(pretrained=True).eval().to("cuda").to(dtype) + input = torch.randn((1, 3, 224, 224)).to("cuda").to(dtype) + + compile_spec = { + "inputs": [ + torchtrt.Input(input.shape, dtype=dtype, format=torch.contiguous_format) + ], + "device": torchtrt.Device("cuda:0"), + "ir": ir, + "pass_through_build_failures": True, + "optimization_level": 1, + "min_block_size": 10, + "cache_built_engines": False, + "reuse_cached_engines": False, + "use_explicit_typing": True, + } + + trt_mod = torchtrt.compile(model, **compile_spec) + pyt_output = model(input) + trt_output = trt_mod(input) + assert pyt_output.dtype == trt_output.dtype + assert pyt_output.dtype == dtype + cos_sim = cosine_similarity(pyt_output, trt_output) + assertions.assertTrue( + cos_sim > COSINE_THRESHOLD, + msg=f"Mobilenet v2 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + ) + + # Clean up model env + torch._dynamo.reset() + + +@pytest.mark.unit +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32]) +@unittest.skipIf( + not importlib.util.find_spec("timm") or not importlib.util.find_spec("torchvision"), + "timm or torchvision not installed", +) +def test_lan_4_efficientnet_b0(ir, dtype): + if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16: + pytest.skip("TensorRT-RTX does not support bfloat16, skipping test") + + model = ( + timm.create_model("efficientnet_b0", pretrained=True) + .eval() + .to("cuda") + .to(dtype) + ) + input = torch.randn((1, 3, 224, 224)).to("cuda").to(dtype) + + compile_spec = { + "inputs": [ + torchtrt.Input(input.shape, dtype=dtype, format=torch.contiguous_format) + ], + "device": torchtrt.Device("cuda:0"), + "ir": ir, + "pass_through_build_failures": True, + "optimization_level": 1, + "min_block_size": 10, + "cache_built_engines": False, + "reuse_cached_engines": False, + "use_explicit_typing": True, + } + + trt_mod = torchtrt.compile(model, **compile_spec) + pyt_output = model(input) + trt_output = trt_mod(input) + assert pyt_output.dtype == trt_output.dtype + assert pyt_output.dtype == dtype + cos_sim = cosine_similarity(pyt_output, trt_output) + assertions.assertTrue( + cos_sim > COSINE_THRESHOLD, + msg=f"EfficientNet-B0 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + ) + + # Clean up model env + torch._dynamo.reset() + + +@pytest.mark.unit +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32]) +@unittest.skipIf( + not importlib.util.find_spec("transformers"), + "transformers is required to run this test", +) +def test_lan_5_bert_base_uncased(ir, dtype): + if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16: + pytest.skip("TensorRT-RTX does not support bfloat16") + + from transformers import BertModel + + model = BertModel.from_pretrained("bert-base-uncased").cuda().eval().to(dtype) + input = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda") + input2 = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda") + + compile_spec = { + "inputs": [ + torchtrt.Input( + input.shape, + dtype=input.dtype, + format=torch.contiguous_format, + ), + torchtrt.Input( + input.shape, + dtype=input.dtype, + format=torch.contiguous_format, + ), + ], + "device": torchtrt.Device("cuda:0"), + "truncate_double": True, + "ir": ir, + "pass_through_build_failures": True, + "optimization_level": 1, + "min_block_size": 15, + "cache_built_engines": False, + "reuse_cached_engines": False, + "use_explicit_typing": True, + } + trt_mod = torchtrt.compile(model, **compile_spec) + + model_outputs = model(input, input2) + trt_model_outputs = trt_mod(input, input2) + for key in model_outputs.keys(): + out, trt_out = model_outputs[key], trt_model_outputs[key] + assert out.dtype == trt_out.dtype + assert out.dtype == dtype + cos_sim = cosine_similarity(out, trt_out) + assertions.assertTrue( + cos_sim > COSINE_THRESHOLD, + msg=f"HF BERT base-uncased TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + ) + + # Clean up model env + torch._dynamo.reset() + + +@pytest.mark.unit +def test_lan_6_bert_base_uncased_cpu_offload(ir): + from transformers import BertModel + + model = BertModel.from_pretrained("bert-base-uncased").cuda().eval() + input = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda") + input2 = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda") + + compile_spec = { + "inputs": [ + torchtrt.Input( + input.shape, + dtype=input.dtype, + format=torch.contiguous_format, + ), + torchtrt.Input( + input.shape, + dtype=input.dtype, + format=torch.contiguous_format, + ), + ], + "device": torchtrt.Device("cuda:0"), + "enabled_precisions": {torch.float}, + "truncate_double": True, + "ir": ir, + "pass_through_build_failures": True, + "optimization_level": 1, + "min_block_size": 15, + "cache_built_engines": False, + "reuse_cached_engines": False, + "offload_module_to_cpu": True, + } + trt_mod = torchtrt.compile(model, **compile_spec) + if ir == "dynamo": + assertions.assertTrue( + get_model_device(model).type == "cpu", + msg="Model should be offloaded to CPU", + ) + model.cuda() + + model_outputs = model(input, input2) + trt_model_outputs = trt_mod(input, input2) + for key in model_outputs.keys(): + out, trt_out = model_outputs[key], trt_model_outputs[key] + cos_sim = cosine_similarity(out, trt_out) + assertions.assertTrue( + cos_sim > COSINE_THRESHOLD, + msg=f"HF BERT base-uncased TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + ) + + # Clean up model env + torch._dynamo.reset() + + +@pytest.mark.unit +@unittest.skipIf( + not importlib.util.find_spec("torchvision"), + "torchvision is not installed", +) +def test_lan_7_resnet18_half(ir): + model = models.resnet18(pretrained=True).eval().to("cuda").half() + input = torch.randn((1, 3, 224, 224)).to("cuda").half() + + compile_spec = { + "inputs": [ + torchtrt.Input( + input.shape, dtype=torch.half, format=torch.contiguous_format + ) + ], + "device": torchtrt.Device("cuda:0"), + "enabled_precisions": {torch.half}, + "ir": ir, + "pass_through_build_failures": True, + "optimization_level": 1, + "cache_built_engines": False, + "reuse_cached_engines": False, + } + + trt_mod = torchtrt.compile(model, **compile_spec) + cos_sim = cosine_similarity(model(input), trt_mod(input)) + assertions.assertTrue( + cos_sim > COSINE_THRESHOLD, + msg=f"Resnet18 Half TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + ) + + # Clean up model env + torch._dynamo.reset() + + +@pytest.mark.unit +@unittest.skipIf( + torchtrt.ENABLED_FEATURES.tensorrt_rtx, + "bf16 is not supported for tensorrt_rtx", +) +def test_lan_8_bf16_model(ir): + class MyModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = torch.nn.Conv2d(3, 16, 3, stride=1, bias=True) + self.relu = torch.nn.ReLU() + + def forward(self, x): + out = self.conv(x) + out = self.relu(out) + return out + + model = MyModule().eval().cuda().to(torch.bfloat16) + input = torch.randn((1, 3, 224, 224)).to("cuda").to(torch.bfloat16) + + compile_spec = { + "inputs": [ + torchtrt.Input( + input.shape, dtype=torch.bfloat16, format=torch.contiguous_format + ) + ], + "device": torchtrt.Device("cuda:0"), + "enabled_precisions": {torch.float32}, + "ir": ir, + "pass_through_build_failures": True, + "min_block_size": 1, + "cache_built_engines": False, + "reuse_cached_engines": False, + "use_explicit_typing": True, + } + + trt_mod = torchtrt.compile(model, **compile_spec) + cos_sim = cosine_similarity(model(input), trt_mod(input)) + + assertions.assertTrue( + cos_sim > COSINE_THRESHOLD, + msg=f"BF16 model TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + ) + + # Clean up model env + torch._dynamo.reset() + + +@pytest.mark.unit +@unittest.skipIf( + torchtrt.ENABLED_FEATURES.tensorrt_rtx, + "bf16 is not supported for tensorrt_rtx", +) +def test_lan_9_bf16_fallback_model(ir): + class MyModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = torch.nn.Conv2d(3, 16, 3, padding=1, stride=1, bias=True) + self.relu = torch.nn.ReLU() + self.conv2 = torch.nn.Conv2d(16, 16, 3, padding=1, stride=1, bias=True) + + def forward(self, x): + out = self.conv(x) + out = self.relu(out) + out = self.conv2(out) + return out + + model = MyModule().eval().cuda().to(torch.bfloat16) + input = torch.randn((1, 3, 224, 224)).to("cuda").to(torch.bfloat16) + + compile_spec = { + "inputs": [ + torchtrt.Input( + input.shape, dtype=torch.bfloat16, format=torch.contiguous_format + ) + ], + "device": torchtrt.Device("cuda:0"), + "enabled_precisions": {torch.float32}, + "ir": ir, + "pass_through_build_failures": True, + "min_block_size": 1, + "cache_built_engines": False, + "reuse_cached_engines": False, + "use_explicit_typing": True, + "torch_executed_ops": {"torch.ops.aten.relu.default"}, + } + + trt_mod = torchtrt.compile(model, **compile_spec) + cos_sim = cosine_similarity(model(input), trt_mod(input)) + + assertions.assertTrue( + cos_sim > COSINE_THRESHOLD, + msg=f"BF16 fallback model TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + ) + + # Clean up model env + torch._dynamo.reset() From 108f8231871836578cd2c853a17f5a4b7be3fe41 Mon Sep 17 00:00:00 2001 From: lanluo-nvidia Date: Tue, 30 Sep 2025 11:34:21 -0700 Subject: [PATCH 4/5] test --- .github/workflows/build-test-windows.yml | 19 +- .github/workflows/build-test-windows_rtx.yml | 2 +- py/torch_tensorrt/_compile.py | 2 + py/torch_tensorrt/dynamo/_compiler.py | 6 + tests/py/dynamo/models/test_models.py | 685 ++++++++++--------- 5 files changed, 369 insertions(+), 345 deletions(-) diff --git a/.github/workflows/build-test-windows.yml b/.github/workflows/build-test-windows.yml index ace1a595be..bed72e7274 100644 --- a/.github/workflows/build-test-windows.yml +++ b/.github/workflows/build-test-windows.yml @@ -116,25 +116,12 @@ jobs: gdb --version nvidia-smi nvcc --version - python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/test_dtype_support.xml --ir torch_compile models/test_dtype_support.py - echo "test_dtype_support.xml passed" - python -m pytest -k "lan_1" -rP models/test_models.py + echo "lan added finished lan_1" python -m pytest -k "lan_2" -rP models/test_models.py + echo "lan added finished lan_2" python -m pytest -k "lan_3" -rP models/test_models.py - python -m pytest -k "lan_4" -rP models/test_models.py - python -m pytest -k "lan_5" -rP models/test_models.py - python -m pytest -k "lan_6" -rP models/test_models.py - python -m pytest -k "lan_7" -rP models/test_models.py - python -m pytest -k "lan_8" -rP models/test_models.py - python -m pytest -k "lan_9" -rP models/test_models.py - - python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/test_dyn_models.xml --ir dynamo models/test_dyn_models.py - python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/test_models_export.xml --ir dynamo models/test_models_export.py - python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_serde_test_results.xml --ir dynamo models/test_export_serde.py - python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/reexport_test_results.xml --ir dynamo models/test_reexport.py - - python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_llm.xml llm/ + echo "lan added finished lan_3" popd diff --git a/.github/workflows/build-test-windows_rtx.yml b/.github/workflows/build-test-windows_rtx.yml index bf6eaef765..4bd08deafd 100644 --- a/.github/workflows/build-test-windows_rtx.yml +++ b/.github/workflows/build-test-windows_rtx.yml @@ -1,7 +1,7 @@ name: RTX - Build and test Windows wheels on: - pull_request: + #pull_request: push: branches: - main diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py index 8d8c51cbfc..c26fff2cc0 100644 --- a/py/torch_tensorrt/_compile.py +++ b/py/torch_tensorrt/_compile.py @@ -227,6 +227,7 @@ def compile( module_type = _parse_module_type(module) target_ir = _get_target_fe(module_type, ir) + print(f"lan added {target_ir=}") if target_ir == _IRType.ts: ts_mod = module if module_type == _ModuleType.nn: @@ -307,6 +308,7 @@ def _fx_input_interface( exp_program = dynamo_trace( module, torchtrt_arg_inputs, kwarg_inputs=torchtrt_kwarg_inputs, **kwargs ) + print(f"lan added {str(exp_program.graph)=}") trt_graph_module = dynamo_compile( exp_program, arg_inputs=torchtrt_arg_inputs, diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index 0dc4654db0..a24aeea75f 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -5,6 +5,7 @@ import os import platform import warnings +from functools import total_ordering from typing import Any, Collection, List, Optional, Sequence, Set, Tuple, Union import torch @@ -809,6 +810,7 @@ def preserve_module_specs( # Partition module into components that can be TRT-accelerated fast_partitioner_failed = False + print(f"lan added {str(gm.graph)=}") # If specified, try using the fast partitioner and fall back to the global one on failure if settings.use_fast_partitioner: try: @@ -847,10 +849,14 @@ def preserve_module_specs( dryrun_tracker.to_run_in_torch.extend(parse_non_trt_nodes(partitioned_module)) submodule_node_dict = {} + print(f"lan added {list(partitioned_module.graph.nodes)=}") + print(f"lan added {total_ops=} {num_supported_ops=}") for node in partitioned_module.graph.nodes: if "_run_on_acc" not in node.name: + print(f"lan added skipped node{node.name=}") continue submodule_node_dict[node.name] = node + print(f"lan added added submodule{node.name=}") preserve_module_specs(original_in_spec, original_out_spec, partitioned_module) # Store TRT replicas of Torch subgraphs diff --git a/tests/py/dynamo/models/test_models.py b/tests/py/dynamo/models/test_models.py index 1ff465dc27..dfd5b5aba5 100644 --- a/tests/py/dynamo/models/test_models.py +++ b/tests/py/dynamo/models/test_models.py @@ -19,52 +19,10 @@ import timm -@pytest.mark.unit -@unittest.skipIf( - not importlib.util.find_spec("torchvision"), - "torchvision is not installed", -) -def test_lan_1_resnet18_cpu_offload(ir): - model = models.resnet18(pretrained=True).eval().to("cuda") - input = torch.randn((1, 3, 224, 224)).to("cuda") - - compile_spec = { - "inputs": [ - torchtrt.Input( - input.shape, dtype=torch.float, format=torch.contiguous_format - ) - ], - "device": torchtrt.Device("cuda:0"), - "enabled_precisions": {torch.float}, - "ir": ir, - "pass_through_build_failures": True, - "optimization_level": 1, - "cache_built_engines": False, - "reuse_cached_engines": False, - "offload_module_to_cpu": True, - } - - trt_mod = torchtrt.compile(model, **compile_spec) - if ir == "dynamo": - assertions.assertTrue( - get_model_device(model).type == "cpu", - msg="Model should be offloaded to CPU", - ) - model.cuda() - cos_sim = cosine_similarity(model(input), trt_mod(input)) - assertions.assertTrue( - cos_sim > COSINE_THRESHOLD, - msg=f"Resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", - ) - - # Clean up model env - torch._dynamo.reset() - - @unittest.skipIf( not importlib.util.find_spec("torchvision"), "torchvision not installed" ) -def test_lan_2_resnet18_torch_exec_ops(ir): +def test_lan_1_resnet18_torch_exec_ops(ir): model = models.resnet18(pretrained=True).eval().to("cuda") input = torch.randn((1, 3, 224, 224)).to("cuda") @@ -97,335 +55,406 @@ def test_lan_2_resnet18_torch_exec_ops(ir): torch._dynamo.reset() -@pytest.mark.unit -@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32]) -@unittest.skipIf( - not importlib.util.find_spec("torchvision"), - "torchvision is not installed", -) -def test_lan_3_mobilenet_v2(ir, dtype): - if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16: - pytest.skip("TensorRT-RTX does not support bfloat16") - - model = models.mobilenet_v2(pretrained=True).eval().to("cuda").to(dtype) - input = torch.randn((1, 3, 224, 224)).to("cuda").to(dtype) - - compile_spec = { - "inputs": [ - torchtrt.Input(input.shape, dtype=dtype, format=torch.contiguous_format) - ], - "device": torchtrt.Device("cuda:0"), - "ir": ir, - "pass_through_build_failures": True, - "optimization_level": 1, - "min_block_size": 10, - "cache_built_engines": False, - "reuse_cached_engines": False, - "use_explicit_typing": True, - } - - trt_mod = torchtrt.compile(model, **compile_spec) - pyt_output = model(input) - trt_output = trt_mod(input) - assert pyt_output.dtype == trt_output.dtype - assert pyt_output.dtype == dtype - cos_sim = cosine_similarity(pyt_output, trt_output) - assertions.assertTrue( - cos_sim > COSINE_THRESHOLD, - msg=f"Mobilenet v2 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", - ) - - # Clean up model env - torch._dynamo.reset() - - -@pytest.mark.unit -@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32]) -@unittest.skipIf( - not importlib.util.find_spec("timm") or not importlib.util.find_spec("torchvision"), - "timm or torchvision not installed", -) -def test_lan_4_efficientnet_b0(ir, dtype): - if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16: - pytest.skip("TensorRT-RTX does not support bfloat16, skipping test") - - model = ( - timm.create_model("efficientnet_b0", pretrained=True) - .eval() - .to("cuda") - .to(dtype) - ) - input = torch.randn((1, 3, 224, 224)).to("cuda").to(dtype) - - compile_spec = { - "inputs": [ - torchtrt.Input(input.shape, dtype=dtype, format=torch.contiguous_format) - ], - "device": torchtrt.Device("cuda:0"), - "ir": ir, - "pass_through_build_failures": True, - "optimization_level": 1, - "min_block_size": 10, - "cache_built_engines": False, - "reuse_cached_engines": False, - "use_explicit_typing": True, - } - - trt_mod = torchtrt.compile(model, **compile_spec) - pyt_output = model(input) - trt_output = trt_mod(input) - assert pyt_output.dtype == trt_output.dtype - assert pyt_output.dtype == dtype - cos_sim = cosine_similarity(pyt_output, trt_output) - assertions.assertTrue( - cos_sim > COSINE_THRESHOLD, - msg=f"EfficientNet-B0 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", - ) - - # Clean up model env - torch._dynamo.reset() - - -@pytest.mark.unit -@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32]) -@unittest.skipIf( - not importlib.util.find_spec("transformers"), - "transformers is required to run this test", -) -def test_lan_5_bert_base_uncased(ir, dtype): - if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16: - pytest.skip("TensorRT-RTX does not support bfloat16") - - from transformers import BertModel - - model = BertModel.from_pretrained("bert-base-uncased").cuda().eval().to(dtype) - input = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda") - input2 = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda") - - compile_spec = { - "inputs": [ - torchtrt.Input( - input.shape, - dtype=input.dtype, - format=torch.contiguous_format, - ), - torchtrt.Input( - input.shape, - dtype=input.dtype, - format=torch.contiguous_format, - ), - ], - "device": torchtrt.Device("cuda:0"), - "truncate_double": True, - "ir": ir, - "pass_through_build_failures": True, - "optimization_level": 1, - "min_block_size": 15, - "cache_built_engines": False, - "reuse_cached_engines": False, - "use_explicit_typing": True, - } - trt_mod = torchtrt.compile(model, **compile_spec) - - model_outputs = model(input, input2) - trt_model_outputs = trt_mod(input, input2) - for key in model_outputs.keys(): - out, trt_out = model_outputs[key], trt_model_outputs[key] - assert out.dtype == trt_out.dtype - assert out.dtype == dtype - cos_sim = cosine_similarity(out, trt_out) - assertions.assertTrue( - cos_sim > COSINE_THRESHOLD, - msg=f"HF BERT base-uncased TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", - ) - - # Clean up model env - torch._dynamo.reset() - - -@pytest.mark.unit -def test_lan_6_bert_base_uncased_cpu_offload(ir): - from transformers import BertModel - - model = BertModel.from_pretrained("bert-base-uncased").cuda().eval() - input = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda") - input2 = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda") - - compile_spec = { - "inputs": [ - torchtrt.Input( - input.shape, - dtype=input.dtype, - format=torch.contiguous_format, - ), - torchtrt.Input( - input.shape, - dtype=input.dtype, - format=torch.contiguous_format, - ), - ], - "device": torchtrt.Device("cuda:0"), - "enabled_precisions": {torch.float}, - "truncate_double": True, - "ir": ir, - "pass_through_build_failures": True, - "optimization_level": 1, - "min_block_size": 15, - "cache_built_engines": False, - "reuse_cached_engines": False, - "offload_module_to_cpu": True, - } - trt_mod = torchtrt.compile(model, **compile_spec) - if ir == "dynamo": - assertions.assertTrue( - get_model_device(model).type == "cpu", - msg="Model should be offloaded to CPU", - ) - model.cuda() - - model_outputs = model(input, input2) - trt_model_outputs = trt_mod(input, input2) - for key in model_outputs.keys(): - out, trt_out = model_outputs[key], trt_model_outputs[key] - cos_sim = cosine_similarity(out, trt_out) - assertions.assertTrue( - cos_sim > COSINE_THRESHOLD, - msg=f"HF BERT base-uncased TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", - ) - - # Clean up model env - torch._dynamo.reset() - - -@pytest.mark.unit @unittest.skipIf( - not importlib.util.find_spec("torchvision"), - "torchvision is not installed", + not importlib.util.find_spec("torchvision"), "torchvision not installed" ) -def test_lan_7_resnet18_half(ir): - model = models.resnet18(pretrained=True).eval().to("cuda").half() - input = torch.randn((1, 3, 224, 224)).to("cuda").half() +def test_lan_2_resnet18_torch_exec_ops(ir): + model = models.resnet18(pretrained=True).eval().to("cuda") + input = torch.randn((1, 3, 224, 224)).to("cuda") compile_spec = { "inputs": [ torchtrt.Input( - input.shape, dtype=torch.half, format=torch.contiguous_format + min_shape=(1, 3, 224, 224), + opt_shape=(8, 3, 224, 224), + max_shape=(16, 3, 224, 224), + dtype=torch.float32, ) ], - "device": torchtrt.Device("cuda:0"), - "enabled_precisions": {torch.half}, "ir": ir, - "pass_through_build_failures": True, - "optimization_level": 1, - "cache_built_engines": False, - "reuse_cached_engines": False, - } - - trt_mod = torchtrt.compile(model, **compile_spec) - cos_sim = cosine_similarity(model(input), trt_mod(input)) - assertions.assertTrue( - cos_sim > COSINE_THRESHOLD, - msg=f"Resnet18 Half TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", - ) - - # Clean up model env - torch._dynamo.reset() - - -@pytest.mark.unit -@unittest.skipIf( - torchtrt.ENABLED_FEATURES.tensorrt_rtx, - "bf16 is not supported for tensorrt_rtx", -) -def test_lan_8_bf16_model(ir): - class MyModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.conv = torch.nn.Conv2d(3, 16, 3, stride=1, bias=True) - self.relu = torch.nn.ReLU() - - def forward(self, x): - out = self.conv(x) - out = self.relu(out) - return out - - model = MyModule().eval().cuda().to(torch.bfloat16) - input = torch.randn((1, 3, 224, 224)).to("cuda").to(torch.bfloat16) - - compile_spec = { - "inputs": [ - torchtrt.Input( - input.shape, dtype=torch.bfloat16, format=torch.contiguous_format - ) - ], - "device": torchtrt.Device("cuda:0"), "enabled_precisions": {torch.float32}, - "ir": ir, - "pass_through_build_failures": True, "min_block_size": 1, + "output_format": "exported_program", "cache_built_engines": False, "reuse_cached_engines": False, - "use_explicit_typing": True, + "torch_executed_ops": {torch.ops.aten.matmul, "torch.ops.aten.add"}, } trt_mod = torchtrt.compile(model, **compile_spec) cos_sim = cosine_similarity(model(input), trt_mod(input)) - assertions.assertTrue( cos_sim > COSINE_THRESHOLD, - msg=f"BF16 model TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + msg=f"Resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", ) # Clean up model env torch._dynamo.reset() -@pytest.mark.unit @unittest.skipIf( - torchtrt.ENABLED_FEATURES.tensorrt_rtx, - "bf16 is not supported for tensorrt_rtx", + not importlib.util.find_spec("torchvision"), "torchvision not installed" ) -def test_lan_9_bf16_fallback_model(ir): - class MyModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.conv = torch.nn.Conv2d(3, 16, 3, padding=1, stride=1, bias=True) - self.relu = torch.nn.ReLU() - self.conv2 = torch.nn.Conv2d(16, 16, 3, padding=1, stride=1, bias=True) - - def forward(self, x): - out = self.conv(x) - out = self.relu(out) - out = self.conv2(out) - return out - - model = MyModule().eval().cuda().to(torch.bfloat16) - input = torch.randn((1, 3, 224, 224)).to("cuda").to(torch.bfloat16) +def test_lan_3_resnet18_torch_exec_ops(ir): + model = models.resnet18(pretrained=True).eval().to("cuda") + input = torch.randn((1, 3, 224, 224)).to("cuda") compile_spec = { "inputs": [ torchtrt.Input( - input.shape, dtype=torch.bfloat16, format=torch.contiguous_format + min_shape=(1, 3, 224, 224), + opt_shape=(8, 3, 224, 224), + max_shape=(16, 3, 224, 224), + dtype=torch.float32, ) ], - "device": torchtrt.Device("cuda:0"), - "enabled_precisions": {torch.float32}, "ir": ir, - "pass_through_build_failures": True, + "enabled_precisions": {torch.float32}, "min_block_size": 1, "cache_built_engines": False, "reuse_cached_engines": False, - "use_explicit_typing": True, - "torch_executed_ops": {"torch.ops.aten.relu.default"}, + "torch_executed_ops": {torch.ops.aten.matmul, "torch.ops.aten.add"}, } trt_mod = torchtrt.compile(model, **compile_spec) cos_sim = cosine_similarity(model(input), trt_mod(input)) - assertions.assertTrue( cos_sim > COSINE_THRESHOLD, - msg=f"BF16 fallback model TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + msg=f"Resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", ) # Clean up model env torch._dynamo.reset() + + +# @pytest.mark.unit +# @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32]) +# @unittest.skipIf( +# not importlib.util.find_spec("torchvision"), +# "torchvision is not installed", +# ) +# def test_lan_3_mobilenet_v2(ir, dtype): +# if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16: +# pytest.skip("TensorRT-RTX does not support bfloat16") + +# model = models.mobilenet_v2(pretrained=True).eval().to("cuda").to(dtype) +# input = torch.randn((1, 3, 224, 224)).to("cuda").to(dtype) + +# compile_spec = { +# "inputs": [ +# torchtrt.Input(input.shape, dtype=dtype, format=torch.contiguous_format) +# ], +# "device": torchtrt.Device("cuda:0"), +# "ir": ir, +# "pass_through_build_failures": True, +# "optimization_level": 1, +# "min_block_size": 10, +# "cache_built_engines": False, +# "reuse_cached_engines": False, +# "use_explicit_typing": True, +# } + +# trt_mod = torchtrt.compile(model, **compile_spec) +# pyt_output = model(input) +# trt_output = trt_mod(input) +# assert pyt_output.dtype == trt_output.dtype +# assert pyt_output.dtype == dtype +# cos_sim = cosine_similarity(pyt_output, trt_output) +# assertions.assertTrue( +# cos_sim > COSINE_THRESHOLD, +# msg=f"Mobilenet v2 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", +# ) + +# # Clean up model env +# torch._dynamo.reset() + + +# @pytest.mark.unit +# @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32]) +# @unittest.skipIf( +# not importlib.util.find_spec("timm") or not importlib.util.find_spec("torchvision"), +# "timm or torchvision not installed", +# ) +# def test_lan_4_efficientnet_b0(ir, dtype): +# if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16: +# pytest.skip("TensorRT-RTX does not support bfloat16, skipping test") + +# model = ( +# timm.create_model("efficientnet_b0", pretrained=True) +# .eval() +# .to("cuda") +# .to(dtype) +# ) +# input = torch.randn((1, 3, 224, 224)).to("cuda").to(dtype) + +# compile_spec = { +# "inputs": [ +# torchtrt.Input(input.shape, dtype=dtype, format=torch.contiguous_format) +# ], +# "device": torchtrt.Device("cuda:0"), +# "ir": ir, +# "pass_through_build_failures": True, +# "optimization_level": 1, +# "min_block_size": 10, +# "cache_built_engines": False, +# "reuse_cached_engines": False, +# "use_explicit_typing": True, +# } + +# trt_mod = torchtrt.compile(model, **compile_spec) +# pyt_output = model(input) +# trt_output = trt_mod(input) +# assert pyt_output.dtype == trt_output.dtype +# assert pyt_output.dtype == dtype +# cos_sim = cosine_similarity(pyt_output, trt_output) +# assertions.assertTrue( +# cos_sim > COSINE_THRESHOLD, +# msg=f"EfficientNet-B0 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", +# ) + +# # Clean up model env +# torch._dynamo.reset() + + +# @pytest.mark.unit +# @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32]) +# @unittest.skipIf( +# not importlib.util.find_spec("transformers"), +# "transformers is required to run this test", +# ) +# def test_lan_5_bert_base_uncased(ir, dtype): +# if torchtrt.ENABLED_FEATURES.tensorrt_rtx and dtype == torch.bfloat16: +# pytest.skip("TensorRT-RTX does not support bfloat16") + +# from transformers import BertModel + +# model = BertModel.from_pretrained("bert-base-uncased").cuda().eval().to(dtype) +# input = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda") +# input2 = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda") + +# compile_spec = { +# "inputs": [ +# torchtrt.Input( +# input.shape, +# dtype=input.dtype, +# format=torch.contiguous_format, +# ), +# torchtrt.Input( +# input.shape, +# dtype=input.dtype, +# format=torch.contiguous_format, +# ), +# ], +# "device": torchtrt.Device("cuda:0"), +# "truncate_double": True, +# "ir": ir, +# "pass_through_build_failures": True, +# "optimization_level": 1, +# "min_block_size": 15, +# "cache_built_engines": False, +# "reuse_cached_engines": False, +# "use_explicit_typing": True, +# } +# trt_mod = torchtrt.compile(model, **compile_spec) + +# model_outputs = model(input, input2) +# trt_model_outputs = trt_mod(input, input2) +# for key in model_outputs.keys(): +# out, trt_out = model_outputs[key], trt_model_outputs[key] +# assert out.dtype == trt_out.dtype +# assert out.dtype == dtype +# cos_sim = cosine_similarity(out, trt_out) +# assertions.assertTrue( +# cos_sim > COSINE_THRESHOLD, +# msg=f"HF BERT base-uncased TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", +# ) + +# # Clean up model env +# torch._dynamo.reset() + + +# @pytest.mark.unit +# def test_lan_6_bert_base_uncased_cpu_offload(ir): +# from transformers import BertModel + +# model = BertModel.from_pretrained("bert-base-uncased").cuda().eval() +# input = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda") +# input2 = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda") + +# compile_spec = { +# "inputs": [ +# torchtrt.Input( +# input.shape, +# dtype=input.dtype, +# format=torch.contiguous_format, +# ), +# torchtrt.Input( +# input.shape, +# dtype=input.dtype, +# format=torch.contiguous_format, +# ), +# ], +# "device": torchtrt.Device("cuda:0"), +# "enabled_precisions": {torch.float}, +# "truncate_double": True, +# "ir": ir, +# "pass_through_build_failures": True, +# "optimization_level": 1, +# "min_block_size": 15, +# "cache_built_engines": False, +# "reuse_cached_engines": False, +# "offload_module_to_cpu": True, +# } +# trt_mod = torchtrt.compile(model, **compile_spec) +# if ir == "dynamo": +# assertions.assertTrue( +# get_model_device(model).type == "cpu", +# msg="Model should be offloaded to CPU", +# ) +# model.cuda() + +# model_outputs = model(input, input2) +# trt_model_outputs = trt_mod(input, input2) +# for key in model_outputs.keys(): +# out, trt_out = model_outputs[key], trt_model_outputs[key] +# cos_sim = cosine_similarity(out, trt_out) +# assertions.assertTrue( +# cos_sim > COSINE_THRESHOLD, +# msg=f"HF BERT base-uncased TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", +# ) + +# # Clean up model env +# torch._dynamo.reset() + + +# @pytest.mark.unit +# @unittest.skipIf( +# not importlib.util.find_spec("torchvision"), +# "torchvision is not installed", +# ) +# def test_lan_7_resnet18_half(ir): +# model = models.resnet18(pretrained=True).eval().to("cuda").half() +# input = torch.randn((1, 3, 224, 224)).to("cuda").half() + +# compile_spec = { +# "inputs": [ +# torchtrt.Input( +# input.shape, dtype=torch.half, format=torch.contiguous_format +# ) +# ], +# "device": torchtrt.Device("cuda:0"), +# "enabled_precisions": {torch.half}, +# "ir": ir, +# "pass_through_build_failures": True, +# "optimization_level": 1, +# "cache_built_engines": False, +# "reuse_cached_engines": False, +# } + +# trt_mod = torchtrt.compile(model, **compile_spec) +# cos_sim = cosine_similarity(model(input), trt_mod(input)) +# assertions.assertTrue( +# cos_sim > COSINE_THRESHOLD, +# msg=f"Resnet18 Half TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", +# ) + +# # Clean up model env +# torch._dynamo.reset() + + +# @pytest.mark.unit +# @unittest.skipIf( +# torchtrt.ENABLED_FEATURES.tensorrt_rtx, +# "bf16 is not supported for tensorrt_rtx", +# ) +# def test_lan_8_bf16_model(ir): +# class MyModule(torch.nn.Module): +# def __init__(self): +# super().__init__() +# self.conv = torch.nn.Conv2d(3, 16, 3, stride=1, bias=True) +# self.relu = torch.nn.ReLU() + +# def forward(self, x): +# out = self.conv(x) +# out = self.relu(out) +# return out + +# model = MyModule().eval().cuda().to(torch.bfloat16) +# input = torch.randn((1, 3, 224, 224)).to("cuda").to(torch.bfloat16) + +# compile_spec = { +# "inputs": [ +# torchtrt.Input( +# input.shape, dtype=torch.bfloat16, format=torch.contiguous_format +# ) +# ], +# "device": torchtrt.Device("cuda:0"), +# "enabled_precisions": {torch.float32}, +# "ir": ir, +# "pass_through_build_failures": True, +# "min_block_size": 1, +# "cache_built_engines": False, +# "reuse_cached_engines": False, +# "use_explicit_typing": True, +# } + +# trt_mod = torchtrt.compile(model, **compile_spec) +# cos_sim = cosine_similarity(model(input), trt_mod(input)) + +# assertions.assertTrue( +# cos_sim > COSINE_THRESHOLD, +# msg=f"BF16 model TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", +# ) + +# # Clean up model env +# torch._dynamo.reset() + + +# @pytest.mark.unit +# @unittest.skipIf( +# torchtrt.ENABLED_FEATURES.tensorrt_rtx, +# "bf16 is not supported for tensorrt_rtx", +# ) +# def test_lan_9_bf16_fallback_model(ir): +# class MyModule(torch.nn.Module): +# def __init__(self): +# super().__init__() +# self.conv = torch.nn.Conv2d(3, 16, 3, padding=1, stride=1, bias=True) +# self.relu = torch.nn.ReLU() +# self.conv2 = torch.nn.Conv2d(16, 16, 3, padding=1, stride=1, bias=True) + +# def forward(self, x): +# out = self.conv(x) +# out = self.relu(out) +# out = self.conv2(out) +# return out + +# model = MyModule().eval().cuda().to(torch.bfloat16) +# input = torch.randn((1, 3, 224, 224)).to("cuda").to(torch.bfloat16) + +# compile_spec = { +# "inputs": [ +# torchtrt.Input( +# input.shape, dtype=torch.bfloat16, format=torch.contiguous_format +# ) +# ], +# "device": torchtrt.Device("cuda:0"), +# "enabled_precisions": {torch.float32}, +# "ir": ir, +# "pass_through_build_failures": True, +# "min_block_size": 1, +# "cache_built_engines": False, +# "reuse_cached_engines": False, +# "use_explicit_typing": True, +# "torch_executed_ops": {"torch.ops.aten.relu.default"}, +# } + +# trt_mod = torchtrt.compile(model, **compile_spec) +# cos_sim = cosine_similarity(model(input), trt_mod(input)) + +# assertions.assertTrue( +# cos_sim > COSINE_THRESHOLD, +# msg=f"BF16 fallback model TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", +# ) + +# # Clean up model env +# torch._dynamo.reset() From 662afdef21d00a5a960da399baf8efa5144f317b Mon Sep 17 00:00:00 2001 From: lanluo-nvidia Date: Tue, 30 Sep 2025 13:54:28 -0700 Subject: [PATCH 5/5] test --- tests/py/dynamo/models/test_models.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/py/dynamo/models/test_models.py b/tests/py/dynamo/models/test_models.py index dfd5b5aba5..96b9ac9002 100644 --- a/tests/py/dynamo/models/test_models.py +++ b/tests/py/dynamo/models/test_models.py @@ -36,7 +36,7 @@ def test_lan_1_resnet18_torch_exec_ops(ir): ) ], "ir": ir, - "enabled_precisions": {torch.float32, torch.float16}, + "enabled_precisions": {torch.float32}, "min_block_size": 1, "output_format": "exported_program", "cache_built_engines": True, @@ -72,11 +72,11 @@ def test_lan_2_resnet18_torch_exec_ops(ir): ) ], "ir": ir, - "enabled_precisions": {torch.float32}, + "enabled_precisions": {torch.float32, torch.float16}, "min_block_size": 1, "output_format": "exported_program", - "cache_built_engines": False, - "reuse_cached_engines": False, + "cache_built_engines": True, + "reuse_cached_engines": True, "torch_executed_ops": {torch.ops.aten.matmul, "torch.ops.aten.add"}, } @@ -108,8 +108,9 @@ def test_lan_3_resnet18_torch_exec_ops(ir): ) ], "ir": ir, - "enabled_precisions": {torch.float32}, + "enabled_precisions": {torch.float32, torch.float16}, "min_block_size": 1, + "output_format": "exported_program", "cache_built_engines": False, "reuse_cached_engines": False, "torch_executed_ops": {torch.ops.aten.matmul, "torch.ops.aten.add"},