From f8065bf6b8619c6e2d264d4e0b4a5dd6975db0d7 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Mon, 21 Apr 2025 14:37:52 -0700 Subject: [PATCH 01/16] add in ci Signed-off-by: Yang Wang --- scripts/test-template-ci.j2 | 55 ++++++++++++++++++++++++++++++------- 1 file changed, 45 insertions(+), 10 deletions(-) diff --git a/scripts/test-template-ci.j2 b/scripts/test-template-ci.j2 index 9a142cad..6aef9d2c 100644 --- a/scripts/test-template-ci.j2 +++ b/scripts/test-template-ci.j2 @@ -4,6 +4,7 @@ {% if branch == "main" %} {% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT" %} {% set docker_image_cu121 = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-cu121" %} +{% set docker_image_torch_nightly = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-torch-nightly" %} {% set docker_image_cu118 = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-cu118" %} {% endif %} {% set docker_image_amd = "rocm/vllm-ci:$BUILDKITE_COMMIT" %} @@ -44,6 +45,40 @@ steps: - exit_status: -10 # Agent was lost limit: 2 + - block: Build torch nightly image + key: block-build-torch-nightly + depends_on: ~ + + - label: ":docker: build image torch nightly" + key: image-build-torch-nightly + depends_on: block-build-torch-nightly + agents: + {% if branch == "main" %} + queue: cpu_queue_postmerge + {% else %} + queue: cpu_queue_premerge + {% endif %} + commands: + - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" + - | + #!/bin/bash + if [[ -z $(docker manifest inspect {{ docker_image_torch_nightly }}) ]]; then + echo "Image not found, proceeding with build..." + else + echo "Image found" + exit 0 + fi + - "docker build --file docker/Dockerfile.nightly_torch --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --tag {{ docker_image_torch_nightly }} --target test --progress plain ." + - "docker push {{ docker_image_torch_nightly }}" + env: + DOCKER_BUILDKIT: "1" + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 2 + - exit_status: -10 # Agent was lost + limit: 2 + - block: Build CUDA 12.1 image key: block-build-cu121 depends_on: ~ @@ -77,7 +112,7 @@ steps: limit: 2 - exit_status: -10 # Agent was lost limit: 2 - + - block: Build CUDA 11.8 image key: block-build-cu118 depends_on: ~ @@ -111,7 +146,7 @@ steps: limit: 2 - exit_status: -10 # Agent was lost limit: 2 - + {% for step in steps %} {% if step.fast_check_only != true %} @@ -245,7 +280,7 @@ steps: - group: "AMD Tests" depends_on: ~ - steps: + steps: - label: "AMD: :docker: build image" depends_on: ~ soft_fail: true @@ -272,7 +307,7 @@ steps: - label: "AMD: {{ step.label }}" depends_on: amd-build agents: - {% if step.label and step.label=="Benchmarks" or step.label=="LoRA Test %N" or step.label=="Kernels Test %N" or step.label=="Distributed Tests (4 GPUs)" or step.label=="Distributed Comm Ops Test" or step.label=="2 Node Tests (4 GPUs in total)" or step.label=="Distributed Tests (2 GPUs)" or step.label=="Plugin Tests (2 GPUs)" or step.label=="Multi-step Tests (4 GPUs)" or step.label=="Pipeline Parallelism Test" or step.label=="LoRA TP Test (Distributed)" or step.label=="Weight Loading Multiple GPU Test" %} + {% if step.label and step.label=="Benchmarks" or step.label=="LoRA Test %N" or step.label=="Kernels Test %N" or step.label=="Distributed Tests (4 GPUs)" or step.label=="Distributed Comm Ops Test" or step.label=="2 Node Tests (4 GPUs in total)" or step.label=="Distributed Tests (2 GPUs)" or step.label=="Plugin Tests (2 GPUs)" or step.label=="Multi-step Tests (4 GPUs)" or step.label=="Pipeline Parallelism Test" or step.label=="LoRA TP Test (Distributed)" or step.label=="Weight Loading Multiple GPU Test" %} queue: amd_mi300 {% else %} queue: amd_mi300_1 @@ -284,7 +319,7 @@ steps: soft_fail: true {% endif %} {% endfor %} - + - label: "Neuron Test" depends_on: ~ agents: @@ -295,7 +330,7 @@ steps: - block: "Run Intel CPU test" depends_on: ~ key: block-intel-cpu - + - label: "Intel CPU Test" depends_on: block-intel-cpu soft_fail: true @@ -309,7 +344,7 @@ steps: agents: queue: intel-hpu command: bash .buildkite/scripts/hardware_ci/run-hpu-test.sh - + - label: "Intel GPU Test" soft_fail: true depends_on: ~ @@ -336,7 +371,7 @@ steps: queue: ibm-ppc64le command: bash .buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh {% endif %} - + {% if branch == "main" or "s390x" in branch %} - label: "IBM Z (s390x) CPU Test" depends_on: ~ @@ -345,7 +380,7 @@ steps: queue: ibm_s390x command: bash .buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh {% endif %} - + {% if nightly == "1" %} - label: "GH200 Test" depends_on: ~ @@ -360,7 +395,7 @@ steps: soft_fail: True agents: queue: tpu_v5_queue - commands: + commands: - yes | docker system prune -a - if [[ -f ".buildkite/scripts/hardware_ci/run-tpu-test.sh" ]]; then bash .buildkite/scripts/hardware_ci/run-tpu-test.sh; fi From cfe7ee6e4453dd9169c0a8b01419b3eea1f1d526 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Mon, 21 Apr 2025 14:49:39 -0700 Subject: [PATCH 02/16] add in ci Signed-off-by: Yang Wang --- scripts/test-template-ci.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/test-template-ci.j2 b/scripts/test-template-ci.j2 index 6aef9d2c..508bcfd6 100644 --- a/scripts/test-template-ci.j2 +++ b/scripts/test-template-ci.j2 @@ -45,7 +45,7 @@ steps: - exit_status: -10 # Agent was lost limit: 2 - - block: Build torch nightly image + - block: Build torch nightly image key: block-build-torch-nightly depends_on: ~ From 50abdc7c77c589f6ba653fbc682f0d61565edad2 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Mon, 21 Apr 2025 14:51:22 -0700 Subject: [PATCH 03/16] add in ci Signed-off-by: Yang Wang --- scripts/test-template-ci.j2 | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/test-template-ci.j2 b/scripts/test-template-ci.j2 index 508bcfd6..8e395dfd 100644 --- a/scripts/test-template-ci.j2 +++ b/scripts/test-template-ci.j2 @@ -58,6 +58,7 @@ steps: {% else %} queue: cpu_queue_premerge {% endif %} + timeout_in_minutes: 300 commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - | From 1b371b22fe6b337033b42b74f6611286d9d5ac14 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Mon, 21 Apr 2025 15:17:11 -0700 Subject: [PATCH 04/16] add in ci Signed-off-by: Yang Wang --- scripts/test-template-ci.j2 | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/test-template-ci.j2 b/scripts/test-template-ci.j2 index 8e395dfd..9e1adfa1 100644 --- a/scripts/test-template-ci.j2 +++ b/scripts/test-template-ci.j2 @@ -1,5 +1,6 @@ {% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %} {% set docker_image_cu121 = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT-cu121" %} +{% set docker_image_torch_nightly = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT--torch-nightly" %} {% set docker_image_cu118 = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT-cu118" %} {% if branch == "main" %} {% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT" %} From 81adfb443fcbb5172d9c1999eb83977014224d8a Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Mon, 21 Apr 2025 18:53:51 -0700 Subject: [PATCH 05/16] add in ci Signed-off-by: Yang Wang --- scripts/test-template-ci.j2 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/test-template-ci.j2 b/scripts/test-template-ci.j2 index 9e1adfa1..9b81042c 100644 --- a/scripts/test-template-ci.j2 +++ b/scripts/test-template-ci.j2 @@ -1,6 +1,6 @@ {% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %} {% set docker_image_cu121 = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT-cu121" %} -{% set docker_image_torch_nightly = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT--torch-nightly" %} +{% set docker_image_torch_nightly = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT-torch-nightly" %} {% set docker_image_cu118 = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT-cu118" %} {% if branch == "main" %} {% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT" %} @@ -59,7 +59,7 @@ steps: {% else %} queue: cpu_queue_premerge {% endif %} - timeout_in_minutes: 300 + timeout_in_minutes: 360 commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - | From 719600d10e47b341b3c7ee7e0db7e1e09078b0c9 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Mon, 21 Apr 2025 19:51:00 -0700 Subject: [PATCH 06/16] add in ci Signed-off-by: Yang Wang --- scripts/test-template-ci.j2 | 153 +++++++++++++++++++++++++++--------- 1 file changed, 118 insertions(+), 35 deletions(-) diff --git a/scripts/test-template-ci.j2 b/scripts/test-template-ci.j2 index 9b81042c..7c4264c4 100644 --- a/scripts/test-template-ci.j2 +++ b/scripts/test-template-ci.j2 @@ -46,41 +46,6 @@ steps: - exit_status: -10 # Agent was lost limit: 2 - - block: Build torch nightly image - key: block-build-torch-nightly - depends_on: ~ - - - label: ":docker: build image torch nightly" - key: image-build-torch-nightly - depends_on: block-build-torch-nightly - agents: - {% if branch == "main" %} - queue: cpu_queue_postmerge - {% else %} - queue: cpu_queue_premerge - {% endif %} - timeout_in_minutes: 360 - commands: - - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - | - #!/bin/bash - if [[ -z $(docker manifest inspect {{ docker_image_torch_nightly }}) ]]; then - echo "Image not found, proceeding with build..." - else - echo "Image found" - exit 0 - fi - - "docker build --file docker/Dockerfile.nightly_torch --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --tag {{ docker_image_torch_nightly }} --target test --progress plain ." - - "docker push {{ docker_image_torch_nightly }}" - env: - DOCKER_BUILDKIT: "1" - retry: - automatic: - - exit_status: -1 # Agent was lost - limit: 2 - - exit_status: -10 # Agent was lost - limit: 2 - - block: Build CUDA 12.1 image key: block-build-cu121 depends_on: ~ @@ -280,6 +245,124 @@ steps: {% endif %} {% endfor %} +- group: "vllm against torch nightly" + depends_on: ~ + steps: + - block: Build torch nightly image + key: block-build-torch-nightly + depends_on: ~ + + - label: ":docker: build image torch nightly" + key: image-build-torch-nightly + depends_on: block-build-torch-nightly + soft_fail: true + agents: + {% if branch == "main" %} + queue: cpu_queue_postmerge + {% else %} + queue: cpu_queue_premerge + {% endif %} + timeout_in_minutes: 360 + commands: + - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" + - | + #!/bin/bash + if [[ -z $(docker manifest inspect {{ docker_image_torch_nightly }}) ]]; then + echo "Image not found, proceeding with build..." + else + echo "Image found" + exit 0 + fi + - "docker build --file docker/Dockerfile.nightly_torch --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --tag {{ docker_image_torch_nightly }} --target test --progress plain ." + - "docker push {{ docker_image_torch_nightly }}" + env: + DOCKER_BUILDKIT: "1" + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 2 + - exit_status: -10 # Agent was lost + limit: 2 + + {% for step in steps %} + {% if step.torch_nightly and step.num_gpus<2 %} + {% set ns = namespace(blocked=1) %} + + {% if run_all == "1" %} + {% set ns.blocked = 0 %} + {% endif %} + + {% if nightly == "1" %} + {% set ns.blocked = 0 %} + {% endif %} + + {% if step.source_file_dependencies %} + {% for source_file in step.source_file_dependencies %} + {% for file in list_file_diff %} + {% if source_file in file %} + {% set ns.blocked = 0 %} + {% endif %} + {% endfor %} + {% endfor %} + {% endif %} + + {% if ns.blocked == 1 or (step.optional and nightly != "1") %} + - block: "Run Torch Nightly {{ step.label }}" + depends_on: image-build-torch-nightly + key: block-torch-nightly-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") }} + {% endif %} + + - label: "{{ step.label }}" + {% if ns.blocked == 1 or (step.optional and nightly != "1") %} + depends_on: block-torch-nightly-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") }} + {% else %} + depends_on: image-build-torch-nightly + {% endif %} + + agents: + {% if step.no_gpu %} + queue: cpu_queue_premerge + {% elif step.gpu == "a100" %} + queue: a100_queue + {% else %} + queue: gpu_1_queue + {% endif %} + + {% if step.num_nodes < 2 %} + plugins: + {% if step.gpu != "a100" %} + - docker#v5.2.0: + image: {{ docker_image_torch_nightly }} + always-pull: true + propagate-environment: true + {% if not step.no_gpu %} + gpus: all + {% endif %} + {% if step.label == "Benchmarks" %} + mount-buildkite-agent: true + {% endif %} + command: [ + "bash", "-xc", + "(command nvidia-smi || true) && export VLLM_LOGGING_LEVEL=DEBUG && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}" + ] + environment: + - VLLM_USAGE_SOURCE=ci-test + - HF_HOME={{ hf_home_fsx }} + - HF_TOKEN + {% if branch == "main" %} + - BUILDKITE_ANALYTICS_TOKEN + {% endif %} + {% if step.label == "Speculative decoding tests" %} + - VLLM_ATTENTION_BACKEND=XFORMERS + {% endif %} + volumes: + - /dev/shm:/dev/shm + - {{ hf_home_fsx }}:{{ hf_home_fsx }} + {% endif %} + {% endif %} + {% endif %} + {% endfor %} + - group: "AMD Tests" depends_on: ~ steps: From 11ee1f7680ebaa8012aa93fcb145829e58eb403a Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Mon, 21 Apr 2025 19:52:48 -0700 Subject: [PATCH 07/16] add in ci Signed-off-by: Yang Wang --- scripts/test-template-ci.j2 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/test-template-ci.j2 b/scripts/test-template-ci.j2 index 7c4264c4..f8039e66 100644 --- a/scripts/test-template-ci.j2 +++ b/scripts/test-template-ci.j2 @@ -285,7 +285,7 @@ steps: limit: 2 {% for step in steps %} - {% if step.torch_nightly and step.num_gpus<2 %} + {% if step.torch_nightly and step.num_gpus<2 %} {% set ns = namespace(blocked=1) %} {% if run_all == "1" %} @@ -318,7 +318,7 @@ steps: {% else %} depends_on: image-build-torch-nightly {% endif %} - + soft_fail: true agents: {% if step.no_gpu %} queue: cpu_queue_premerge @@ -328,7 +328,7 @@ steps: queue: gpu_1_queue {% endif %} - {% if step.num_nodes < 2 %} + {% if step.num_nodes < 2 %} plugins: {% if step.gpu != "a100" %} - docker#v5.2.0: From c6d8581686636fb4b9da8d660371ec959116518d Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Mon, 21 Apr 2025 20:09:28 -0700 Subject: [PATCH 08/16] add in ci Signed-off-by: Yang Wang --- scripts/test-template-ci.j2 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/test-template-ci.j2 b/scripts/test-template-ci.j2 index f8039e66..bdde9182 100644 --- a/scripts/test-template-ci.j2 +++ b/scripts/test-template-ci.j2 @@ -329,8 +329,8 @@ steps: {% endif %} {% if step.num_nodes < 2 %} - plugins: - {% if step.gpu != "a100" %} + plugins: + {% if step.gpu != "a100" %} - docker#v5.2.0: image: {{ docker_image_torch_nightly }} always-pull: true From 2bbd2013c0910faf9b85560b09ab86097a925e37 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Mon, 21 Apr 2025 20:16:30 -0700 Subject: [PATCH 09/16] add in ci Signed-off-by: Yang Wang --- scripts/test-template-ci.j2 | 79 ------------------------------------- 1 file changed, 79 deletions(-) diff --git a/scripts/test-template-ci.j2 b/scripts/test-template-ci.j2 index bdde9182..b7808ec5 100644 --- a/scripts/test-template-ci.j2 +++ b/scripts/test-template-ci.j2 @@ -284,85 +284,6 @@ steps: - exit_status: -10 # Agent was lost limit: 2 - {% for step in steps %} - {% if step.torch_nightly and step.num_gpus<2 %} - {% set ns = namespace(blocked=1) %} - - {% if run_all == "1" %} - {% set ns.blocked = 0 %} - {% endif %} - - {% if nightly == "1" %} - {% set ns.blocked = 0 %} - {% endif %} - - {% if step.source_file_dependencies %} - {% for source_file in step.source_file_dependencies %} - {% for file in list_file_diff %} - {% if source_file in file %} - {% set ns.blocked = 0 %} - {% endif %} - {% endfor %} - {% endfor %} - {% endif %} - - {% if ns.blocked == 1 or (step.optional and nightly != "1") %} - - block: "Run Torch Nightly {{ step.label }}" - depends_on: image-build-torch-nightly - key: block-torch-nightly-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") }} - {% endif %} - - - label: "{{ step.label }}" - {% if ns.blocked == 1 or (step.optional and nightly != "1") %} - depends_on: block-torch-nightly-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") }} - {% else %} - depends_on: image-build-torch-nightly - {% endif %} - soft_fail: true - agents: - {% if step.no_gpu %} - queue: cpu_queue_premerge - {% elif step.gpu == "a100" %} - queue: a100_queue - {% else %} - queue: gpu_1_queue - {% endif %} - - {% if step.num_nodes < 2 %} - plugins: - {% if step.gpu != "a100" %} - - docker#v5.2.0: - image: {{ docker_image_torch_nightly }} - always-pull: true - propagate-environment: true - {% if not step.no_gpu %} - gpus: all - {% endif %} - {% if step.label == "Benchmarks" %} - mount-buildkite-agent: true - {% endif %} - command: [ - "bash", "-xc", - "(command nvidia-smi || true) && export VLLM_LOGGING_LEVEL=DEBUG && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}" - ] - environment: - - VLLM_USAGE_SOURCE=ci-test - - HF_HOME={{ hf_home_fsx }} - - HF_TOKEN - {% if branch == "main" %} - - BUILDKITE_ANALYTICS_TOKEN - {% endif %} - {% if step.label == "Speculative decoding tests" %} - - VLLM_ATTENTION_BACKEND=XFORMERS - {% endif %} - volumes: - - /dev/shm:/dev/shm - - {{ hf_home_fsx }}:{{ hf_home_fsx }} - {% endif %} - {% endif %} - {% endif %} - {% endfor %} - - group: "AMD Tests" depends_on: ~ steps: From cf689506f441824a4ae2441fb8829695b4cf60d5 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Mon, 21 Apr 2025 20:22:59 -0700 Subject: [PATCH 10/16] add in ci Signed-off-by: Yang Wang --- scripts/test-template-ci.j2 | 147 +++++++++++++++++++++++++++--------- 1 file changed, 113 insertions(+), 34 deletions(-) diff --git a/scripts/test-template-ci.j2 b/scripts/test-template-ci.j2 index b7808ec5..9010fe2f 100644 --- a/scripts/test-template-ci.j2 +++ b/scripts/test-template-ci.j2 @@ -245,44 +245,123 @@ steps: {% endif %} {% endfor %} -- group: "vllm against torch nightly" - depends_on: ~ - steps: - - block: Build torch nightly image - key: block-build-torch-nightly - depends_on: ~ - - - label: ":docker: build image torch nightly" - key: image-build-torch-nightly - depends_on: block-build-torch-nightly + - group: "vllm against torch nightly" + depends_on: ~ + steps: + - block: Build torch nightly image + key: block-build-torch-nightly + depends_on: ~ + + - label: ":docker: build image torch nightly" + key: image-build-torch-nightly + depends_on: block-build-torch-nightly + soft_fail: true + agents: + {% if branch == "main" %} + queue: cpu_queue_postmerge + {% else %} + queue: cpu_queue_premerge + {% endif %} + timeout_in_minutes: 360 + commands: + - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" + - | + #!/bin/bash + if [[ -z $(docker manifest inspect {{ docker_image_torch_nightly }}) ]]; then + echo "Image not found, proceeding with build..." + else + echo "Image found" + exit 0 + fi + - "docker build --file docker/Dockerfile.nightly_torch --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --tag {{ docker_image_torch_nightly }} --target test --progress plain ." + - "docker push {{ docker_image_torch_nightly }}" + env: + DOCKER_BUILDKIT: "1" + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 2 + - exit_status: -10 # Agent was lost + limit: 2 + + {% for step in steps %} + {% if step.torch_nightly and step.num_gpus<2 %} + {% set ns = namespace(blocked=1) %} + + {% if run_all == "1" %} + {% set ns.blocked = 0 %} + {% endif %} + + {% if nightly == "1" %} + {% set ns.blocked = 0 %} + {% endif %} + + {% if step.source_file_dependencies %} + {% for source_file in step.source_file_dependencies %} + {% for file in list_file_diff %} + {% if source_file in file %} + {% set ns.blocked = 0 %} + {% endif %} + {% endfor %} + {% endfor %} + {% endif %} + + {% if ns.blocked == 1 or (step.optional and nightly != "1") %} + - block: "Run Torch Nightly {{ step.label }}" + depends_on: image-build-torch-nightly + key: block-torch-nightly-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") }} + {% endif %} + + - label: "{{ step.label }}" + {% if ns.blocked == 1 or (step.optional and nightly != "1") %} + depends_on: block-torch-nightly-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") }} + {% else %} + depends_on: image-build-torch-nightly + {% endif %} soft_fail: true agents: - {% if branch == "main" %} - queue: cpu_queue_postmerge - {% else %} + {% if step.no_gpu %} queue: cpu_queue_premerge + {% elif step.gpu == "a100" %} + queue: a100_queue + {% else %} + queue: gpu_1_queue + {% endif %} + + {% if step.num_nodes < 2 %} + plugins: + {% if step.gpu != "a100" %} + - docker#v5.2.0: + image: {{ docker_image_torch_nightly }} + always-pull: true + propagate-environment: true + {% if not step.no_gpu %} + gpus: all + {% endif %} + {% if step.label == "Benchmarks" %} + mount-buildkite-agent: true + {% endif %} + command: [ + "bash", "-xc", + "(command nvidia-smi || true) && export VLLM_LOGGING_LEVEL=DEBUG && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}" + ] + environment: + - VLLM_USAGE_SOURCE=ci-test + - HF_HOME={{ hf_home_fsx }} + - HF_TOKEN + {% if branch == "main" %} + - BUILDKITE_ANALYTICS_TOKEN + {% endif %} + {% if step.label == "Speculative decoding tests" %} + - VLLM_ATTENTION_BACKEND=XFORMERS + {% endif %} + volumes: + - /dev/shm:/dev/shm + - {{ hf_home_fsx }}:{{ hf_home_fsx }} {% endif %} - timeout_in_minutes: 360 - commands: - - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - | - #!/bin/bash - if [[ -z $(docker manifest inspect {{ docker_image_torch_nightly }}) ]]; then - echo "Image not found, proceeding with build..." - else - echo "Image found" - exit 0 - fi - - "docker build --file docker/Dockerfile.nightly_torch --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --tag {{ docker_image_torch_nightly }} --target test --progress plain ." - - "docker push {{ docker_image_torch_nightly }}" - env: - DOCKER_BUILDKIT: "1" - retry: - automatic: - - exit_status: -1 # Agent was lost - limit: 2 - - exit_status: -10 # Agent was lost - limit: 2 + {% endif %} + {% endif %} + {% endfor %} - group: "AMD Tests" depends_on: ~ From 4de318fd1668d4ecc34300c6aeb6d287dfc4a6d0 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Mon, 21 Apr 2025 20:29:14 -0700 Subject: [PATCH 11/16] add in ci Signed-off-by: Yang Wang --- scripts/test-template-ci.j2 | 92 ++++++++++++++++++------------------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/scripts/test-template-ci.j2 b/scripts/test-template-ci.j2 index 9010fe2f..44927d0b 100644 --- a/scripts/test-template-ci.j2 +++ b/scripts/test-template-ci.j2 @@ -306,60 +306,60 @@ steps: {% endfor %} {% endif %} - {% if ns.blocked == 1 or (step.optional and nightly != "1") %} - - block: "Run Torch Nightly {{ step.label }}" - depends_on: image-build-torch-nightly - key: block-torch-nightly-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") }} - {% endif %} - - - label: "{{ step.label }}" {% if ns.blocked == 1 or (step.optional and nightly != "1") %} - depends_on: block-torch-nightly-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") }} - {% else %} - depends_on: image-build-torch-nightly + - block: "Run Torch Nightly {{ step.label }}" + depends_on: image-build-torch-nightly + key: block-torch-nightly-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") }} {% endif %} - soft_fail: true - agents: - {% if step.no_gpu %} - queue: cpu_queue_premerge - {% elif step.gpu == "a100" %} - queue: a100_queue + + - label: "{{ step.label }}" + {% if ns.blocked == 1 or (step.optional and nightly != "1") %} + depends_on: block-torch-nightly-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") }} {% else %} - queue: gpu_1_queue + depends_on: image-build-torch-nightly {% endif %} + soft_fail: true + agents: + {% if step.no_gpu %} + queue: cpu_queue_premerge + {% elif step.gpu == "a100" %} + queue: a100_queue + {% else %} + queue: gpu_1_queue + {% endif %} - {% if step.num_nodes < 2 %} - plugins: - {% if step.gpu != "a100" %} - - docker#v5.2.0: - image: {{ docker_image_torch_nightly }} - always-pull: true - propagate-environment: true - {% if not step.no_gpu %} - gpus: all - {% endif %} - {% if step.label == "Benchmarks" %} - mount-buildkite-agent: true - {% endif %} - command: [ - "bash", "-xc", - "(command nvidia-smi || true) && export VLLM_LOGGING_LEVEL=DEBUG && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}" - ] - environment: - - VLLM_USAGE_SOURCE=ci-test - - HF_HOME={{ hf_home_fsx }} - - HF_TOKEN - {% if branch == "main" %} - - BUILDKITE_ANALYTICS_TOKEN + {% if step.num_nodes < 2 %} + plugins: + {% if step.gpu != "a100" %} + - docker#v5.2.0: + image: {{ docker_image_torch_nightly }} + always-pull: true + propagate-environment: true + {% if not step.no_gpu %} + gpus: all {% endif %} - {% if step.label == "Speculative decoding tests" %} - - VLLM_ATTENTION_BACKEND=XFORMERS + {% if step.label == "Benchmarks" %} + mount-buildkite-agent: true {% endif %} - volumes: - - /dev/shm:/dev/shm - - {{ hf_home_fsx }}:{{ hf_home_fsx }} + command: [ + "bash", "-xc", + "(command nvidia-smi || true) && export VLLM_LOGGING_LEVEL=DEBUG && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}" + ] + environment: + - VLLM_USAGE_SOURCE=ci-test + - HF_HOME={{ hf_home_fsx }} + - HF_TOKEN + {% if branch == "main" %} + - BUILDKITE_ANALYTICS_TOKEN + {% endif %} + {% if step.label == "Speculative decoding tests" %} + - VLLM_ATTENTION_BACKEND=XFORMERS + {% endif %} + volumes: + - /dev/shm:/dev/shm + - {{ hf_home_fsx }}:{{ hf_home_fsx }} + {% endif %} {% endif %} - {% endif %} {% endif %} {% endfor %} From 86e092dab9b9d244bd75c0999376fee2eca757a2 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Mon, 21 Apr 2025 20:32:05 -0700 Subject: [PATCH 12/16] add in ci Signed-off-by: Yang Wang --- scripts/test-template-ci.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/test-template-ci.j2 b/scripts/test-template-ci.j2 index 44927d0b..b4fd9f33 100644 --- a/scripts/test-template-ci.j2 +++ b/scripts/test-template-ci.j2 @@ -312,7 +312,7 @@ steps: key: block-torch-nightly-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") }} {% endif %} - - label: "{{ step.label }}" + - label: "Torch Nightly {{ step.label }}" {% if ns.blocked == 1 or (step.optional and nightly != "1") %} depends_on: block-torch-nightly-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") }} {% else %} From 8969b14b8af2f053efbc24717c5bcb73d950b392 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Mon, 21 Apr 2025 22:43:02 -0700 Subject: [PATCH 13/16] add in ci Signed-off-by: Yang Wang --- scripts/test-template-ci.j2 | 302 +++++++++++++++--------------------- 1 file changed, 127 insertions(+), 175 deletions(-) diff --git a/scripts/test-template-ci.j2 b/scripts/test-template-ci.j2 index b4fd9f33..8d1314d3 100644 --- a/scripts/test-template-ci.j2 +++ b/scripts/test-template-ci.j2 @@ -15,6 +15,126 @@ {% set hf_home_fsx = "/fsx/hf_cache" %} {% set list_file_diff = list_file_diff | split("|") %} +{% macro get_block_status(step, list_file_diff, run_all, nightly) %} + {% set ns = namespace(blocked=1) %} + + {% if run_all == "1" or nightly == "1" %} + {% set ns.blocked = 0 %} + {% endif %} + + {% if step.source_file_dependencies %} + {% for source_file in step.source_file_dependencies %} + {% for file in list_file_diff %} + {% if source_file in file %} + {% set ns.blocked = 0 %} + {% endif %} + {% endfor %} + {% endfor %} + {% else %} + {% set ns.blocked = 0 %} + {% endif %} + + {{ ns.blocked }} +{% endmacro %} + +{% macro render_cuda_config(step, image, default_working_dir, hf_home_fsx, hf_home, branch) %} +agents: + {% if step.label == "Documentation Build" %} + queue: small_cpu_queue_premerge + {% elif step.no_gpu %} + queue: cpu_queue_premerge + {% elif step.gpu == "a100" %} + queue: a100_queue + {% elif step.num_gpus == 2 or step.num_gpus == 4 %} + queue: gpu_4_queue + {% else %} + queue: gpu_1_queue + {% endif %} + +{% if step.num_nodes >= 2 %} +commands: + - ./.buildkite/scripts/run-multi-node-test.sh {{ (step.working_dir or default_working_dir) | safe }} {{ step.num_nodes }} {{ step.num_gpus }} {{ image }} {% for command in step.commands %}"{{ (command | join(' && ')) | safe }}" {% endfor %} +{% endif %} + +{% if step.parallelism %} +parallelism: {{ step.parallelism }} +{% endif %} + +retry: + automatic: + - exit_status: -1 + limit: 1 + - exit_status: -10 + limit: 1 + +{% if step.num_nodes < 2 %} +plugins: + {% if step.gpu != "a100" %} + - docker#v5.2.0: + image: {{ image }} + always-pull: true + propagate-environment: true + {% if not step.no_gpu %} + gpus: all + {% endif %} + {% if step.label == "Benchmarks" %} + mount-buildkite-agent: true + {% endif %} + command: ["bash", "-xc", "(command nvidia-smi || true) && export VLLM_LOGGING_LEVEL=DEBUG && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}"] + environment: + - VLLM_USAGE_SOURCE=ci-test + - HF_HOME={{ hf_home_fsx }} + - HF_TOKEN + {% if branch == "main" %} + - BUILDKITE_ANALYTICS_TOKEN + {% endif %} + {% if step.label == "Speculative decoding tests" %} + - VLLM_ATTENTION_BACKEND=XFORMERS + {% endif %} + volumes: + - /dev/shm:/dev/shm + - {{ hf_home_fsx }}:{{ hf_home_fsx }} + {% else %} + - kubernetes: + podSpec: + priorityClassName: ci + containers: + - image: {{ image }} + command: + - bash -c '(command nvidia-smi || true) && export VLLM_LOGGING_LEVEL=DEBUG && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(" && ")) | safe }}' + resources: + limits: + nvidia.com/gpu: {{ step.num_gpus or 1 }} + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: {{ hf_home }} + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: HF_HOME + value: {{ hf_home }} + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: {{ hf_home }} + type: Directory + {% endif %} +{% endif %} +{% endmacro %} + + steps: - label: ":docker: build image" key: image-build @@ -117,27 +237,7 @@ steps: {% for step in steps %} {% if step.fast_check_only != true %} - {% set ns = namespace(blocked=1) %} - - {% if run_all == "1" %} - {% set ns.blocked = 0 %} - {% endif %} - - {% if nightly == "1" %} - {% set ns.blocked = 0 %} - {% endif %} - - {% if step.source_file_dependencies %} - {% for source_file in step.source_file_dependencies %} - {% for file in list_file_diff %} - {% if source_file in file %} - {% set ns.blocked = 0 %} - {% endif %} - {% endfor %} - {% endfor %} - {% else %} - {% set ns.blocked = 0 %} - {% endif %} + {% set ns = namespace(blocked = get_block_status(step, list_file_diff, run_all, nightly)) %} {% if ns.blocked == 1 or (step.optional and nightly != "1") %} - block: "Run {{ step.label }}" @@ -151,97 +251,8 @@ steps: {% else %} depends_on: image-build {% endif %} - agents: - {% if step.label == "Documentation Build" %} - queue: small_cpu_queue_premerge - {% elif step.no_gpu %} - queue: cpu_queue_premerge - {% elif step.gpu == "a100" %} - queue: a100_queue - {% elif step.num_gpus == 2 or step.num_gpus == 4 %} - queue: gpu_4_queue - {% else %} - queue: gpu_1_queue - {% endif %} - {% if step.num_nodes >= 2%} {# for multi-node test #} - commands: - - ./.buildkite/scripts/run-multi-node-test.sh {{ (step.working_dir or default_working_dir) | safe }} {{ step.num_nodes }} {{ step.num_gpus }} {{ docker_image }} {% for command in step.commands %}"{{ (command | join(" && ")) | safe }}" {% endfor %} - {% endif %} soft_fail: {{ step.soft_fail or false }} - {% if step.parallelism %} - parallelism: {{ step.parallelism }} - {% endif %} - retry: - automatic: - - exit_status: -1 # Agent was lost - limit: 1 - - exit_status: -10 # Agent was lost - limit: 1 - {% if step.num_nodes < 2 %} - plugins: - {% if step.gpu != "a100" %} - - docker#v5.2.0: {# for GPU test #} - image: {{ docker_image }} - always-pull: true - propagate-environment: true - {% if not step.no_gpu %} - gpus: all - {% endif %} - {% if step.label == "Benchmarks" %} - mount-buildkite-agent: true - {% endif %} - command: ["bash", "-xc", "(command nvidia-smi || true) && export VLLM_LOGGING_LEVEL=DEBUG && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}"] - environment: - - VLLM_USAGE_SOURCE=ci-test - - HF_HOME={{ hf_home_fsx }} - - HF_TOKEN - {% if branch == "main" %} - - BUILDKITE_ANALYTICS_TOKEN - {% endif %} - {% if step.label == "Speculative decoding tests" %} - - VLLM_ATTENTION_BACKEND=XFORMERS - {% endif %} - volumes: - - /dev/shm:/dev/shm - - {{ hf_home_fsx }}:{{ hf_home_fsx }} - {% else %} - - kubernetes: - podSpec: - priorityClassName: ci - containers: - - image: {{ docker_image }} - command: - - bash -c '(command nvidia-smi || true) && export VLLM_LOGGING_LEVEL=DEBUG && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}' - resources: - limits: - nvidia.com/gpu: {{ step.num_gpus or 1 }} - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: {{ hf_home }} - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_HOME - value: {{ hf_home }} - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: {{ hf_home }} - type: Directory - {% endif %} - {% endif %} + {{ render_cuda_config(step, docker_image, default_working_dir, hf_home_fsx, hf_home, branch) }} {% endif %} {% endfor %} @@ -283,28 +294,9 @@ steps: limit: 2 - exit_status: -10 # Agent was lost limit: 2 - {% for step in steps %} - {% if step.torch_nightly and step.num_gpus<2 %} - {% set ns = namespace(blocked=1) %} - - {% if run_all == "1" %} - {% set ns.blocked = 0 %} - {% endif %} - - {% if nightly == "1" %} - {% set ns.blocked = 0 %} - {% endif %} - - {% if step.source_file_dependencies %} - {% for source_file in step.source_file_dependencies %} - {% for file in list_file_diff %} - {% if source_file in file %} - {% set ns.blocked = 0 %} - {% endif %} - {% endfor %} - {% endfor %} - {% endif %} + {% if step.torch_nightly %} + {% set ns = namespace(blocked = get_block_status(step, list_file_diff, run_all, nightly)) %} {% if ns.blocked == 1 or (step.optional and nightly != "1") %} - block: "Run Torch Nightly {{ step.label }}" @@ -319,49 +311,9 @@ steps: depends_on: image-build-torch-nightly {% endif %} soft_fail: true - agents: - {% if step.no_gpu %} - queue: cpu_queue_premerge - {% elif step.gpu == "a100" %} - queue: a100_queue - {% else %} - queue: gpu_1_queue - {% endif %} - - {% if step.num_nodes < 2 %} - plugins: - {% if step.gpu != "a100" %} - - docker#v5.2.0: - image: {{ docker_image_torch_nightly }} - always-pull: true - propagate-environment: true - {% if not step.no_gpu %} - gpus: all - {% endif %} - {% if step.label == "Benchmarks" %} - mount-buildkite-agent: true - {% endif %} - command: [ - "bash", "-xc", - "(command nvidia-smi || true) && export VLLM_LOGGING_LEVEL=DEBUG && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}" - ] - environment: - - VLLM_USAGE_SOURCE=ci-test - - HF_HOME={{ hf_home_fsx }} - - HF_TOKEN - {% if branch == "main" %} - - BUILDKITE_ANALYTICS_TOKEN - {% endif %} - {% if step.label == "Speculative decoding tests" %} - - VLLM_ATTENTION_BACKEND=XFORMERS - {% endif %} - volumes: - - /dev/shm:/dev/shm - - {{ hf_home_fsx }}:{{ hf_home_fsx }} - {% endif %} - {% endif %} - {% endif %} - {% endfor %} + {{ render_cuda_config(step, docker_image_torch_nightly, default_working_dir, hf_home_fsx, hf_home, branch) }} + {% endif %} + {% endfor %} - group: "AMD Tests" depends_on: ~ From 1458d186f3d63c5dd3fbff9608063312c3a82488 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Mon, 21 Apr 2025 23:04:18 -0700 Subject: [PATCH 14/16] add in ci Signed-off-by: Yang Wang --- scripts/test-template-ci.j2 | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/scripts/test-template-ci.j2 b/scripts/test-template-ci.j2 index 8d1314d3..4b68d90e 100644 --- a/scripts/test-template-ci.j2 +++ b/scripts/test-template-ci.j2 @@ -16,25 +16,25 @@ {% set list_file_diff = list_file_diff | split("|") %} {% macro get_block_status(step, list_file_diff, run_all, nightly) %} - {% set ns = namespace(blocked=1) %} +{% set ns = namespace(blocked=1) %} - {% if run_all == "1" or nightly == "1" %} - {% set ns.blocked = 0 %} - {% endif %} +{% if run_all == "1" or nightly == "1" %} + {% set ns.blocked = 0 %} +{% endif %} - {% if step.source_file_dependencies %} - {% for source_file in step.source_file_dependencies %} - {% for file in list_file_diff %} - {% if source_file in file %} - {% set ns.blocked = 0 %} - {% endif %} - {% endfor %} +{% if step.source_file_dependencies %} + {% for source_file in step.source_file_dependencies %} + {% for file in list_file_diff %} + {% if source_file in file %} + {% set ns.blocked = 0 %} + {% endif %} {% endfor %} - {% else %} - {% set ns.blocked = 0 %} - {% endif %} + {% endfor %} +{% else %} + {% set ns.blocked = 0 %} +{% endif %} - {{ ns.blocked }} +{{ ns.blocked }} {% endmacro %} {% macro render_cuda_config(step, image, default_working_dir, hf_home_fsx, hf_home, branch) %} @@ -252,7 +252,9 @@ steps: depends_on: image-build {% endif %} soft_fail: {{ step.soft_fail or false }} + {% filter indent(2) %} {{ render_cuda_config(step, docker_image, default_working_dir, hf_home_fsx, hf_home, branch) }} + {% endfilter %} {% endif %} {% endfor %} @@ -311,7 +313,9 @@ steps: depends_on: image-build-torch-nightly {% endif %} soft_fail: true + {% filter indent(4) %} {{ render_cuda_config(step, docker_image_torch_nightly, default_working_dir, hf_home_fsx, hf_home, branch) }} + {% endfilter %} {% endif %} {% endfor %} From 0f0c07b1dec63ce786e1a7d54e4b752efb36d7fa Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Mon, 21 Apr 2025 23:13:28 -0700 Subject: [PATCH 15/16] add in ci Signed-off-by: Yang Wang --- scripts/test-template-ci.j2 | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/scripts/test-template-ci.j2 b/scripts/test-template-ci.j2 index 4b68d90e..94922ce4 100644 --- a/scripts/test-template-ci.j2 +++ b/scripts/test-template-ci.j2 @@ -253,7 +253,7 @@ steps: {% endif %} soft_fail: {{ step.soft_fail or false }} {% filter indent(2) %} - {{ render_cuda_config(step, docker_image, default_working_dir, hf_home_fsx, hf_home, branch) }} + {{ render_cuda_config(step, docker_image, default_working_dir, hf_home_fsx, hf_home, branch) | indent(2, true) }} {% endfilter %} {% endif %} {% endfor %} @@ -313,9 +313,7 @@ steps: depends_on: image-build-torch-nightly {% endif %} soft_fail: true - {% filter indent(4) %} - {{ render_cuda_config(step, docker_image_torch_nightly, default_working_dir, hf_home_fsx, hf_home, branch) }} - {% endfilter %} + {{ render_cuda_config(step, docker_image_torch_nightly, default_working_dir, hf_home_fsx, hf_home, branch) | indent(8, true) }} {% endif %} {% endfor %} From 960bdec026760206b96bb753bdd4263656809053 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Mon, 21 Apr 2025 23:15:55 -0700 Subject: [PATCH 16/16] add in ci Signed-off-by: Yang Wang --- scripts/test-template-ci.j2 | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scripts/test-template-ci.j2 b/scripts/test-template-ci.j2 index 94922ce4..6be9a50e 100644 --- a/scripts/test-template-ci.j2 +++ b/scripts/test-template-ci.j2 @@ -252,9 +252,7 @@ steps: depends_on: image-build {% endif %} soft_fail: {{ step.soft_fail or false }} - {% filter indent(2) %} - {{ render_cuda_config(step, docker_image, default_working_dir, hf_home_fsx, hf_home, branch) | indent(2, true) }} - {% endfilter %} + {{ render_cuda_config(step, docker_image, default_working_dir, hf_home_fsx, hf_home, branch) | indent(4, true) }} {% endif %} {% endfor %}