From f2cfdfb76b9c17b71f2f89a1834e17ead55fcdcf Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 24 Feb 2025 14:20:01 +0100 Subject: [PATCH 01/26] Match TPU `torch` version to `torch_xla` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- requirements-tpu.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-tpu.txt b/requirements-tpu.txt index 1abde714af7c..fae1f1afcc40 100644 --- a/requirements-tpu.txt +++ b/requirements-tpu.txt @@ -17,7 +17,7 @@ ray[default] --find-links https://storage.googleapis.com/libtpu-releases/index.html --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html -torch==2.6.0.dev20241216+cpu +torch==2.7.0.dev20250124+cpu torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" From 4104094fd4dcbb033e6fa87d272395c174f5c50c Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 24 Feb 2025 14:29:12 +0100 Subject: [PATCH 02/26] Add torchvision requirement Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- requirements-tpu.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-tpu.txt b/requirements-tpu.txt index fae1f1afcc40..9bdcf83adeb2 100644 --- a/requirements-tpu.txt +++ b/requirements-tpu.txt @@ -18,6 +18,7 @@ ray[default] --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html torch==2.7.0.dev20250124+cpu +torchvision==0.22.0.dev20250124+cpu torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" From 6496fadde97202b0e0b8366427389db6d905ff15 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 24 Feb 2025 14:46:25 +0100 Subject: [PATCH 03/26] Updated moved test file Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .buildkite/run-tpu-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh index 650af0fac4c6..a2a73b29374b 100755 --- a/.buildkite/run-tpu-test.sh +++ b/.buildkite/run-tpu-test.sh @@ -19,7 +19,7 @@ docker run --privileged --net host --shm-size=16G -it \ vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \ && python3 -m pip install pytest \ && python3 -m pip install lm_eval[api]==0.4.4 \ - && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py \ + && pytest -v -s /workspace/vllm/tests/entrypoints/openai/correctness/test_lmeval.py \ && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ && python3 /workspace/vllm/tests/tpu/test_compilation.py \ && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \ From bf0ff0ce4a97113ef6f7aec0b3817e072dccab51 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 24 Feb 2025 15:04:49 +0100 Subject: [PATCH 04/26] Try 2.6.0 Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- requirements-tpu.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/requirements-tpu.txt b/requirements-tpu.txt index 9bdcf83adeb2..35d7ed282541 100644 --- a/requirements-tpu.txt +++ b/requirements-tpu.txt @@ -17,8 +17,8 @@ ray[default] --find-links https://storage.googleapis.com/libtpu-releases/index.html --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html -torch==2.7.0.dev20250124+cpu -torchvision==0.22.0.dev20250124+cpu -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" +torch==2.6.0+cpu +torchvision==0.21.0+cpu +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" From 505ddb9a6eba4d9d2edfe4021fc476da2bf3b6fb Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 24 Feb 2025 15:08:23 +0100 Subject: [PATCH 05/26] Try from pypi Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- requirements-tpu.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/requirements-tpu.txt b/requirements-tpu.txt index 35d7ed282541..83aceedb2578 100644 --- a/requirements-tpu.txt +++ b/requirements-tpu.txt @@ -19,6 +19,4 @@ ray[default] --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html torch==2.6.0+cpu torchvision==0.21.0+cpu -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" +torch_xla[tpu, pallas]==2.6.0 From 3acf682266690813ce0b5a48072a2ea31e8c1f6f Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 24 Feb 2025 15:14:38 +0100 Subject: [PATCH 06/26] try Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- requirements-tpu.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements-tpu.txt b/requirements-tpu.txt index 83aceedb2578..231e6c1075d2 100644 --- a/requirements-tpu.txt +++ b/requirements-tpu.txt @@ -17,6 +17,6 @@ ray[default] --find-links https://storage.googleapis.com/libtpu-releases/index.html --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html -torch==2.6.0+cpu -torchvision==0.21.0+cpu +torch==2.6.0 +torchvision==0.21.0 torch_xla[tpu, pallas]==2.6.0 From 9fbbb58ea8baab83e1e5000fe002b3a957e816a2 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 24 Feb 2025 15:33:57 +0100 Subject: [PATCH 07/26] Don't use nightly container Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- Dockerfile.tpu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Dockerfile.tpu b/Dockerfile.tpu index e268b3947666..9a256b5cff9d 100644 --- a/Dockerfile.tpu +++ b/Dockerfile.tpu @@ -1,5 +1,4 @@ -ARG NIGHTLY_DATE="20250124" -ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE" +ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.6.0_3.10_tpuvm FROM $BASE_IMAGE WORKDIR /workspace/vllm From 12273b1acf53cd17c1f949aef02098ee49417292 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 24 Feb 2025 15:36:08 +0100 Subject: [PATCH 08/26] Typo Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- Dockerfile.tpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.tpu b/Dockerfile.tpu index 9a256b5cff9d..d53bdbcd2bcb 100644 --- a/Dockerfile.tpu +++ b/Dockerfile.tpu @@ -1,4 +1,4 @@ -ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.6.0_3.10_tpuvm +ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.6.0_3.10_tpuvm" FROM $BASE_IMAGE WORKDIR /workspace/vllm From b0a033e296c27ef16dee2a70135737e11c78968e Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 24 Feb 2025 15:51:12 +0100 Subject: [PATCH 09/26] Try to cache requirements files separately Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- Dockerfile.tpu | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/Dockerfile.tpu b/Dockerfile.tpu index d53bdbcd2bcb..eb54c91d9d99 100644 --- a/Dockerfile.tpu +++ b/Dockerfile.tpu @@ -8,6 +8,12 @@ RUN apt-get update && apt-get install -y \ git \ ffmpeg libsm6 libxext6 libgl1 +# Install Python dependencies +COPY requirements-common.txt requirements-common.txt +COPY requirements-tpu.txt requirements-tpu.txt +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip install -r requirements-tpu.txt + # Build vLLM. COPY . . ARG GIT_REPO_CHECK=0 @@ -15,10 +21,6 @@ RUN --mount=type=bind,source=.git,target=.git \ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi ENV VLLM_TARGET_DEVICE="tpu" -RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,source=.git,target=.git \ - python3 -m pip install \ - -r requirements-tpu.txt RUN python3 setup.py develop # install development dependencies (for testing) From 1f5b2edd4574ab4ac56f36c40158599a770eb6be Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 24 Feb 2025 16:14:24 +0100 Subject: [PATCH 10/26] Try something Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- requirements-tpu.txt | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/requirements-tpu.txt b/requirements-tpu.txt index 231e6c1075d2..502b98b61312 100644 --- a/requirements-tpu.txt +++ b/requirements-tpu.txt @@ -11,12 +11,11 @@ jinja2 ray[default] # Install torch_xla ---pre ---extra-index-url https://download.pytorch.org/whl/nightly/cpu +--extra-index-url https://download.pytorch.org/whl/cpu --find-links https://storage.googleapis.com/libtpu-wheels/index.html --find-links https://storage.googleapis.com/libtpu-releases/index.html --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html torch==2.6.0 -torchvision==0.21.0 -torch_xla[tpu, pallas]==2.6.0 +torchvision # Install whichever version is compatible with torch +torch_xla[tpu, pallas] # Install whichever version is compatible with torch From 012b00d0795a4a8b33e38b297f64781b5d6fb02a Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 25 Feb 2025 12:16:30 +0100 Subject: [PATCH 11/26] Use `uv` to install requirements Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- Dockerfile.tpu | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/Dockerfile.tpu b/Dockerfile.tpu index eb54c91d9d99..7d4fd8662646 100644 --- a/Dockerfile.tpu +++ b/Dockerfile.tpu @@ -8,11 +8,15 @@ RUN apt-get update && apt-get install -y \ git \ ffmpeg libsm6 libxext6 libgl1 +# Install uv for faster pip installs +RUN --mount=type=cache,target=/root/.cache/uv \ + python3 -m pip install uv + # Install Python dependencies COPY requirements-common.txt requirements-common.txt COPY requirements-tpu.txt requirements-tpu.txt -RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install -r requirements-tpu.txt +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system -r requirements-tpu.txt # Build vLLM. COPY . . @@ -24,6 +28,6 @@ ENV VLLM_TARGET_DEVICE="tpu" RUN python3 setup.py develop # install development dependencies (for testing) -RUN python3 -m pip install -e tests/vllm_test_utils +RUN uv pip install --system -e tests/vllm_test_utils CMD ["/bin/bash"] From cf0180dd5b0ed625736b48d975378b133cf48986 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 25 Feb 2025 12:25:15 +0100 Subject: [PATCH 12/26] Update to latest container (that available torch nightly) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- Dockerfile.tpu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile.tpu b/Dockerfile.tpu index 7d4fd8662646..90d356057888 100644 --- a/Dockerfile.tpu +++ b/Dockerfile.tpu @@ -1,4 +1,5 @@ -ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.6.0_3.10_tpuvm" +ARG NIGHTLY_DATE="20250224" +ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE" FROM $BASE_IMAGE WORKDIR /workspace/vllm From e159851aa21021f68fdc4e5c60de4189477afb44 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 25 Feb 2025 12:31:48 +0100 Subject: [PATCH 13/26] Remove `uv` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- Dockerfile.tpu | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/Dockerfile.tpu b/Dockerfile.tpu index 90d356057888..008f9cc609ad 100644 --- a/Dockerfile.tpu +++ b/Dockerfile.tpu @@ -9,15 +9,11 @@ RUN apt-get update && apt-get install -y \ git \ ffmpeg libsm6 libxext6 libgl1 -# Install uv for faster pip installs -RUN --mount=type=cache,target=/root/.cache/uv \ - python3 -m pip install uv - # Install Python dependencies COPY requirements-common.txt requirements-common.txt COPY requirements-tpu.txt requirements-tpu.txt RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system -r requirements-tpu.txt + python3 -m pip install -r requirements-tpu.txt # Build vLLM. COPY . . @@ -29,6 +25,6 @@ ENV VLLM_TARGET_DEVICE="tpu" RUN python3 setup.py develop # install development dependencies (for testing) -RUN uv pip install --system -e tests/vllm_test_utils +RUN python3 -m pip install -e tests/vllm_test_utils CMD ["/bin/bash"] From e4a13c8940207496d9ef115da3f426f3787c0b73 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 25 Feb 2025 12:32:04 +0100 Subject: [PATCH 14/26] Update torch version Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- requirements-tpu.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/requirements-tpu.txt b/requirements-tpu.txt index 502b98b61312..532bd24f08be 100644 --- a/requirements-tpu.txt +++ b/requirements-tpu.txt @@ -11,11 +11,12 @@ jinja2 ray[default] # Install torch_xla ---extra-index-url https://download.pytorch.org/whl/cpu +--pre +--extra-index-url https://download.pytorch.org/whl/nightly/cpu --find-links https://storage.googleapis.com/libtpu-wheels/index.html --find-links https://storage.googleapis.com/libtpu-releases/index.html --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html -torch==2.6.0 +torch==2.7.0.dev20250224+cpu torchvision # Install whichever version is compatible with torch torch_xla[tpu, pallas] # Install whichever version is compatible with torch From 3c57eca127dfb26452efaaf77ff1a3696ba03ac9 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 25 Feb 2025 12:41:07 +0100 Subject: [PATCH 15/26] Specify matching torchvision version Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- requirements-tpu.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-tpu.txt b/requirements-tpu.txt index 532bd24f08be..938f90d37967 100644 --- a/requirements-tpu.txt +++ b/requirements-tpu.txt @@ -18,5 +18,5 @@ ray[default] --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html torch==2.7.0.dev20250224+cpu -torchvision # Install whichever version is compatible with torch +torchvision==0.22.0.dev20250224+cpu torch_xla[tpu, pallas] # Install whichever version is compatible with torch From af0a7cb4f21c79cc9598f8457753e0c989810780 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 25 Feb 2025 13:07:38 +0100 Subject: [PATCH 16/26] Specify matching torch_xla Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- requirements-tpu.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/requirements-tpu.txt b/requirements-tpu.txt index 938f90d37967..477575d3a7ed 100644 --- a/requirements-tpu.txt +++ b/requirements-tpu.txt @@ -19,4 +19,6 @@ ray[default] --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html torch==2.7.0.dev20250224+cpu torchvision==0.22.0.dev20250224+cpu -torch_xla[tpu, pallas] # Install whichever version is compatible with torch +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250224-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250224-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250224-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" From 069e07b96834f47cd1ffcd1b2fcc6d98dad4f3d1 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 25 Feb 2025 13:27:13 +0100 Subject: [PATCH 17/26] Use torch and torch_xla that come with container Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- Dockerfile.tpu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Dockerfile.tpu b/Dockerfile.tpu index 008f9cc609ad..fa1db313bbf6 100644 --- a/Dockerfile.tpu +++ b/Dockerfile.tpu @@ -12,8 +12,9 @@ RUN apt-get update && apt-get install -y \ # Install Python dependencies COPY requirements-common.txt requirements-common.txt COPY requirements-tpu.txt requirements-tpu.txt -RUN --mount=type=cache,target=/root/.cache/uv \ - python3 -m pip install -r requirements-tpu.txt +RUN --mount=type=cache,target=/root/.cache/pip \ + grep -v -E "^torch(_xla)?\b" requirements-tpu.txt | \ + python3 -m pip install -r /dev/stdin # Build vLLM. COPY . . From 9b2fe75b11692acaead5466fcee61d3b494646ea Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 25 Feb 2025 13:33:06 +0100 Subject: [PATCH 18/26] Save filtered requirements to temporary file Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- Dockerfile.tpu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile.tpu b/Dockerfile.tpu index fa1db313bbf6..2447a03ed7fe 100644 --- a/Dockerfile.tpu +++ b/Dockerfile.tpu @@ -12,9 +12,9 @@ RUN apt-get update && apt-get install -y \ # Install Python dependencies COPY requirements-common.txt requirements-common.txt COPY requirements-tpu.txt requirements-tpu.txt +RUN grep -v -E "^torch(_xla)?\b" requirements-tpu.txt > requirements-filtered.txt RUN --mount=type=cache,target=/root/.cache/pip \ - grep -v -E "^torch(_xla)?\b" requirements-tpu.txt | \ - python3 -m pip install -r /dev/stdin + python3 -m pip install -r requirements-filtered.txt # Build vLLM. COPY . . From f03c34c08fe789341f208f5cc89ca184bb8f0cea Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 25 Feb 2025 13:49:48 +0100 Subject: [PATCH 19/26] Make sure pallas gets installed Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- Dockerfile.tpu | 2 +- requirements-tpu.txt | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/Dockerfile.tpu b/Dockerfile.tpu index 2447a03ed7fe..e9cb5af0b40b 100644 --- a/Dockerfile.tpu +++ b/Dockerfile.tpu @@ -12,7 +12,7 @@ RUN apt-get update && apt-get install -y \ # Install Python dependencies COPY requirements-common.txt requirements-common.txt COPY requirements-tpu.txt requirements-tpu.txt -RUN grep -v -E "^torch(_xla)?\b" requirements-tpu.txt > requirements-filtered.txt +RUN grep -v -E "^torch\b" requirements-tpu.txt > requirements-filtered.txt RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install -r requirements-filtered.txt diff --git a/requirements-tpu.txt b/requirements-tpu.txt index 477575d3a7ed..938f90d37967 100644 --- a/requirements-tpu.txt +++ b/requirements-tpu.txt @@ -19,6 +19,4 @@ ray[default] --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html torch==2.7.0.dev20250224+cpu torchvision==0.22.0.dev20250224+cpu -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250224-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250224-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250224-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" +torch_xla[tpu, pallas] # Install whichever version is compatible with torch From 7fd4ba1881703cf75b40f5f5fa003d74d7987020 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 25 Feb 2025 14:01:56 +0100 Subject: [PATCH 20/26] Revert "Make sure pallas gets installed" This reverts commit f03c34c08fe789341f208f5cc89ca184bb8f0cea. Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- Dockerfile.tpu | 2 +- requirements-tpu.txt | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Dockerfile.tpu b/Dockerfile.tpu index e9cb5af0b40b..2447a03ed7fe 100644 --- a/Dockerfile.tpu +++ b/Dockerfile.tpu @@ -12,7 +12,7 @@ RUN apt-get update && apt-get install -y \ # Install Python dependencies COPY requirements-common.txt requirements-common.txt COPY requirements-tpu.txt requirements-tpu.txt -RUN grep -v -E "^torch\b" requirements-tpu.txt > requirements-filtered.txt +RUN grep -v -E "^torch(_xla)?\b" requirements-tpu.txt > requirements-filtered.txt RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install -r requirements-filtered.txt diff --git a/requirements-tpu.txt b/requirements-tpu.txt index 938f90d37967..477575d3a7ed 100644 --- a/requirements-tpu.txt +++ b/requirements-tpu.txt @@ -19,4 +19,6 @@ ray[default] --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html torch==2.7.0.dev20250224+cpu torchvision==0.22.0.dev20250224+cpu -torch_xla[tpu, pallas] # Install whichever version is compatible with torch +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250224-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250224-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250224-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" From 4d716c25f1f67fa426058a9ca9e09f3e4ddc228c Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 25 Feb 2025 14:03:29 +0100 Subject: [PATCH 21/26] Install pallas deps separately in Dockerfile Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- Dockerfile.tpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.tpu b/Dockerfile.tpu index 2447a03ed7fe..c23c437203f0 100644 --- a/Dockerfile.tpu +++ b/Dockerfile.tpu @@ -14,7 +14,7 @@ COPY requirements-common.txt requirements-common.txt COPY requirements-tpu.txt requirements-tpu.txt RUN grep -v -E "^torch(_xla)?\b" requirements-tpu.txt > requirements-filtered.txt RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install -r requirements-filtered.txt + python3 -m pip install -r requirements-filtered.txt jax jaxlib # Build vLLM. COPY . . From e09358ed362c65298900866fdc5b0b66d9f9d94c Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 25 Feb 2025 14:14:25 +0100 Subject: [PATCH 22/26] Add pallas dependency to install command Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- Dockerfile.tpu | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Dockerfile.tpu b/Dockerfile.tpu index c23c437203f0..f32acb6ea987 100644 --- a/Dockerfile.tpu +++ b/Dockerfile.tpu @@ -12,9 +12,11 @@ RUN apt-get update && apt-get install -y \ # Install Python dependencies COPY requirements-common.txt requirements-common.txt COPY requirements-tpu.txt requirements-tpu.txt +# Filter out torch and torch_xla to use versions that came with image RUN grep -v -E "^torch(_xla)?\b" requirements-tpu.txt > requirements-filtered.txt +# Image does not come with pallas dependencies so we must add them manually RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install -r requirements-filtered.txt jax jaxlib + python3 -m pip install -r requirements-filtered.txt jax==0.5.1.dev20250210 jaxlib # Build vLLM. COPY . . From ba8151268f438c95da23b7364765e067f53ea825 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 25 Feb 2025 14:25:36 +0100 Subject: [PATCH 23/26] Pin jaxlib in Dockerfile Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- Dockerfile.tpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.tpu b/Dockerfile.tpu index f32acb6ea987..69f14ba358ba 100644 --- a/Dockerfile.tpu +++ b/Dockerfile.tpu @@ -16,7 +16,7 @@ COPY requirements-tpu.txt requirements-tpu.txt RUN grep -v -E "^torch(_xla)?\b" requirements-tpu.txt > requirements-filtered.txt # Image does not come with pallas dependencies so we must add them manually RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install -r requirements-filtered.txt jax==0.5.1.dev20250210 jaxlib + python3 -m pip install -r requirements-filtered.txt jax==0.5.1.dev20250210 jaxlib==0.5.1.dev20250210 # Build vLLM. COPY . . From 6aa25ad912a2b41b17e52f28fe0b4b5ca83dd124 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 25 Feb 2025 14:32:05 +0100 Subject: [PATCH 24/26] Try cxx11 image Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- Dockerfile.tpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.tpu b/Dockerfile.tpu index 69f14ba358ba..e6ec12478fd0 100644 --- a/Dockerfile.tpu +++ b/Dockerfile.tpu @@ -1,5 +1,5 @@ ARG NIGHTLY_DATE="20250224" -ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE" +ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_cxx11_$NIGHTLY_DATE" FROM $BASE_IMAGE WORKDIR /workspace/vllm From 58ac5696a3d705107f8298466fb60ae202b1cd94 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 25 Feb 2025 14:44:26 +0100 Subject: [PATCH 25/26] Try installing requirements from txt in Dockerfile Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- Dockerfile.tpu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Dockerfile.tpu b/Dockerfile.tpu index e6ec12478fd0..2475515b07d6 100644 --- a/Dockerfile.tpu +++ b/Dockerfile.tpu @@ -13,10 +13,11 @@ RUN apt-get update && apt-get install -y \ COPY requirements-common.txt requirements-common.txt COPY requirements-tpu.txt requirements-tpu.txt # Filter out torch and torch_xla to use versions that came with image -RUN grep -v -E "^torch(_xla)?\b" requirements-tpu.txt > requirements-filtered.txt +# RUN grep -v -E "^torch(_xla)?\b" requirements-tpu.txt > requirements-filtered.txt # Image does not come with pallas dependencies so we must add them manually RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install -r requirements-filtered.txt jax==0.5.1.dev20250210 jaxlib==0.5.1.dev20250210 + python3 -m pip install -r requirements-tpu.txt + # jax==0.5.1.dev20250210 jaxlib==0.5.1.dev20250210 # Build vLLM. COPY . . From d2d0e3d091c13e199ff9143863501d2ee7f57d96 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 25 Feb 2025 14:50:15 +0100 Subject: [PATCH 26/26] Remove no longer needed code Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- Dockerfile.tpu | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Dockerfile.tpu b/Dockerfile.tpu index 2475515b07d6..f7abe18f45f7 100644 --- a/Dockerfile.tpu +++ b/Dockerfile.tpu @@ -12,12 +12,8 @@ RUN apt-get update && apt-get install -y \ # Install Python dependencies COPY requirements-common.txt requirements-common.txt COPY requirements-tpu.txt requirements-tpu.txt -# Filter out torch and torch_xla to use versions that came with image -# RUN grep -v -E "^torch(_xla)?\b" requirements-tpu.txt > requirements-filtered.txt -# Image does not come with pallas dependencies so we must add them manually RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install -r requirements-tpu.txt - # jax==0.5.1.dev20250210 jaxlib==0.5.1.dev20250210 # Build vLLM. COPY . .