From 0ded21a1aee51cd88728285a5a02894c6145a77a Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 20 Jun 2024 22:19:00 -0400 Subject: [PATCH] chore(ci): workaround to retry `error decoding response body` from uv (#3889) This PR uses a shell wrapper to check if the `error decoding response body` error message is in the uv stderr and retry if so. It is just a workaround for https://github.com/astral-sh/uv/issues/2586 and https://github.com/astral-sh/uv/issues/3514 and hope the upstream can fix it. Note that this PR does nothing with cibuildwheel. It's unclear how to retry with certain errors under its complex logic (feature requested in https://github.com/pypa/cibuildwheel/issues/1846). ## Summary by CodeRabbit - **Chores** - Standardized installation process for TensorFlow, Torch, and other dependencies across workflows by using `uv_with_retry.sh` script to ensure reliable installations. --------- Signed-off-by: Jinzhe Zeng Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- .github/workflows/build_cc.yml | 2 +- .github/workflows/test_cc.yml | 4 ++-- .github/workflows/test_cuda.yml | 4 ++-- .github/workflows/test_python.yml | 6 +++--- source/install/uv_with_retry.sh | 32 +++++++++++++++++++++++++++++++ 5 files changed, 40 insertions(+), 8 deletions(-) create mode 100755 source/install/uv_with_retry.sh diff --git a/.github/workflows/build_cc.yml b/.github/workflows/build_cc.yml index bf16b67656..775b88cfd3 100644 --- a/.github/workflows/build_cc.yml +++ b/.github/workflows/build_cc.yml @@ -32,7 +32,7 @@ jobs: python-version: '3.11' - uses: lukka/get-cmake@latest - run: python -m pip install uv - - run: python -m uv pip install --system tensorflow + - run: source/install/uv_with_retry.sh pip install --system tensorflow - name: Download libtorch run: | wget https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.1.2%2Bcpu.zip -O libtorch.zip diff --git a/.github/workflows/test_cc.yml b/.github/workflows/test_cc.yml index 799a55e9ff..ebbfc4d960 100644 --- a/.github/workflows/test_cc.yml +++ b/.github/workflows/test_cc.yml @@ -27,7 +27,7 @@ jobs: mpi: mpich - uses: lukka/get-cmake@latest - run: python -m pip install uv - - run: python -m uv pip install --system tensorflow + - run: source/install/uv_with_retry.sh pip install --system tensorflow - name: Download libtorch run: | wget https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.1.2%2Bcpu.zip -O libtorch.zip @@ -49,7 +49,7 @@ jobs: # test lammps - run: | export TENSORFLOW_ROOT=$(python -c 'import importlib,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)') - python -m uv pip install --system -e .[cpu,test,lmp] mpi4py + source/install/uv_with_retry.sh pip install --system -e .[cpu,test,lmp] mpi4py env: DP_BUILD_TESTING: 1 if: ${{ !matrix.check_memleak }} diff --git a/.github/workflows/test_cuda.yml b/.github/workflows/test_cuda.yml index d97b1f9431..703d0ea2fe 100644 --- a/.github/workflows/test_cuda.yml +++ b/.github/workflows/test_cuda.yml @@ -47,10 +47,10 @@ jobs: && sudo apt-get -y install cuda-12-3 libcudnn8=8.9.5.*-1+cuda12.3 if: false # skip as we use nvidia image - run: python -m pip install -U uv - - run: python -m uv pip install --system "tensorflow>=2.15.0rc0" "torch>=2.2.0" + - run: source/install/uv_with_retry.sh pip install --system "tensorflow>=2.15.0rc0" "torch>=2.2.0" - run: | export TENSORFLOW_ROOT=$(python -c 'import importlib,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)') - python -m uv pip install --system -v -e .[gpu,test,lmp,cu12,torch] mpi4py + source/install/uv_with_retry.sh pip install --system -v -e .[gpu,test,lmp,cu12,torch] mpi4py env: DP_VARIANT: cuda DP_ENABLE_NATIVE_OPTIMIZATION: 1 diff --git a/.github/workflows/test_python.yml b/.github/workflows/test_python.yml index 0f9fc61acd..3cf56ecbd3 100644 --- a/.github/workflows/test_python.yml +++ b/.github/workflows/test_python.yml @@ -25,10 +25,10 @@ jobs: python-version: ${{ matrix.python }} - run: python -m pip install -U uv - run: | - uv pip install --system mpich - uv pip install --system "torch==2.3.0+cpu.cxx11.abi" -i https://download.pytorch.org/whl/ + source/install/uv_with_retry.sh pip install --system mpich + source/install/uv_with_retry.sh pip install --system "torch==2.3.0+cpu.cxx11.abi" -i https://download.pytorch.org/whl/ export PYTORCH_ROOT=$(python -c 'import torch;print(torch.__path__[0])') - uv pip install --system --only-binary=horovod -e .[cpu,test] horovod[tensorflow-cpu] mpi4py + source/install/uv_with_retry.sh pip install --system --only-binary=horovod -e .[cpu,test] horovod[tensorflow-cpu] mpi4py env: # Please note that uv has some issues with finding # existing TensorFlow package. Currently, it uses diff --git a/source/install/uv_with_retry.sh b/source/install/uv_with_retry.sh new file mode 100755 index 0000000000..2d9a524f6b --- /dev/null +++ b/source/install/uv_with_retry.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# This script is used to retry the uv command if the error "error decoding response body" is encountered. +# See also: +# https://github.com/astral-sh/uv/issues/2586 +# https://github.com/astral-sh/uv/issues/3456 +# https://github.com/astral-sh/uv/issues/3514 +# https://github.com/astral-sh/uv/issues/4402 +tmpstderr=$(mktemp) +max_retry=3 +while true; do + uv "$@" 2> >(tee -a "${tmpstderr}" >&2) + exit_code=$? + # exit if ok + if [ $exit_code -eq 0 ]; then + rm -f "${tmpstderr}" + exit 0 + fi + # check if "error decoding response body" is in the stderr + if grep -q "error decoding response body" "${tmpstderr}"; then + echo "Retrying uv in 1 s..." + max_retry=$((max_retry - 1)) + if [ $max_retry -eq 0 ]; then + echo "Max retry reached, exiting..." + rm -f "${tmpstderr}" + exit 1 + fi + sleep 1 + else + rm -f "${tmpstderr}" + exit $exit_code + fi +done