From 0ded21a1aee51cd88728285a5a02894c6145a77a Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Thu, 20 Jun 2024 22:19:00 -0400
Subject: [PATCH] chore(ci): workaround to retry `error decoding response body`
 from uv (#3889)

This PR uses a shell wrapper to check if the `error decoding response
body` error message is in the uv stderr and retry if so. It is just a
workaround for https://github.com/astral-sh/uv/issues/2586 and
https://github.com/astral-sh/uv/issues/3514 and hope the upstream can
fix it.

Note that this PR does nothing with cibuildwheel. It's unclear how to
retry with certain errors under its complex logic (feature requested in
https://github.com/pypa/cibuildwheel/issues/1846).

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **Chores**
- Standardized installation process for TensorFlow, Torch, and other
dependencies across workflows by using `uv_with_retry.sh` script to
ensure reliable installations.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
---
 .github/workflows/build_cc.yml    |  2 +-
 .github/workflows/test_cc.yml     |  4 ++--
 .github/workflows/test_cuda.yml   |  4 ++--
 .github/workflows/test_python.yml |  6 +++---
 source/install/uv_with_retry.sh   | 32 +++++++++++++++++++++++++++++++
 5 files changed, 40 insertions(+), 8 deletions(-)
 create mode 100755 source/install/uv_with_retry.sh

diff --git a/.github/workflows/build_cc.yml b/.github/workflows/build_cc.yml
index bf16b67656..775b88cfd3 100644
--- a/.github/workflows/build_cc.yml
+++ b/.github/workflows/build_cc.yml
@@ -32,7 +32,7 @@ jobs:
         python-version: '3.11'
     - uses: lukka/get-cmake@latest
     - run: python -m pip install uv
-    - run: python -m uv pip install --system tensorflow
+    - run: source/install/uv_with_retry.sh pip install --system tensorflow
     - name: Download libtorch
       run: |
          wget https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.1.2%2Bcpu.zip -O libtorch.zip
diff --git a/.github/workflows/test_cc.yml b/.github/workflows/test_cc.yml
index 799a55e9ff..ebbfc4d960 100644
--- a/.github/workflows/test_cc.yml
+++ b/.github/workflows/test_cc.yml
@@ -27,7 +27,7 @@ jobs:
         mpi: mpich
     - uses: lukka/get-cmake@latest
     - run: python -m pip install uv
-    - run: python -m uv pip install --system tensorflow
+    - run: source/install/uv_with_retry.sh pip install --system tensorflow
     - name: Download libtorch
       run: |
          wget https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.1.2%2Bcpu.zip -O libtorch.zip
@@ -49,7 +49,7 @@ jobs:
     # test lammps
     - run: |
         export TENSORFLOW_ROOT=$(python -c 'import importlib,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)')
-        python -m uv pip install --system -e .[cpu,test,lmp] mpi4py
+        source/install/uv_with_retry.sh pip install --system -e .[cpu,test,lmp] mpi4py
       env:
         DP_BUILD_TESTING: 1
       if: ${{ !matrix.check_memleak }}
diff --git a/.github/workflows/test_cuda.yml b/.github/workflows/test_cuda.yml
index d97b1f9431..703d0ea2fe 100644
--- a/.github/workflows/test_cuda.yml
+++ b/.github/workflows/test_cuda.yml
@@ -47,10 +47,10 @@ jobs:
          && sudo apt-get -y install cuda-12-3 libcudnn8=8.9.5.*-1+cuda12.3
       if: false  # skip as we use nvidia image
     - run: python -m pip install -U uv
-    - run: python -m uv pip install --system "tensorflow>=2.15.0rc0" "torch>=2.2.0"
+    - run: source/install/uv_with_retry.sh pip install --system "tensorflow>=2.15.0rc0" "torch>=2.2.0"
     - run: |
         export TENSORFLOW_ROOT=$(python -c 'import importlib,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)')
-        python -m uv pip install --system -v -e .[gpu,test,lmp,cu12,torch] mpi4py
+        source/install/uv_with_retry.sh pip install --system -v -e .[gpu,test,lmp,cu12,torch] mpi4py
       env:
         DP_VARIANT: cuda
         DP_ENABLE_NATIVE_OPTIMIZATION: 1
diff --git a/.github/workflows/test_python.yml b/.github/workflows/test_python.yml
index 0f9fc61acd..3cf56ecbd3 100644
--- a/.github/workflows/test_python.yml
+++ b/.github/workflows/test_python.yml
@@ -25,10 +25,10 @@ jobs:
         python-version: ${{ matrix.python }}
     - run: python -m pip install -U uv
     - run: |
-        uv pip install --system mpich
-        uv pip install --system "torch==2.3.0+cpu.cxx11.abi" -i https://download.pytorch.org/whl/
+        source/install/uv_with_retry.sh pip install --system mpich
+        source/install/uv_with_retry.sh pip install --system "torch==2.3.0+cpu.cxx11.abi" -i https://download.pytorch.org/whl/
         export PYTORCH_ROOT=$(python -c 'import torch;print(torch.__path__[0])')
-        uv pip install --system --only-binary=horovod -e .[cpu,test] horovod[tensorflow-cpu] mpi4py
+        source/install/uv_with_retry.sh pip install --system --only-binary=horovod -e .[cpu,test] horovod[tensorflow-cpu] mpi4py
       env:
         # Please note that uv has some issues with finding
         # existing TensorFlow package. Currently, it uses
diff --git a/source/install/uv_with_retry.sh b/source/install/uv_with_retry.sh
new file mode 100755
index 0000000000..2d9a524f6b
--- /dev/null
+++ b/source/install/uv_with_retry.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+# This script is used to retry the uv command if the error "error decoding response body" is encountered.
+# See also:
+# https://github.com/astral-sh/uv/issues/2586
+# https://github.com/astral-sh/uv/issues/3456
+# https://github.com/astral-sh/uv/issues/3514
+# https://github.com/astral-sh/uv/issues/4402
+tmpstderr=$(mktemp)
+max_retry=3
+while true; do
+	uv "$@" 2> >(tee -a "${tmpstderr}" >&2)
+	exit_code=$?
+	# exit if ok
+	if [ $exit_code -eq 0 ]; then
+		rm -f "${tmpstderr}"
+		exit 0
+	fi
+	# check if "error decoding response body" is in the stderr
+	if grep -q "error decoding response body" "${tmpstderr}"; then
+		echo "Retrying uv in 1 s..."
+		max_retry=$((max_retry - 1))
+		if [ $max_retry -eq 0 ]; then
+			echo "Max retry reached, exiting..."
+			rm -f "${tmpstderr}"
+			exit 1
+		fi
+		sleep 1
+	else
+		rm -f "${tmpstderr}"
+		exit $exit_code
+	fi
+done