additions for bump to v0.3.2 (vllm-project#50)

andy-neuma · andy-neuma · web-flow · commit de9fde5420b5 · 2024-02-23T17:03:55.000-05:00
SUMMARY:
* "remote push" job for multi-gpu runner.
* "remote push" job for single gpu runner.
* patches for re-initialization of "ray". found other places in `vllm`
where they are passing in `ignore_reinit_error=True`, it just looked
like they missed a couple of places.
* patch "find" command to only find *.py files starting with "test_".


TEST PLAN:
runs on remote push

---------

Co-authored-by: andy-neuma &lt;andy@neuralmagic.com&gt;
diff --git a/.github/actions/nm-set-env/action.yml b/.github/actions/nm-set-env/action.yml
@@ -15,6 +15,7 @@ runs:
       NUM_THREADS=$(./.github/scripts/determine-threading -G ${{ inputs.Gi_per_thread }})
       echo "MAX_JOBS=${NUM_THREADS}" >> $GITHUB_ENV
       echo "VLLM_INSTALL_PUNICA_KERNELS=1" >> $GITHUB_ENV
+      echo "NCCL_IGNORE_DISABLED_P2P=1" >> $GITHUB_ENV
       echo "PYENV_ROOT=/usr/local/apps/pyenv" >> $GITHUB_ENV
       echo "XDG_CONFIG_HOME=/usr/local/apps" >> $GITHUB_ENV
       WHOAMI=$(whoami)
diff --git a/.github/scripts/run-tests b/.github/scripts/run-tests
@@ -48,7 +48,7 @@ if [ ! -d "${TEST_DIR}" ]; then
 fi
 
 # run tests serially
-TESTS_DOT_PY=$(find ${TEST_DIR}  -not -name "__init__.py" -name "*.py")
+TESTS_DOT_PY=$(find ${TEST_DIR} -name "test*.py")
 TESTS_TO_RUN=($TESTS_DOT_PY)
 SUCCESS=0
 for TEST in "${TESTS_TO_RUN[@]}"
diff --git a/.github/workflows/remote-push.yml b/.github/workflows/remote-push.yml
@@ -11,15 +11,28 @@ concurrency:
 
 jobs:
 
-    # TODO: expand python matrix later, once CI system has
-    #       matured.
+    # TODO: expand python matrix later, once CI system has matured.
 
-    # TODO: enable this later
-    AWS-AVX2-32G-A10G-24G:
+    # multi-gpu
+    AWS-AVX2-192G-4-A10G-96G:
         strategy:
             matrix:
                 python: [3.10.12]
         uses: ./.github/workflows/build-test.yml
+        with:
+            label: aws-avx2-192G-4-a10g-96G
+            timeout: 180
+            gitref: '${{ github.ref }}'
+            Gi_per_thread: 4
+            python: ${{ matrix.python }}
+        secrets: inherit
+
+    # single gpu
+    AWS-AVX2-32G-A10G-24G:
+        strategy:
+            matrix:
+                python: [3.11.4]
+        uses: ./.github/workflows/build-test.yml
         with:
             label: aws-avx2-32G-a10g-24G
             timeout: 180
diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py
@@ -5,9 +5,11 @@
 import pytest
 import torch
 
+
+# TODO: just picking one, need to update test runner to selectively use "--forked"
 MODELS = [
     "facebook/opt-125m",
-    "meta-llama/Llama-2-7b-hf",
+    # "meta-llama/Llama-2-7b-hf",
 ]
 
 
diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
@@ -73,6 +73,19 @@ def eager_allreduce(world_size, rank, distributed_init_port):
     assert torch.allclose(out, inp * world_size)
 
 
+def P2P_disabled():
+    num_gpus = torch.cuda.device_count()
+    for kk in range(num_gpus):
+        for jj in range(kk, num_gpus):
+            if torch.cuda.can_device_access_peer(
+                    device=torch.device(f"cuda:{kk}"),
+                    peer_device=torch.device(f"cuda:{jj}")):
+                return False
+    return True
+
+
+@pytest.mark.skipif(P2P_disabled(),
+                    reason="Cuda failure 'peer access is not supported between these two devices'")
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
                     reason="Need at least 2 GPUs to run the test.")
 @pytest.mark.parametrize("tensor_parallel_size", [2])
diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
@@ -62,7 +62,7 @@ def zephyr_lora_files():
 
 @pytest.fixture(scope="session")
 def server(zephyr_lora_files):
-    ray.init()
+    ray.init(ignore_reinit_error=True)
     server_runner = ServerRunner.remote([
         "--model",
         MODEL_NAME,
diff --git a/vllm/test_utils.py b/vllm/test_utils.py
@@ -28,7 +28,7 @@ def multi_process_tensor_parallel(
 ) -> None:
     # Using ray helps debugging the error when it failed
     # as compared to multiprocessing.
-    ray.init()
+    ray.init(ignore_reinit_error=True)
 
     distributed_init_port = get_open_port()
     refs = []