Merge branch 'branch-24.10' into fea-remove-adaptor-factories

rapidsai · Jul 31, 2024 · 899590e · 899590e
2 parents 507777c + 2a79a83
commit 899590e
Show file tree

Hide file tree

Showing 4 changed files with 32 additions and 29 deletions.
diff --git a/.devcontainer/cuda12.5-pip/devcontainer.json b/.devcontainer/cuda12.5-pip/devcontainer.json
@@ -5,7 +5,7 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda12.5-ucx1.15.0-openmpi-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda12.5-ucx1.17.0-openmpi-ubuntu22.04"
     }
   },
   "runArgs": [

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
@@ -69,18 +69,19 @@ DEPENDENCIES=(
   pyraft
   raft-dask
   rmm
-  ucx-py
   rapids-dask-dependency
 )
-for DEP in "${DEPENDENCIES[@]}"; do
-  for FILE in dependencies.yaml conda/environments/*.yaml python/cugraph-{pyg,dgl}/conda/*.yaml; do
+for FILE in dependencies.yaml conda/environments/*.yaml python/cugraph-{pyg,dgl}/conda/*.yaml; do
+  for DEP in "${DEPENDENCIES[@]}"; do
     sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}"
-    sed_runner "/-.* ucx-py==/ s/==.*/==${NEXT_UCX_PY_VERSION}.*,>=0.0.0a0/g" "${FILE}"
   done
-  for FILE in python/**/pyproject.toml python/**/**/pyproject.toml; do
+  sed_runner "/-.* ucx-py\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_UCX_PY_VERSION}.*,>=0.0.0a0/g" "${FILE}"
+done
+for FILE in python/**/pyproject.toml python/**/**/pyproject.toml; do
+  for DEP in "${DEPENDENCIES[@]}"; do
     sed_runner "/\"${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" "${FILE}"
-    sed_runner "/\"ucx-py==/ s/==.*\"/==${NEXT_UCX_PY_VERSION}.*,>=0.0.0a0\"/g" "${FILE}"
   done
+  sed_runner "/\"ucx-py\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*\"/==${NEXT_UCX_PY_VERSION}.*,>=0.0.0a0\"/g" "${FILE}"
 done
 
 # ucx-py version

diff --git a/dependencies.yaml b/dependencies.yaml
@@ -506,12 +506,12 @@ dependencies:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - &ucx_py_cu11 ucx-py-cu11==24.10.*,>=0.0.0a0
+              - &ucx_py_cu11 ucx-py-cu11==0.40.*,>=0.0.0a0
           - matrix:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - &ucx_py_cu12 ucx-py-cu12==24.10.*,>=0.0.0a0
+              - &ucx_py_cu12 ucx-py-cu12==0.40.*,>=0.0.0a0
           - matrix:
             packages:
               - *ucx_py_unsuffixed

diff --git a/python/cugraph/cugraph/tests/data_store/test_gnn_feat_storage_wholegraph.py b/python/cugraph/cugraph/tests/data_store/test_gnn_feat_storage_wholegraph.py
@@ -13,6 +13,7 @@
 
 import pytest
 import numpy as np
+import os
 
 from cugraph.gnn import FeatureStore
 
@@ -21,18 +22,23 @@
 pylibwholegraph = import_optional("pylibwholegraph")
 wmb = import_optional("pylibwholegraph.binding.wholememory_binding")
 torch = import_optional("torch")
+wgth = import_optional("pylibwholegraph.torch")
 
 
-def runtest(world_rank: int, world_size: int):
-    from pylibwholegraph.torch.initialize import init_torch_env_and_create_wm_comm
+def runtest(rank: int, world_size: int):
+    torch.cuda.set_device(rank)
 
-    wm_comm, _ = init_torch_env_and_create_wm_comm(
-        world_rank,
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12355"
+    torch.distributed.init_process_group("nccl", rank=rank, world_size=world_size)
+
+    pylibwholegraph.torch.initialize.init(
+        rank,
         world_size,
-        world_rank,
+        rank,
         world_size,
     )
-    wm_comm = wm_comm.wmb_comm
+    wm_comm = wgth.get_global_communicator()
 
     generator = np.random.default_rng(62)
     arr = (
@@ -52,36 +58,32 @@ def runtest(world_rank: int, world_size: int):
     expected = arr[indices_to_fetch]
     np.testing.assert_array_equal(output_fs.cpu().numpy(), expected)
 
-    wmb.finalize()
+    pylibwholegraph.torch.initialize.finalize()
 
 
 @pytest.mark.sg
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 @pytest.mark.skipif(
     isinstance(pylibwholegraph, MissingModule), reason="wholegraph not available"
 )
-@pytest.mark.skip(reason="broken")
 def test_feature_storage_wholegraph_backend():
-    from pylibwholegraph.utils.multiprocess import multiprocess_run
+    world_size = torch.cuda.device_count()
+    print("gpu count:", world_size)
+    assert world_size > 0
 
-    gpu_count = wmb.fork_get_gpu_count()
-    print("gpu count:", gpu_count)
-    assert gpu_count > 0
+    print("ignoring gpu count and running on 1 GPU only")
 
-    multiprocess_run(1, runtest)
+    torch.multiprocessing.spawn(runtest, args=(1,), nprocs=1)
 
 
 @pytest.mark.mg
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 @pytest.mark.skipif(
     isinstance(pylibwholegraph, MissingModule), reason="wholegraph not available"
 )
-@pytest.mark.skip(reason="broken")
 def test_feature_storage_wholegraph_backend_mg():
-    from pylibwholegraph.utils.multiprocess import multiprocess_run
-
-    gpu_count = wmb.fork_get_gpu_count()
-    print("gpu count:", gpu_count)
-    assert gpu_count > 0
+    world_size = torch.cuda.device_count()
+    print("gpu count:", world_size)
+    assert world_size > 0
 
-    multiprocess_run(gpu_count, runtest)
+    torch.multiprocessing.spawn(runtest, args=(world_size,), nprocs=world_size)