From 4dd8fcaf6ec84ee3b70cda6af8285c491052588c Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 30 Jul 2024 20:01:55 -0500 Subject: [PATCH 1/2] Fix ucx-py version, use UCX 1.17.0 in pip devcontainers (#4562) Fixes ucx-py dependency in `dependencies.yaml` and `update-version.sh`. Updates to UCX 1.17.0 in pip devcontainers. (context: https://github.com/rapidsai/cugraph/pull/4562#issuecomment-2258608174). Authors: - Bradley Dice (https://github.com/bdice) - James Lamb (https://github.com/jameslamb) Approvers: - James Lamb (https://github.com/jameslamb) URL: https://github.com/rapidsai/cugraph/pull/4562 --- .devcontainer/cuda12.5-pip/devcontainer.json | 2 +- ci/release/update-version.sh | 13 +++++++------ dependencies.yaml | 4 ++-- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/.devcontainer/cuda12.5-pip/devcontainer.json b/.devcontainer/cuda12.5-pip/devcontainer.json index 1c53bd9925d..6e2bf45700a 100644 --- a/.devcontainer/cuda12.5-pip/devcontainer.json +++ b/.devcontainer/cuda12.5-pip/devcontainer.json @@ -5,7 +5,7 @@ "args": { "CUDA": "12.5", "PYTHON_PACKAGE_MANAGER": "pip", - "BASE": "rapidsai/devcontainers:24.10-cpp-cuda12.5-ucx1.15.0-openmpi-ubuntu22.04" + "BASE": "rapidsai/devcontainers:24.10-cpp-cuda12.5-ucx1.17.0-openmpi-ubuntu22.04" } }, "runArgs": [ diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index ce2488ad0bf..08c22fca02e 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -69,18 +69,19 @@ DEPENDENCIES=( pyraft raft-dask rmm - ucx-py rapids-dask-dependency ) -for DEP in "${DEPENDENCIES[@]}"; do - for FILE in dependencies.yaml conda/environments/*.yaml python/cugraph-{pyg,dgl}/conda/*.yaml; do +for FILE in dependencies.yaml conda/environments/*.yaml python/cugraph-{pyg,dgl}/conda/*.yaml; do + for DEP in "${DEPENDENCIES[@]}"; do sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}" - sed_runner "/-.* ucx-py==/ s/==.*/==${NEXT_UCX_PY_VERSION}.*,>=0.0.0a0/g" "${FILE}" done - for FILE in python/**/pyproject.toml python/**/**/pyproject.toml; do + sed_runner "/-.* ucx-py\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_UCX_PY_VERSION}.*,>=0.0.0a0/g" "${FILE}" +done +for FILE in python/**/pyproject.toml python/**/**/pyproject.toml; do + for DEP in "${DEPENDENCIES[@]}"; do sed_runner "/\"${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" "${FILE}" - sed_runner "/\"ucx-py==/ s/==.*\"/==${NEXT_UCX_PY_VERSION}.*,>=0.0.0a0\"/g" "${FILE}" done + sed_runner "/\"ucx-py\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*\"/==${NEXT_UCX_PY_VERSION}.*,>=0.0.0a0\"/g" "${FILE}" done # ucx-py version diff --git a/dependencies.yaml b/dependencies.yaml index b0e4834a50c..9e9cfcc63a3 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -506,12 +506,12 @@ dependencies: cuda: "11.*" cuda_suffixed: "true" packages: - - &ucx_py_cu11 ucx-py-cu11==24.10.*,>=0.0.0a0 + - &ucx_py_cu11 ucx-py-cu11==0.40.*,>=0.0.0a0 - matrix: cuda: "12.*" cuda_suffixed: "true" packages: - - &ucx_py_cu12 ucx-py-cu12==24.10.*,>=0.0.0a0 + - &ucx_py_cu12 ucx-py-cu12==0.40.*,>=0.0.0a0 - matrix: packages: - *ucx_py_unsuffixed From 5458e76c332dd80e025704344307e1b96cc911b6 Mon Sep 17 00:00:00 2001 From: Alex Barghi <105237337+alexbarghi-nv@users.noreply.github.com> Date: Tue, 30 Jul 2024 22:04:45 -0400 Subject: [PATCH 2/2] [BUG] Fix Failing WholeGraph Tests (#4560) This PR properly uses PyTorch DDP to initialize a process group and test the WholeGraph feature store. Previously it was relying on an API in WholeGraph that no longer appears to work. Authors: - Alex Barghi (https://github.com/alexbarghi-nv) Approvers: - Rick Ratzel (https://github.com/rlratzel) URL: https://github.com/rapidsai/cugraph/pull/4560 --- .../test_gnn_feat_storage_wholegraph.py | 42 ++++++++++--------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/python/cugraph/cugraph/tests/data_store/test_gnn_feat_storage_wholegraph.py b/python/cugraph/cugraph/tests/data_store/test_gnn_feat_storage_wholegraph.py index 0a272e445fa..30336490312 100644 --- a/python/cugraph/cugraph/tests/data_store/test_gnn_feat_storage_wholegraph.py +++ b/python/cugraph/cugraph/tests/data_store/test_gnn_feat_storage_wholegraph.py @@ -13,6 +13,7 @@ import pytest import numpy as np +import os from cugraph.gnn import FeatureStore @@ -21,18 +22,23 @@ pylibwholegraph = import_optional("pylibwholegraph") wmb = import_optional("pylibwholegraph.binding.wholememory_binding") torch = import_optional("torch") +wgth = import_optional("pylibwholegraph.torch") -def runtest(world_rank: int, world_size: int): - from pylibwholegraph.torch.initialize import init_torch_env_and_create_wm_comm +def runtest(rank: int, world_size: int): + torch.cuda.set_device(rank) - wm_comm, _ = init_torch_env_and_create_wm_comm( - world_rank, + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = "12355" + torch.distributed.init_process_group("nccl", rank=rank, world_size=world_size) + + pylibwholegraph.torch.initialize.init( + rank, world_size, - world_rank, + rank, world_size, ) - wm_comm = wm_comm.wmb_comm + wm_comm = wgth.get_global_communicator() generator = np.random.default_rng(62) arr = ( @@ -52,7 +58,7 @@ def runtest(world_rank: int, world_size: int): expected = arr[indices_to_fetch] np.testing.assert_array_equal(output_fs.cpu().numpy(), expected) - wmb.finalize() + pylibwholegraph.torch.initialize.finalize() @pytest.mark.sg @@ -60,15 +66,14 @@ def runtest(world_rank: int, world_size: int): @pytest.mark.skipif( isinstance(pylibwholegraph, MissingModule), reason="wholegraph not available" ) -@pytest.mark.skip(reason="broken") def test_feature_storage_wholegraph_backend(): - from pylibwholegraph.utils.multiprocess import multiprocess_run + world_size = torch.cuda.device_count() + print("gpu count:", world_size) + assert world_size > 0 - gpu_count = wmb.fork_get_gpu_count() - print("gpu count:", gpu_count) - assert gpu_count > 0 + print("ignoring gpu count and running on 1 GPU only") - multiprocess_run(1, runtest) + torch.multiprocessing.spawn(runtest, args=(1,), nprocs=1) @pytest.mark.mg @@ -76,12 +81,9 @@ def test_feature_storage_wholegraph_backend(): @pytest.mark.skipif( isinstance(pylibwholegraph, MissingModule), reason="wholegraph not available" ) -@pytest.mark.skip(reason="broken") def test_feature_storage_wholegraph_backend_mg(): - from pylibwholegraph.utils.multiprocess import multiprocess_run - - gpu_count = wmb.fork_get_gpu_count() - print("gpu count:", gpu_count) - assert gpu_count > 0 + world_size = torch.cuda.device_count() + print("gpu count:", world_size) + assert world_size > 0 - multiprocess_run(gpu_count, runtest) + torch.multiprocessing.spawn(runtest, args=(world_size,), nprocs=world_size)