Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix a bug in tensornet backend scratch pad allocation in multi-GPU mode #2516

Merged
merged 4 commits into from
Jan 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions runtime/nvqir/cutensornet/simulator_cutensornet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ SimulatorTensorNetBase::SimulatorTensorNetBase()
cudaq::mpi::is_initialized() ? cudaq::mpi::rank() % numDevices : 0;
HANDLE_CUDA_ERROR(cudaSetDevice(deviceId));
HANDLE_CUTN_ERROR(cutensornetCreate(&m_cutnHandle));
// The scratch pad must be allocated after we have selected the device.
scratchPad.allocate();
}

static std::vector<std::complex<double>>
Expand Down
13 changes: 11 additions & 2 deletions runtime/nvqir/cutensornet/tensornet_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,12 @@ struct ScratchDeviceMem {
2; // use half of available memory with alignment
}

ScratchDeviceMem() {
// Allocate scratch device memory based on available memory
void allocate() {
if (d_scratch)
throw std::runtime_error(
"Multiple scratch device memory allocations is not allowed.");

computeScratchSize();
// Try allocate device memory
auto errCode = cudaMalloc(&d_scratch, scratchSize);
Expand All @@ -86,7 +91,11 @@ struct ScratchDeviceMem {
HANDLE_CUDA_ERROR(errCode);
}
}
~ScratchDeviceMem() { HANDLE_CUDA_ERROR(cudaFree(d_scratch)); }

~ScratchDeviceMem() {
if (scratchSize > 0)
HANDLE_CUDA_ERROR(cudaFree(d_scratch));
}
};

/// Initialize `cutensornet` MPI Comm
Expand Down
23 changes: 23 additions & 0 deletions unittests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,29 @@ if(TARGET nvqir-tensornet)
message(STATUS "Building cutensornet backend tests.")
create_tests_with_backend(tensornet "")
create_tests_with_backend(tensornet-mps "")
if (MPI_CXX_FOUND)
# Count the number of GPUs
find_program(NVIDIA_SMI "nvidia-smi")
if(NVIDIA_SMI)
execute_process(COMMAND bash -c "nvidia-smi --list-gpus | wc -l" OUTPUT_VARIABLE NGPUS)
# Only build this test if we have more than 1 GPUs
if (${NGPUS} GREATER_EQUAL 2)
message(STATUS "Building cutensornet MPI tests.")
add_executable(test_tensornet_mpi mpi/tensornet_mpi_tester.cpp)
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT APPLE)
target_link_options(test_tensornet_mpi PRIVATE -Wl,--no-as-needed)
endif()
target_link_libraries(test_tensornet_mpi
PRIVATE
cudaq
cudaq-platform-default
nvqir-tensornet
gtest)
add_test(NAME TensornetMPITest COMMAND ${MPIEXEC} --allow-run-as-root -np 2 ${CMAKE_BINARY_DIR}/unittests/test_tensornet_mpi)
set_tests_properties(TensornetMPITest PROPERTIES LABELS "gpu_required;mgpus_required")
endif() # NGPUS
endif() # NVIDIA_SMI
endif() # MPI_CXX_FOUND
endif()

# Create an executable for SpinOp UnitTests
Expand Down
44 changes: 44 additions & 0 deletions unittests/mpi/tensornet_mpi_tester.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
/*******************************************************************************
* Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates. *
* All rights reserved. *
* *
* This source code and the accompanying materials are made available under *
* the terms of the Apache License 2.0 which accompanies this distribution. *
******************************************************************************/
#include <cudaq.h>
#include <gtest/gtest.h>

TEST(TensornetMPITester, checkInit) {
EXPECT_TRUE(cudaq::mpi::is_initialized());
std::cout << "Rank = " << cudaq::mpi::rank() << "\n";
}

TEST(TensornetMPITester, checkSimple) {
constexpr std::size_t numQubits = 50;
auto kernel = []() __qpu__ {
cudaq::qvector q(numQubits);
h(q[0]);
for (int i = 0; i < numQubits - 1; i++)
x<cudaq::ctrl>(q[i], q[i + 1]);
mz(q);
};

auto counts = cudaq::sample(100, kernel);

if (cudaq::mpi::rank() == 0) {
EXPECT_EQ(2, counts.size());

for (auto &[bits, count] : counts) {
printf("Observed: %s, %lu\n", bits.data(), count);
EXPECT_EQ(numQubits, bits.size());
}
}
}

int main(int argc, char **argv) {
::testing::InitGoogleTest(&argc, argv);
cudaq::mpi::initialize();
const auto testResult = RUN_ALL_TESTS();
cudaq::mpi::finalize();
return testResult;
}
Loading