Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Print gpus used on simulator startup #5611

Merged
merged 2 commits into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CMakeLists_files.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,8 @@ endif()

# add these files if we should compile the hip code
if (HAVE_CUDA)
list(APPEND MAIN_SOURCE_FILES opm/simulators/linalg/gpuistl/device_management.hpp) # should not be hipified to make main independant of library
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg device_management.cpp)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/CuBlasHandle.cpp)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/gpusparse_matrix_operations.cu)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/CuSparseHandle.cpp)
Expand Down
6 changes: 1 addition & 5 deletions opm/simulators/flow/Main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,6 @@
#include <opm/simulators/utils/DamarisOutputModule.hpp>
#endif

#if HAVE_CUDA
#include <opm/simulators/linalg/gpuistl/set_device.hpp>
#endif

namespace Opm {

Main::Main(int argc, char** argv, bool ownMPI)
Expand Down Expand Up @@ -163,7 +159,7 @@ void Main::initMPI()
}

#if HAVE_CUDA
Opm::gpuistl::setDevice(FlowGenericVanguard::comm().rank(), FlowGenericVanguard::comm().size());
Opm::gpuistl::setDevice();
#endif

#endif // HAVE_MPI
Expand Down
8 changes: 8 additions & 0 deletions opm/simulators/flow/Main.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,10 @@
#include <opm/simulators/utils/ParallelEclipseState.hpp>
#endif

#if HAVE_CUDA
#include <opm/simulators/linalg/gpuistl/device_management.hpp>
#endif

#if HAVE_DAMARIS
#include <opm/simulators/utils/DamarisKeywords.hpp>
#endif
Expand Down Expand Up @@ -426,6 +430,10 @@ class Main
return false;
}

#if HAVE_CUDA
Opm::gpuistl::printDevice();
#endif

exitCode = EXIT_SUCCESS;
return true;
}
Expand Down
84 changes: 84 additions & 0 deletions opm/simulators/linalg/gpuistl/device_management.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/*
Copyright 2024 SINTEF AS

This file is part of the Open Porous Media project (OPM).

OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/

#include <config.h>

#include <opm/simulators/flow/FlowGenericVanguard.hpp>
#include <opm/simulators/utils/DeferredLogger.hpp>
#if HAVE_CUDA
#include <cuda_runtime.h>
#include <cuda.h>
#include <opm/simulators/linalg/gpuistl/set_device.hpp>
#include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>
#endif

namespace Opm::gpuistl {

/*
* Print the device name and compute capability on every rank

If you have an AMD GPU and you have an AMD CPU you might run
into problems with this code when using multiple MPI ranks.
The simulation might hang because the integrated GPU in the CPU
is detected has Radeon compute units, but it does not support ROCM.
This is fixable my making only the GPUS on your system visible with
ROCR_VISIBLE_DEVICES environment variable.
*/
void printDevice()
{
int mpiRank = 0;
#if HAVE_CUDA
#if HAVE_MPI
mpiRank = FlowGenericVanguard::comm().rank();
#endif

int deviceCount = -1;
OPM_GPU_WARN_IF_ERROR(cudaGetDeviceCount(&deviceCount));

const auto deviceId = mpiRank % deviceCount;

struct cudaDeviceProp props;
OPM_GPU_WARN_IF_ERROR(cudaGetDeviceProperties(&props, deviceId));

std::string out;
out = fmt::format("rank: {}, GPU: {}, Compute Capability: {}.{} (device {} out of {})\n",
mpiRank, props.name, props.major, props.minor, deviceId, deviceCount);
auto deferred_logger = ::Opm::DeferredLogger();
deferred_logger.info(out);

DeferredLogger global = gatherDeferredLogger(deferred_logger, FlowGenericVanguard::comm());
if (mpiRank == 0) {
global.logMessages();
}

#endif
}

void setDevice()
{
#if HAVE_CUDA
#if HAVE_MPI
Opm::gpuistl::setDevice(FlowGenericVanguard::comm().rank(), FlowGenericVanguard::comm().size());
#else
Opm::gpuistl::setDevice(0, 1);
#endif
#endif
}

} // namespace Opm::gpuistl
34 changes: 34 additions & 0 deletions opm/simulators/linalg/gpuistl/device_management.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/*
Copyright 2024 SINTEF AS

This file is part of the Open Porous Media project (OPM).

OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/

#ifndef OPM_GPUISTL_DEVICE_MANAGEMENT
#define OPM_GPUISTL_DEVICE_MANAGEMENT

/*
This file should not be hipified, and serves as a layer between main and gpuistl/set_device
that does not depend on the library such that the simulatorobjects to not depend
on the library and can be built in parallel.
*/

namespace Opm::gpuistl {
void printDevice();
void setDevice();
}

#endif // namespace Opm::gpuistl
15 changes: 8 additions & 7 deletions opm/simulators/linalg/gpuistl/set_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,33 +17,34 @@
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#include <sstream>
#include <cuda_runtime.h>
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/simulators/flow/FlowGenericVanguard.hpp>
#include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>
#include <opm/simulators/linalg/gpuistl/set_device.hpp>

namespace Opm::gpuistl
{
void
setDevice(int mpiRank, [[maybe_unused]] int numberOfMpiRanks)
{

int deviceCount = -1;
[[maybe_unused]] auto cuError = cudaGetDeviceCount(&deviceCount);

if (deviceCount <= 0) {
// If they have CUDA enabled (ie. using a component that needs CUDA, eg. gpubicgstab or CUILU0), this will fail
// If they have CUDA/HIP enabled (ie. using a component that needs CUDA, eg. gpubicgstab or CUILU0), this will fail
// later down the line. At this point in the simulator, we can not determine if CUDA is enabled, so we can only
// issue a warning.
OpmLog::warning("Could not find any CUDA devices.");
OpmLog::warning("Could not find any CUDA/HIP devices.");
return;
}

// Now do a round robin kind of assignment
// TODO: We need to be more sophistacted here. We have no guarantee this will pick the correct device.
const auto deviceId = mpiRank % deviceCount;
OPM_GPU_SAFE_CALL(cudaDeviceReset());
OPM_GPU_SAFE_CALL(cudaSetDevice(deviceId));
OpmLog::info("Set CUDA device to " + std::to_string(deviceId) + " (out of " + std::to_string(deviceCount)
+ " devices).");
OPM_GPU_WARN_IF_ERROR(cudaDeviceReset());
OPM_GPU_WARN_IF_ERROR(cudaSetDevice(deviceId));
}

} // namespace Opm::gpuistl
6 changes: 4 additions & 2 deletions opm/simulators/linalg/gpuistl/set_device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/

#ifndef OPM_CUISTL_SET_DEVICE_HEADER
#define OPM_CUISTL_SET_DEVICE_HEADER
#ifndef OPM_GPUISTL_SET_DEVICE_HEADER
#define OPM_GPUISTL_SET_DEVICE_HEADER

namespace Opm::gpuistl
{
Expand All @@ -32,5 +32,7 @@ namespace Opm::gpuistl
//!
//! @note If no CUDA device is present, this does nothing.
void setDevice(int mpiRank, int numberOfMpiRanks);

void printDevice(int mpiRank, int numberOfMpiRanks);
} // namespace Opm::gpuistl
#endif