Skip to content

Commit

Permalink
Print device peak BW using NVML
Browse files Browse the repository at this point in the history
  • Loading branch information
gonzalobg committed May 30, 2024
1 parent 46b6d41 commit 51231ac
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 19 deletions.
64 changes: 45 additions & 19 deletions src/cuda/CUDAStream.cu
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,25 @@
// source code

#include "CUDAStream.h"
#include <nvml.h>

#if !defined(UNROLL_FACTOR)
#define UNROLL_FACTOR 4
#endif

[[noreturn]] inline void error(char const* file, int line, char const* expr, cudaError_t e) {
std::fprintf(stderr, "Error at %s:%d: %s (%d)\n %s\n", file, line, cudaGetErrorString(e), e, expr);
[[noreturn]] inline void cuda_error(char const* file, int line, char const* expr, cudaError_t e) {
std::fprintf(stderr, "CUDA Error at %s:%d: %s (%d)\n %s\n", file, line, cudaGetErrorString(e), e, expr);
exit(e);
}

[[noreturn]] inline void nvml_error(char const* file, int line, char const* expr, nvmlReturn_t e) {
std::fprintf(stderr, "NVML Error at %s:%d: %s (%d)\n %s\n", file, line, nvmlErrorString(e), e, expr);
exit(e);
}

// The do while is there to make sure you remember to put a semi-colon after calling CU
#define CU(EXPR) do { auto __e = (EXPR); if (__e != cudaSuccess) error(__FILE__, __LINE__, #EXPR, __e); } while(false)
#define CU(EXPR) do { auto __e = (EXPR); if (__e != cudaSuccess) cuda_error(__FILE__, __LINE__, #EXPR, __e); } while(false)
#define NVML(EXPR) do { auto __e = (EXPR); if (__e != NVML_SUCCESS) nvml_error(__FILE__, __LINE__, #EXPR, __e); } while(false)

// It is best practice to include __device__ and constexpr even though in BabelStream it only needs to be __host__ const
__host__ __device__ constexpr size_t ceil_div(size_t a, size_t b) { return (a + b - 1) / b; }
Expand Down Expand Up @@ -83,20 +90,39 @@ CUDAStream<T>::CUDAStream(const intptr_t array_size, const int device_index)
CU(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));

// Print out device information
std::cout << "Using CUDA device " << getDeviceName(device_index) << std::endl;
std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
#if defined(MANAGED)
std::cout << "Memory: MANAGED" << std::endl;
#elif defined(PAGEFAULT)
std::cout << "Memory: PAGEFAULT" << std::endl;
#else
std::cout << "Memory: DEFAULT" << std::endl;
#endif

// Query device for sensible dot kernel block count
cudaDeviceProp props;
CU(cudaGetDeviceProperties(&props, device_index));
dot_num_blocks = props.multiProcessorCount * 4;
std::cout << "CUDA Driver: " << getDeviceDriver(device_index) << std::endl;
NVML(nvmlInit());
cudaDeviceProp dprop;
CU(cudaGetDeviceProperties(&dprop, device_index));
unsigned int memclock;
char mybus[16];
sprintf(&mybus[0], "%04x:%02x:%02x.0", dprop.pciDomainID, dprop.pciBusID, dprop.pciDeviceID);
nvmlDevice_t nvmldev;
NVML(nvmlDeviceGetHandleByPciBusId(mybus, &nvmldev));
NVML(nvmlDeviceGetClockInfo(nvmldev, NVML_CLOCK_MEM, &memclock));
std::cout << "CUDA Device " << device_index << ": \""
<< getDeviceName(device_index)
<< "\" " << dprop.multiProcessorCount << " SMs(" << dprop.major << "," << dprop.minor << ") "
<< "Memory: " << memclock << " MHz x " << dprop.memoryBusWidth << "-bit = "
<< 2.0*memclock*(dprop.memoryBusWidth/8)/1000.0 << " GB/s PEAK, ECC is "
<< (dprop.ECCEnabled ? "ON" : "OFF")
<< std::endl;

// Print Memory allocation API used for buffers
std::cout << "Memory Allocation: ";
#if defined(MANAGED)
std::cout << "MANAGED";
#elif defined(PAGEFAULT)
std::cout << "PAGEFAULT";
#else
std::cout << "DEFAULT";
#endif
std::cout << std::endl;

std::cout << "Parallel for kernel config: thread blocks of size " << TBSIZE << std::endl;

// Set sensible dot kernel block count
dot_num_blocks = dprop.multiProcessorCount * 4;

// Size of partial sums for dot kernels
size_t sums_bytes = sizeof(T) * dot_num_blocks;
Expand All @@ -105,8 +131,8 @@ CUDAStream<T>::CUDAStream(const intptr_t array_size, const int device_index)
std::cout << "Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE_DOT << std::endl;

// Check buffers fit on the device
if (props.totalGlobalMem < total_bytes)
throw std::runtime_error("Device does not have enough memory for all 3 buffers");
if (dprop.totalGlobalMem < total_bytes)
throw std::runtime_error("Device does not have enough memory for all buffers");

// Allocate buffers:
d_a = alloc_device<T>(array_size);
Expand Down
3 changes: 3 additions & 0 deletions src/cuda/model.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ macro(setup)
"--extended-lambda" "-DUNROLL_FACTOR=${UNROLL_FACTOR}" ${CUDA_EXTRA_FLAGS})
string(REPLACE ";" " " CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}")

# Link against the NVIDIA Management Library for device information
register_link_library("nvidia-ml")

# CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG
# appended later
wipe_gcc_style_optimisation_flags(CMAKE_CUDA_FLAGS_${BUILD_TYPE})
Expand Down

0 comments on commit 51231ac

Please sign in to comment.