From fd4530d308bda19fe85b95d9788bfe32966668fb Mon Sep 17 00:00:00 2001 From: Matthias Knorr Date: Mon, 2 Sep 2024 18:24:54 +0200 Subject: [PATCH] Docs: Refactor hip driver API example code --- docs/how-to/hip_porting_driver_api.rst | 86 ++++++++++++-------------- 1 file changed, 39 insertions(+), 47 deletions(-) diff --git a/docs/how-to/hip_porting_driver_api.rst b/docs/how-to/hip_porting_driver_api.rst index d4d770df57..607c5f6d8a 100644 --- a/docs/how-to/hip_porting_driver_api.rst +++ b/docs/how-to/hip_porting_driver_api.rst @@ -235,77 +235,69 @@ The sample below shows how to use ``hipModuleGetFunction``. .. code-block:: cpp - #include - #include + #include + #include - #include - #include #include - #define LEN 64 - #define SIZE LEN << 2 - - #ifdef __HIP_PLATFORM_AMD__ - #define fileName "vcpy_isa.co" - #endif - - #ifdef __HIP_PLATFORM_NVIDIA__ - #define fileName "vcpy_isa.ptx" - #endif - - #define kernel_name "hello_world" - int main() { - float *A, *B; - hipDeviceptr_t Ad, Bd; - A = new float[LEN]; - B = new float[LEN]; - for (uint32_t i = 0; i < LEN; i++) { - A[i] = i * 1.0f; - B[i] = 0.0f; - std::cout << A[i] << " " << B[i] << std::endl; - } + size_t elements = 64*1024; + size_t size_bytes = elements * sizeof(float); + std::vector A(elements), B(elements); + + // On NVIDIA platforms the driver runtime needs to be initiated #ifdef __HIP_PLATFORM_NVIDIA__ hipInit(0); hipDevice_t device; hipCtx_t context; - hipDeviceGet(&device, 0); - hipCtxCreate(&context, 0, device); + HIPCHECK(hipDeviceGet(&device, 0)); + HIPCHECK(hipCtxCreate(&context, 0, device)); #endif - hipMalloc((void **)&Ad, SIZE); - hipMalloc((void **)&Bd, SIZE); + // Allocate device memory + hipDeviceptr_t d_A, d_B; + HIPCHECK(hipMalloc(&d_A, size_bytes)); + HIPCHECK(hipMalloc(&d_B, size_bytes)); + + // Copy data to device + HIPCHECK(hipMemcpyHtoD(d_A, A.data(), size_bytes)); + HIPCHECK(hipMemcpyHtoD(d_B, B.data(), size_bytes)); - hipMemcpyHtoD(Ad, A, SIZE); - hipMemcpyHtoD(Bd, B, SIZE); + // Load module hipModule_t Module; + // For AMD the module file has to contain architecture specific object codee + // For NVIDIA the module file has to contain PTX, found in e.g. "vcpy_isa.ptx" + HIPCHECK(hipModuleLoad(&Module, "vcpy_isa.co")); + // Get kernel function from the module via its name hipFunction_t Function; - hipModuleLoad(&Module, fileName); - hipModuleGetFunction(&Function, Module, kernel_name); + HIPCHECK(hipModuleGetFunction(&Function, Module, "hello_world")); - std::vector argBuffer(2); - memcpy(&argBuffer[0], &Ad, sizeof(void *)); - memcpy(&argBuffer[1], &Bd, sizeof(void *)); + // Create buffer for kernel arguments + std::vector argBuffer{&d_A, &d_B}; + size_t arg_size_bytes = argBuffer.size() * sizeof(void*); - size_t size = argBuffer.size() * sizeof(void *); + // Create configuration passed to the kernel as arguments + void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, argBuffer.data(), + HIP_LAUNCH_PARAM_BUFFER_SIZE, &arg_size_bytes, HIP_LAUNCH_PARAM_END}; - void *config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0], - HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, HIP_LAUNCH_PARAM_END}; + int threads_per_block = 128; + int blocks = (elements + threads_per_block - 1) / threads_per_block; - hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL, - (void **)&config); + // Actually launch kernel + HIPCHECK(hipModuleLaunchKernel(Function, blocks, 1, 1, threads_per_block, 1, 1, 0, 0, NULL, config)); - hipMemcpyDtoH(B, Bd, SIZE); - for (uint32_t i = 0; i < LEN; i++) { - std::cout << A[i] << " - " << B[i] << std::endl; - } + HIPCHECK(hipMemcpyDtoH(A.data(), d_A, elements)); + HIPCHECK(hipMemcpyDtoH(B.data(), d_B, elements)); #ifdef __HIP_PLATFORM_NVIDIA__ - hipCtxDetach(context); + HIPCHECK(hipCtxDetach(context)); #endif + HIPCHECK(hipFree(d_A)); + HIPCHECK(hipFree(d_B)); + return 0; }