From fd4530d308bda19fe85b95d9788bfe32966668fb Mon Sep 17 00:00:00 2001
From: Matthias Knorr <MKKnorr@web.de>
Date: Mon, 2 Sep 2024 18:24:54 +0200
Subject: [PATCH] Docs: Refactor hip driver API example code

---
 docs/how-to/hip_porting_driver_api.rst | 86 ++++++++++++--------------
 1 file changed, 39 insertions(+), 47 deletions(-)
diff --git a/docs/how-to/hip_porting_driver_api.rst b/docs/how-to/hip_porting_driver_api.rst
index d4d770df57..607c5f6d8a 100644
--- a/docs/how-to/hip_porting_driver_api.rst
+++ b/docs/how-to/hip_porting_driver_api.rst
@@ -235,77 +235,69 @@ The sample below shows how to use ``hipModuleGetFunction``.
 
 .. code-block:: cpp
 
-    #include <hip_runtime.h>
-    #include <hip_runtime_api.h>
+    #include <hip/hip_runtime.h>
+    #include <hip/hip_runtime_api.h>
 
-    #include <fstream>
-    #include <iostream>
     #include <vector>
 
-    #define LEN 64
-    #define SIZE LEN << 2
-
-    #ifdef __HIP_PLATFORM_AMD__
-    #define fileName "vcpy_isa.co"
-    #endif
-
-    #ifdef __HIP_PLATFORM_NVIDIA__
-    #define fileName "vcpy_isa.ptx"
-    #endif
-
-    #define kernel_name "hello_world"
-
     int main() {
-        float *A, *B;
-        hipDeviceptr_t Ad, Bd;
-        A = new float[LEN];
-        B = new float[LEN];
 
-        for (uint32_t i = 0; i < LEN; i++) {
-            A[i] = i * 1.0f;
-            B[i] = 0.0f;
-            std::cout << A[i] << " " << B[i] << std::endl;
-        }
+        size_t elements = 64*1024;
+        size_t size_bytes = elements * sizeof(float);
 
+        std::vector<float> A(elements), B(elements);
+
+        // On NVIDIA platforms the driver runtime needs to be initiated
         #ifdef __HIP_PLATFORM_NVIDIA__
         hipInit(0);
         hipDevice_t device;
         hipCtx_t context;
-        hipDeviceGet(&device, 0);
-        hipCtxCreate(&context, 0, device);
+        HIPCHECK(hipDeviceGet(&device, 0));
+        HIPCHECK(hipCtxCreate(&context, 0, device));
         #endif
 
-        hipMalloc((void **)&Ad, SIZE);
-        hipMalloc((void **)&Bd, SIZE);
+        // Allocate device memory
+        hipDeviceptr_t d_A, d_B;
+        HIPCHECK(hipMalloc(&d_A, size_bytes));
+        HIPCHECK(hipMalloc(&d_B, size_bytes));
+
+        // Copy data to device
+        HIPCHECK(hipMemcpyHtoD(d_A, A.data(), size_bytes));
+        HIPCHECK(hipMemcpyHtoD(d_B, B.data(), size_bytes));
 
-        hipMemcpyHtoD(Ad, A, SIZE);
-        hipMemcpyHtoD(Bd, B, SIZE);
+        // Load module
         hipModule_t Module;
+        // For AMD the module file has to contain architecture specific object codee
+        // For NVIDIA the module file has to contain PTX, found in e.g. "vcpy_isa.ptx"
+        HIPCHECK(hipModuleLoad(&Module, "vcpy_isa.co"));
+        // Get kernel function from the module via its name
         hipFunction_t Function;
-        hipModuleLoad(&Module, fileName);
-        hipModuleGetFunction(&Function, Module, kernel_name);
+        HIPCHECK(hipModuleGetFunction(&Function, Module, "hello_world"));
 
-        std::vector<void *> argBuffer(2);
-        memcpy(&argBuffer[0], &Ad, sizeof(void *));
-        memcpy(&argBuffer[1], &Bd, sizeof(void *));
+        // Create buffer for kernel arguments
+        std::vector<void*> argBuffer{&d_A, &d_B};
+        size_t arg_size_bytes = argBuffer.size() * sizeof(void*);
 
-        size_t size = argBuffer.size() * sizeof(void *);
+        // Create configuration passed to the kernel as arguments
+        void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, argBuffer.data(),
+                          HIP_LAUNCH_PARAM_BUFFER_SIZE, &arg_size_bytes, HIP_LAUNCH_PARAM_END};
 
-        void *config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0],
-                            HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, HIP_LAUNCH_PARAM_END};
+        int threads_per_block = 128;
+        int blocks = (elements + threads_per_block - 1) / threads_per_block;
 
-        hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL,
-                                (void **)&config);
+        // Actually launch kernel
+        HIPCHECK(hipModuleLaunchKernel(Function, blocks, 1, 1, threads_per_block, 1, 1, 0, 0, NULL, config));
 
-        hipMemcpyDtoH(B, Bd, SIZE);
-        for (uint32_t i = 0; i < LEN; i++) {
-            std::cout << A[i] << " - " << B[i] << std::endl;
-        }
+        HIPCHECK(hipMemcpyDtoH(A.data(), d_A, elements));
+        HIPCHECK(hipMemcpyDtoH(B.data(), d_B, elements));
 
         #ifdef __HIP_PLATFORM_NVIDIA__
-        hipCtxDetach(context);
+        HIPCHECK(hipCtxDetach(context));
         #endif
 
+        HIPCHECK(hipFree(d_A));
+        HIPCHECK(hipFree(d_B));
+
         return 0;
     }