Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated hip kernels #139

Merged
merged 7 commits into from
May 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 33 additions & 32 deletions src/hip/HIPStream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#include "hip/hip_runtime.h"

#define TBSIZE 1024
#define DOT_NUM_BLOCKS 256


void check_error(void)
{
Expand Down Expand Up @@ -47,9 +47,15 @@ HIPStream<T>::HIPStream(const int ARRAY_SIZE, const int device_index)
std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;

array_size = ARRAY_SIZE;

// Allocate the host array for partial sums for dot kernels
sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS);
// Round dot_num_blocks up to next multiple of (TBSIZE * dot_elements_per_lane)
dot_num_blocks = (array_size + (TBSIZE * dot_elements_per_lane - 1)) / (TBSIZE * dot_elements_per_lane);

// Allocate the host array for partial sums for dot kernels using hipHostMalloc.
// This creates an array on the host which is visible to the device. However, it requires
// synchronization (e.g. hipDeviceSynchronize) for the result to be available on the host
// after it has been passed through to a kernel.
hipHostMalloc(&sums, sizeof(T) * dot_num_blocks, hipHostMallocNonCoherent);
check_error();

// Check buffers fit on the device
hipDeviceProp_t props;
Expand All @@ -64,31 +70,28 @@ HIPStream<T>::HIPStream(const int ARRAY_SIZE, const int device_index)
check_error();
hipMalloc(&d_c, ARRAY_SIZE*sizeof(T));
check_error();
hipMalloc(&d_sum, DOT_NUM_BLOCKS*sizeof(T));
check_error();
}


template <class T>
HIPStream<T>::~HIPStream()
{
free(sums);
hipHostFree(sums);
check_error();

hipFree(d_a);
check_error();
hipFree(d_b);
check_error();
hipFree(d_c);
check_error();
hipFree(d_sum);
check_error();
}


template <typename T>
__global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC)
{
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
const size_t i = blockDim.x * blockIdx.x + threadIdx.x;
a[i] = initA;
b[i] = initB;
c[i] = initC;
Expand All @@ -97,7 +100,7 @@ __global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC)
template <class T>
void HIPStream<T>::init_arrays(T initA, T initB, T initC)
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(init_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c, initA, initB, initC);
init_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_b, d_c, initA, initB, initC);
check_error();
hipDeviceSynchronize();
check_error();
Expand All @@ -115,18 +118,17 @@ void HIPStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector
check_error();
}


template <typename T>
__global__ void copy_kernel(const T * a, T * c)
{
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
c[i] = a[i];
}

template <class T>
void HIPStream<T>::copy()
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_c);
copy_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_c);
check_error();
hipDeviceSynchronize();
check_error();
Expand All @@ -136,14 +138,14 @@ template <typename T>
__global__ void mul_kernel(T * b, const T * c)
{
const T scalar = startScalar;
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
b[i] = scalar * c[i];
}

template <class T>
void HIPStream<T>::mul()
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(mul_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_b, d_c);
mul_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_b, d_c);
check_error();
hipDeviceSynchronize();
check_error();
Expand All @@ -152,14 +154,14 @@ void HIPStream<T>::mul()
template <typename T>
__global__ void add_kernel(const T * a, const T * b, T * c)
{
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
c[i] = a[i] + b[i];
}

template <class T>
void HIPStream<T>::add()
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(add_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c);
add_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_b, d_c);
check_error();
hipDeviceSynchronize();
check_error();
Expand All @@ -169,14 +171,14 @@ template <typename T>
__global__ void triad_kernel(T * a, const T * b, const T * c)
{
const T scalar = startScalar;
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
a[i] = b[i] + scalar * c[i];
}

template <class T>
void HIPStream<T>::triad()
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(triad_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c);
triad_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_b, d_c);
check_error();
hipDeviceSynchronize();
check_error();
Expand All @@ -186,32 +188,32 @@ template <typename T>
__global__ void nstream_kernel(T * a, const T * b, const T * c)
{
const T scalar = startScalar;
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
const size_t i = threadIdx.x + blockIdx.x * blockDim.x;
a[i] += b[i] + scalar * c[i];
}

template <class T>
void HIPStream<T>::nstream()
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(nstream_kernel<T>), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c);
nstream_kernel<T><<<dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0>>>(d_a, d_b, d_c);
check_error();
hipDeviceSynchronize();
check_error();
}

template <class T>
template <typename T>
__global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size)
{
__shared__ T tb_sum[TBSIZE];

int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
const size_t local_i = hipThreadIdx_x;
const size_t local_i = threadIdx.x;
size_t i = blockDim.x * blockIdx.x + local_i;

tb_sum[local_i] = 0.0;
for (; i < array_size; i += hipBlockDim_x*hipGridDim_x)
for (; i < array_size; i += blockDim.x*gridDim.x)
tomdeakin marked this conversation as resolved.
Show resolved Hide resolved
tb_sum[local_i] += a[i] * b[i];

for (int offset = hipBlockDim_x / 2; offset > 0; offset /= 2)
for (size_t offset = blockDim.x / 2; offset > 0; offset /= 2)
{
__syncthreads();
if (local_i < offset)
Expand All @@ -221,20 +223,19 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size)
}

if (local_i == 0)
sum[hipBlockIdx_x] = tb_sum[local_i];
sum[blockIdx.x] = tb_sum[local_i];
}

template <class T>
T HIPStream<T>::dot()
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(dot_kernel<T>), dim3(DOT_NUM_BLOCKS), dim3(TBSIZE), 0, 0, d_a, d_b, d_sum, array_size);
dot_kernel<T><<<dim3(dot_num_blocks), dim3(TBSIZE), 0, 0>>>(d_a, d_b, sums, array_size);
check_error();

hipMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), hipMemcpyDeviceToHost);
hipDeviceSynchronize();
check_error();

T sum = 0.0;
for (int i = 0; i < DOT_NUM_BLOCKS; i++)
for (int i = 0; i < dot_num_blocks; i++)
sum += sums[i];

return sum;
Expand Down
19 changes: 18 additions & 1 deletion src/hip/HIPStream.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,31 @@
#include "Stream.h"

#define IMPLEMENTATION_STRING "HIP"
#define DOT_READ_DWORDS_PER_LANE 4


template <class T>
class HIPStream : public Stream<T>
{
// Make sure that either:
// DOT_READ_DWORDS_PER_LANE is less than sizeof(T), in which case we default to 1 element
// or
// DOT_READ_DWORDS_PER_LANE is divisible by sizeof(T)
static_assert((DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) < sizeof(T)) ||
(DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) % sizeof(T) == 0),
"DOT_READ_DWORDS_PER_LANE not divisible by sizeof(element_type)");

// Take into account the datatype size
// That is, for 4 DOT_READ_DWORDS_PER_LANE, this is 2 FP64 elements
// and 4 FP32 elements
static constexpr unsigned int dot_elements_per_lane{
(DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int)) < sizeof(T) ? 1 : (
DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) / sizeof(T))};

protected:
// Size of arrays
int array_size;
int dot_num_blocks;

// Host array for partial sums for dot kernel
T *sums;
Expand All @@ -29,7 +47,6 @@ class HIPStream : public Stream<T>
T *d_a;
T *d_b;
T *d_c;
T *d_sum;


public:
Expand Down