Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Neural Network Extension, HIP GPU backend - add support for layers that use MIOpen #619

Merged
merged 7 commits into from
Sep 23, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions amd_openvx_extensions/amd_nn/nn_hip/nn_hip_host_decls.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ THE SOFTWARE.
#ifndef NN_HIP_HOST_DECLS_H
#define NN_HIP_HOST_DECLS_H
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
#include <VX/vx.h>

int HipExec_Gather_layer(hipStream_t stream, dim3 globalThreads, dim3 localThreads, vx_enum type, unsigned char* in, uint in_offset,
Expand All @@ -42,4 +43,7 @@ int HipExec_image_to_tensor_layer(hipStream_t stream, vx_df_image format, vx_enu
int HipExec_tensor_to_image_layer(hipStream_t stream, vx_df_image format, vx_enum type, uint width, uint height, uint N, unsigned char* in,
uint in_offset, uint4 in_stride, unsigned char* out, uint out_offset, uint out_stride, float sc1, float sc2, uint reverse_channel_order);

int HipExec_copy(hipStream_t stream, vx_enum type, unsigned char* inp, unsigned char* out, uint width, uint height, uint ldi, uint i_offset,
uint ldc, uint c_offset, bool tI);

#endif //NN_HIP_HOST_DECLS_H
60 changes: 59 additions & 1 deletion amd_openvx_extensions/amd_nn/nn_hip/nn_hip_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ THE SOFTWARE.

#include "../../../amd_openvx/openvx/hipvx/hip_common_funcs.h"
#include "nn_hip_host_decls.h"
#include "hip/hip_fp16.h"

// ----------------------------------------------------------------------------
// Neural Network kernels for hip backend
Expand Down Expand Up @@ -499,3 +498,62 @@ int HipExec_tensor_to_image_layer(hipStream_t stream, vx_df_image format, vx_enu

return VX_SUCCESS;
}

template <typename T>
__global__ void __attribute__((visibility("default")))
copy_v1(const T* inp, T* out, uint width, uint height, uint BLKW, uint ldi, uint i_offset, uint ldc, uint c_offset) {
__shared__ float lbuf[256];
uint gx = blockIdx.x;
uint gy = blockIdx.y;
uint lx = threadIdx.x;
uint ly = threadIdx.y;
uint ix = hip_mad24(gx, BLKW, lx);
uint iy = hip_mad24(gy, BLKW, ly);
if (ix < width && iy < height) {
uint iloc = iy * ldi + ix + i_offset;
lbuf[hip_mad24(ly, BLKW + 1, lx)] = inp[iloc];
}
__syncthreads();
uint ox = hip_mad24(gy, BLKW, lx);
uint oy = hip_mad24(gx, BLKW, ly);
if(oy < width && ox < height) {
uint oloc = oy * ldc + ox + c_offset;
out[oloc] = lbuf[hip_mad24(lx, BLKW + 1, ly)];
}
}

template <typename T>
__global__ void __attribute__((visibility("default")))
copy_v2(const T* inp, T* out, uint width, uint height, uint ldi, uint i_offset, uint ldc, uint c_offset) {
uint x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
uint y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
if(x < width && y < height) {
uint i = y * ldi + x + i_offset;
uint o = y * ldc + x + c_offset;
out[o] = inp[i];
}
}

int HipExec_copy(hipStream_t stream, vx_enum type, uchar* inp, uchar* out, uint width, uint height, uint ldi, uint i_offset,
uint ldc, uint c_offset, bool tI) {
if(tI) {
dim3 blockDim(16, 16, 1);
dim3 gridDim = dim3(ceil((float)width / blockDim.x), ceil((float)height / blockDim.y), 1);
if (type == VX_TYPE_FLOAT32) {
hipLaunchKernelGGL(copy_v1<float>, gridDim, blockDim, 0, stream, (float*)inp, (float*)out, width, height, blockDim.x, ldi,
i_offset, ldc, c_offset);
} else {
hipLaunchKernelGGL(copy_v1<__half>, gridDim, blockDim, 0, stream, (__half*)inp, (__half*)out, width, height, blockDim.x, ldi,
i_offset, ldc, c_offset);
}
} else {
dim3 blockDim(64, 1, 1);
dim3 gridDim = dim3(ceil((float)width / blockDim.x), height, 1);
if (type == VX_TYPE_FLOAT32) {
hipLaunchKernelGGL(copy_v2<float>, gridDim, blockDim, 0, stream, (float*)inp, (float*)out, width, height, ldi, i_offset, ldc, c_offset);
} else {
hipLaunchKernelGGL(copy_v2<float>, gridDim, blockDim, 0, stream, (float*)inp, (float*)out, width, height, ldi, i_offset, ldc, c_offset);
}
}
return VX_SUCCESS;
}
61 changes: 61 additions & 0 deletions amd_openvx_extensions/amd_nn/src/batch_normalization_layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,28 @@ THE SOFTWARE.
struct BatchNormLayerLocalData {
NeuralNetworkCommonHandle * handle;
miopenTensorDescriptor_t input_desc;
#if ENABLE_OPENCL
cl_mem input_mem;
#elif ENABLE_HIP
vx_uint8 *input_mem;
#endif
miopenTensorDescriptor_t output_desc;
miopenDataType_t data_type; // data_type for the kernel
#if ENABLE_OPENCL
cl_mem output_mem;
cl_mem workspace;
#elif ENABLE_HIP
AryanSalmanpour marked this conversation as resolved.
Show resolved Hide resolved
vx_uint8 *output_mem;
vx_uint8 *workspace;
#endif
size_t workspace_size;
float alpha, beta, eps;
miopenTensorDescriptor_t bnScaleBiasMeanVarDesc;
#if ENABLE_OPENCL
AryanSalmanpour marked this conversation as resolved.
Show resolved Hide resolved
cl_mem bnScale, bnBias, bnMean, bnVariance;
#elif ENABLE_HIP
vx_uint8 *bnScale, *bnBias, *bnMean, *bnVariance;
#endif
};

static vx_status VX_CALLBACK validateBatchNormalizationLayer(vx_node node, const vx_reference parameters[], vx_uint32 num, vx_meta_format metas[])
Expand Down Expand Up @@ -109,8 +122,13 @@ PROFILER_START(VX_NN, Batch_Normalization_Layer)
ERROR_CHECK_STATUS(vxQueryNode(node, VX_NODE_LOCAL_DATA_PTR, &data, sizeof(data)));
miopenHandle_t miopenHandle = data->handle->miopen_handle;

#if ENABLE_OPENCL
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_BUFFER_OPENCL, &data->input_mem, sizeof(data->input_mem)));
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[6], VX_TENSOR_BUFFER_OPENCL, &data->output_mem, sizeof(data->output_mem)));
#elif ENABLE_HIP
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_BUFFER_HIP, &data->input_mem, sizeof(data->input_mem)));
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[6], VX_TENSOR_BUFFER_HIP, &data->output_mem, sizeof(data->output_mem)));
#endif

// miopen batch norm inference.
ERROR_CHECK_MIOPEN_STATUS(miopenBatchNormalizationForwardInference(miopenHandle, miopenBNSpatial, &data->alpha, &data->beta, data->input_desc, data->input_mem,
Expand Down Expand Up @@ -149,6 +167,7 @@ static vx_status VX_CALLBACK initializeBatchNormalizationLayer(vx_node node, con

data->alpha = 1; data->beta = 0;

#if ENABLE_OPENCL
// input and output memory.
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_BUFFER_OPENCL, &data->input_mem, sizeof(data->input_mem)));
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_BUFFER_OPENCL, &data->bnMean, sizeof(data->bnMean)));
Expand All @@ -163,6 +182,7 @@ static vx_status VX_CALLBACK initializeBatchNormalizationLayer(vx_node node, con
ERROR_CHECK_STATUS(vxQueryContext(vxContext, VX_CONTEXT_ATTRIBUTE_AMD_OPENCL_CONTEXT, &context, sizeof(context)));
cl_float pattern = 0; cl_int err = 0;
data->bnBias = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*input_dims[2], NULL, &err);

if (err) return VX_FAILURE;
if (data->data_type == miopenFloat)
err = clEnqueueFillBuffer(data->handle->cmdq, data->bnBias, &pattern, sizeof(cl_float), 0, input_dims[2], 0, NULL, NULL);
Expand All @@ -172,6 +192,40 @@ static vx_status VX_CALLBACK initializeBatchNormalizationLayer(vx_node node, con
if (err) return VX_FAILURE;
}
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[6], VX_TENSOR_BUFFER_OPENCL, &data->output_mem, sizeof(data->output_mem)));
#elif ENABLE_HIP
// input and output memory.
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_BUFFER_HIP, &data->input_mem, sizeof(data->input_mem)));
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_BUFFER_HIP, &data->bnMean, sizeof(data->bnMean)));
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_BUFFER_HIP, &data->bnVariance, sizeof(data->bnVariance)));
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[3], VX_TENSOR_BUFFER_HIP, &data->bnScale, sizeof(data->bnScale)));
if(parameters[4]) {
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[4], VX_TENSOR_BUFFER_HIP, &data->bnBias, sizeof(data->bnBias)));
} else {
vx_context vxContext = vxGetContext((vx_reference)node);
int hip_device = -1;
ERROR_CHECK_STATUS(vxQueryContext(vxContext, VX_CONTEXT_ATTRIBUTE_AMD_HIP_DEVICE, &hip_device, sizeof(hip_device)));
if (hip_device < 0) {
return VX_FAILURE;
}
hipError_t errcode_ret = hipSuccess;
errcode_ret = hipSetDevice(hip_device);
AryanSalmanpour marked this conversation as resolved.
Show resolved Hide resolved
if (errcode_ret != hipSuccess) {
return VX_FAILURE;
}

errcode_ret = hipMalloc(&data->bnBias, sizeof(float)*input_dims[2]);
if (errcode_ret != hipSuccess) {
return VX_FAILURE;
}

errcode_ret = hipMemset(data->bnBias, 0, input_dims[2]);
if (errcode_ret != hipSuccess) {
return VX_FAILURE;
}
}
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[6], VX_TENSOR_BUFFER_HIP, &data->output_mem, sizeof(data->output_mem)));

#endif

data->eps = 0.00001;
ERROR_CHECK_STATUS(vxCopyScalar((vx_scalar)parameters[5], &data->eps, VX_READ_ONLY, VX_MEMORY_TYPE_HOST));
Expand All @@ -196,8 +250,15 @@ static vx_status VX_CALLBACK uninitializeBatchNormalizationLayer(vx_node node, c
ERROR_CHECK_MIOPEN_STATUS(miopenDestroyTensorDescriptor(data->bnScaleBiasMeanVarDesc));
if(!parameters[4]){
if(data->bnBias) {
#if ENABLE_OPENCL
cl_int err = clReleaseMemObject(data->bnBias);
if (err) return VX_FAILURE;
#elif ENABLE_HIP
hipError_t errcode_ret = hipFree((void *)data->bnBias);
if (errcode_ret != hipSuccess) {
return VX_FAILURE;
}
#endif
}
}
ERROR_CHECK_STATUS(releaseGraphHandle(node, data->handle));
Expand Down
70 changes: 70 additions & 0 deletions amd_openvx_extensions/amd_nn/src/convolution_layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,17 +39,34 @@ struct ConvolutionLayerLocalData {
float bias_alpha, bias_beta;
miopenDataType_t data_type; // data_type for the kernel
miopenTensorDescriptor_t input_desc;
#if ENABLE_OPENCL
cl_mem input_mem;
#elif ENABLE_HIP
vx_uint8* input_mem;
#endif
miopenTensorDescriptor_t weight_desc;
#if ENABLE_OPENCL
cl_mem weight_mem;
#elif ENABLE_HIP
vx_uint8* weight_mem;
#endif
miopenConvolutionDescriptor_t conv_desc;
miopenConvFwdAlgorithm_t algo;
miopenTensorDescriptor_t output_desc;
#if ENABLE_OPENCL
cl_mem output_mem;
cl_mem workspace;
#elif ENABLE_HIP
vx_uint8* output_mem;
vx_uint8* workspace;
#endif
size_t workspace_size;
miopenTensorDescriptor_t bias_desc;
#if ENABLE_OPENCL
cl_mem bias_mem;
#elif ENABLE_HIP
vx_uint8* bias_mem;
#endif
miopenActivationMode_t activation_mode;
float activation_alpha;
float activation_beta;
Expand Down Expand Up @@ -135,12 +152,22 @@ static vx_status VX_CALLBACK processConvolutionLayer(vx_node node, const vx_refe
PROFILER_START(VX_NN, Convolution_Layer)
ConvolutionLayerLocalData * data= NULL;
ERROR_CHECK_STATUS(vxQueryNode(node, VX_NODE_LOCAL_DATA_PTR, &data, sizeof(data)));
#if ENABLE_OPENCL
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_BUFFER_OPENCL, &data->input_mem, sizeof(data->input_mem)));
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[4], VX_TENSOR_BUFFER_OPENCL, &data->output_mem, sizeof(data->output_mem)));
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_BUFFER_OPENCL, &data->weight_mem, sizeof(data->weight_mem)));
if(parameters[2]) {
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_BUFFER_OPENCL, &data->bias_mem, sizeof(data->bias_mem)));
}
#elif ENABLE_HIP
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_BUFFER_HIP, &data->input_mem, sizeof(data->input_mem)));
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[4], VX_TENSOR_BUFFER_HIP, &data->output_mem, sizeof(data->output_mem)));
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_BUFFER_HIP, &data->weight_mem, sizeof(data->weight_mem)));
if(parameters[2]) {
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_BUFFER_HIP, &data->bias_mem, sizeof(data->bias_mem)));
}
#endif

if (data->fusion_possible == true)
{
// Set the Args
Expand Down Expand Up @@ -294,13 +321,23 @@ static vx_status VX_CALLBACK initializeConvolutionLayer(vx_node node, const vx_r
//Grouped Convolution
ERROR_CHECK_MIOPEN_STATUS(miopenSetConvolutionGroupCount(data->conv_desc, data->groupCount));

#if ENABLE_OPENCL
//Memory Declaration.
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_BUFFER_OPENCL, &data->input_mem, sizeof(data->input_mem)));
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[4], VX_TENSOR_BUFFER_OPENCL, &data->output_mem, sizeof(data->output_mem)));
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_BUFFER_OPENCL, &data->weight_mem, sizeof(data->weight_mem)));
if(parameters[2]) {
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_BUFFER_OPENCL, &data->bias_mem, sizeof(data->bias_mem)));
}
#elif ENABLE_HIP
//Memory Declaration.
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_BUFFER_HIP, &data->input_mem, sizeof(data->input_mem)));
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[4], VX_TENSOR_BUFFER_HIP, &data->output_mem, sizeof(data->output_mem)));
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_BUFFER_HIP, &data->weight_mem, sizeof(data->weight_mem)));
if(parameters[2]) {
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_BUFFER_HIP, &data->bias_mem, sizeof(data->bias_mem)));
}
#endif

if (/*(data->bias_activ_mode == BIAS_ONLY_FUSED) || (data->bias_activ_mode == ACTIVATION_ONLY_FUSED) ||*/ (data->bias_activ_mode == BIAS_ACTIVATION_FUSED)) {
ERROR_CHECK_MIOPEN_STATUS(miopenCreateFusionPlan(&data->fusePlanDesc, miopenVerticalFusion, data->input_desc));
Expand Down Expand Up @@ -356,6 +393,7 @@ static vx_status VX_CALLBACK initializeConvolutionLayer(vx_node node, const vx_r
ERROR_CHECK_MIOPEN_STATUS(miopenConvolutionForwardGetWorkSpaceSize(data->handle->miopen_handle, data->weight_desc, data->input_desc, data->conv_desc, data->output_desc, &data->workspace_size ));
if (data->workspace_size > 0) {
vx_context vxContext = vxGetContext((vx_reference)node);
#if ENABLE_OPENCL
cl_context context;
ERROR_CHECK_STATUS(vxQueryContext(vxContext, VX_CONTEXT_ATTRIBUTE_AMD_OPENCL_CONTEXT, &context, sizeof(context)));
data->workspace_size = (data->workspace_size + 3) & ~3;
Expand All @@ -370,6 +408,29 @@ static vx_status VX_CALLBACK initializeConvolutionLayer(vx_node node, const vx_r
else
err = clEnqueueFillBuffer(data->handle->cmdq, data->workspace, &pattern, sizeof(cl_half), 0, data->workspace_size, 0, NULL, NULL);
if(err) return VX_FAILURE;
#elif ENABLE_HIP
int hip_device = -1;
ERROR_CHECK_STATUS(vxQueryContext(vxContext, VX_CONTEXT_ATTRIBUTE_AMD_HIP_DEVICE, &hip_device, sizeof(hip_device)));
if (hip_device < 0) {
return VX_FAILURE;
}
hipError_t errcode_ret = hipSuccess;
errcode_ret = hipSetDevice(hip_device);
AryanSalmanpour marked this conversation as resolved.
Show resolved Hide resolved
if (errcode_ret != hipSuccess) {
return VX_FAILURE;
}

data->workspace_size = (data->workspace_size + 3) & ~3;
errcode_ret = hipMalloc(&data->workspace, data->workspace_size);
if (errcode_ret != hipSuccess) {
return VX_FAILURE;
}

errcode_ret = hipMemset(data->workspace, 0, data->workspace_size);
if (errcode_ret != hipSuccess) {
return VX_FAILURE;
}
#endif
}
//Finding best Convolution Algorithm.
miopenConvAlgoPerf_t perf;
Expand Down Expand Up @@ -404,7 +465,16 @@ static vx_status VX_CALLBACK uninitializeConvolutionLayer(vx_node node, const vx
{
ConvolutionLayerLocalData * data = NULL;
ERROR_CHECK_STATUS(vxQueryNode(node, VX_NODE_LOCAL_DATA_PTR, &data, sizeof(data)));
#if ENABLE_OPENCL
if(data->workspace && clReleaseMemObject(data->workspace) != 0) return VX_FAILURE;
#elif ENABLE_HIP
if (data->workspace) {
hipError_t errcode_ret = hipFree((void *)data->workspace);
if (errcode_ret != hipSuccess) {
return VX_FAILURE;
}
}
#endif
if (data->fusePlanDesc) miopenDestroyFusionPlan(data->fusePlanDesc);
if (data->fusionArgs) miopenDestroyOperatorArgs(data->fusionArgs);
ERROR_CHECK_MIOPEN_STATUS(miopenDestroyConvolutionDescriptor(data->conv_desc));
Expand Down
Loading