Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Neural Network Extension HIP GPU backend - add support for the permute/Tensor_log/Tensor_exp layers #664

Merged
merged 1 commit into from
Oct 29, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions amd_openvx/openvx/api/vx_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8114,6 +8114,44 @@ VX_API_ENTRY vx_status VX_API_CALL vxQueryArray(vx_array arr, vx_enum attribute,
status = VX_SUCCESS;
}
break;
#if (ENABLE_OPENCL||ENABLE_HIP)
case VX_ARRAY_OFFSET_GPU:
if (size == sizeof(vx_size)) {
*(vx_size *)ptr = data->gpu_buffer_offset;
status = VX_SUCCESS;
}
break;
#if ENABLE_OPENCL
case VX_ARRAY_BUFFER_OPENCL:
if (size == sizeof(cl_mem)) {
if (data->opencl_buffer) {
*(cl_mem *)ptr = data->opencl_buffer;
}
else {
#if defined(CL_VERSION_2_0)
*(vx_uint8 **)ptr = data->opencl_svm_buffer;
#else
*(vx_uint8 **)ptr = NULL;
#endif
}
status = VX_SUCCESS;
}
break;
#else
case VX_ARRAY_BUFFER_HIP:
if (size == sizeof(vx_uint8 *)) {
if (data->hip_memory) {
*(vx_uint8 **)ptr = data->hip_memory;
}
else {
*(vx_uint8 **)ptr = NULL;
}
status = VX_SUCCESS;
}
break;

#endif
#endif
default:
status = VX_ERROR_NOT_SUPPORTED;
break;
Expand Down
3 changes: 2 additions & 1 deletion amd_openvx/openvx/include/vx_ext_amd.h
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,8 @@ enum vx_array_attribute_amd_e {
/*! \brief OpenCL buffer. <tt>cl_mem</tt>. */
VX_ARRAY_BUFFER_OPENCL = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_ARRAY) + 0x9,
VX_ARRAY_BUFFER_HIP = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_ARRAY) + 0x10,
VX_ARRAY_BUFFER = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_ARRAY ) + 0x11
VX_ARRAY_BUFFER = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_ARRAY ) + 0x11,
VX_ARRAY_OFFSET_GPU = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_ARRAY ) + 0x12
};

/*! \brief These enumerations are given to the \c vxDirective API to enable/disable
Expand Down
9 changes: 9 additions & 0 deletions amd_openvx_extensions/amd_nn/nn_hip/nn_hip_host_decls.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,13 @@ int HipExec_tensor_to_image_layer(hipStream_t stream, vx_df_image format, vx_enu
int HipExec_copy(hipStream_t stream, vx_enum type, unsigned char* inp, unsigned char* out, uint width, uint height, uint ldi, uint i_offset,
uint ldc, uint c_offset, bool tI);

int HipExec_permute_layer(hipStream_t stream, dim3 globalThreads, dim3 localThreads, unsigned char* in, uint in_offset, uint4 in_stride,
unsigned char* order_buf, uint order_offset, uint order_cap, unsigned char* out, uint out_offset, uint4 out_stride);

int HipExec_tensor_log_layer(hipStream_t stream, dim3 globalThreads, dim3 localThreads, vx_enum type, unsigned char *in, uint in_offset,
uint4 in_stride, unsigned char *out, uint out_offset, uint4 out_stride);

int HipExec_tensor_exp_layer(hipStream_t stream, dim3 globalThreads, dim3 localThreads, vx_enum type, unsigned char *in, uint in_offset,
uint4 in_stride, unsigned char *out, uint out_offset, uint4 out_stride);

#endif //NN_HIP_HOST_DECLS_H
123 changes: 122 additions & 1 deletion amd_openvx_extensions/amd_nn/nn_hip/nn_hip_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -556,4 +556,125 @@ int HipExec_copy(hipStream_t stream, vx_enum type, uchar* inp, uchar* out, uint
}
}
return VX_SUCCESS;
}
}

__global__ void __attribute__((visibility("default")))
Hip_permute_layer(uchar* in, uint in_offset, uint4 in_stride, uchar* order_buf, uint order_offset, uint order_cap, uchar* out, uint out_offset,
uint4 out_stride) {
uint x = (hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x) * 4;
uint y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
uint z = hipBlockDim_z * hipBlockIdx_z + hipThreadIdx_z;
int num_axis = order_cap;
int i = x * out_stride.x + y * out_stride.y + z * out_stride.z;
int old_idx = 0;
int idx = i;
for(int k = num_axis - 1, j = 0; k >= 0; k--, j++) {
int order = 3 - ((int *)(order_buf + order_offset))[j];
old_idx += (idx / out_stride.data[k]) * (in_stride.data[order]);
idx %= (out_stride.data[k]);
}
out += out_offset + i;
in += in_offset + old_idx;
*(float4 *)&out[0] = *(float4 *)&in[0];
}

int HipExec_permute_layer(hipStream_t stream, dim3 globalThreads, dim3 localThreads, uchar* in, uint in_offset, uint4 in_stride, uchar* order_buf,
uint order_offset, uint order_cap, uchar* out, uint out_offset, uint4 out_stride) {

hipLaunchKernelGGL(Hip_permute_layer, dim3(ceil((float)globalThreads.x/localThreads.x), ceil((float)globalThreads.y/localThreads.y),
ceil((float)globalThreads.z/localThreads.z)), dim3(localThreads.x, localThreads.y, localThreads.z), 0, stream, in, in_offset, in_stride,
order_buf, order_offset, order_cap, out, out_offset, out_stride);

return VX_SUCCESS;
}

__global__ void __attribute__((visibility("default")))
Hip_tensor_log_layer(uchar *in, uint in_offset, uint4 in_stride, uchar *out, uint out_offset, uint4 out_stride) {

uint x = (hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x) * 4;
uint y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
uint z = hipBlockDim_z * hipBlockIdx_z + hipThreadIdx_z;

float4 value = *(float4 *)&in[in_offset + x * in_stride.x + y * in_stride.y + z * in_stride.z];
out += out_offset + x * out_stride.x + y * out_stride.y + z * out_stride.z;
*(float4 *)&out[0] = make_float4(log(value.x), log(value.y), log(value.z), log(value.w));
}

__global__ void __attribute__((visibility("default")))
Hip_tensor_log_layer_half(uchar *in, uint in_offset, uint4 in_stride, uchar *out, uint out_offset, uint4 out_stride) {

uint x = (hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x) * 4;
uint y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
uint z = hipBlockDim_z * hipBlockIdx_z + hipThreadIdx_z;

d_half4 value = *(d_half4 *)&in[in_offset + x * in_stride.x + y * in_stride.y + z * in_stride.z];
out += out_offset + x * out_stride.x + y * out_stride.y + z * out_stride.z;
d_half4 p;
p.data[0] = hlog(value.data[0]);
p.data[1] = hlog(value.data[1]);
p.data[2] = hlog(value.data[2]);
p.data[3] = hlog(value.data[3]);
*(d_half4 *)&out[0] = p;
}

int HipExec_tensor_log_layer(hipStream_t stream, dim3 globalThreads, dim3 localThreads, vx_enum type, uchar *in, uint in_offset, uint4 in_stride, uchar *out,
uint out_offset, uint4 out_stride) {

if (type == VX_TYPE_FLOAT32) {
hipLaunchKernelGGL(Hip_tensor_log_layer, dim3(ceil((float)globalThreads.x/localThreads.x), ceil((float)globalThreads.y/localThreads.y),
ceil((float)globalThreads.z/localThreads.z)), dim3(localThreads.x, localThreads.y, localThreads.z), 0, stream, in, in_offset, in_stride,
out, out_offset, out_stride);
} else {
hipLaunchKernelGGL(Hip_tensor_log_layer_half, dim3(ceil((float)globalThreads.x/localThreads.x), ceil((float)globalThreads.y/localThreads.y),
ceil((float)globalThreads.z/localThreads.z)), dim3(localThreads.x, localThreads.y, localThreads.z), 0, stream, in, in_offset, in_stride,
out, out_offset, out_stride);
}

return VX_SUCCESS;
}

__global__ void __attribute__((visibility("default")))
Hip_tensor_exp_layer(uchar *in, uint in_offset, uint4 in_stride, uchar *out, uint out_offset, uint4 out_stride) {

uint x = (hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x) * 4;
uint y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
uint z = hipBlockDim_z * hipBlockIdx_z + hipThreadIdx_z;

float4 value = *(float4 *)&in[in_offset + x * in_stride.x + y * in_stride.y + z * in_stride.z];
out += out_offset + x * out_stride.x + y * out_stride.y + z * out_stride.z;
*(float4 *)&out[0] = make_float4(exp(value.x), exp(value.y), exp(value.z), exp(value.w));
}

__global__ void __attribute__((visibility("default")))
Hip_tensor_exp_layer_half(uchar *in, uint in_offset, uint4 in_stride, uchar *out, uint out_offset, uint4 out_stride) {

uint x = (hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x) * 4;
uint y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
uint z = hipBlockDim_z * hipBlockIdx_z + hipThreadIdx_z;

d_half4 value = *(d_half4 *)&in[in_offset + x * in_stride.x + y * in_stride.y + z * in_stride.z];
out += out_offset + x * out_stride.x + y * out_stride.y + z * out_stride.z;

d_half4 p;
p.data[0] = hexp(value.data[0]);
p.data[1] = hexp(value.data[1]);
p.data[2] = hexp(value.data[2]);
p.data[3] = hexp(value.data[3]);
*(d_half4 *)&out[0] = p;
}

int HipExec_tensor_exp_layer(hipStream_t stream, dim3 globalThreads, dim3 localThreads, vx_enum type, uchar *in, uint in_offset, uint4 in_stride, uchar *out,
uint out_offset, uint4 out_stride) {

if (type == VX_TYPE_FLOAT32) {
hipLaunchKernelGGL(Hip_tensor_exp_layer, dim3(ceil((float)globalThreads.x/localThreads.x), ceil((float)globalThreads.y/localThreads.y),
ceil((float)globalThreads.z/localThreads.z)), dim3(localThreads.x, localThreads.y, localThreads.z), 0, stream, in, in_offset, in_stride,
out, out_offset, out_stride);
} else {
hipLaunchKernelGGL(Hip_tensor_exp_layer_half, dim3(ceil((float)globalThreads.x/localThreads.x), ceil((float)globalThreads.y/localThreads.y),
ceil((float)globalThreads.z/localThreads.z)), dim3(localThreads.x, localThreads.y, localThreads.z), 0, stream, in, in_offset, in_stride,
out, out_offset, out_stride);
}

return VX_SUCCESS;
}
60 changes: 58 additions & 2 deletions amd_openvx_extensions/amd_nn/src/permute_layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ static vx_status VX_CALLBACK query_target_support(vx_graph graph, vx_node node,
return VX_SUCCESS;
}

#ifdef ENABLE_OPENCL
static vx_status VX_CALLBACK opencl_codegen(
vx_node node, // [input] node
const vx_reference parameters[], // [input] parameters
Expand Down Expand Up @@ -122,11 +123,65 @@ static vx_status VX_CALLBACK opencl_codegen(
}
return VX_SUCCESS;
}
#endif

//! \brief The kernel execution.
static vx_status VX_CALLBACK host_kernel(vx_node node, const vx_reference * parameters, vx_uint32 num)
{
#if ENABLE_HIP
vx_size num_of_dims;
vx_size temp[4] = {0};
vx_size input_offset, output_offset, order_offset;
uint4 input_stride, output_dims, output_stride;
unsigned char *input_mem = NULL;
unsigned char *output_mem = NULL;
unsigned char *order_mem = NULL;
vx_size order_cap;
hipStream_t hip_stream;

ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_NUMBER_OF_DIMS, &num_of_dims, sizeof(num_of_dims)));
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_BUFFER_HIP, &input_mem, sizeof(input_mem)));
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_OFFSET_GPU, &input_offset, sizeof(input_offset)));
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_STRIDE_GPU, &temp, sizeof(temp)));
input_stride.x = temp[0];
input_stride.y = temp[1];
input_stride.z = temp[2];
input_stride.w = temp[3];

ERROR_CHECK_STATUS(vxQueryArray((vx_array)parameters[1], VX_ARRAY_BUFFER_HIP, &order_mem, sizeof(order_mem)));
ERROR_CHECK_STATUS(vxQueryArray((vx_array)parameters[1], VX_ARRAY_OFFSET_GPU, &order_offset, sizeof(order_offset)));
ERROR_CHECK_STATUS(vxQueryArray((vx_array)parameters[1], VX_ARRAY_CAPACITY, &order_cap, sizeof(order_cap)));

ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_NUMBER_OF_DIMS, &num_of_dims, sizeof(num_of_dims)));
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_DIMS, temp, sizeof(temp)));
output_dims.x = temp[0];
output_dims.y = temp[1];
output_dims.z = temp[2];
output_dims.w = temp[3];
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_BUFFER_HIP, &output_mem, sizeof(output_mem)));
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_OFFSET_GPU, &output_offset, sizeof(output_offset)));
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_STRIDE_GPU, &temp, sizeof(temp)));
output_stride.x = temp[0];
output_stride.y = temp[1];
output_stride.z = temp[2];
output_stride.w = temp[3];

ERROR_CHECK_STATUS(vxQueryNode(node, VX_NODE_ATTRIBUTE_AMD_HIP_STREAM, &hip_stream, sizeof(hip_stream)));

dim3 globalThreads;
globalThreads.x = (output_dims.x + 3) >> 2;
globalThreads.y = output_dims.y;
globalThreads.z = output_dims.z * output_dims.w;

if (HipExec_permute_layer(hip_stream, globalThreads, dim3(1), input_mem, input_offset, input_stride, order_mem, order_offset, order_cap, output_mem, output_offset, output_stride)) {
return VX_FAILURE;
}

return VX_SUCCESS;

#elif ENABLE_OPENCL
return VX_ERROR_NOT_IMPLEMENTED;
#endif
}

//! \brief The kernel publisher.
Expand All @@ -136,10 +191,11 @@ vx_status publishPermuteLayer(vx_context context)
ERROR_CHECK_OBJECT(kernel);

amd_kernel_query_target_support_f query_target_support_f = query_target_support;
amd_kernel_opencl_codegen_callback_f opencl_codegen_callback_f = opencl_codegen;
ERROR_CHECK_STATUS(vxSetKernelAttribute(kernel, VX_KERNEL_ATTRIBUTE_AMD_QUERY_TARGET_SUPPORT, &query_target_support_f, sizeof(query_target_support_f)));
#if ENABLE_OPENCL
amd_kernel_opencl_codegen_callback_f opencl_codegen_callback_f = opencl_codegen;
ERROR_CHECK_STATUS(vxSetKernelAttribute(kernel, VX_KERNEL_ATTRIBUTE_AMD_OPENCL_CODEGEN_CALLBACK, &opencl_codegen_callback_f, sizeof(opencl_codegen_callback_f)));

#endif
//set kernel parameters.
ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 0, VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED));
ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 1, VX_INPUT, VX_TYPE_ARRAY, VX_PARAMETER_STATE_REQUIRED));
Expand Down
55 changes: 53 additions & 2 deletions amd_openvx_extensions/amd_nn/src/tensor_exp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ static vx_status VX_CALLBACK query_target_support(vx_graph graph, vx_node node,
return VX_SUCCESS;
}

#if ENABLE_OPENCL
//! \brief The OpenCL code generator callback.
static vx_status VX_CALLBACK opencl_codegen(
vx_node node, // [input] node
Expand Down Expand Up @@ -119,21 +120,71 @@ static vx_status VX_CALLBACK opencl_codegen(
}
return VX_SUCCESS;
}
#endif

static vx_status VX_CALLBACK host_kernel(vx_node node, const vx_reference * parameters, vx_uint32 num) {
#if ENABLE_HIP
vx_size input_dims[4];
vx_size num_of_dims;
vx_enum type;
vx_size temp[4] = {0};
vx_size input_offset, output_offset;
uint4 input_stride, output_stride;
unsigned char *input_mem = NULL;
unsigned char *output_mem = NULL;
hipStream_t hip_stream;

ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_NUMBER_OF_DIMS, &num_of_dims, sizeof(num_of_dims)));
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DIMS, input_dims, sizeof(input_dims)));
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_BUFFER_HIP, &input_mem, sizeof(input_mem)));
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_OFFSET_GPU, &input_offset, sizeof(input_offset)));
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_STRIDE_GPU, &temp, sizeof(temp)));
input_stride.x = temp[0];
input_stride.y = temp[1];
input_stride.z = temp[2];
input_stride.w = temp[3];

ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DATA_TYPE, &type, sizeof(type)));
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_BUFFER_HIP, &output_mem, sizeof(output_mem)));
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_OFFSET_GPU, &output_offset, sizeof(output_offset)));
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_STRIDE_GPU, &temp, sizeof(temp)));
output_stride.x = temp[0];
output_stride.y = temp[1];
output_stride.z = temp[2];
output_stride.w = temp[3];

ERROR_CHECK_STATUS(vxQueryNode(node, VX_NODE_ATTRIBUTE_AMD_HIP_STREAM, &hip_stream, sizeof(hip_stream)));

dim3 globalThreads;
globalThreads.x = (input_dims[0] + 3) >> 2;
globalThreads.y = input_dims[1];
globalThreads.z = input_dims[2] * input_dims[3];

if (HipExec_tensor_exp_layer(hip_stream, globalThreads, dim3(1), type, input_mem, input_offset, input_stride, output_mem,
output_offset, output_stride)) {
return VX_FAILURE;
}

return VX_SUCCESS;

#elif ENABLE_OPENCL
return VX_ERROR_NOT_IMPLEMENTED;
#endif
}

vx_status publishTensorExp(vx_context context)
{
// add kernel to the context with callbacks
vx_kernel kernel = vxAddUserKernel(context, "com.amd.nn_extension.tensor_exp", VX_KERNEL_TENSOR_EXP_AMD, host_kernel, 2, validateTensorExp, nullptr, nullptr);
ERROR_CHECK_OBJECT(kernel);

amd_kernel_query_target_support_f query_target_support_f = query_target_support;
amd_kernel_opencl_codegen_callback_f opencl_codegen_callback_f = opencl_codegen;
ERROR_CHECK_STATUS(vxSetKernelAttribute(kernel, VX_KERNEL_ATTRIBUTE_AMD_QUERY_TARGET_SUPPORT, &query_target_support_f, sizeof(query_target_support_f)));

#if ENABLE_OPENCL
amd_kernel_opencl_codegen_callback_f opencl_codegen_callback_f = opencl_codegen;
ERROR_CHECK_STATUS(vxSetKernelAttribute(kernel, VX_KERNEL_ATTRIBUTE_AMD_OPENCL_CODEGEN_CALLBACK, &opencl_codegen_callback_f, sizeof(opencl_codegen_callback_f)));
#endif

//set kernel parameters.
ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 0, VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED));
Expand Down
Loading