From 4da20b4c9cf5efc87fd1f843d765d7d21778ef3b Mon Sep 17 00:00:00 2001 From: Aryan Salmanpour Date: Thu, 28 Oct 2021 17:24:08 -0400 Subject: [PATCH] Neural Network Extension HIP GPU backend - add support for the permute/Tensor_log/Tensor_exp layers --- amd_openvx/openvx/api/vx_api.cpp | 38 ++++++ amd_openvx/openvx/include/vx_ext_amd.h | 3 +- .../amd_nn/nn_hip/nn_hip_host_decls.h | 9 ++ .../amd_nn/nn_hip/nn_hip_kernels.cpp | 123 +++++++++++++++++- .../amd_nn/src/permute_layer.cpp | 60 ++++++++- .../amd_nn/src/tensor_exp.cpp | 55 +++++++- .../amd_nn/src/tensor_log.cpp | 53 +++++++- 7 files changed, 334 insertions(+), 7 deletions(-) diff --git a/amd_openvx/openvx/api/vx_api.cpp b/amd_openvx/openvx/api/vx_api.cpp index 3673d7aab2..2a3d682a11 100644 --- a/amd_openvx/openvx/api/vx_api.cpp +++ b/amd_openvx/openvx/api/vx_api.cpp @@ -8114,6 +8114,44 @@ VX_API_ENTRY vx_status VX_API_CALL vxQueryArray(vx_array arr, vx_enum attribute, status = VX_SUCCESS; } break; +#if (ENABLE_OPENCL||ENABLE_HIP) + case VX_ARRAY_OFFSET_GPU: + if (size == sizeof(vx_size)) { + *(vx_size *)ptr = data->gpu_buffer_offset; + status = VX_SUCCESS; + } + break; +#if ENABLE_OPENCL + case VX_ARRAY_BUFFER_OPENCL: + if (size == sizeof(cl_mem)) { + if (data->opencl_buffer) { + *(cl_mem *)ptr = data->opencl_buffer; + } + else { +#if defined(CL_VERSION_2_0) + *(vx_uint8 **)ptr = data->opencl_svm_buffer; +#else + *(vx_uint8 **)ptr = NULL; +#endif + } + status = VX_SUCCESS; + } + break; +#else + case VX_ARRAY_BUFFER_HIP: + if (size == sizeof(vx_uint8 *)) { + if (data->hip_memory) { + *(vx_uint8 **)ptr = data->hip_memory; + } + else { + *(vx_uint8 **)ptr = NULL; + } + status = VX_SUCCESS; + } + break; + +#endif +#endif default: status = VX_ERROR_NOT_SUPPORTED; break; diff --git a/amd_openvx/openvx/include/vx_ext_amd.h b/amd_openvx/openvx/include/vx_ext_amd.h index 36ca7cd8db..4222a000e9 100644 --- a/amd_openvx/openvx/include/vx_ext_amd.h +++ b/amd_openvx/openvx/include/vx_ext_amd.h @@ -201,7 +201,8 @@ enum vx_array_attribute_amd_e { /*! \brief OpenCL buffer. cl_mem. */ VX_ARRAY_BUFFER_OPENCL = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_ARRAY) + 0x9, VX_ARRAY_BUFFER_HIP = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_ARRAY) + 0x10, - VX_ARRAY_BUFFER = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_ARRAY ) + 0x11 + VX_ARRAY_BUFFER = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_ARRAY ) + 0x11, + VX_ARRAY_OFFSET_GPU = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_ARRAY ) + 0x12 }; /*! \brief These enumerations are given to the \c vxDirective API to enable/disable diff --git a/amd_openvx_extensions/amd_nn/nn_hip/nn_hip_host_decls.h b/amd_openvx_extensions/amd_nn/nn_hip/nn_hip_host_decls.h index 446253869f..609624f43f 100644 --- a/amd_openvx_extensions/amd_nn/nn_hip/nn_hip_host_decls.h +++ b/amd_openvx_extensions/amd_nn/nn_hip/nn_hip_host_decls.h @@ -46,4 +46,13 @@ int HipExec_tensor_to_image_layer(hipStream_t stream, vx_df_image format, vx_enu int HipExec_copy(hipStream_t stream, vx_enum type, unsigned char* inp, unsigned char* out, uint width, uint height, uint ldi, uint i_offset, uint ldc, uint c_offset, bool tI); +int HipExec_permute_layer(hipStream_t stream, dim3 globalThreads, dim3 localThreads, unsigned char* in, uint in_offset, uint4 in_stride, + unsigned char* order_buf, uint order_offset, uint order_cap, unsigned char* out, uint out_offset, uint4 out_stride); + +int HipExec_tensor_log_layer(hipStream_t stream, dim3 globalThreads, dim3 localThreads, vx_enum type, unsigned char *in, uint in_offset, + uint4 in_stride, unsigned char *out, uint out_offset, uint4 out_stride); + +int HipExec_tensor_exp_layer(hipStream_t stream, dim3 globalThreads, dim3 localThreads, vx_enum type, unsigned char *in, uint in_offset, + uint4 in_stride, unsigned char *out, uint out_offset, uint4 out_stride); + #endif //NN_HIP_HOST_DECLS_H \ No newline at end of file diff --git a/amd_openvx_extensions/amd_nn/nn_hip/nn_hip_kernels.cpp b/amd_openvx_extensions/amd_nn/nn_hip/nn_hip_kernels.cpp index e6cdea957c..2fd2385f2e 100644 --- a/amd_openvx_extensions/amd_nn/nn_hip/nn_hip_kernels.cpp +++ b/amd_openvx_extensions/amd_nn/nn_hip/nn_hip_kernels.cpp @@ -556,4 +556,125 @@ int HipExec_copy(hipStream_t stream, vx_enum type, uchar* inp, uchar* out, uint } } return VX_SUCCESS; -} \ No newline at end of file +} + +__global__ void __attribute__((visibility("default"))) +Hip_permute_layer(uchar* in, uint in_offset, uint4 in_stride, uchar* order_buf, uint order_offset, uint order_cap, uchar* out, uint out_offset, + uint4 out_stride) { + uint x = (hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x) * 4; + uint y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + uint z = hipBlockDim_z * hipBlockIdx_z + hipThreadIdx_z; + int num_axis = order_cap; + int i = x * out_stride.x + y * out_stride.y + z * out_stride.z; + int old_idx = 0; + int idx = i; + for(int k = num_axis - 1, j = 0; k >= 0; k--, j++) { + int order = 3 - ((int *)(order_buf + order_offset))[j]; + old_idx += (idx / out_stride.data[k]) * (in_stride.data[order]); + idx %= (out_stride.data[k]); + } + out += out_offset + i; + in += in_offset + old_idx; + *(float4 *)&out[0] = *(float4 *)&in[0]; +} + +int HipExec_permute_layer(hipStream_t stream, dim3 globalThreads, dim3 localThreads, uchar* in, uint in_offset, uint4 in_stride, uchar* order_buf, + uint order_offset, uint order_cap, uchar* out, uint out_offset, uint4 out_stride) { + + hipLaunchKernelGGL(Hip_permute_layer, dim3(ceil((float)globalThreads.x/localThreads.x), ceil((float)globalThreads.y/localThreads.y), + ceil((float)globalThreads.z/localThreads.z)), dim3(localThreads.x, localThreads.y, localThreads.z), 0, stream, in, in_offset, in_stride, + order_buf, order_offset, order_cap, out, out_offset, out_stride); + + return VX_SUCCESS; +} + +__global__ void __attribute__((visibility("default"))) +Hip_tensor_log_layer(uchar *in, uint in_offset, uint4 in_stride, uchar *out, uint out_offset, uint4 out_stride) { + + uint x = (hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x) * 4; + uint y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + uint z = hipBlockDim_z * hipBlockIdx_z + hipThreadIdx_z; + + float4 value = *(float4 *)&in[in_offset + x * in_stride.x + y * in_stride.y + z * in_stride.z]; + out += out_offset + x * out_stride.x + y * out_stride.y + z * out_stride.z; + *(float4 *)&out[0] = make_float4(log(value.x), log(value.y), log(value.z), log(value.w)); + } + +__global__ void __attribute__((visibility("default"))) +Hip_tensor_log_layer_half(uchar *in, uint in_offset, uint4 in_stride, uchar *out, uint out_offset, uint4 out_stride) { + + uint x = (hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x) * 4; + uint y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + uint z = hipBlockDim_z * hipBlockIdx_z + hipThreadIdx_z; + + d_half4 value = *(d_half4 *)&in[in_offset + x * in_stride.x + y * in_stride.y + z * in_stride.z]; + out += out_offset + x * out_stride.x + y * out_stride.y + z * out_stride.z; + d_half4 p; + p.data[0] = hlog(value.data[0]); + p.data[1] = hlog(value.data[1]); + p.data[2] = hlog(value.data[2]); + p.data[3] = hlog(value.data[3]); + *(d_half4 *)&out[0] = p; + } + +int HipExec_tensor_log_layer(hipStream_t stream, dim3 globalThreads, dim3 localThreads, vx_enum type, uchar *in, uint in_offset, uint4 in_stride, uchar *out, + uint out_offset, uint4 out_stride) { + + if (type == VX_TYPE_FLOAT32) { + hipLaunchKernelGGL(Hip_tensor_log_layer, dim3(ceil((float)globalThreads.x/localThreads.x), ceil((float)globalThreads.y/localThreads.y), + ceil((float)globalThreads.z/localThreads.z)), dim3(localThreads.x, localThreads.y, localThreads.z), 0, stream, in, in_offset, in_stride, + out, out_offset, out_stride); + } else { + hipLaunchKernelGGL(Hip_tensor_log_layer_half, dim3(ceil((float)globalThreads.x/localThreads.x), ceil((float)globalThreads.y/localThreads.y), + ceil((float)globalThreads.z/localThreads.z)), dim3(localThreads.x, localThreads.y, localThreads.z), 0, stream, in, in_offset, in_stride, + out, out_offset, out_stride); + } + + return VX_SUCCESS; +} + +__global__ void __attribute__((visibility("default"))) +Hip_tensor_exp_layer(uchar *in, uint in_offset, uint4 in_stride, uchar *out, uint out_offset, uint4 out_stride) { + + uint x = (hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x) * 4; + uint y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + uint z = hipBlockDim_z * hipBlockIdx_z + hipThreadIdx_z; + + float4 value = *(float4 *)&in[in_offset + x * in_stride.x + y * in_stride.y + z * in_stride.z]; + out += out_offset + x * out_stride.x + y * out_stride.y + z * out_stride.z; + *(float4 *)&out[0] = make_float4(exp(value.x), exp(value.y), exp(value.z), exp(value.w)); + } + +__global__ void __attribute__((visibility("default"))) +Hip_tensor_exp_layer_half(uchar *in, uint in_offset, uint4 in_stride, uchar *out, uint out_offset, uint4 out_stride) { + + uint x = (hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x) * 4; + uint y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + uint z = hipBlockDim_z * hipBlockIdx_z + hipThreadIdx_z; + + d_half4 value = *(d_half4 *)&in[in_offset + x * in_stride.x + y * in_stride.y + z * in_stride.z]; + out += out_offset + x * out_stride.x + y * out_stride.y + z * out_stride.z; + + d_half4 p; + p.data[0] = hexp(value.data[0]); + p.data[1] = hexp(value.data[1]); + p.data[2] = hexp(value.data[2]); + p.data[3] = hexp(value.data[3]); + *(d_half4 *)&out[0] = p; + } + +int HipExec_tensor_exp_layer(hipStream_t stream, dim3 globalThreads, dim3 localThreads, vx_enum type, uchar *in, uint in_offset, uint4 in_stride, uchar *out, + uint out_offset, uint4 out_stride) { + + if (type == VX_TYPE_FLOAT32) { + hipLaunchKernelGGL(Hip_tensor_exp_layer, dim3(ceil((float)globalThreads.x/localThreads.x), ceil((float)globalThreads.y/localThreads.y), + ceil((float)globalThreads.z/localThreads.z)), dim3(localThreads.x, localThreads.y, localThreads.z), 0, stream, in, in_offset, in_stride, + out, out_offset, out_stride); + } else { + hipLaunchKernelGGL(Hip_tensor_exp_layer_half, dim3(ceil((float)globalThreads.x/localThreads.x), ceil((float)globalThreads.y/localThreads.y), + ceil((float)globalThreads.z/localThreads.z)), dim3(localThreads.x, localThreads.y, localThreads.z), 0, stream, in, in_offset, in_stride, + out, out_offset, out_stride); + } + + return VX_SUCCESS; +} diff --git a/amd_openvx_extensions/amd_nn/src/permute_layer.cpp b/amd_openvx_extensions/amd_nn/src/permute_layer.cpp index 9ebc8e57b1..d0d58e28c5 100644 --- a/amd_openvx_extensions/amd_nn/src/permute_layer.cpp +++ b/amd_openvx_extensions/amd_nn/src/permute_layer.cpp @@ -48,6 +48,7 @@ static vx_status VX_CALLBACK query_target_support(vx_graph graph, vx_node node, return VX_SUCCESS; } +#ifdef ENABLE_OPENCL static vx_status VX_CALLBACK opencl_codegen( vx_node node, // [input] node const vx_reference parameters[], // [input] parameters @@ -122,11 +123,65 @@ static vx_status VX_CALLBACK opencl_codegen( } return VX_SUCCESS; } +#endif //! \brief The kernel execution. static vx_status VX_CALLBACK host_kernel(vx_node node, const vx_reference * parameters, vx_uint32 num) { +#if ENABLE_HIP + vx_size num_of_dims; + vx_size temp[4] = {0}; + vx_size input_offset, output_offset, order_offset; + uint4 input_stride, output_dims, output_stride; + unsigned char *input_mem = NULL; + unsigned char *output_mem = NULL; + unsigned char *order_mem = NULL; + vx_size order_cap; + hipStream_t hip_stream; + + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_NUMBER_OF_DIMS, &num_of_dims, sizeof(num_of_dims))); + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_BUFFER_HIP, &input_mem, sizeof(input_mem))); + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_OFFSET_GPU, &input_offset, sizeof(input_offset))); + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_STRIDE_GPU, &temp, sizeof(temp))); + input_stride.x = temp[0]; + input_stride.y = temp[1]; + input_stride.z = temp[2]; + input_stride.w = temp[3]; + + ERROR_CHECK_STATUS(vxQueryArray((vx_array)parameters[1], VX_ARRAY_BUFFER_HIP, &order_mem, sizeof(order_mem))); + ERROR_CHECK_STATUS(vxQueryArray((vx_array)parameters[1], VX_ARRAY_OFFSET_GPU, &order_offset, sizeof(order_offset))); + ERROR_CHECK_STATUS(vxQueryArray((vx_array)parameters[1], VX_ARRAY_CAPACITY, &order_cap, sizeof(order_cap))); + + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_NUMBER_OF_DIMS, &num_of_dims, sizeof(num_of_dims))); + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_DIMS, temp, sizeof(temp))); + output_dims.x = temp[0]; + output_dims.y = temp[1]; + output_dims.z = temp[2]; + output_dims.w = temp[3]; + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_BUFFER_HIP, &output_mem, sizeof(output_mem))); + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_OFFSET_GPU, &output_offset, sizeof(output_offset))); + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_STRIDE_GPU, &temp, sizeof(temp))); + output_stride.x = temp[0]; + output_stride.y = temp[1]; + output_stride.z = temp[2]; + output_stride.w = temp[3]; + + ERROR_CHECK_STATUS(vxQueryNode(node, VX_NODE_ATTRIBUTE_AMD_HIP_STREAM, &hip_stream, sizeof(hip_stream))); + + dim3 globalThreads; + globalThreads.x = (output_dims.x + 3) >> 2; + globalThreads.y = output_dims.y; + globalThreads.z = output_dims.z * output_dims.w; + + if (HipExec_permute_layer(hip_stream, globalThreads, dim3(1), input_mem, input_offset, input_stride, order_mem, order_offset, order_cap, output_mem, output_offset, output_stride)) { + return VX_FAILURE; + } + + return VX_SUCCESS; + +#elif ENABLE_OPENCL return VX_ERROR_NOT_IMPLEMENTED; +#endif } //! \brief The kernel publisher. @@ -136,10 +191,11 @@ vx_status publishPermuteLayer(vx_context context) ERROR_CHECK_OBJECT(kernel); amd_kernel_query_target_support_f query_target_support_f = query_target_support; - amd_kernel_opencl_codegen_callback_f opencl_codegen_callback_f = opencl_codegen; ERROR_CHECK_STATUS(vxSetKernelAttribute(kernel, VX_KERNEL_ATTRIBUTE_AMD_QUERY_TARGET_SUPPORT, &query_target_support_f, sizeof(query_target_support_f))); +#if ENABLE_OPENCL + amd_kernel_opencl_codegen_callback_f opencl_codegen_callback_f = opencl_codegen; ERROR_CHECK_STATUS(vxSetKernelAttribute(kernel, VX_KERNEL_ATTRIBUTE_AMD_OPENCL_CODEGEN_CALLBACK, &opencl_codegen_callback_f, sizeof(opencl_codegen_callback_f))); - +#endif //set kernel parameters. ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 0, VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED)); ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 1, VX_INPUT, VX_TYPE_ARRAY, VX_PARAMETER_STATE_REQUIRED)); diff --git a/amd_openvx_extensions/amd_nn/src/tensor_exp.cpp b/amd_openvx_extensions/amd_nn/src/tensor_exp.cpp index 29b2263376..1d52a8ed89 100644 --- a/amd_openvx_extensions/amd_nn/src/tensor_exp.cpp +++ b/amd_openvx_extensions/amd_nn/src/tensor_exp.cpp @@ -46,6 +46,7 @@ static vx_status VX_CALLBACK query_target_support(vx_graph graph, vx_node node, return VX_SUCCESS; } +#if ENABLE_OPENCL //! \brief The OpenCL code generator callback. static vx_status VX_CALLBACK opencl_codegen( vx_node node, // [input] node @@ -119,9 +120,56 @@ static vx_status VX_CALLBACK opencl_codegen( } return VX_SUCCESS; } +#endif static vx_status VX_CALLBACK host_kernel(vx_node node, const vx_reference * parameters, vx_uint32 num) { +#if ENABLE_HIP + vx_size input_dims[4]; + vx_size num_of_dims; + vx_enum type; + vx_size temp[4] = {0}; + vx_size input_offset, output_offset; + uint4 input_stride, output_stride; + unsigned char *input_mem = NULL; + unsigned char *output_mem = NULL; + hipStream_t hip_stream; + + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_NUMBER_OF_DIMS, &num_of_dims, sizeof(num_of_dims))); + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DIMS, input_dims, sizeof(input_dims))); + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_BUFFER_HIP, &input_mem, sizeof(input_mem))); + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_OFFSET_GPU, &input_offset, sizeof(input_offset))); + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_STRIDE_GPU, &temp, sizeof(temp))); + input_stride.x = temp[0]; + input_stride.y = temp[1]; + input_stride.z = temp[2]; + input_stride.w = temp[3]; + + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DATA_TYPE, &type, sizeof(type))); + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_BUFFER_HIP, &output_mem, sizeof(output_mem))); + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_OFFSET_GPU, &output_offset, sizeof(output_offset))); + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_STRIDE_GPU, &temp, sizeof(temp))); + output_stride.x = temp[0]; + output_stride.y = temp[1]; + output_stride.z = temp[2]; + output_stride.w = temp[3]; + + ERROR_CHECK_STATUS(vxQueryNode(node, VX_NODE_ATTRIBUTE_AMD_HIP_STREAM, &hip_stream, sizeof(hip_stream))); + + dim3 globalThreads; + globalThreads.x = (input_dims[0] + 3) >> 2; + globalThreads.y = input_dims[1]; + globalThreads.z = input_dims[2] * input_dims[3]; + + if (HipExec_tensor_exp_layer(hip_stream, globalThreads, dim3(1), type, input_mem, input_offset, input_stride, output_mem, + output_offset, output_stride)) { + return VX_FAILURE; + } + + return VX_SUCCESS; + +#elif ENABLE_OPENCL return VX_ERROR_NOT_IMPLEMENTED; +#endif } vx_status publishTensorExp(vx_context context) @@ -129,11 +177,14 @@ vx_status publishTensorExp(vx_context context) // add kernel to the context with callbacks vx_kernel kernel = vxAddUserKernel(context, "com.amd.nn_extension.tensor_exp", VX_KERNEL_TENSOR_EXP_AMD, host_kernel, 2, validateTensorExp, nullptr, nullptr); ERROR_CHECK_OBJECT(kernel); - + amd_kernel_query_target_support_f query_target_support_f = query_target_support; - amd_kernel_opencl_codegen_callback_f opencl_codegen_callback_f = opencl_codegen; ERROR_CHECK_STATUS(vxSetKernelAttribute(kernel, VX_KERNEL_ATTRIBUTE_AMD_QUERY_TARGET_SUPPORT, &query_target_support_f, sizeof(query_target_support_f))); + +#if ENABLE_OPENCL + amd_kernel_opencl_codegen_callback_f opencl_codegen_callback_f = opencl_codegen; ERROR_CHECK_STATUS(vxSetKernelAttribute(kernel, VX_KERNEL_ATTRIBUTE_AMD_OPENCL_CODEGEN_CALLBACK, &opencl_codegen_callback_f, sizeof(opencl_codegen_callback_f))); +#endif //set kernel parameters. ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 0, VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED)); diff --git a/amd_openvx_extensions/amd_nn/src/tensor_log.cpp b/amd_openvx_extensions/amd_nn/src/tensor_log.cpp index 6e8c680334..1ec022fb82 100644 --- a/amd_openvx_extensions/amd_nn/src/tensor_log.cpp +++ b/amd_openvx_extensions/amd_nn/src/tensor_log.cpp @@ -46,6 +46,7 @@ static vx_status VX_CALLBACK query_target_support(vx_graph graph, vx_node node, return VX_SUCCESS; } +#if ENABLE_OPENCL //! \brief The OpenCL code generator callback. static vx_status VX_CALLBACK opencl_codegen( vx_node node, // [input] node @@ -119,9 +120,56 @@ static vx_status VX_CALLBACK opencl_codegen( } return VX_SUCCESS; } +#endif static vx_status VX_CALLBACK host_kernel(vx_node node, const vx_reference * parameters, vx_uint32 num) { +#if ENABLE_HIP + vx_size input_dims[4]; + vx_size num_of_dims; + vx_enum type; + vx_size temp[4] = {0}; + vx_size input_offset, output_offset; + uint4 input_stride, output_stride; + unsigned char *input_mem = NULL; + unsigned char *output_mem = NULL; + hipStream_t hip_stream; + + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_NUMBER_OF_DIMS, &num_of_dims, sizeof(num_of_dims))); + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DIMS, input_dims, sizeof(input_dims))); + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_BUFFER_HIP, &input_mem, sizeof(input_mem))); + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_OFFSET_GPU, &input_offset, sizeof(input_offset))); + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_STRIDE_GPU, &temp, sizeof(temp))); + input_stride.x = temp[0]; + input_stride.y = temp[1]; + input_stride.z = temp[2]; + input_stride.w = temp[3]; + + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DATA_TYPE, &type, sizeof(type))); + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_BUFFER_HIP, &output_mem, sizeof(output_mem))); + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_OFFSET_GPU, &output_offset, sizeof(output_offset))); + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_STRIDE_GPU, &temp, sizeof(temp))); + output_stride.x = temp[0]; + output_stride.y = temp[1]; + output_stride.z = temp[2]; + output_stride.w = temp[3]; + + ERROR_CHECK_STATUS(vxQueryNode(node, VX_NODE_ATTRIBUTE_AMD_HIP_STREAM, &hip_stream, sizeof(hip_stream))); + + dim3 globalThreads; + globalThreads.x = (input_dims[0] + 3) >> 2; + globalThreads.y = input_dims[1]; + globalThreads.z = input_dims[2] * input_dims[3]; + + if (HipExec_tensor_log_layer(hip_stream, globalThreads, dim3(1), type, input_mem, input_offset, input_stride, output_mem, + output_offset, output_stride)) { + return VX_FAILURE; + } + + return VX_SUCCESS; + +#elif ENABLE_OPENCL return VX_ERROR_NOT_IMPLEMENTED; +#endif } vx_status publishTensorLog(vx_context context) @@ -131,9 +179,12 @@ vx_status publishTensorLog(vx_context context) ERROR_CHECK_OBJECT(kernel); amd_kernel_query_target_support_f query_target_support_f = query_target_support; - amd_kernel_opencl_codegen_callback_f opencl_codegen_callback_f = opencl_codegen; ERROR_CHECK_STATUS(vxSetKernelAttribute(kernel, VX_KERNEL_ATTRIBUTE_AMD_QUERY_TARGET_SUPPORT, &query_target_support_f, sizeof(query_target_support_f))); + +#if ENABLE_OPENCL + amd_kernel_opencl_codegen_callback_f opencl_codegen_callback_f = opencl_codegen; ERROR_CHECK_STATUS(vxSetKernelAttribute(kernel, VX_KERNEL_ATTRIBUTE_AMD_OPENCL_CODEGEN_CALLBACK, &opencl_codegen_callback_f, sizeof(opencl_codegen_callback_f))); +#endif //set kernel parameters. ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 0, VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED));