ROCm · kiritigowda · Oct 29, 2021 · Oct 28, 2021
@@ -8114,6 +8114,44 @@ VX_API_ENTRY vx_status VX_API_CALL vxQueryArray(vx_array arr, vx_enum attribute,
                     status = VX_SUCCESS;
                 }
                 break;
+#if (ENABLE_OPENCL||ENABLE_HIP)
+            case VX_ARRAY_OFFSET_GPU:
+                if (size == sizeof(vx_size)) {
+                    *(vx_size *)ptr = data->gpu_buffer_offset;
+                    status = VX_SUCCESS;
+                }
+                break;
+#if ENABLE_OPENCL
+            case VX_ARRAY_BUFFER_OPENCL:
+                if (size == sizeof(cl_mem)) {
+                    if (data->opencl_buffer) {
+                        *(cl_mem *)ptr = data->opencl_buffer;
+                    }
+                    else {
+#if defined(CL_VERSION_2_0)
+                        *(vx_uint8 **)ptr = data->opencl_svm_buffer;
+#else
+                        *(vx_uint8 **)ptr = NULL;
+#endif
+                    }
+                    status = VX_SUCCESS;
+                }
+                break;
+#else
+            case VX_ARRAY_BUFFER_HIP:
+                if (size == sizeof(vx_uint8 *)) {
+                    if (data->hip_memory) {
+                        *(vx_uint8 **)ptr = data->hip_memory;
+                    }
+                    else {
+                        *(vx_uint8 **)ptr = NULL;
+                    }
+                    status = VX_SUCCESS;
+                }
+                break;
+
+#endif
+#endif
             default:
                 status = VX_ERROR_NOT_SUPPORTED;
                 break;

@@ -201,7 +201,8 @@ enum vx_array_attribute_amd_e {
     /*! \brief OpenCL buffer. <tt>cl_mem</tt>. */
     VX_ARRAY_BUFFER_OPENCL   = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_ARRAY) + 0x9,
     VX_ARRAY_BUFFER_HIP   = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_ARRAY) + 0x10,
-        VX_ARRAY_BUFFER    = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_ARRAY ) + 0x11
+    VX_ARRAY_BUFFER    = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_ARRAY ) + 0x11,
+    VX_ARRAY_OFFSET_GPU = VX_ATTRIBUTE_BASE(VX_ID_AMD, VX_TYPE_ARRAY ) + 0x12
 };
 
 /*! \brief These enumerations are given to the \c vxDirective API to enable/disable

@@ -46,4 +46,13 @@ int HipExec_tensor_to_image_layer(hipStream_t stream, vx_df_image format, vx_enu
 int HipExec_copy(hipStream_t stream, vx_enum type, unsigned char* inp, unsigned char* out, uint width, uint height, uint ldi, uint i_offset,
     uint ldc, uint c_offset, bool tI);
 
+int HipExec_permute_layer(hipStream_t stream, dim3 globalThreads, dim3 localThreads, unsigned char* in, uint in_offset, uint4 in_stride,
+    unsigned char* order_buf, uint order_offset, uint order_cap, unsigned char* out, uint out_offset, uint4 out_stride);
+
+int HipExec_tensor_log_layer(hipStream_t stream, dim3 globalThreads, dim3 localThreads, vx_enum type, unsigned char *in, uint in_offset,
+    uint4 in_stride, unsigned char *out, uint out_offset, uint4 out_stride);
+
+int HipExec_tensor_exp_layer(hipStream_t stream, dim3 globalThreads, dim3 localThreads, vx_enum type, unsigned char *in, uint in_offset,
+    uint4 in_stride, unsigned char *out, uint out_offset, uint4 out_stride);
+
 #endif //NN_HIP_HOST_DECLS_H
@@ -556,4 +556,125 @@ int HipExec_copy(hipStream_t stream, vx_enum type, uchar* inp, uchar* out, uint
         }
     }
     return VX_SUCCESS;
-}
+}
+
+__global__ void __attribute__((visibility("default")))
+Hip_permute_layer(uchar* in, uint in_offset, uint4 in_stride, uchar* order_buf, uint order_offset, uint order_cap, uchar* out, uint out_offset,
+    uint4 out_stride) {
+    uint x = (hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x) * 4;
+    uint y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+    uint z = hipBlockDim_z * hipBlockIdx_z + hipThreadIdx_z;
+    int num_axis = order_cap;
+    int i = x * out_stride.x + y * out_stride.y + z * out_stride.z;
+    int old_idx = 0;
+    int idx = i;
+    for(int k = num_axis - 1, j = 0; k >= 0; k--, j++) {
+        int order = 3 - ((int *)(order_buf + order_offset))[j];
+        old_idx += (idx / out_stride.data[k]) * (in_stride.data[order]);
+        idx %= (out_stride.data[k]);
+    }
+    out += out_offset + i;
+    in += in_offset + old_idx;
+    *(float4 *)&out[0] = *(float4 *)&in[0];
+}
+
+int HipExec_permute_layer(hipStream_t stream, dim3 globalThreads, dim3 localThreads, uchar* in, uint in_offset, uint4 in_stride, uchar* order_buf,
+    uint order_offset, uint order_cap, uchar* out, uint out_offset, uint4 out_stride) {
+
+    hipLaunchKernelGGL(Hip_permute_layer, dim3(ceil((float)globalThreads.x/localThreads.x), ceil((float)globalThreads.y/localThreads.y),
+        ceil((float)globalThreads.z/localThreads.z)), dim3(localThreads.x, localThreads.y, localThreads.z), 0, stream, in, in_offset, in_stride,
+        order_buf, order_offset, order_cap, out, out_offset, out_stride);
+
+    return VX_SUCCESS;
+}
+
+__global__ void __attribute__((visibility("default")))
+Hip_tensor_log_layer(uchar *in, uint in_offset, uint4 in_stride, uchar *out, uint out_offset, uint4 out_stride) {
+
+    uint x = (hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x) * 4;
+    uint y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+    uint z = hipBlockDim_z * hipBlockIdx_z + hipThreadIdx_z;
+
+     float4 value = *(float4 *)&in[in_offset + x * in_stride.x + y * in_stride.y + z * in_stride.z];
+     out += out_offset + x * out_stride.x + y * out_stride.y + z * out_stride.z;
+     *(float4 *)&out[0] = make_float4(log(value.x), log(value.y), log(value.z), log(value.w));
+ }
+
+__global__ void __attribute__((visibility("default")))
+Hip_tensor_log_layer_half(uchar *in, uint in_offset, uint4 in_stride, uchar *out, uint out_offset, uint4 out_stride) {
+
+    uint x = (hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x) * 4;
+    uint y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+    uint z = hipBlockDim_z * hipBlockIdx_z + hipThreadIdx_z;
+
+     d_half4 value = *(d_half4 *)&in[in_offset + x * in_stride.x + y * in_stride.y + z * in_stride.z];
+     out += out_offset + x * out_stride.x + y * out_stride.y + z * out_stride.z;
+     d_half4 p;
+     p.data[0] = hlog(value.data[0]);
+     p.data[1] = hlog(value.data[1]);
+     p.data[2] = hlog(value.data[2]);
+     p.data[3] = hlog(value.data[3]);
+     *(d_half4 *)&out[0] = p;
+ }
+
+int HipExec_tensor_log_layer(hipStream_t stream, dim3 globalThreads, dim3 localThreads, vx_enum type, uchar *in, uint in_offset, uint4 in_stride, uchar *out,
+    uint out_offset, uint4 out_stride) {
+
+    if (type == VX_TYPE_FLOAT32) {
+        hipLaunchKernelGGL(Hip_tensor_log_layer, dim3(ceil((float)globalThreads.x/localThreads.x), ceil((float)globalThreads.y/localThreads.y),
+            ceil((float)globalThreads.z/localThreads.z)), dim3(localThreads.x, localThreads.y, localThreads.z), 0, stream, in, in_offset, in_stride,
+            out, out_offset, out_stride);
+    } else {
+        hipLaunchKernelGGL(Hip_tensor_log_layer_half, dim3(ceil((float)globalThreads.x/localThreads.x), ceil((float)globalThreads.y/localThreads.y),
+            ceil((float)globalThreads.z/localThreads.z)), dim3(localThreads.x, localThreads.y, localThreads.z), 0, stream, in, in_offset, in_stride,
+            out, out_offset, out_stride);
+    }
+
+    return VX_SUCCESS;
+}
+
+__global__ void __attribute__((visibility("default")))
+Hip_tensor_exp_layer(uchar *in, uint in_offset, uint4 in_stride, uchar *out, uint out_offset, uint4 out_stride) {
+
+    uint x = (hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x) * 4;
+    uint y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+    uint z = hipBlockDim_z * hipBlockIdx_z + hipThreadIdx_z;
+
+     float4 value = *(float4 *)&in[in_offset + x * in_stride.x + y * in_stride.y + z * in_stride.z];
+     out += out_offset + x  * out_stride.x + y * out_stride.y + z * out_stride.z;
+     *(float4 *)&out[0] = make_float4(exp(value.x), exp(value.y), exp(value.z), exp(value.w));
+ }
+
+__global__ void __attribute__((visibility("default")))
+Hip_tensor_exp_layer_half(uchar *in, uint in_offset, uint4 in_stride, uchar *out, uint out_offset, uint4 out_stride) {
+
+    uint x = (hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x) * 4;
+    uint y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+    uint z = hipBlockDim_z * hipBlockIdx_z + hipThreadIdx_z;
+
+    d_half4 value = *(d_half4 *)&in[in_offset + x * in_stride.x + y * in_stride.y + z * in_stride.z];
+    out += out_offset + x * out_stride.x + y * out_stride.y + z * out_stride.z;
+
+     d_half4 p;
+     p.data[0] = hexp(value.data[0]);
+     p.data[1] = hexp(value.data[1]);
+     p.data[2] = hexp(value.data[2]);
+     p.data[3] = hexp(value.data[3]);
+     *(d_half4 *)&out[0] = p;
+ }
+
+int HipExec_tensor_exp_layer(hipStream_t stream, dim3 globalThreads, dim3 localThreads, vx_enum type, uchar *in, uint in_offset, uint4 in_stride, uchar *out,
+    uint out_offset, uint4 out_stride) {
+
+    if (type == VX_TYPE_FLOAT32) {
+        hipLaunchKernelGGL(Hip_tensor_exp_layer, dim3(ceil((float)globalThreads.x/localThreads.x), ceil((float)globalThreads.y/localThreads.y),
+            ceil((float)globalThreads.z/localThreads.z)), dim3(localThreads.x, localThreads.y, localThreads.z), 0, stream, in, in_offset, in_stride,
+            out, out_offset, out_stride);
+    } else {
+        hipLaunchKernelGGL(Hip_tensor_exp_layer_half, dim3(ceil((float)globalThreads.x/localThreads.x), ceil((float)globalThreads.y/localThreads.y),
+            ceil((float)globalThreads.z/localThreads.z)), dim3(localThreads.x, localThreads.y, localThreads.z), 0, stream, in, in_offset, in_stride,
+            out, out_offset, out_stride);
+    }
+
+    return VX_SUCCESS;
+}
@@ -48,6 +48,7 @@ static vx_status VX_CALLBACK query_target_support(vx_graph graph, vx_node node,
     return VX_SUCCESS;
 }
 
+#ifdef ENABLE_OPENCL
 static vx_status VX_CALLBACK opencl_codegen(
     vx_node node,                                  // [input] node
     const vx_reference parameters[],               // [input] parameters
@@ -122,11 +123,65 @@ static vx_status VX_CALLBACK opencl_codegen(
     }
     return VX_SUCCESS;
 }
+#endif
 
 //! \brief The kernel execution.
 static vx_status VX_CALLBACK host_kernel(vx_node node, const vx_reference * parameters, vx_uint32 num)
 {
+#if ENABLE_HIP
+    vx_size num_of_dims;
+    vx_size temp[4] = {0};
+    vx_size input_offset, output_offset, order_offset;
+    uint4 input_stride, output_dims, output_stride;
+    unsigned char *input_mem = NULL;
+    unsigned char *output_mem = NULL;
+    unsigned char *order_mem = NULL;
+    vx_size order_cap;
+    hipStream_t hip_stream;
+
+    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_NUMBER_OF_DIMS, &num_of_dims, sizeof(num_of_dims)));
+    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_BUFFER_HIP, &input_mem, sizeof(input_mem)));
+    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_OFFSET_GPU, &input_offset, sizeof(input_offset)));
+    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_STRIDE_GPU, &temp, sizeof(temp)));
+    input_stride.x = temp[0];
+    input_stride.y = temp[1];
+    input_stride.z = temp[2];
+    input_stride.w = temp[3];
+
+    ERROR_CHECK_STATUS(vxQueryArray((vx_array)parameters[1], VX_ARRAY_BUFFER_HIP, &order_mem, sizeof(order_mem)));
+    ERROR_CHECK_STATUS(vxQueryArray((vx_array)parameters[1], VX_ARRAY_OFFSET_GPU, &order_offset, sizeof(order_offset)));
+    ERROR_CHECK_STATUS(vxQueryArray((vx_array)parameters[1], VX_ARRAY_CAPACITY, &order_cap, sizeof(order_cap)));
+
+    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_NUMBER_OF_DIMS, &num_of_dims, sizeof(num_of_dims)));
+    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_DIMS, temp, sizeof(temp)));
+    output_dims.x = temp[0];
+    output_dims.y = temp[1];
+    output_dims.z = temp[2];
+    output_dims.w = temp[3];
+    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_BUFFER_HIP, &output_mem, sizeof(output_mem)));
+    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_OFFSET_GPU, &output_offset, sizeof(output_offset)));
+    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_STRIDE_GPU, &temp, sizeof(temp)));
+    output_stride.x = temp[0];
+    output_stride.y = temp[1];
+    output_stride.z = temp[2];
+    output_stride.w = temp[3];
+
+    ERROR_CHECK_STATUS(vxQueryNode(node, VX_NODE_ATTRIBUTE_AMD_HIP_STREAM, &hip_stream, sizeof(hip_stream)));
+
+    dim3 globalThreads;
+    globalThreads.x = (output_dims.x + 3) >> 2;
+    globalThreads.y = output_dims.y;
+    globalThreads.z = output_dims.z * output_dims.w;
+
+    if (HipExec_permute_layer(hip_stream, globalThreads, dim3(1), input_mem, input_offset, input_stride, order_mem, order_offset, order_cap, output_mem, output_offset, output_stride)) {
+        return VX_FAILURE;
+    }
+
+    return VX_SUCCESS;
+
+#elif ENABLE_OPENCL
     return VX_ERROR_NOT_IMPLEMENTED;
+#endif
 }
 
 //! \brief The kernel publisher.
@@ -136,10 +191,11 @@ vx_status publishPermuteLayer(vx_context context)
     ERROR_CHECK_OBJECT(kernel);
 
     amd_kernel_query_target_support_f query_target_support_f = query_target_support;
-    amd_kernel_opencl_codegen_callback_f opencl_codegen_callback_f = opencl_codegen;
     ERROR_CHECK_STATUS(vxSetKernelAttribute(kernel, VX_KERNEL_ATTRIBUTE_AMD_QUERY_TARGET_SUPPORT, &query_target_support_f, sizeof(query_target_support_f)));
+#if ENABLE_OPENCL
+    amd_kernel_opencl_codegen_callback_f opencl_codegen_callback_f = opencl_codegen;
     ERROR_CHECK_STATUS(vxSetKernelAttribute(kernel, VX_KERNEL_ATTRIBUTE_AMD_OPENCL_CODEGEN_CALLBACK, &opencl_codegen_callback_f, sizeof(opencl_codegen_callback_f)));
-
+#endif
     //set kernel parameters.
     ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 0, VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED));
     ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 1, VX_INPUT, VX_TYPE_ARRAY, VX_PARAMETER_STATE_REQUIRED));

@@ -46,6 +46,7 @@ static vx_status VX_CALLBACK query_target_support(vx_graph graph, vx_node node,
     return VX_SUCCESS;
 }
 
+#if ENABLE_OPENCL
 //! \brief The OpenCL code generator callback.
 static vx_status VX_CALLBACK opencl_codegen(
     vx_node node,                                  // [input] node
@@ -119,21 +120,71 @@ static vx_status VX_CALLBACK opencl_codegen(
     }
     return VX_SUCCESS;
 }
+#endif
 
 static vx_status VX_CALLBACK host_kernel(vx_node node, const vx_reference * parameters, vx_uint32 num) {
+#if ENABLE_HIP
+    vx_size input_dims[4];
+    vx_size num_of_dims;
+    vx_enum type;
+    vx_size temp[4] = {0};
+    vx_size input_offset, output_offset;
+    uint4 input_stride, output_stride;
+    unsigned char *input_mem = NULL;
+    unsigned char *output_mem = NULL;
+    hipStream_t hip_stream;
+
+    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_NUMBER_OF_DIMS, &num_of_dims, sizeof(num_of_dims)));
+    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DIMS, input_dims, sizeof(input_dims)));
+    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_BUFFER_HIP, &input_mem, sizeof(input_mem)));
+    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_OFFSET_GPU, &input_offset, sizeof(input_offset)));
+    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_STRIDE_GPU, &temp, sizeof(temp)));
+    input_stride.x = temp[0];
+    input_stride.y = temp[1];
+    input_stride.z = temp[2];
+    input_stride.w = temp[3];
+
+    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DATA_TYPE, &type, sizeof(type)));
+    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_BUFFER_HIP, &output_mem, sizeof(output_mem)));
+    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_OFFSET_GPU, &output_offset, sizeof(output_offset)));
+    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_STRIDE_GPU, &temp, sizeof(temp)));
+    output_stride.x = temp[0];
+    output_stride.y = temp[1];
+    output_stride.z = temp[2];
+    output_stride.w = temp[3];
+
+    ERROR_CHECK_STATUS(vxQueryNode(node, VX_NODE_ATTRIBUTE_AMD_HIP_STREAM, &hip_stream, sizeof(hip_stream)));
+
+    dim3 globalThreads;
+    globalThreads.x = (input_dims[0] + 3) >> 2;
+    globalThreads.y = input_dims[1];
+    globalThreads.z = input_dims[2] * input_dims[3];
+
+    if (HipExec_tensor_exp_layer(hip_stream, globalThreads, dim3(1), type, input_mem, input_offset, input_stride, output_mem,
+        output_offset, output_stride)) {
+            return VX_FAILURE;
+    }
+
+    return VX_SUCCESS;
+
+#elif ENABLE_OPENCL
     return VX_ERROR_NOT_IMPLEMENTED;
+#endif
 }
 
 vx_status publishTensorExp(vx_context context)
 {
     // add kernel to the context with callbacks
     vx_kernel kernel = vxAddUserKernel(context, "com.amd.nn_extension.tensor_exp", VX_KERNEL_TENSOR_EXP_AMD, host_kernel, 2, validateTensorExp, nullptr, nullptr);
     ERROR_CHECK_OBJECT(kernel);
-  
+
     amd_kernel_query_target_support_f query_target_support_f = query_target_support;
-    amd_kernel_opencl_codegen_callback_f opencl_codegen_callback_f = opencl_codegen;
     ERROR_CHECK_STATUS(vxSetKernelAttribute(kernel, VX_KERNEL_ATTRIBUTE_AMD_QUERY_TARGET_SUPPORT, &query_target_support_f, sizeof(query_target_support_f)));
+
+#if ENABLE_OPENCL
+    amd_kernel_opencl_codegen_callback_f opencl_codegen_callback_f = opencl_codegen;
     ERROR_CHECK_STATUS(vxSetKernelAttribute(kernel, VX_KERNEL_ATTRIBUTE_AMD_OPENCL_CODEGEN_CALLBACK, &opencl_codegen_callback_f, sizeof(opencl_codegen_callback_f)));
+#endif
 
     //set kernel parameters.
     ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 0, VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED));