diff --git a/amd_openvx_extensions/amd_nn/CMakeLists.txt b/amd_openvx_extensions/amd_nn/CMakeLists.txt index 5cdd536593..96627ed09b 100644 --- a/amd_openvx_extensions/amd_nn/CMakeLists.txt +++ b/amd_openvx_extensions/amd_nn/CMakeLists.txt @@ -83,6 +83,7 @@ list(APPEND SOURCES src/gather_layer.cpp src/topk_layer.cpp src/reduce_min.cpp + src/tile_layer.cpp src/profiler.cpp ) diff --git a/amd_openvx_extensions/amd_nn/README.md b/amd_openvx_extensions/amd_nn/README.md index 5fb06c4167..a8e5e71143 100644 --- a/amd_openvx_extensions/amd_nn/README.md +++ b/amd_openvx_extensions/amd_nn/README.md @@ -37,6 +37,7 @@ vx_nn is an OpenVX Neural Network extension module. This implementation supports | Tensor Min|vxTensorMinNode|com.amd.nn_extension.tensor_min | | Tensor Multiply|vxTensorMultiplyNode|org.khronos.openvx.tensor_multiply | | Tensor Subtract|vxTensorSubtractNode|org.khronos.openvx.tensor_subtract | +| Tile|vxTileLayer|com.amd.nn_extension.tile_layer | | TopK|vxTopKLayer|com.amd.nn_extension.topk_layer| | Upsample Nearest Neighborhood|vxUpsampleNearestLayer|com.amd.nn_extension.upsample_nearest_layer | diff --git a/amd_openvx_extensions/amd_nn/include/vx_amd_nn.h b/amd_openvx_extensions/amd_nn/include/vx_amd_nn.h index ebf02cf139..3b5e2ec6fe 100644 --- a/amd_openvx_extensions/amd_nn/include/vx_amd_nn.h +++ b/amd_openvx_extensions/amd_nn/include/vx_amd_nn.h @@ -357,4 +357,15 @@ VX_API_ENTRY vx_node VX_API_CALL vxTopKLayer(vx_graph graph, vx_tensor x_tensor, */ VX_API_ENTRY vx_node VX_API_CALL vxReduceMinLayer(vx_graph graph, vx_tensor data, vx_array axes, vx_int32 keepdims, vx_tensor reduced); +/* \brief [Graph] Creates a Tile Layer Node. + * \details Constructs a tensor by tiling a given tensor. + * \param [in] graph The handle to the graph. + * \param [in] input The input tensor data. + * \param [in] repeats 1D int64 tensor of the same length as input's dimension number, includes numbers of repeated copies along input's dimensions. + * \param [out] output Output tensor of the same dimension and type as tensor input. output_dim[i] = input_dim[i] * repeats[i] + * \return vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus. + */ +VX_API_ENTRY vx_node VX_API_CALL vxTileLayer(vx_graph graph, vx_tensor input, vx_tensor repeats, vx_tensor output); + #endif diff --git a/amd_openvx_extensions/amd_nn/src/kernels.cpp b/amd_openvx_extensions/amd_nn/src/kernels.cpp index 35f81e2648..de31da2766 100644 --- a/amd_openvx_extensions/amd_nn/src/kernels.cpp +++ b/amd_openvx_extensions/amd_nn/src/kernels.cpp @@ -253,6 +253,7 @@ SHARED_PUBLIC vx_status VX_API_CALL vxPublishKernels(vx_context context) ERROR_CHECK_STATUS(publishGatherLayer(context)); ERROR_CHECK_STATUS(publishTopKLayer(context)); ERROR_CHECK_STATUS(publishReduceMinLayer(context)); + ERROR_CHECK_STATUS(publishTileLayer(context)); // register drama rules AgoNodeMergeRule softmax_rule = { diff --git a/amd_openvx_extensions/amd_nn/src/kernels.h b/amd_openvx_extensions/amd_nn/src/kernels.h index 2c5e2531f8..16f8721c16 100644 --- a/amd_openvx_extensions/amd_nn/src/kernels.h +++ b/amd_openvx_extensions/amd_nn/src/kernels.h @@ -115,6 +115,7 @@ enum user_kernel_e VX_KERNEL_GATHER_LAYER_AMD = VX_KERNEL_BASE(VX_ID_AMD, NN_EXTENSION_LIBRARY) + 0x016, VX_KERNEL_TOPK_LAYER_AMD = VX_KERNEL_BASE(VX_ID_AMD, NN_EXTENSION_LIBRARY) + 0x017, VX_KERNEL_REDUCE_MIN_LAYER_AMD = VX_KERNEL_BASE(VX_ID_AMD, NN_EXTENSION_LIBRARY) + 0x018, + VX_KERNEL_TILE_LAYER_AMD = VX_KERNEL_BASE(VX_ID_AMD, NN_EXTENSION_LIBRARY) + 0x019, }; ////////////////////////////////////////////////////////////////////// @@ -173,6 +174,7 @@ vx_status publishNMSLayer(vx_context context); vx_status publishGatherLayer(vx_context context); vx_status publishTopKLayer(vx_context context); vx_status publishReduceMinLayer(vx_context context); +vx_status publishTileLayer(vx_context context); ////////////////////////////////////////////////////////////////////// //! \brief The module entry point for publishing/unpublishing kernels diff --git a/amd_openvx_extensions/amd_nn/src/tile_layer.cpp b/amd_openvx_extensions/amd_nn/src/tile_layer.cpp new file mode 100644 index 0000000000..652c97d53c --- /dev/null +++ b/amd_openvx_extensions/amd_nn/src/tile_layer.cpp @@ -0,0 +1,167 @@ +#include + +static vx_status VX_CALLBACK validateTileLayer(vx_node node, const vx_reference *parameters, vx_uint32 num, vx_meta_format metas[]) { + vx_enum type, type2, out_type; + vx_size num_dims, num_dims2, out_num_dims; + vx_size input_dims[4], input_dims2[4], output_dims[4]; + + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_NUMBER_OF_DIMS, &num_dims, sizeof(num_dims))); + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DATA_TYPE, &type, sizeof(type))); + if ((type != VX_TYPE_FLOAT32) && (type != VX_TYPE_FLOAT16)) return VX_ERROR_INVALID_TYPE; + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DIMS, input_dims, sizeof(input_dims))); + + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_NUMBER_OF_DIMS, &num_dims2, sizeof(num_dims2))); + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DATA_TYPE, &type2, sizeof(type2))); + if ((type2 != VX_TYPE_INT32) && (type2 != VX_TYPE_INT64)) return VX_ERROR_INVALID_TYPE; + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DIMS, input_dims2, sizeof(input_dims2))); + + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_NUMBER_OF_DIMS, &out_num_dims, sizeof(out_num_dims))); + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_DATA_TYPE, &out_type, sizeof(out_type))); + if ((out_type != VX_TYPE_FLOAT32) && (out_type != VX_TYPE_FLOAT16)) return VX_ERROR_INVALID_TYPE; + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_DIMS, output_dims, sizeof(output_dims))); + + if ((num_dims != out_num_dims) || (num_dims != input_dims2[0])) { + printf("validate: tile: Ranks of input, repeat, and output tensors should be equal\n"); + return VX_ERROR_INVALID_DIMENSION; + } + + ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[2], VX_TENSOR_DATA_TYPE, &out_type, sizeof(out_type))); + ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[2], VX_TENSOR_NUMBER_OF_DIMS, &out_num_dims, sizeof(num_dims))); + ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[2], VX_TENSOR_DIMS, &output_dims, sizeof(output_dims))); + return VX_SUCCESS; + +} + +static vx_status VX_CALLBACK query_target_support(vx_graph graph, vx_node node, + vx_bool use_opencl_1_2, + vx_uint32& supported_target_affinity +) +{ + supported_target_affinity = AGO_TARGET_AFFINITY_GPU; + return VX_SUCCESS; +} + +static vx_status VX_CALLBACK opencl_codegen( + vx_node node, // [input] node + const vx_reference parameters[], // [input] parameters + vx_uint32 num, // [input] number of parameters + bool opencl_load_function, // [input] false: normal OpenCL kernel; true: reserved + char opencl_kernel_function_name[64], // [output] kernel_name for clCreateKernel() + std::string& opencl_kernel_code, // [output] string for clCreateProgramWithSource() + std::string& opencl_build_options, // [output] options for clBuildProgram() + vx_uint32& opencl_work_dim, // [output] work_dim for clEnqueueNDRangeKernel() + vx_size opencl_global_work[], // [output] global_work[] for clEnqueueNDRangeKernel() + vx_size opencl_local_work[], // [output] local_work[] for clEnqueueNDRangeKernel() + vx_uint32& opencl_local_buffer_usage_mask, // [output] reserved: must be ZERO + vx_uint32& opencl_local_buffer_size_in_bytes // [output] reserved: must be ZERO +) +{ + //get tensor dimensions + vx_size input_dims[4], output_dims[4]; + vx_size num_of_dims; + vx_enum type; + + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_NUMBER_OF_DIMS, &num_of_dims, sizeof(num_of_dims))); + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DIMS, input_dims, sizeof(input_dims))); + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DATA_TYPE, &type, sizeof(type))); + ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_DIMS, output_dims, sizeof(output_dims))); + + for (int i=0; i<4; i++) { //convert input tensor to 4-D + if(i >= num_of_dims) { + input_dims[i] = 1; + output_dims[i] = 1; + } + } + + strcpy(opencl_kernel_function_name, "tile_layer"); + + opencl_work_dim = 3; + opencl_global_work[0] = output_dims[0]; + opencl_global_work[1] = output_dims[1]; + opencl_global_work[2] = output_dims[2] * output_dims[3]; + + // Setting variables required by the interface + opencl_local_buffer_usage_mask = 0; + opencl_local_buffer_size_in_bytes = 0; + + if (num_of_dims) { + char item[8192]; + if (type == VX_TYPE_FLOAT32) { + sprintf(item, + "#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n" + "__kernel void %s(__global uchar * in, uint in_offset, uint4 in_stride, __global uchar * rep, uint rep_offset, uint4 rep_stride, __global uchar * out, uint out_offset, uint4 out_stride) \n" + "{ \n" + " uint x = get_global_id(0);\n" + " uint y = get_global_id(1);\n" + " uint c = get_global_id(2);\n" + " uint nx = x %% %d;\n" + " uint ny = y %% %d;\n" + " uint nc = c %% %d;\n" + " float value = *(__global float *)&in[in_offset + nx*in_stride.s0 + ny*in_stride.s1 + nc*in_stride.s2];\n" + " uint offset = out_offset + x*out_stride.s0 + y*out_stride.s1 + c*out_stride.s2;\n" + " out += offset;\n" + " *(__global float *)&out[0] = value;\n" + "}\n", opencl_kernel_function_name, (int)input_dims[0], (int)input_dims[1], (int)input_dims[2]); + } + else { + sprintf(item, + "#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n" + "__kernel void %s(__global uchar * in, uint in_offset, uint4 in_stride, __global uchar * rep, uint rep_offset, uint4 rep_stride, __global uchar * out, uint out_offset, uint4 out_stride) \n" + "{ \n" + " uint x = get_global_id(0);\n" + " uint y = get_global_id(1);\n" + " uint c = get_global_id(2);\n" + " uint nx = x %% %d;\n" + " uint ny = y %% %d;\n" + " uint nc = c %% %d;\n" + " half value = *(__global half *)&in[in_offset + nx*in_stride.s0 + ny*in_stride.s1 + nc*in_stride.s2];\n" + " uint offset = out_offset + x*out_stride.s0 + y*out_stride.s1 + c*out_stride.s2;\n" + " out += offset;\n" + " *(__global half *)&out[0] = value;\n" + "}\n", opencl_kernel_function_name, (int)input_dims[0], (int)input_dims[1], (int)input_dims[2]); + } + opencl_kernel_code = item; + } + return VX_SUCCESS; +} + +//! \brief The kernel execution. +static vx_status VX_CALLBACK host_kernel(vx_node node, const vx_reference * parameters, vx_uint32 num) +{ + return VX_ERROR_NOT_IMPLEMENTED; +} + +//! \brief The kernel publisher. +vx_status publishTileLayer(vx_context context) { + vx_kernel kernel = vxAddUserKernel(context, "com.amd.nn_extension.tile_layer", VX_KERNEL_TILE_LAYER_AMD, host_kernel, 3, validateTileLayer, nullptr, nullptr); + ERROR_CHECK_OBJECT(kernel); + + amd_kernel_query_target_support_f query_target_support_f = query_target_support; + amd_kernel_opencl_codegen_callback_f opencl_codegen_callback_f = opencl_codegen; + ERROR_CHECK_STATUS(vxSetKernelAttribute(kernel, VX_KERNEL_ATTRIBUTE_AMD_QUERY_TARGET_SUPPORT, &query_target_support_f, sizeof(query_target_support_f))); + ERROR_CHECK_STATUS(vxSetKernelAttribute(kernel, VX_KERNEL_ATTRIBUTE_AMD_OPENCL_CODEGEN_CALLBACK, &opencl_codegen_callback_f, sizeof(opencl_codegen_callback_f))); + + ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 0, VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED)); + ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 1, VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED)); + ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 2, VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED)); + + ERROR_CHECK_STATUS(vxFinalizeKernel(kernel)); + ERROR_CHECK_STATUS(vxReleaseKernel(&kernel)); + return VX_SUCCESS; +} + +VX_API_ENTRY vx_node VX_API_CALL vxTileLayer(vx_graph graph, vx_tensor input, vx_tensor repeats, vx_tensor output) { + vx_node node = NULL; + vx_context context = vxGetContext((vx_reference)graph); + if (vxGetStatus((vx_reference)context) == VX_SUCCESS) { + vx_reference params[] = { + (vx_reference) input, + (vx_reference) repeats, + (vx_reference) output, + }; + node = createNode(graph, VX_KERNEL_TILE_LAYER_AMD, params, sizeof(params) / sizeof(params[0])); + } + + return node; +} + diff --git a/model_compiler/README.md b/model_compiler/README.md index daf5fc3df4..37593be058 100644 --- a/model_compiler/README.md +++ b/model_compiler/README.md @@ -351,6 +351,7 @@ Test code with preprocessing add / multiply values to normalize the input tensor |Squeeze||☑|☑| |Sub||☑|☑| |Sum||☑|| +|Tile||☑|| |TopK||☑|| |Transpose||☑|☑| |Unsqueeze||☑|☑| diff --git a/model_compiler/python/nnir.py b/model_compiler/python/nnir.py index dbda33988f..8fdc1ce774 100644 --- a/model_compiler/python/nnir.py +++ b/model_compiler/python/nnir.py @@ -206,6 +206,7 @@ def __init__(self): 'gather' : 1, 'topk' : 1, 'reduce_min' : 1, + 'tile' : 1, } def set(self,type,inputs,outputs,attr): @@ -822,6 +823,25 @@ def updateLocals(self): local.setInfo(input.type, output_shape) local.setFormat(input.format) self.addLocal(local) + elif node.type in ['tile']: + input = self.tensor_dict[node.inputs[0]] + if node.inputs[1] not in self.binaries: + raise ValueError("tile: tile by local tensor is unsupported: " + node.inputs[1]) + + repeats = np.frombuffer(self.binaries[node.inputs[1]], dtype=npType) + + repeats = np.flip(repeats) + while len(repeats) < 4: + repeats.add(1) + + for i in range(len(input.shape)): + output_shape = input.shape[i] * repeats[i] + + local = IrTensor() + local.setName(output) + local.setInfo(input.type, output_shape) + local.setFormat(input.format) + self.addLocal(local) else: raise ValueError("Unsupported IR node type: {}".format(node.type)) diff --git a/model_compiler/python/nnir_to_openvx.py b/model_compiler/python/nnir_to_openvx.py index ae1c53af4c..1901448c59 100644 --- a/model_compiler/python/nnir_to_openvx.py +++ b/model_compiler/python/nnir_to_openvx.py @@ -954,6 +954,16 @@ def generateModuleCPP(graph,fileName,virtual_tensor_flag): } """ % (node.attr.get('axis'), node.inputs[0], node.inputs[1], node.outputs[0])) + elif node.type == 'tile': + f.write( \ +""" + { + vx_node node = vxTileLayer(graph, %s, %s, %s); + ERROR_CHECK_OBJECT(node); + ERROR_CHECK_STATUS(vxReleaseNode(&node)); + } +""" + % (node.inputs[0], node.inputs[1], node.outputs[0])) elif node.type == 'reduce_min': axes = node.attr.get('axes') axes_len = -1 diff --git a/model_compiler/python/onnx_to_nnir.py b/model_compiler/python/onnx_to_nnir.py index 0c4ab4bac3..4fe48225a4 100644 --- a/model_compiler/python/onnx_to_nnir.py +++ b/model_compiler/python/onnx_to_nnir.py @@ -93,6 +93,7 @@ 'Gather' : 'gather', 'TopK' : 'topk', 'ReduceMin' : 'reduce_min', + 'Tile' : 'tile', } onnx2ir_data_type = [