diff --git a/amd_openvx_extensions/amd_nn/CMakeLists.txt b/amd_openvx_extensions/amd_nn/CMakeLists.txt
index 5cdd536593..96627ed09b 100644
--- a/amd_openvx_extensions/amd_nn/CMakeLists.txt
+++ b/amd_openvx_extensions/amd_nn/CMakeLists.txt
@@ -83,6 +83,7 @@ list(APPEND SOURCES
src/gather_layer.cpp
src/topk_layer.cpp
src/reduce_min.cpp
+ src/tile_layer.cpp
src/profiler.cpp
)
diff --git a/amd_openvx_extensions/amd_nn/README.md b/amd_openvx_extensions/amd_nn/README.md
index 5fb06c4167..a8e5e71143 100644
--- a/amd_openvx_extensions/amd_nn/README.md
+++ b/amd_openvx_extensions/amd_nn/README.md
@@ -37,6 +37,7 @@ vx_nn is an OpenVX Neural Network extension module. This implementation supports
| Tensor Min|vxTensorMinNode|com.amd.nn_extension.tensor_min |
| Tensor Multiply|vxTensorMultiplyNode|org.khronos.openvx.tensor_multiply |
| Tensor Subtract|vxTensorSubtractNode|org.khronos.openvx.tensor_subtract |
+| Tile|vxTileLayer|com.amd.nn_extension.tile_layer |
| TopK|vxTopKLayer|com.amd.nn_extension.topk_layer|
| Upsample Nearest Neighborhood|vxUpsampleNearestLayer|com.amd.nn_extension.upsample_nearest_layer |
diff --git a/amd_openvx_extensions/amd_nn/include/vx_amd_nn.h b/amd_openvx_extensions/amd_nn/include/vx_amd_nn.h
index ebf02cf139..3b5e2ec6fe 100644
--- a/amd_openvx_extensions/amd_nn/include/vx_amd_nn.h
+++ b/amd_openvx_extensions/amd_nn/include/vx_amd_nn.h
@@ -357,4 +357,15 @@ VX_API_ENTRY vx_node VX_API_CALL vxTopKLayer(vx_graph graph, vx_tensor x_tensor,
*/
VX_API_ENTRY vx_node VX_API_CALL vxReduceMinLayer(vx_graph graph, vx_tensor data, vx_array axes, vx_int32 keepdims, vx_tensor reduced);
+/* \brief [Graph] Creates a Tile Layer Node.
+ * \details Constructs a tensor by tiling a given tensor.
+ * \param [in] graph The handle to the graph.
+ * \param [in] input The input tensor data.
+ * \param [in] repeats 1D int64 tensor of the same length as input's dimension number, includes numbers of repeated copies along input's dimensions.
+ * \param [out] output Output tensor of the same dimension and type as tensor input. output_dim[i] = input_dim[i] * repeats[i]
+ * \return vx_node.
+ * \returns A node reference \ref vx_node. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus.
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxTileLayer(vx_graph graph, vx_tensor input, vx_tensor repeats, vx_tensor output);
+
#endif
diff --git a/amd_openvx_extensions/amd_nn/src/kernels.cpp b/amd_openvx_extensions/amd_nn/src/kernels.cpp
index 35f81e2648..de31da2766 100644
--- a/amd_openvx_extensions/amd_nn/src/kernels.cpp
+++ b/amd_openvx_extensions/amd_nn/src/kernels.cpp
@@ -253,6 +253,7 @@ SHARED_PUBLIC vx_status VX_API_CALL vxPublishKernels(vx_context context)
ERROR_CHECK_STATUS(publishGatherLayer(context));
ERROR_CHECK_STATUS(publishTopKLayer(context));
ERROR_CHECK_STATUS(publishReduceMinLayer(context));
+ ERROR_CHECK_STATUS(publishTileLayer(context));
// register drama rules
AgoNodeMergeRule softmax_rule = {
diff --git a/amd_openvx_extensions/amd_nn/src/kernels.h b/amd_openvx_extensions/amd_nn/src/kernels.h
index 2c5e2531f8..16f8721c16 100644
--- a/amd_openvx_extensions/amd_nn/src/kernels.h
+++ b/amd_openvx_extensions/amd_nn/src/kernels.h
@@ -115,6 +115,7 @@ enum user_kernel_e
VX_KERNEL_GATHER_LAYER_AMD = VX_KERNEL_BASE(VX_ID_AMD, NN_EXTENSION_LIBRARY) + 0x016,
VX_KERNEL_TOPK_LAYER_AMD = VX_KERNEL_BASE(VX_ID_AMD, NN_EXTENSION_LIBRARY) + 0x017,
VX_KERNEL_REDUCE_MIN_LAYER_AMD = VX_KERNEL_BASE(VX_ID_AMD, NN_EXTENSION_LIBRARY) + 0x018,
+ VX_KERNEL_TILE_LAYER_AMD = VX_KERNEL_BASE(VX_ID_AMD, NN_EXTENSION_LIBRARY) + 0x019,
};
//////////////////////////////////////////////////////////////////////
@@ -173,6 +174,7 @@ vx_status publishNMSLayer(vx_context context);
vx_status publishGatherLayer(vx_context context);
vx_status publishTopKLayer(vx_context context);
vx_status publishReduceMinLayer(vx_context context);
+vx_status publishTileLayer(vx_context context);
//////////////////////////////////////////////////////////////////////
//! \brief The module entry point for publishing/unpublishing kernels
diff --git a/amd_openvx_extensions/amd_nn/src/tile_layer.cpp b/amd_openvx_extensions/amd_nn/src/tile_layer.cpp
new file mode 100644
index 0000000000..652c97d53c
--- /dev/null
+++ b/amd_openvx_extensions/amd_nn/src/tile_layer.cpp
@@ -0,0 +1,167 @@
+#include
+
+static vx_status VX_CALLBACK validateTileLayer(vx_node node, const vx_reference *parameters, vx_uint32 num, vx_meta_format metas[]) {
+ vx_enum type, type2, out_type;
+ vx_size num_dims, num_dims2, out_num_dims;
+ vx_size input_dims[4], input_dims2[4], output_dims[4];
+
+ ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_NUMBER_OF_DIMS, &num_dims, sizeof(num_dims)));
+ ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DATA_TYPE, &type, sizeof(type)));
+ if ((type != VX_TYPE_FLOAT32) && (type != VX_TYPE_FLOAT16)) return VX_ERROR_INVALID_TYPE;
+ ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DIMS, input_dims, sizeof(input_dims)));
+
+ ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_NUMBER_OF_DIMS, &num_dims2, sizeof(num_dims2)));
+ ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DATA_TYPE, &type2, sizeof(type2)));
+ if ((type2 != VX_TYPE_INT32) && (type2 != VX_TYPE_INT64)) return VX_ERROR_INVALID_TYPE;
+ ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DIMS, input_dims2, sizeof(input_dims2)));
+
+ ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_NUMBER_OF_DIMS, &out_num_dims, sizeof(out_num_dims)));
+ ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_DATA_TYPE, &out_type, sizeof(out_type)));
+ if ((out_type != VX_TYPE_FLOAT32) && (out_type != VX_TYPE_FLOAT16)) return VX_ERROR_INVALID_TYPE;
+ ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_DIMS, output_dims, sizeof(output_dims)));
+
+ if ((num_dims != out_num_dims) || (num_dims != input_dims2[0])) {
+ printf("validate: tile: Ranks of input, repeat, and output tensors should be equal\n");
+ return VX_ERROR_INVALID_DIMENSION;
+ }
+
+ ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[2], VX_TENSOR_DATA_TYPE, &out_type, sizeof(out_type)));
+ ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[2], VX_TENSOR_NUMBER_OF_DIMS, &out_num_dims, sizeof(num_dims)));
+ ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[2], VX_TENSOR_DIMS, &output_dims, sizeof(output_dims)));
+ return VX_SUCCESS;
+
+}
+
+static vx_status VX_CALLBACK query_target_support(vx_graph graph, vx_node node,
+ vx_bool use_opencl_1_2,
+ vx_uint32& supported_target_affinity
+)
+{
+ supported_target_affinity = AGO_TARGET_AFFINITY_GPU;
+ return VX_SUCCESS;
+}
+
+static vx_status VX_CALLBACK opencl_codegen(
+ vx_node node, // [input] node
+ const vx_reference parameters[], // [input] parameters
+ vx_uint32 num, // [input] number of parameters
+ bool opencl_load_function, // [input] false: normal OpenCL kernel; true: reserved
+ char opencl_kernel_function_name[64], // [output] kernel_name for clCreateKernel()
+ std::string& opencl_kernel_code, // [output] string for clCreateProgramWithSource()
+ std::string& opencl_build_options, // [output] options for clBuildProgram()
+ vx_uint32& opencl_work_dim, // [output] work_dim for clEnqueueNDRangeKernel()
+ vx_size opencl_global_work[], // [output] global_work[] for clEnqueueNDRangeKernel()
+ vx_size opencl_local_work[], // [output] local_work[] for clEnqueueNDRangeKernel()
+ vx_uint32& opencl_local_buffer_usage_mask, // [output] reserved: must be ZERO
+ vx_uint32& opencl_local_buffer_size_in_bytes // [output] reserved: must be ZERO
+)
+{
+ //get tensor dimensions
+ vx_size input_dims[4], output_dims[4];
+ vx_size num_of_dims;
+ vx_enum type;
+
+ ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_NUMBER_OF_DIMS, &num_of_dims, sizeof(num_of_dims)));
+ ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DIMS, input_dims, sizeof(input_dims)));
+ ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DATA_TYPE, &type, sizeof(type)));
+ ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_DIMS, output_dims, sizeof(output_dims)));
+
+ for (int i=0; i<4; i++) { //convert input tensor to 4-D
+ if(i >= num_of_dims) {
+ input_dims[i] = 1;
+ output_dims[i] = 1;
+ }
+ }
+
+ strcpy(opencl_kernel_function_name, "tile_layer");
+
+ opencl_work_dim = 3;
+ opencl_global_work[0] = output_dims[0];
+ opencl_global_work[1] = output_dims[1];
+ opencl_global_work[2] = output_dims[2] * output_dims[3];
+
+ // Setting variables required by the interface
+ opencl_local_buffer_usage_mask = 0;
+ opencl_local_buffer_size_in_bytes = 0;
+
+ if (num_of_dims) {
+ char item[8192];
+ if (type == VX_TYPE_FLOAT32) {
+ sprintf(item,
+ "#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n"
+ "__kernel void %s(__global uchar * in, uint in_offset, uint4 in_stride, __global uchar * rep, uint rep_offset, uint4 rep_stride, __global uchar * out, uint out_offset, uint4 out_stride) \n"
+ "{ \n"
+ " uint x = get_global_id(0);\n"
+ " uint y = get_global_id(1);\n"
+ " uint c = get_global_id(2);\n"
+ " uint nx = x %% %d;\n"
+ " uint ny = y %% %d;\n"
+ " uint nc = c %% %d;\n"
+ " float value = *(__global float *)&in[in_offset + nx*in_stride.s0 + ny*in_stride.s1 + nc*in_stride.s2];\n"
+ " uint offset = out_offset + x*out_stride.s0 + y*out_stride.s1 + c*out_stride.s2;\n"
+ " out += offset;\n"
+ " *(__global float *)&out[0] = value;\n"
+ "}\n", opencl_kernel_function_name, (int)input_dims[0], (int)input_dims[1], (int)input_dims[2]);
+ }
+ else {
+ sprintf(item,
+ "#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n"
+ "__kernel void %s(__global uchar * in, uint in_offset, uint4 in_stride, __global uchar * rep, uint rep_offset, uint4 rep_stride, __global uchar * out, uint out_offset, uint4 out_stride) \n"
+ "{ \n"
+ " uint x = get_global_id(0);\n"
+ " uint y = get_global_id(1);\n"
+ " uint c = get_global_id(2);\n"
+ " uint nx = x %% %d;\n"
+ " uint ny = y %% %d;\n"
+ " uint nc = c %% %d;\n"
+ " half value = *(__global half *)&in[in_offset + nx*in_stride.s0 + ny*in_stride.s1 + nc*in_stride.s2];\n"
+ " uint offset = out_offset + x*out_stride.s0 + y*out_stride.s1 + c*out_stride.s2;\n"
+ " out += offset;\n"
+ " *(__global half *)&out[0] = value;\n"
+ "}\n", opencl_kernel_function_name, (int)input_dims[0], (int)input_dims[1], (int)input_dims[2]);
+ }
+ opencl_kernel_code = item;
+ }
+ return VX_SUCCESS;
+}
+
+//! \brief The kernel execution.
+static vx_status VX_CALLBACK host_kernel(vx_node node, const vx_reference * parameters, vx_uint32 num)
+{
+ return VX_ERROR_NOT_IMPLEMENTED;
+}
+
+//! \brief The kernel publisher.
+vx_status publishTileLayer(vx_context context) {
+ vx_kernel kernel = vxAddUserKernel(context, "com.amd.nn_extension.tile_layer", VX_KERNEL_TILE_LAYER_AMD, host_kernel, 3, validateTileLayer, nullptr, nullptr);
+ ERROR_CHECK_OBJECT(kernel);
+
+ amd_kernel_query_target_support_f query_target_support_f = query_target_support;
+ amd_kernel_opencl_codegen_callback_f opencl_codegen_callback_f = opencl_codegen;
+ ERROR_CHECK_STATUS(vxSetKernelAttribute(kernel, VX_KERNEL_ATTRIBUTE_AMD_QUERY_TARGET_SUPPORT, &query_target_support_f, sizeof(query_target_support_f)));
+ ERROR_CHECK_STATUS(vxSetKernelAttribute(kernel, VX_KERNEL_ATTRIBUTE_AMD_OPENCL_CODEGEN_CALLBACK, &opencl_codegen_callback_f, sizeof(opencl_codegen_callback_f)));
+
+ ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 0, VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED));
+ ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 1, VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED));
+ ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 2, VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED));
+
+ ERROR_CHECK_STATUS(vxFinalizeKernel(kernel));
+ ERROR_CHECK_STATUS(vxReleaseKernel(&kernel));
+ return VX_SUCCESS;
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxTileLayer(vx_graph graph, vx_tensor input, vx_tensor repeats, vx_tensor output) {
+ vx_node node = NULL;
+ vx_context context = vxGetContext((vx_reference)graph);
+ if (vxGetStatus((vx_reference)context) == VX_SUCCESS) {
+ vx_reference params[] = {
+ (vx_reference) input,
+ (vx_reference) repeats,
+ (vx_reference) output,
+ };
+ node = createNode(graph, VX_KERNEL_TILE_LAYER_AMD, params, sizeof(params) / sizeof(params[0]));
+ }
+
+ return node;
+}
+
diff --git a/model_compiler/README.md b/model_compiler/README.md
index daf5fc3df4..37593be058 100644
--- a/model_compiler/README.md
+++ b/model_compiler/README.md
@@ -351,6 +351,7 @@ Test code with preprocessing add / multiply values to normalize the input tensor
|Squeeze||☑|☑|
|Sub||☑|☑|
|Sum||☑||
+|Tile||☑||
|TopK||☑||
|Transpose||☑|☑|
|Unsqueeze||☑|☑|
diff --git a/model_compiler/python/nnir.py b/model_compiler/python/nnir.py
index dbda33988f..8fdc1ce774 100644
--- a/model_compiler/python/nnir.py
+++ b/model_compiler/python/nnir.py
@@ -206,6 +206,7 @@ def __init__(self):
'gather' : 1,
'topk' : 1,
'reduce_min' : 1,
+ 'tile' : 1,
}
def set(self,type,inputs,outputs,attr):
@@ -822,6 +823,25 @@ def updateLocals(self):
local.setInfo(input.type, output_shape)
local.setFormat(input.format)
self.addLocal(local)
+ elif node.type in ['tile']:
+ input = self.tensor_dict[node.inputs[0]]
+ if node.inputs[1] not in self.binaries:
+ raise ValueError("tile: tile by local tensor is unsupported: " + node.inputs[1])
+
+ repeats = np.frombuffer(self.binaries[node.inputs[1]], dtype=npType)
+
+ repeats = np.flip(repeats)
+ while len(repeats) < 4:
+ repeats.add(1)
+
+ for i in range(len(input.shape)):
+ output_shape = input.shape[i] * repeats[i]
+
+ local = IrTensor()
+ local.setName(output)
+ local.setInfo(input.type, output_shape)
+ local.setFormat(input.format)
+ self.addLocal(local)
else:
raise ValueError("Unsupported IR node type: {}".format(node.type))
diff --git a/model_compiler/python/nnir_to_openvx.py b/model_compiler/python/nnir_to_openvx.py
index ae1c53af4c..1901448c59 100644
--- a/model_compiler/python/nnir_to_openvx.py
+++ b/model_compiler/python/nnir_to_openvx.py
@@ -954,6 +954,16 @@ def generateModuleCPP(graph,fileName,virtual_tensor_flag):
}
"""
% (node.attr.get('axis'), node.inputs[0], node.inputs[1], node.outputs[0]))
+ elif node.type == 'tile':
+ f.write( \
+"""
+ {
+ vx_node node = vxTileLayer(graph, %s, %s, %s);
+ ERROR_CHECK_OBJECT(node);
+ ERROR_CHECK_STATUS(vxReleaseNode(&node));
+ }
+"""
+ % (node.inputs[0], node.inputs[1], node.outputs[0]))
elif node.type == 'reduce_min':
axes = node.attr.get('axes')
axes_len = -1
diff --git a/model_compiler/python/onnx_to_nnir.py b/model_compiler/python/onnx_to_nnir.py
index 0c4ab4bac3..4fe48225a4 100644
--- a/model_compiler/python/onnx_to_nnir.py
+++ b/model_compiler/python/onnx_to_nnir.py
@@ -93,6 +93,7 @@
'Gather' : 'gather',
'TopK' : 'topk',
'ReduceMin' : 'reduce_min',
+ 'Tile' : 'tile',
}
onnx2ir_data_type = [