ROCm · kiritigowda · Aug 11, 2020 · Jan 28, 2020 · Jan 29, 2020 · Aug 3, 2020
@@ -83,6 +83,7 @@ list(APPEND SOURCES
     src/gather_layer.cpp
     src/topk_layer.cpp
     src/reduce_min.cpp
+    src/tile_layer.cpp
     src/profiler.cpp
     )
 

@@ -37,6 +37,7 @@ vx_nn is an OpenVX Neural Network extension module. This implementation supports
 | Tensor Min|vxTensorMinNode|com.amd.nn_extension.tensor_min |
 | Tensor Multiply|vxTensorMultiplyNode|org.khronos.openvx.tensor_multiply |
 | Tensor Subtract|vxTensorSubtractNode|org.khronos.openvx.tensor_subtract |
+| Tile|vxTileLayer|com.amd.nn_extension.tile_layer |
 | TopK|vxTopKLayer|com.amd.nn_extension.topk_layer|
 | Upsample Nearest Neighborhood|vxUpsampleNearestLayer|com.amd.nn_extension.upsample_nearest_layer |
 

@@ -357,4 +357,15 @@ VX_API_ENTRY vx_node VX_API_CALL vxTopKLayer(vx_graph graph, vx_tensor x_tensor,
  */
 VX_API_ENTRY vx_node VX_API_CALL vxReduceMinLayer(vx_graph graph, vx_tensor data, vx_array axes, vx_int32 keepdims, vx_tensor reduced);
 
+/* \brief [Graph] Creates a Tile Layer Node.
+ * \details Constructs a tensor by tiling a given tensor.
+ * \param [in]  graph   The handle to the graph.
+ * \param [in]  input   The input tensor data.
+ * \param [in]  repeats 1D int64 tensor of the same length as input's dimension number, includes numbers of repeated copies along input's dimensions.
+ * \param [out] output  Output tensor of the same dimension and type as tensor input. output_dim[i] = input_dim[i] * repeats[i]
+ * \return <tt> vx_node</tt>.
+ * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxTileLayer(vx_graph graph, vx_tensor input, vx_tensor repeats, vx_tensor output);
+
 #endif
@@ -253,6 +253,7 @@ SHARED_PUBLIC vx_status VX_API_CALL vxPublishKernels(vx_context context)
     ERROR_CHECK_STATUS(publishGatherLayer(context));
     ERROR_CHECK_STATUS(publishTopKLayer(context));
     ERROR_CHECK_STATUS(publishReduceMinLayer(context));
+    ERROR_CHECK_STATUS(publishTileLayer(context));
 
     // register drama rules
     AgoNodeMergeRule softmax_rule = {

@@ -115,6 +115,7 @@ enum user_kernel_e
     VX_KERNEL_GATHER_LAYER_AMD               = VX_KERNEL_BASE(VX_ID_AMD, NN_EXTENSION_LIBRARY) + 0x016,
     VX_KERNEL_TOPK_LAYER_AMD                 = VX_KERNEL_BASE(VX_ID_AMD, NN_EXTENSION_LIBRARY) + 0x017,
     VX_KERNEL_REDUCE_MIN_LAYER_AMD           = VX_KERNEL_BASE(VX_ID_AMD, NN_EXTENSION_LIBRARY) + 0x018,
+    VX_KERNEL_TILE_LAYER_AMD                 = VX_KERNEL_BASE(VX_ID_AMD, NN_EXTENSION_LIBRARY) + 0x019,
 };
 
 //////////////////////////////////////////////////////////////////////
@@ -173,6 +174,7 @@ vx_status publishNMSLayer(vx_context context);
 vx_status publishGatherLayer(vx_context context);
 vx_status publishTopKLayer(vx_context context);
 vx_status publishReduceMinLayer(vx_context context);
+vx_status publishTileLayer(vx_context context);
 
 //////////////////////////////////////////////////////////////////////
 //! \brief The module entry point for publishing/unpublishing kernels

@@ -0,0 +1,167 @@
+#include <kernels.h>
+
+static vx_status VX_CALLBACK validateTileLayer(vx_node node, const vx_reference *parameters, vx_uint32 num, vx_meta_format metas[]) {
+    vx_enum type, type2, out_type;
+    vx_size num_dims, num_dims2, out_num_dims;
+    vx_size input_dims[4], input_dims2[4], output_dims[4];
+
+    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_NUMBER_OF_DIMS, &num_dims, sizeof(num_dims)));
+    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DATA_TYPE, &type, sizeof(type)));
+    if ((type != VX_TYPE_FLOAT32) && (type != VX_TYPE_FLOAT16)) return VX_ERROR_INVALID_TYPE;
+    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DIMS, input_dims, sizeof(input_dims)));
+
+    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_NUMBER_OF_DIMS, &num_dims2, sizeof(num_dims2)));
+    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DATA_TYPE, &type2, sizeof(type2)));
+    if ((type2 != VX_TYPE_INT32) && (type2 != VX_TYPE_INT64)) return VX_ERROR_INVALID_TYPE;
+    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DIMS, input_dims2, sizeof(input_dims2)));
+
+    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_NUMBER_OF_DIMS, &out_num_dims, sizeof(out_num_dims)));
+    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_DATA_TYPE, &out_type, sizeof(out_type)));
+    if ((out_type != VX_TYPE_FLOAT32) && (out_type != VX_TYPE_FLOAT16)) return VX_ERROR_INVALID_TYPE;
+    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_DIMS, output_dims, sizeof(output_dims)));
+
+    if ((num_dims != out_num_dims) || (num_dims != input_dims2[0])) {
+        printf("validate: tile: Ranks of input, repeat, and output tensors should be equal\n");
+        return VX_ERROR_INVALID_DIMENSION;        
+    }
+
+    ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[2], VX_TENSOR_DATA_TYPE, &out_type, sizeof(out_type)));
+    ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[2], VX_TENSOR_NUMBER_OF_DIMS, &out_num_dims, sizeof(num_dims)));
+    ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[2], VX_TENSOR_DIMS, &output_dims, sizeof(output_dims)));
+    return VX_SUCCESS;
+
+}
+
+static vx_status VX_CALLBACK query_target_support(vx_graph graph, vx_node node,
+    vx_bool use_opencl_1_2,
+    vx_uint32& supported_target_affinity
+)
+{
+    supported_target_affinity = AGO_TARGET_AFFINITY_GPU;
+    return VX_SUCCESS;
+}
+
+static vx_status VX_CALLBACK opencl_codegen(
+    vx_node node,                                  // [input] node
+    const vx_reference parameters[],               // [input] parameters
+    vx_uint32 num,                                 // [input] number of parameters
+    bool opencl_load_function,                     // [input]  false: normal OpenCL kernel; true: reserved
+    char opencl_kernel_function_name[64],          // [output] kernel_name for clCreateKernel()
+    std::string& opencl_kernel_code,               // [output] string for clCreateProgramWithSource()
+    std::string& opencl_build_options,             // [output] options for clBuildProgram()
+    vx_uint32& opencl_work_dim,                    // [output] work_dim for clEnqueueNDRangeKernel()
+    vx_size opencl_global_work[],                  // [output] global_work[] for clEnqueueNDRangeKernel()
+    vx_size opencl_local_work[],                   // [output] local_work[] for clEnqueueNDRangeKernel()
+    vx_uint32& opencl_local_buffer_usage_mask,     // [output] reserved: must be ZERO
+    vx_uint32& opencl_local_buffer_size_in_bytes   // [output] reserved: must be ZERO
+)
+{
+    //get tensor dimensions
+    vx_size input_dims[4], output_dims[4];
+    vx_size num_of_dims;
+    vx_enum type;
+
+    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_NUMBER_OF_DIMS, &num_of_dims, sizeof(num_of_dims)));
+    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DIMS, input_dims, sizeof(input_dims)));
+    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DATA_TYPE, &type, sizeof(type)));
+    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_DIMS, output_dims, sizeof(output_dims)));
+
+    for (int i=0; i<4; i++) {  //convert input tensor to 4-D
+        if(i >= num_of_dims) {
+            input_dims[i] = 1;
+            output_dims[i] = 1;
+        }
+    }
+
+    strcpy(opencl_kernel_function_name, "tile_layer");
+
+    opencl_work_dim = 3;    
+    opencl_global_work[0] = output_dims[0];
+    opencl_global_work[1] = output_dims[1];
+    opencl_global_work[2] = output_dims[2] * output_dims[3];
+
+    // Setting variables required by the interface
+    opencl_local_buffer_usage_mask = 0;
+    opencl_local_buffer_size_in_bytes = 0;
+
+    if (num_of_dims) {
+        char item[8192];
+        if (type == VX_TYPE_FLOAT32) {
+            sprintf(item,
+                "#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n"
+                "__kernel void %s(__global uchar * in, uint in_offset, uint4 in_stride, __global uchar * rep, uint rep_offset, uint4 rep_stride, __global uchar * out, uint out_offset, uint4 out_stride) \n"
+                "{ \n"
+                "   uint x = get_global_id(0);\n"
+                "   uint y = get_global_id(1);\n"
+                "   uint c = get_global_id(2);\n"
+                "   uint nx = x %% %d;\n"
+                "   uint ny = y %% %d;\n"
+                "   uint nc = c %% %d;\n"
+                "   float value = *(__global float *)&in[in_offset + nx*in_stride.s0 + ny*in_stride.s1 + nc*in_stride.s2];\n"
+                "   uint offset = out_offset + x*out_stride.s0 + y*out_stride.s1 + c*out_stride.s2;\n"
+                "   out += offset;\n"
+                "   *(__global float *)&out[0] = value;\n"
+                "}\n", opencl_kernel_function_name, (int)input_dims[0], (int)input_dims[1], (int)input_dims[2]);
+        }
+        else {
+            sprintf(item,
+                "#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n"
+                "__kernel void %s(__global uchar * in, uint in_offset, uint4 in_stride, __global uchar * rep, uint rep_offset, uint4 rep_stride, __global uchar * out, uint out_offset, uint4 out_stride) \n"
+                "{ \n"
+                "   uint x = get_global_id(0);\n"
+                "   uint y = get_global_id(1);\n"
+                "   uint c = get_global_id(2);\n"
+                "   uint nx = x %% %d;\n"
+                "   uint ny = y %% %d;\n"
+                "   uint nc = c %% %d;\n"
+                "   half value = *(__global half *)&in[in_offset + nx*in_stride.s0 + ny*in_stride.s1 + nc*in_stride.s2];\n"
+                "   uint offset = out_offset + x*out_stride.s0 + y*out_stride.s1 + c*out_stride.s2;\n"
+                "   out += offset;\n"
+                "   *(__global half *)&out[0] = value;\n"
+                "}\n", opencl_kernel_function_name, (int)input_dims[0], (int)input_dims[1], (int)input_dims[2]);
+        }
+        opencl_kernel_code = item;
+    }
+    return VX_SUCCESS;
+}
+
+//! \brief The kernel execution.
+static vx_status VX_CALLBACK host_kernel(vx_node node, const vx_reference * parameters, vx_uint32 num) 
+{
+    return VX_ERROR_NOT_IMPLEMENTED;
+}
+
+//! \brief The kernel publisher.
+vx_status publishTileLayer(vx_context context) {
+    vx_kernel kernel = vxAddUserKernel(context, "com.amd.nn_extension.tile_layer", VX_KERNEL_TILE_LAYER_AMD, host_kernel, 3, validateTileLayer, nullptr, nullptr);
+    ERROR_CHECK_OBJECT(kernel);
+
+    amd_kernel_query_target_support_f query_target_support_f = query_target_support;
+    amd_kernel_opencl_codegen_callback_f opencl_codegen_callback_f = opencl_codegen;
+    ERROR_CHECK_STATUS(vxSetKernelAttribute(kernel, VX_KERNEL_ATTRIBUTE_AMD_QUERY_TARGET_SUPPORT, &query_target_support_f, sizeof(query_target_support_f)));
+    ERROR_CHECK_STATUS(vxSetKernelAttribute(kernel, VX_KERNEL_ATTRIBUTE_AMD_OPENCL_CODEGEN_CALLBACK, &opencl_codegen_callback_f, sizeof(opencl_codegen_callback_f)));
+
+    ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 0, VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED));
+    ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 1, VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED));
+    ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 2, VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED));
+
+    ERROR_CHECK_STATUS(vxFinalizeKernel(kernel));
+    ERROR_CHECK_STATUS(vxReleaseKernel(&kernel));
+    return VX_SUCCESS; 
+}
+
+VX_API_ENTRY vx_node VX_API_CALL vxTileLayer(vx_graph graph, vx_tensor input, vx_tensor repeats, vx_tensor output) {
+    vx_node node = NULL;
+    vx_context context = vxGetContext((vx_reference)graph);
+    if (vxGetStatus((vx_reference)context) == VX_SUCCESS) {
+        vx_reference params[] = {
+            (vx_reference) input,
+            (vx_reference) repeats,
+            (vx_reference) output,
+        };
+        node = createNode(graph, VX_KERNEL_TILE_LAYER_AMD, params, sizeof(params) / sizeof(params[0]));
+    }
+
+    return node;
+}
+
@@ -351,6 +351,7 @@ Test code with preprocessing add / multiply values to normalize the input tensor
 |Squeeze||&#9745;|&#9745;|
 |Sub||&#9745;|&#9745;|
 |Sum||&#9745;||
+|Tile||&#9745;||
 |TopK||&#9745;||
 |Transpose||&#9745;|&#9745;|
 |Unsqueeze||&#9745;|&#9745;|

@@ -206,6 +206,7 @@ def __init__(self):
             'gather' : 1,
             'topk'  : 1,
             'reduce_min' : 1,
+            'tile' : 1,
         }
 
     def set(self,type,inputs,outputs,attr):
@@ -822,6 +823,25 @@ def updateLocals(self):
                     local.setInfo(input.type, output_shape)
                     local.setFormat(input.format)
                     self.addLocal(local)
+                elif node.type in ['tile']:
+                    input = self.tensor_dict[node.inputs[0]]
+                    if node.inputs[1] not in self.binaries:
+                        raise ValueError("tile: tile by local tensor is unsupported: " + node.inputs[1])
+
+                    repeats = np.frombuffer(self.binaries[node.inputs[1]], dtype=npType)
+
+                    repeats = np.flip(repeats)
+                    while len(repeats) < 4:
+                        repeats.add(1)
+
+                    for i in range(len(input.shape)):
+                        output_shape = input.shape[i] * repeats[i]
+
+                    local = IrTensor()
+                    local.setName(output)
+                    local.setInfo(input.type, output_shape)
+                    local.setFormat(input.format)
+                    self.addLocal(local)
                 else:
                     raise ValueError("Unsupported IR node type: {}".format(node.type))
 

@@ -954,6 +954,16 @@ def generateModuleCPP(graph,fileName,virtual_tensor_flag):
     }    
 """ 
     % (node.attr.get('axis'), node.inputs[0], node.inputs[1], node.outputs[0]))
+            elif node.type == 'tile':
+                f.write( \
+"""
+    { 
+      vx_node node = vxTileLayer(graph, %s, %s, %s);
+      ERROR_CHECK_OBJECT(node);
+      ERROR_CHECK_STATUS(vxReleaseNode(&node));
+    }    
+""" 
+    % (node.inputs[0], node.inputs[1], node.outputs[0]))
             elif node.type == 'reduce_min':
                 axes = node.attr.get('axes')
                 axes_len = -1

@@ -93,6 +93,7 @@
     'Gather'             : 'gather',
     'TopK'               : 'topk',
     'ReduceMin'          : 'reduce_min',
+    'Tile'               : 'tile',
 }
 
 onnx2ir_data_type = [