Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

H/tile layer #352

Merged
merged 9 commits into from
Aug 11, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions amd_openvx_extensions/amd_nn/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ list(APPEND SOURCES
src/gather_layer.cpp
src/topk_layer.cpp
src/reduce_min.cpp
src/tile_layer.cpp
src/profiler.cpp
)

Expand Down
1 change: 1 addition & 0 deletions amd_openvx_extensions/amd_nn/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ vx_nn is an OpenVX Neural Network extension module. This implementation supports
| Tensor Min|vxTensorMinNode|com.amd.nn_extension.tensor_min |
| Tensor Multiply|vxTensorMultiplyNode|org.khronos.openvx.tensor_multiply |
| Tensor Subtract|vxTensorSubtractNode|org.khronos.openvx.tensor_subtract |
| Tile|vxTileLayer|com.amd.nn_extension.tile_layer |
| TopK|vxTopKLayer|com.amd.nn_extension.topk_layer|
| Upsample Nearest Neighborhood|vxUpsampleNearestLayer|com.amd.nn_extension.upsample_nearest_layer |

Expand Down
11 changes: 11 additions & 0 deletions amd_openvx_extensions/amd_nn/include/vx_amd_nn.h
Original file line number Diff line number Diff line change
Expand Up @@ -357,4 +357,15 @@ VX_API_ENTRY vx_node VX_API_CALL vxTopKLayer(vx_graph graph, vx_tensor x_tensor,
*/
VX_API_ENTRY vx_node VX_API_CALL vxReduceMinLayer(vx_graph graph, vx_tensor data, vx_array axes, vx_int32 keepdims, vx_tensor reduced);

/* \brief [Graph] Creates a Tile Layer Node.
* \details Constructs a tensor by tiling a given tensor.
* \param [in] graph The handle to the graph.
* \param [in] input The input tensor data.
* \param [in] repeats 1D int64 tensor of the same length as input's dimension number, includes numbers of repeated copies along input's dimensions.
* \param [out] output Output tensor of the same dimension and type as tensor input. output_dim[i] = input_dim[i] * repeats[i]
* \return <tt> vx_node</tt>.
* \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>.
*/
VX_API_ENTRY vx_node VX_API_CALL vxTileLayer(vx_graph graph, vx_tensor input, vx_tensor repeats, vx_tensor output);

#endif
1 change: 1 addition & 0 deletions amd_openvx_extensions/amd_nn/src/kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,7 @@ SHARED_PUBLIC vx_status VX_API_CALL vxPublishKernels(vx_context context)
ERROR_CHECK_STATUS(publishGatherLayer(context));
ERROR_CHECK_STATUS(publishTopKLayer(context));
ERROR_CHECK_STATUS(publishReduceMinLayer(context));
ERROR_CHECK_STATUS(publishTileLayer(context));

// register drama rules
AgoNodeMergeRule softmax_rule = {
Expand Down
2 changes: 2 additions & 0 deletions amd_openvx_extensions/amd_nn/src/kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ enum user_kernel_e
VX_KERNEL_GATHER_LAYER_AMD = VX_KERNEL_BASE(VX_ID_AMD, NN_EXTENSION_LIBRARY) + 0x016,
VX_KERNEL_TOPK_LAYER_AMD = VX_KERNEL_BASE(VX_ID_AMD, NN_EXTENSION_LIBRARY) + 0x017,
VX_KERNEL_REDUCE_MIN_LAYER_AMD = VX_KERNEL_BASE(VX_ID_AMD, NN_EXTENSION_LIBRARY) + 0x018,
VX_KERNEL_TILE_LAYER_AMD = VX_KERNEL_BASE(VX_ID_AMD, NN_EXTENSION_LIBRARY) + 0x019,
};

//////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -173,6 +174,7 @@ vx_status publishNMSLayer(vx_context context);
vx_status publishGatherLayer(vx_context context);
vx_status publishTopKLayer(vx_context context);
vx_status publishReduceMinLayer(vx_context context);
vx_status publishTileLayer(vx_context context);

//////////////////////////////////////////////////////////////////////
//! \brief The module entry point for publishing/unpublishing kernels
Expand Down
167 changes: 167 additions & 0 deletions amd_openvx_extensions/amd_nn/src/tile_layer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
#include <kernels.h>

static vx_status VX_CALLBACK validateTileLayer(vx_node node, const vx_reference *parameters, vx_uint32 num, vx_meta_format metas[]) {
vx_enum type, type2, out_type;
vx_size num_dims, num_dims2, out_num_dims;
vx_size input_dims[4], input_dims2[4], output_dims[4];

ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_NUMBER_OF_DIMS, &num_dims, sizeof(num_dims)));
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DATA_TYPE, &type, sizeof(type)));
if ((type != VX_TYPE_FLOAT32) && (type != VX_TYPE_FLOAT16)) return VX_ERROR_INVALID_TYPE;
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DIMS, input_dims, sizeof(input_dims)));

ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_NUMBER_OF_DIMS, &num_dims2, sizeof(num_dims2)));
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DATA_TYPE, &type2, sizeof(type2)));
if ((type2 != VX_TYPE_INT32) && (type2 != VX_TYPE_INT64)) return VX_ERROR_INVALID_TYPE;
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DIMS, input_dims2, sizeof(input_dims2)));

ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_NUMBER_OF_DIMS, &out_num_dims, sizeof(out_num_dims)));
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_DATA_TYPE, &out_type, sizeof(out_type)));
if ((out_type != VX_TYPE_FLOAT32) && (out_type != VX_TYPE_FLOAT16)) return VX_ERROR_INVALID_TYPE;
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_DIMS, output_dims, sizeof(output_dims)));

if ((num_dims != out_num_dims) || (num_dims != input_dims2[0])) {
printf("validate: tile: Ranks of input, repeat, and output tensors should be equal\n");
return VX_ERROR_INVALID_DIMENSION;
}

ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[2], VX_TENSOR_DATA_TYPE, &out_type, sizeof(out_type)));
ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[2], VX_TENSOR_NUMBER_OF_DIMS, &out_num_dims, sizeof(num_dims)));
ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[2], VX_TENSOR_DIMS, &output_dims, sizeof(output_dims)));
return VX_SUCCESS;

}

static vx_status VX_CALLBACK query_target_support(vx_graph graph, vx_node node,
vx_bool use_opencl_1_2,
vx_uint32& supported_target_affinity
)
{
supported_target_affinity = AGO_TARGET_AFFINITY_GPU;
return VX_SUCCESS;
}

static vx_status VX_CALLBACK opencl_codegen(
vx_node node, // [input] node
const vx_reference parameters[], // [input] parameters
vx_uint32 num, // [input] number of parameters
bool opencl_load_function, // [input] false: normal OpenCL kernel; true: reserved
char opencl_kernel_function_name[64], // [output] kernel_name for clCreateKernel()
std::string& opencl_kernel_code, // [output] string for clCreateProgramWithSource()
std::string& opencl_build_options, // [output] options for clBuildProgram()
vx_uint32& opencl_work_dim, // [output] work_dim for clEnqueueNDRangeKernel()
vx_size opencl_global_work[], // [output] global_work[] for clEnqueueNDRangeKernel()
vx_size opencl_local_work[], // [output] local_work[] for clEnqueueNDRangeKernel()
vx_uint32& opencl_local_buffer_usage_mask, // [output] reserved: must be ZERO
vx_uint32& opencl_local_buffer_size_in_bytes // [output] reserved: must be ZERO
)
{
//get tensor dimensions
vx_size input_dims[4], output_dims[4];
vx_size num_of_dims;
vx_enum type;

ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_NUMBER_OF_DIMS, &num_of_dims, sizeof(num_of_dims)));
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DIMS, input_dims, sizeof(input_dims)));
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DATA_TYPE, &type, sizeof(type)));
ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[2], VX_TENSOR_DIMS, output_dims, sizeof(output_dims)));

for (int i=0; i<4; i++) { //convert input tensor to 4-D
if(i >= num_of_dims) {
input_dims[i] = 1;
output_dims[i] = 1;
}
}

strcpy(opencl_kernel_function_name, "tile_layer");

opencl_work_dim = 3;
opencl_global_work[0] = output_dims[0];
opencl_global_work[1] = output_dims[1];
opencl_global_work[2] = output_dims[2] * output_dims[3];

// Setting variables required by the interface
opencl_local_buffer_usage_mask = 0;
opencl_local_buffer_size_in_bytes = 0;

if (num_of_dims) {
char item[8192];
if (type == VX_TYPE_FLOAT32) {
sprintf(item,
"#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n"
"__kernel void %s(__global uchar * in, uint in_offset, uint4 in_stride, __global uchar * rep, uint rep_offset, uint4 rep_stride, __global uchar * out, uint out_offset, uint4 out_stride) \n"
"{ \n"
" uint x = get_global_id(0);\n"
" uint y = get_global_id(1);\n"
" uint c = get_global_id(2);\n"
" uint nx = x %% %d;\n"
" uint ny = y %% %d;\n"
" uint nc = c %% %d;\n"
" float value = *(__global float *)&in[in_offset + nx*in_stride.s0 + ny*in_stride.s1 + nc*in_stride.s2];\n"
" uint offset = out_offset + x*out_stride.s0 + y*out_stride.s1 + c*out_stride.s2;\n"
" out += offset;\n"
" *(__global float *)&out[0] = value;\n"
"}\n", opencl_kernel_function_name, (int)input_dims[0], (int)input_dims[1], (int)input_dims[2]);
}
else {
sprintf(item,
"#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n"
"__kernel void %s(__global uchar * in, uint in_offset, uint4 in_stride, __global uchar * rep, uint rep_offset, uint4 rep_stride, __global uchar * out, uint out_offset, uint4 out_stride) \n"
"{ \n"
" uint x = get_global_id(0);\n"
" uint y = get_global_id(1);\n"
" uint c = get_global_id(2);\n"
" uint nx = x %% %d;\n"
" uint ny = y %% %d;\n"
" uint nc = c %% %d;\n"
" half value = *(__global half *)&in[in_offset + nx*in_stride.s0 + ny*in_stride.s1 + nc*in_stride.s2];\n"
" uint offset = out_offset + x*out_stride.s0 + y*out_stride.s1 + c*out_stride.s2;\n"
" out += offset;\n"
" *(__global half *)&out[0] = value;\n"
"}\n", opencl_kernel_function_name, (int)input_dims[0], (int)input_dims[1], (int)input_dims[2]);
}
opencl_kernel_code = item;
}
return VX_SUCCESS;
}

//! \brief The kernel execution.
static vx_status VX_CALLBACK host_kernel(vx_node node, const vx_reference * parameters, vx_uint32 num)
{
return VX_ERROR_NOT_IMPLEMENTED;
}

//! \brief The kernel publisher.
vx_status publishTileLayer(vx_context context) {
vx_kernel kernel = vxAddUserKernel(context, "com.amd.nn_extension.tile_layer", VX_KERNEL_TILE_LAYER_AMD, host_kernel, 3, validateTileLayer, nullptr, nullptr);
ERROR_CHECK_OBJECT(kernel);

amd_kernel_query_target_support_f query_target_support_f = query_target_support;
amd_kernel_opencl_codegen_callback_f opencl_codegen_callback_f = opencl_codegen;
ERROR_CHECK_STATUS(vxSetKernelAttribute(kernel, VX_KERNEL_ATTRIBUTE_AMD_QUERY_TARGET_SUPPORT, &query_target_support_f, sizeof(query_target_support_f)));
ERROR_CHECK_STATUS(vxSetKernelAttribute(kernel, VX_KERNEL_ATTRIBUTE_AMD_OPENCL_CODEGEN_CALLBACK, &opencl_codegen_callback_f, sizeof(opencl_codegen_callback_f)));

ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 0, VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED));
ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 1, VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED));
ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 2, VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED));

ERROR_CHECK_STATUS(vxFinalizeKernel(kernel));
ERROR_CHECK_STATUS(vxReleaseKernel(&kernel));
return VX_SUCCESS;
}

VX_API_ENTRY vx_node VX_API_CALL vxTileLayer(vx_graph graph, vx_tensor input, vx_tensor repeats, vx_tensor output) {
vx_node node = NULL;
vx_context context = vxGetContext((vx_reference)graph);
if (vxGetStatus((vx_reference)context) == VX_SUCCESS) {
vx_reference params[] = {
(vx_reference) input,
(vx_reference) repeats,
(vx_reference) output,
};
node = createNode(graph, VX_KERNEL_TILE_LAYER_AMD, params, sizeof(params) / sizeof(params[0]));
}

return node;
}

1 change: 1 addition & 0 deletions model_compiler/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,7 @@ Test code with preprocessing add / multiply values to normalize the input tensor
|Squeeze||&#9745;|&#9745;|
|Sub||&#9745;|&#9745;|
|Sum||&#9745;||
|Tile||&#9745;||
|TopK||&#9745;||
|Transpose||&#9745;|&#9745;|
|Unsqueeze||&#9745;|&#9745;|
Expand Down
20 changes: 20 additions & 0 deletions model_compiler/python/nnir.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ def __init__(self):
'gather' : 1,
'topk' : 1,
'reduce_min' : 1,
'tile' : 1,
}

def set(self,type,inputs,outputs,attr):
Expand Down Expand Up @@ -822,6 +823,25 @@ def updateLocals(self):
local.setInfo(input.type, output_shape)
local.setFormat(input.format)
self.addLocal(local)
elif node.type in ['tile']:
input = self.tensor_dict[node.inputs[0]]
if node.inputs[1] not in self.binaries:
raise ValueError("tile: tile by local tensor is unsupported: " + node.inputs[1])

repeats = np.frombuffer(self.binaries[node.inputs[1]], dtype=npType)

repeats = np.flip(repeats)
while len(repeats) < 4:
repeats.add(1)

for i in range(len(input.shape)):
output_shape = input.shape[i] * repeats[i]

local = IrTensor()
local.setName(output)
local.setInfo(input.type, output_shape)
local.setFormat(input.format)
self.addLocal(local)
else:
raise ValueError("Unsupported IR node type: {}".format(node.type))

Expand Down
10 changes: 10 additions & 0 deletions model_compiler/python/nnir_to_openvx.py
Original file line number Diff line number Diff line change
Expand Up @@ -954,6 +954,16 @@ def generateModuleCPP(graph,fileName,virtual_tensor_flag):
}
"""
% (node.attr.get('axis'), node.inputs[0], node.inputs[1], node.outputs[0]))
elif node.type == 'tile':
f.write( \
"""
{
vx_node node = vxTileLayer(graph, %s, %s, %s);
ERROR_CHECK_OBJECT(node);
ERROR_CHECK_STATUS(vxReleaseNode(&node));
}
"""
% (node.inputs[0], node.inputs[1], node.outputs[0]))
elif node.type == 'reduce_min':
axes = node.attr.get('axes')
axes_len = -1
Expand Down
1 change: 1 addition & 0 deletions model_compiler/python/onnx_to_nnir.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@
'Gather' : 'gather',
'TopK' : 'topk',
'ReduceMin' : 'reduce_min',
'Tile' : 'tile',
}

onnx2ir_data_type = [
Expand Down