From a0aa9339a9c9d7cf5e81b54bb5577205ba0fd4d9 Mon Sep 17 00:00:00 2001 From: Hansel Yang Date: Wed, 17 Feb 2021 23:18:27 -0800 Subject: [PATCH] GPU Fix - multiply gpu (#39) * CMake * multiply fix * code cleanup --- amd_openvx/openvx/ago/ago_kernel_api.cpp | 16 +++++----- amd_openvx/openvx/ago/ago_util_opencl.cpp | 38 +++++++++++------------ 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/amd_openvx/openvx/ago/ago_kernel_api.cpp b/amd_openvx/openvx/ago/ago_kernel_api.cpp index 6e6778a363..4481ee7883 100644 --- a/amd_openvx/openvx/ago/ago_kernel_api.cpp +++ b/amd_openvx/openvx/ago/ago_kernel_api.cpp @@ -8987,14 +8987,14 @@ int agoKernel_Mul_S16_S16S16_Wrap_Round(AgoNode * node, AgoKernelCommand cmd) "void %s (S16x8 * p0, S16x8 p1, S16x8 p2, float p3)\n" "{\n" " S16x8 r;\n" - " r.s0 = ((int)convert_short_rte(p3 * (float)((((int)(p1.s0)) << 16) >> 16) * (float)((((int)(p2.s0)) << 16) >> 16)) & 0x0000ffff) ;\n" - " r.s0 |= ((int)convert_short_rte(p3 * (float)(((int)(p1.s0)) >> 16) * (float)(((int)(p2.s0)) >> 16)) ) << 16;\n" - " r.s1 = ((int)convert_short_rte(p3 * (float)((((int)(p1.s1)) << 16) >> 16) * (float)((((int)(p2.s1)) << 16) >> 16)) & 0x0000ffff);\n" - " r.s1 |= ((int)convert_short_rte(p3 * (float)(((int)(p1.s1)) >> 16) * (float)(((int)(p2.s1)) >> 16)) ) << 16;\n" - " r.s2 = ((int)convert_short_rte(p3 * (float)((((int)(p1.s2)) << 16) >> 16) * (float)((((int)(p2.s2)) << 16) >> 16)) & 0x0000ffff) ;\n" - " r.s2 |= ((int)convert_short_rte(p3 * (float)(((int)(p1.s2)) >> 16) * (float)(((int)(p2.s2)) >> 16)) ) << 16;\n" - " r.s3 = ((int)convert_short_rte(p3 * (float)((((int)(p1.s3)) << 16) >> 16) * (float)((((int)(p2.s3)) << 16) >> 16)) & 0x0000ffff) ;\n" - " r.s3 |= ((int)convert_short_rte(p3 * (float)(((int)(p1.s3)) >> 16) * (float)(((int)(p2.s3)) >> 16)) ) << 16;\n" + " r.s0 = ((int)convert_int_rte(p3 * (float)((((int)(p1.s0)) << 16) >> 16) * (float)((((int)(p2.s0)) << 16) >> 16)) & 0x0000ffff) ;\n" + " r.s0 |= ((int)convert_int_rte(p3 * (float)(((int)(p1.s0)) >> 16) * (float)(((int)(p2.s0)) >> 16)) ) << 16;\n" + " r.s1 = ((int)convert_int_rte(p3 * (float)((((int)(p1.s1)) << 16) >> 16) * (float)((((int)(p2.s1)) << 16) >> 16)) & 0x0000ffff);\n" + " r.s1 |= ((int)convert_int_rte(p3 * (float)(((int)(p1.s1)) >> 16) * (float)(((int)(p2.s1)) >> 16)) ) << 16;\n" + " r.s2 = ((int)convert_int_rte(p3 * (float)((((int)(p1.s2)) << 16) >> 16) * (float)((((int)(p2.s2)) << 16) >> 16)) & 0x0000ffff) ;\n" + " r.s2 |= ((int)convert_int_rte(p3 * (float)(((int)(p1.s2)) >> 16) * (float)(((int)(p2.s2)) >> 16)) ) << 16;\n" + " r.s3 = ((int)convert_int_rte(p3 * (float)((((int)(p1.s3)) << 16) >> 16) * (float)((((int)(p2.s3)) << 16) >> 16)) & 0x0000ffff) ;\n" + " r.s3 |= ((int)convert_int_rte(p3 * (float)(((int)(p1.s3)) >> 16) * (float)(((int)(p2.s3)) >> 16)) ) << 16;\n" " *p0 = r;\n" "}\n" ), node->opencl_name); diff --git a/amd_openvx/openvx/ago/ago_util_opencl.cpp b/amd_openvx/openvx/ago/ago_util_opencl.cpp index c244ccbbc4..b2d94ff28d 100644 --- a/amd_openvx/openvx/ago/ago_util_opencl.cpp +++ b/amd_openvx/openvx/ago/ago_util_opencl.cpp @@ -1738,25 +1738,25 @@ int agoGpuOclSuperNodeFinalize(AgoGraph * graph, AgoSuperNode * supernode) " r |= (p1.s1 >> 17) & 128;\n" " *p0 = r;\n" "}\n" - "void Convert_U8_S16 (U8x8 *p0, S16X8 p1)\n" - "{\n" - " U8x8 r;\n" - " p1.s0 = (~p1.s0) + 1; \n" - " p1.s1 = (~p1.s1) + 1; \n" - " p1.s2 = (~p1.s2) + 1; \n" - " p1.s3 = (~p1.s3) + 1; \n" - " r = p1.s0 & 0x000000ff;\n" - " r |= (p1.s0 >> 15) & 0x0000ff00;\n" - " r |= (p1.s1) & 0x000000ff;\n" - " r |= (p1.s1 >> 15) & 0x0000ff00;\n" - " p0.s0 = r;\n" - " U8x8 q;\n" - " q = p1.s2 & 0x000000ff;\n" - " q |= (p1.s2 >> 15) & 0x0000ff00;\n" - " q |= (p1.s3) & 0x000000ff;\n" - " q |= (p1.s3 >> 15) & 0x0000ff00;\n" - " p0.s1 = q;\n" - "}\n" + // "void Convert_U8_S16 (U8x8 *p0, S16X8 p1)\n" + // "{\n" + // " U8x8 r;\n" + // " p1.s0 = (~p1.s0) + 1; \n" + // " p1.s1 = (~p1.s1) + 1; \n" + // " p1.s2 = (~p1.s2) + 1; \n" + // " p1.s3 = (~p1.s3) + 1; \n" + // " r = p1.s0 & 0x000000ff;\n" + // " r |= (p1.s0 >> 15) & 0x0000ff00;\n" + // " r |= (p1.s1) & 0x000000ff;\n" + // " r |= (p1.s1 >> 15) & 0x0000ff00;\n" + // " p0.s0 = r;\n" + // " U8x8 q;\n" + // " q = p1.s2 & 0x000000ff;\n" + // " q |= (p1.s2 >> 15) & 0x0000ff00;\n" + // " q |= (p1.s3) & 0x000000ff;\n" + // " q |= (p1.s3 >> 15) & 0x0000ff00;\n" + // " p0.s1 = q;\n" + // "}\n" ); for (size_t index = 0; index < supernode->nodeList.size(); index++) { // get node and set node name