diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc index 61a8ee8a57b6..fa4ca7d34ba8 100644 --- a/src/target/source/codegen_opencl.cc +++ b/src/target/source/codegen_opencl.cc @@ -370,17 +370,19 @@ std::string CodeGenOpenCL::CastFromTo(std::string value, DataType from, DataType std::string CodeGenOpenCL::CastTo(std::string value, DataType target) { std::ostringstream os; - if (target.lanes() == 1) { - os << "(("; + if (target == DataType::Bool()) { + os << "("; + os << "("; this->PrintType(target, os); os << ")" << value << ")"; - } else { // convert vector type + return os.str(); + } else { os << "("; os << "convert_"; this->PrintType(target, os); os << "(" << value << "))"; + return os.str(); } - return os.str(); } void CodeGenOpenCL::VisitStmt_(const AllocateNode* op) { diff --git a/tests/python/unittest/test_target_codegen_opencl.py b/tests/python/unittest/test_target_codegen_opencl.py index 67dc37363ea9..4a426c952b56 100644 --- a/tests/python/unittest/test_target_codegen_opencl.py +++ b/tests/python/unittest/test_target_codegen_opencl.py @@ -32,10 +32,12 @@ def check_if_then_else(dev, n, dtype): max_lhs = tvm.tir.const(2, dtype=dtype) max_rhs = tvm.tir.if_then_else(A[0] > 0, true_value, false_value) C = te.compute((n,), lambda i: tvm.te.max(max_lhs, max_rhs), name="C") - s = te.create_schedule(C.op) - s[C].bind(s[C].op.axis[0], te.thread_axis("threadIdx.x")) - fun = tvm.build(s, [A, C], target) + func = te.create_prim_func([A, C]) + sch = tvm.tir.Schedule(func) + (x,) = sch.get_loops(sch.get_block("C")) + sch.bind(x, "threadIdx.x") + fun = tvm.build(sch.mod, target=target) a = tvm.nd.empty((n,), A.dtype, dev) c = tvm.nd.empty((n,), A.dtype, dev) # Only need to test compiling here @@ -48,9 +50,11 @@ def check_select(dev, n, dtype): max_lhs = tvm.tir.const(2, dtype=dtype) max_rhs = tvm.tir.Select(A[0] > 0, true_value, false_value) C = te.compute((n,), lambda i: tvm.te.max(max_lhs, max_rhs), name="C") - s = te.create_schedule(C.op) - s[C].bind(s[C].op.axis[0], te.thread_axis("threadIdx.x")) - fun = tvm.build(s, [A, C], target) + func = te.create_prim_func([A, C]) + sch = tvm.tir.Schedule(func) + (x,) = sch.get_loops(sch.get_block("C")) + sch.bind(x, "threadIdx.x") + fun = tvm.build(sch.mod, target=target) a = tvm.nd.empty((n,), A.dtype, dev) c = tvm.nd.empty((n,), A.dtype, dev) @@ -76,9 +80,11 @@ def check_inf_nan(dev, n, value, dtype): A = te.placeholder((n,), name="A", dtype=dtype) inf_value = tvm.tir.const(value, dtype=dtype) C = te.compute((n,), lambda i: inf_value, name="C") - s = te.create_schedule(C.op) - s[C].bind(s[C].op.axis[0], te.thread_axis("threadIdx.x")) - fun = tvm.build(s, [A, C], target) + func = te.create_prim_func([A, C]) + sch = tvm.tir.Schedule(func) + (x,) = sch.get_loops(sch.get_block("C")) + sch.bind(x, "threadIdx.x") + fun = tvm.build(sch.mod, target=target) a = tvm.nd.empty((n,), A.dtype, dev) c = tvm.nd.empty((n,), A.dtype, dev) # Only need to test compiling here @@ -102,9 +108,11 @@ def check_max(dev, n, dtype): max_lhs = A[0] + tvm.tir.const(1, dtype=dtype) max_rhs = tvm.tir.const(0, dtype=dtype) C = te.compute((n,), lambda i: tvm.te.max(max_lhs, max_rhs), name="C") - s = te.create_schedule(C.op) - s[C].bind(s[C].op.axis[0], te.thread_axis("threadIdx.x")) - fun = tvm.build(s, [A, C], target) + func = te.create_prim_func([A, C]) + sch = tvm.tir.Schedule(func) + (x,) = sch.get_loops(sch.get_block("C")) + sch.bind(x, "threadIdx.x") + fun = tvm.build(sch.mod, target=target) a = tvm.nd.empty((n,), A.dtype, dev) c = tvm.nd.empty((n,), A.dtype, dev) @@ -150,7 +158,7 @@ def check_type_casting(ctx, n, dtype): tvm.tir.all( *[ i // block_size == tvm.tir.const(3, "int32"), - i % block_size == tvm.tir.const(3, "int32"), + i % 3 == tvm.tir.const(1, "int32"), ] ), tvm.tir.const(1, dtype), @@ -158,42 +166,26 @@ def check_type_casting(ctx, n, dtype): ), name="C", ) - s = te.create_schedule(C.op) - (tx, vx) = s[C].split(s[C].op.axis[0], factor=block_size) - s[C].vectorize(vx) - thrx = te.thread_axis("threadIdx.x") - - s[C].bind(tx, thrx) - fun = tvm.build(s, [C], target) - + # NOTE: test simple convert pattern + func = te.create_prim_func([C]) + sch = tvm.tir.Schedule(func) + (x,) = sch.get_loops(sch.get_block("C")) + tx, vx = sch.split(x, factors=[None, block_size]) + sch.bind(tx, "threadIdx.x") + sch.vectorize(vx) + + fun = tvm.build(sch.mod, target=target) c = tvm.nd.empty((n,), dtype, ctx) assembly = fun.imported_modules[0].get_source() - - if dtype == "float32": - false_branch = "((float4)(0.000000e+00f, 0.000000e+00f, 0.000000e+00f, 0.000000e+00f))" - true_branch = "((float4)(1.000000e+00f, 1.000000e+00f, 1.000000e+00f, 1.000000e+00f))" - lcond = "convert_int4(((convert_uint4(((uint4)((((int)get_local_id(0)) == 3), (((int)get_local_id(0)) == 3), (((int)get_local_id(0)) == 3), (((int)get_local_id(0)) == 3)))))" - rcond = "(convert_uint4((((int4)((0)+(1*0), (0)+(1*1), (0)+(1*2), (0)+(1*3))) == ((int4)(3, 3, 3, 3)))))" - cond = "({} && {})".format(lcond, rcond) - select = "select({}, {}, {})".format(false_branch, true_branch, cond) - count = assembly.count(select) - assert count == 1 - fun(c) - - elif dtype == "float16": - false_branch = "((half4)((half)0.000000e+00f, (half)0.000000e+00f, (half)0.000000e+00f, (half)0.000000e+00f))" - true_branch = "((half4)((half)1.000000e+00f, (half)1.000000e+00f, (half)1.000000e+00f, (half)1.000000e+00f))" - lcond = "convert_short4(((convert_uint4(((uint4)((((int)get_local_id(0)) == 3), (((int)get_local_id(0)) == 3), (((int)get_local_id(0)) == 3), (((int)get_local_id(0)) == 3)))))" - rcond = "(convert_uint4((((int4)((0)+(1*0), (0)+(1*1), (0)+(1*2), (0)+(1*3))) == ((int4)(3, 3, 3, 3)))))))" - cond = "({} && {})".format(lcond, rcond) - select = "select({}, {}, {})".format(false_branch, true_branch, cond) - count = assembly.count(select) - assert count == 1 - fun(c) + lcond = "convert_int4(((convert_uint4(((uint4)(((convert_int(get_local_id(0))) == 3), ((convert_int(get_local_id(0))) == 3), ((convert_int(get_local_id(0))) == 3), ((convert_int(get_local_id(0))) == 3)))))" + rcond = "(convert_uint4(((((int4)(((convert_int(get_local_id(0))))+(1*0), ((convert_int(get_local_id(0))))+(1*1), ((convert_int(get_local_id(0))))+(1*2), ((convert_int(get_local_id(0))))+(1*3))) % ((int4)(3, 3, 3, 3))) == ((int4)(1, 1, 1, 1))))))))" + pattern_cond = "({} && {})".format(lcond, rcond) + assert assembly.count(pattern_cond) != 0 + fun(c) dev = tvm.device(target, 0) - check_type_casting(dev, 16, "float32") + check_type_casting(dev, 32, "float32") # fp16 is not yet supported in ci # check_type_casting(dev, 16, "float16") diff --git a/tests/python/unittest/test_target_texture_codegen_opencl.py b/tests/python/unittest/test_target_texture_codegen_opencl.py index 639159c495f0..5681dcf9e6a8 100644 --- a/tests/python/unittest/test_target_texture_codegen_opencl.py +++ b/tests/python/unittest/test_target_texture_codegen_opencl.py @@ -1466,8 +1466,8 @@ class TestSimpleTextureToScalarFP16: ["global.texture", (1, 1, 40, 40, 4)], ["", (1, 4, 40, 40)], [ - "float4 v_ = READ_IMAGEF(p0_comp, image_sampler, ((int2)((((int)get_local_id(0)) % 40), (((((int)get_group_id(0)) & 1) * 20) + (((int)get_local_id(0)) / 40)))));", - "out[((((int)get_group_id(0)) * 800) + ((int)get_local_id(0)))] = ((half)((float*)&v_)[(((int)get_group_id(0)) >> 1)]);", + "float4 v_ = READ_IMAGEF(p0_comp, image_sampler, ((int2)(((convert_int(get_local_id(0))) % 40), ((((convert_int(get_group_id(0))) & 1) * 20) + ((convert_int(get_local_id(0))) / 40)))));", + "out[(((convert_int(get_group_id(0))) * 800) + (convert_int(get_local_id(0))))] = (convert_half(((float*)&v_)[((convert_int(get_group_id(0))) >> 1)]));", ], ), # 2. Buffer (NCHW4c) -> Cast(FP16) -> Buffer (NCHW) @@ -1475,7 +1475,7 @@ class TestSimpleTextureToScalarFP16: ["", (1, 1, 40, 40, 4)], ["", (1, 4, 40, 40)], [ - "out[((((int)get_group_id(0)) * 800) + ((int)get_local_id(0)))] = ((half)p0_comp[((((((int)get_group_id(0)) & 1) * 3200) + (((int)get_local_id(0)) * 4)) + (((int)get_group_id(0)) >> 1))]);" + "out[(((convert_int(get_group_id(0))) * 800) + (convert_int(get_local_id(0))))] = (convert_half(p0_comp[(((((convert_int(get_group_id(0))) & 1) * 3200) + ((convert_int(get_local_id(0))) * 4)) + ((convert_int(get_group_id(0))) >> 1))]));" ], ), # 3. Texture (NCHW4c) -> Cast(FP16) -> Texture (NCHW4c) @@ -1483,8 +1483,8 @@ class TestSimpleTextureToScalarFP16: ["global.texture", (1, 1, 40, 40, 4)], ["global.texture", (1, 1, 40, 40, 4)], [ - "float4 v_ = READ_IMAGEF(p0_comp, image_sampler, ((int2)((((((int)get_group_id(0)) * 24) + ((int)get_local_id(0))) % 40), (((((int)get_group_id(0)) * 8) + (((int)get_local_id(0)) >> 3)) / 5))));", - "write_imageh(out, (int2)((((((int)get_group_id(0)) * 24) + ((int)get_local_id(0))) % 40), (((((int)get_group_id(0)) * 8) + (((int)get_local_id(0)) >> 3)) / 5)), (convert_half4(v_)));", + "float4 v_ = READ_IMAGEF(p0_comp, image_sampler, ((int2)(((((convert_int(get_group_id(0))) * 24) + (convert_int(get_local_id(0)))) % 40), ((((convert_int(get_group_id(0))) * 8) + ((convert_int(get_local_id(0))) >> 3)) / 5))));", + "write_imageh(out, (int2)(((((convert_int(get_group_id(0))) * 24) + (convert_int(get_local_id(0)))) % 40), ((((convert_int(get_group_id(0))) * 8) + ((convert_int(get_local_id(0))) >> 3)) / 5)), (convert_half4(v_)));", ], ), ) @@ -1507,8 +1507,8 @@ class TestSimpleTextureToScalarFP32: ["global.texture", (1, 1, 40, 40, 4)], ["", (1, 4, 40, 40)], [ - "float4 v_ = READ_IMAGEF(p0_comp, image_sampler, ((int2)((((int)get_local_id(0)) % 40), (((((int)get_group_id(0)) & 1) * 20) + (((int)get_local_id(0)) / 40)))));", - "out[((((int)get_group_id(0)) * 800) + ((int)get_local_id(0)))] = ((float*)&v_)[(((int)get_group_id(0)) >> 1)];", + "float4 v_ = READ_IMAGEF(p0_comp, image_sampler, ((int2)(((convert_int(get_local_id(0))) % 40), ((((convert_int(get_group_id(0))) & 1) * 20) + ((convert_int(get_local_id(0))) / 40)))));", + "out[(((convert_int(get_group_id(0))) * 800) + (convert_int(get_local_id(0))))] = ((float*)&v_)[((convert_int(get_group_id(0))) >> 1)];", ], ), # 2. Buffer (NCHW4c) -> Buffer (NCHW) @@ -1516,7 +1516,7 @@ class TestSimpleTextureToScalarFP32: ["", (1, 1, 40, 40, 4)], ["", (1, 4, 40, 40)], [ - "out[((((int)get_group_id(0)) * 800) + ((int)get_local_id(0)))] = p0_comp[((((((int)get_group_id(0)) & 1) * 3200) + (((int)get_local_id(0)) * 4)) + (((int)get_group_id(0)) >> 1))];" + "out[(((convert_int(get_group_id(0))) * 800) + (convert_int(get_local_id(0))))] = p0_comp[(((((convert_int(get_group_id(0))) & 1) * 3200) + ((convert_int(get_local_id(0))) * 4)) + ((convert_int(get_group_id(0))) >> 1))];" ], ), ) @@ -1619,8 +1619,8 @@ class TestTextureToScalarReuseSSAFP16: ["global.texture", (1, 1, 40, 40, 4)], ["", (1, 4, 40, 40)], [ - "float4 v_ = READ_IMAGEF(p0_comp, image_sampler, ((int2)((((int)get_local_id(0)) % 40), (((((int)get_group_id(0)) & 1) * 20) + (((int)get_local_id(0)) / 40)))));", - "out_sum[((((int)get_group_id(0)) * 800) + ((int)get_local_id(0)))] = (((half)((float*)&v_)[(((int)get_group_id(0)) >> 1)]) + (((half)((float*)&v_)[(((int)get_group_id(0)) >> 1)]) + ((half)((float*)&v_)[(((int)get_group_id(0)) >> 1)])));", + "float4 v_ = READ_IMAGEF(p0_comp, image_sampler, ((int2)(((convert_int(get_local_id(0))) % 40), ((((convert_int(get_group_id(0))) & 1) * 20) + ((convert_int(get_local_id(0))) / 40)))));", + "out_sum[(((convert_int(get_group_id(0))) * 800) + (convert_int(get_local_id(0))))] = ((convert_half(((float*)&v_)[((convert_int(get_group_id(0))) >> 1)])) + ((convert_half(((float*)&v_)[((convert_int(get_group_id(0))) >> 1)])) + (convert_half(((float*)&v_)[((convert_int(get_group_id(0))) >> 1)]))));", ], ), # 2. Buffer (NCHW4c) -> Cast(FP16) -> Buffer (NCHW) @@ -1628,7 +1628,7 @@ class TestTextureToScalarReuseSSAFP16: ["", (1, 1, 40, 40, 4)], ["", (1, 4, 40, 40)], [ - "out_sum[((((int)get_group_id(0)) * 800) + ((int)get_local_id(0)))] = (((half)p0_comp[((((((int)get_group_id(0)) & 1) * 3200) + (((int)get_local_id(0)) * 4)) + (((int)get_group_id(0)) >> 1))]) + (((half)p0_comp[((((((int)get_group_id(0)) & 1) * 3200) + (((int)get_local_id(0)) * 4)) + (((int)get_group_id(0)) >> 1))]) + ((half)p0_comp[((((((int)get_group_id(0)) & 1) * 3200) + (((int)get_local_id(0)) * 4)) + (((int)get_group_id(0)) >> 1))])));" + " out_sum[(((convert_int(get_group_id(0))) * 800) + (convert_int(get_local_id(0))))] = ((convert_half(p0_comp[(((((convert_int(get_group_id(0))) & 1) * 3200) + ((convert_int(get_local_id(0))) * 4)) + ((convert_int(get_group_id(0))) >> 1))])) + ((convert_half(p0_comp[(((((convert_int(get_group_id(0))) & 1) * 3200) + ((convert_int(get_local_id(0))) * 4)) + ((convert_int(get_group_id(0))) >> 1))])) + (convert_half(p0_comp[(((((convert_int(get_group_id(0))) & 1) * 3200) + ((convert_int(get_local_id(0))) * 4)) + ((convert_int(get_group_id(0))) >> 1))]))));" ], ), # 3. Texture (NCHW4c) -> Cast(FP16) -> Texture (NCHW4c) @@ -1636,8 +1636,8 @@ class TestTextureToScalarReuseSSAFP16: ["global.texture", (1, 1, 40, 40, 4)], ["global.texture", (1, 1, 40, 40, 4)], [ - "float4 v_ = READ_IMAGEF(p0_comp, image_sampler, ((int2)((((((int)get_group_id(0)) * 24) + ((int)get_local_id(0))) % 40), (((((int)get_group_id(0)) * 8) + (((int)get_local_id(0)) >> 3)) / 5))));", - "write_imageh(out_sum, (int2)((((((int)get_group_id(0)) * 24) + ((int)get_local_id(0))) % 40), (((((int)get_group_id(0)) * 8) + (((int)get_local_id(0)) >> 3)) / 5)), ((convert_half4(v_)) + ((convert_half4(v_)) + (convert_half4(v_)))));", + "float4 v_ = READ_IMAGEF(p0_comp, image_sampler, ((int2)(((((convert_int(get_group_id(0))) * 24) + (convert_int(get_local_id(0)))) % 40), ((((convert_int(get_group_id(0))) * 8) + ((convert_int(get_local_id(0))) >> 3)) / 5))));", + "write_imageh(out_sum, (int2)(((((convert_int(get_group_id(0))) * 24) + (convert_int(get_local_id(0)))) % 40), ((((convert_int(get_group_id(0))) * 8) + ((convert_int(get_local_id(0))) >> 3)) / 5)), ((convert_half4(v_)) + ((convert_half4(v_)) + (convert_half4(v_)))));", ], ), ) @@ -1660,8 +1660,8 @@ class TestTextureToScalarReuseSSAFP32: ["global.texture", (1, 1, 40, 40, 4)], ["", (1, 4, 40, 40)], [ - "float4 v_ = READ_IMAGEF(p0_comp, image_sampler, ((int2)((((int)get_local_id(0)) % 40), (((((int)get_group_id(0)) & 1) * 20) + (((int)get_local_id(0)) / 40)))));", - "out_sum[((((int)get_group_id(0)) * 800) + ((int)get_local_id(0)))] = (((float*)&v_)[(((int)get_group_id(0)) >> 1)] + (((float*)&v_)[(((int)get_group_id(0)) >> 1)] + ((float*)&v_)[(((int)get_group_id(0)) >> 1)]));", + "float4 v_ = READ_IMAGEF(p0_comp, image_sampler, ((int2)(((convert_int(get_local_id(0))) % 40), ((((convert_int(get_group_id(0))) & 1) * 20) + ((convert_int(get_local_id(0))) / 40)))));", + "out_sum[(((convert_int(get_group_id(0))) * 800) + (convert_int(get_local_id(0))))] = (((float*)&v_)[((convert_int(get_group_id(0))) >> 1)] + (((float*)&v_)[((convert_int(get_group_id(0))) >> 1)] + ((float*)&v_)[((convert_int(get_group_id(0))) >> 1)]));", ], ), # 2. Buffer (NCHW4c) -> Buffer (NCHW) @@ -1669,7 +1669,7 @@ class TestTextureToScalarReuseSSAFP32: ["", (1, 1, 40, 40, 4)], ["", (1, 4, 40, 40)], [ - "out_sum[((((int)get_group_id(0)) * 800) + ((int)get_local_id(0)))] = (p0_comp[((((((int)get_group_id(0)) & 1) * 3200) + (((int)get_local_id(0)) * 4)) + (((int)get_group_id(0)) >> 1))] + (p0_comp[((((((int)get_group_id(0)) & 1) * 3200) + (((int)get_local_id(0)) * 4)) + (((int)get_group_id(0)) >> 1))] + p0_comp[((((((int)get_group_id(0)) & 1) * 3200) + (((int)get_local_id(0)) * 4)) + (((int)get_group_id(0)) >> 1))]));" + "out_sum[(((convert_int(get_group_id(0))) * 800) + (convert_int(get_local_id(0))))] = (p0_comp[(((((convert_int(get_group_id(0))) & 1) * 3200) + ((convert_int(get_local_id(0))) * 4)) + ((convert_int(get_group_id(0))) >> 1))] + (p0_comp[(((((convert_int(get_group_id(0))) & 1) * 3200) + ((convert_int(get_local_id(0))) * 4)) + ((convert_int(get_group_id(0))) >> 1))] + p0_comp[(((((convert_int(get_group_id(0))) & 1) * 3200) + ((convert_int(get_local_id(0))) * 4)) + ((convert_int(get_group_id(0))) >> 1))]));" ], ), ) @@ -1693,10 +1693,10 @@ class TestLocalArrayToTexture: (1, 2, 38, 38, 4), [ "float out_local[4];", - "float4 v_ = READ_IMAGEF(p1_comp, image_sampler, ((int2)((((((int)get_group_id(0)) * 14) + ((int)get_local_id(0))) % 38), ((((((int)get_group_id(0)) * 64) + (((int)get_local_id(0)) >> 1)) % 722) / 19))));", - "float4 v__1 = READ_IMAGEF(p2_comp, image_sampler, ((int2)(rw, ((((((((int)get_group_id(0)) * 32) + (((int)get_local_id(0)) >> 2)) / 361) * 12) + (rcb * 3)) + rh))));", + "float4 v_ = READ_IMAGEF(p1_comp, image_sampler, ((int2)(((((convert_int(get_group_id(0))) * 14) + (convert_int(get_local_id(0)))) % 38), (((((convert_int(get_group_id(0))) * 64) + ((convert_int(get_local_id(0))) >> 1)) % 722) / 19))));", + "float4 v__1 = READ_IMAGEF(p2_comp, image_sampler, ((int2)(rw, (((((((convert_int(get_group_id(0))) * 32) + ((convert_int(get_local_id(0))) >> 2)) / 361) * 12) + (rcb * 3)) + rh))));", "out_local[cb_c] = (out_local[cb_c] + (((float*)&v_)[rcb] * ((float*)&v__1)[cb_c]));", - "write_imagef(out, (int2)((((((int)get_group_id(0)) * 14) + ((int)get_local_id(0))) % 38), (((((int)get_group_id(0)) * 64) + (((int)get_local_id(0)) >> 1)) / 19)), vload4(0, out_local + 0));", + "write_imagef(out, (int2)(((((convert_int(get_group_id(0))) * 14) + (convert_int(get_local_id(0)))) % 38), ((((convert_int(get_group_id(0))) * 64) + ((convert_int(get_local_id(0))) >> 1)) / 19)), vload4(0, out_local + 0));", ], ), )