diff --git a/python/tvm/relay/op/strategy/adreno.py b/python/tvm/relay/op/strategy/adreno.py index a537fa1e7b90..9429fd71e1d9 100644 --- a/python/tvm/relay/op/strategy/adreno.py +++ b/python/tvm/relay/op/strategy/adreno.py @@ -36,8 +36,10 @@ def conv2d_strategy_adreno(attrs, inputs, out_type, target): raise ValueError("dilation should be positive value") if groups == 1: - if (data_layout == "NCHW" and kernel_layout == "OIHW") or ( - data_layout == "NCHW4c" and kernel_layout == "OIHW4o" + if ( + (data_layout == "NCHW" and kernel_layout == "OIHW") + or (data_layout == "NCHW4c" and kernel_layout == "OIHW4o") + or (data_layout == "NCHW" and kernel_layout == "OIHW4o") ): if len(kernel.shape) == 4: _, _, kh, kw = get_const_tuple(kernel.shape) @@ -47,35 +49,24 @@ def conv2d_strategy_adreno(attrs, inputs, out_type, target): (2 < kh < 8 and 2 < kw < 8 and kh == kw) and (stride_h == 1 and stride_w == 1) and (dilation_h == 1 and dilation_w == 1) + and not (data_layout == "NCHW" and kernel_layout == "OIHW4o") ): - if out_type.dtype == "float16": - strategy.add_implementation( - wrap_compute_conv2d(topi.adreno.conv2d_nchw_winograd), - wrap_topi_schedule(topi.adreno.schedule_conv2d_nchw_winograd), - name="conv2d_nchw_winograd.image2d", - plevel=5, - ) strategy.add_implementation( - wrap_compute_conv2d(topi.adreno.conv2d_nchw_winograd_acc32), - wrap_topi_schedule(topi.adreno.schedule_conv2d_nchw_winograd_acc32), - name="conv2d_nchw_winograd_acc32.image2d", - plevel=7, - ) - if out_type.dtype == "float16": - strategy.add_implementation( - wrap_compute_conv2d(topi.adreno.conv2d_nchwc), - wrap_topi_schedule(topi.adreno.schedule_conv2d_nchwc), - name="conv2d_nchwc.image2d", - plevel=10, + wrap_compute_conv2d(topi.adreno.conv2d_nchw_winograd), + wrap_topi_schedule(topi.adreno.schedule_conv2d_nchw_winograd), + name="conv2d_nchw_winograd.image2d", + plevel=5, ) strategy.add_implementation( - wrap_compute_conv2d(topi.adreno.conv2d_nchwc_acc32), - wrap_topi_schedule(topi.adreno.schedule_conv2d_nchwc_acc32), - name="conv2d_nchwc_acc32.image2d", - plevel=20, + wrap_compute_conv2d(topi.adreno.conv2d_nchwc), + wrap_topi_schedule(topi.adreno.schedule_conv2d_nchwc), + name="conv2d_nchwc.image2d", + plevel=10, ) - elif (data_layout == "NHWC" and kernel_layout == "HWIO") or ( - data_layout == "NHWC4c" and kernel_layout == "HWIO4o" + elif ( + (data_layout == "NHWC" and kernel_layout == "HWIO") + or (data_layout == "NHWC4c" and kernel_layout == "HWIO4o") + or (data_layout == "NHWC" and kernel_layout == "HWIO4o") ): if len(kernel.shape) == 4: kh, kw, _, _ = get_const_tuple(kernel.shape) @@ -85,32 +76,19 @@ def conv2d_strategy_adreno(attrs, inputs, out_type, target): (2 < kh < 8 and 2 < kw < 8 and kh == kw) and (stride_h == 1 and stride_w == 1) and (dilation_h == 1 and dilation_w == 1) + and not (data_layout == "NHWC" and kernel_layout == "HWIO4o") ): - if out_type.dtype == "float16": - strategy.add_implementation( - wrap_compute_conv2d(topi.adreno.conv2d_nhwc_winograd), - wrap_topi_schedule(topi.adreno.schedule_conv2d_nhwc_winograd), - name="conv2d_nhwc_winograd.image2d", - plevel=5, - ) strategy.add_implementation( - wrap_compute_conv2d(topi.adreno.conv2d_nhwc_winograd_acc32), - wrap_topi_schedule(topi.adreno.schedule_conv2d_nhwc_winograd_acc32), - name="conv2d_nhwc_winograd_acc32.image2d", - plevel=7, - ) - if out_type.dtype == "float16": - strategy.add_implementation( - wrap_compute_conv2d(topi.adreno.conv2d_nhwc), - wrap_topi_schedule(topi.adreno.schedule_conv2d_nhwc), - name="conv2d_nhwc.image2d", - plevel=10, + wrap_compute_conv2d(topi.adreno.conv2d_nhwc_winograd), + wrap_topi_schedule(topi.adreno.schedule_conv2d_nhwc_winograd), + name="conv2d_nhwc_winograd.image2d", + plevel=5, ) strategy.add_implementation( - wrap_compute_conv2d(topi.adreno.conv2d_nhwc_acc32), - wrap_topi_schedule(topi.adreno.schedule_conv2d_nhwc_acc32), - name="conv2d_nhwc_acc32.image2d", - plevel=20, + wrap_compute_conv2d(topi.adreno.conv2d_nhwc), + wrap_topi_schedule(topi.adreno.schedule_conv2d_nhwc), + name="conv2d_nhwc.image2d", + plevel=10, ) else: raise RuntimeError( @@ -149,35 +127,21 @@ def conv2d_strategy_adreno(attrs, inputs, out_type, target): if (data_layout == "NCHW" and kernel_layout == "OIHW") or ( data_layout == "NCHW4c" and kernel_layout == "OIHW4o" ): - if out_type.dtype == "float16": - strategy.add_implementation( - wrap_compute_conv2d(topi.adreno.depthwise_conv2d_nchwc), - wrap_topi_schedule(topi.adreno.schedule_depthwise_conv2d_nchwc), - name="depthwise_conv2d_nchwc.image2d", - plevel=10, - ) strategy.add_implementation( - wrap_compute_conv2d(topi.adreno.depthwise_conv2d_nchwc_acc32), - wrap_topi_schedule(topi.adreno.schedule_depthwise_conv2d_nchwc_acc32), - name="depthwise_conv2d_nchwc_acc32.image2d", - plevel=20, + wrap_compute_conv2d(topi.adreno.depthwise_conv2d_nchwc), + wrap_topi_schedule(topi.adreno.schedule_depthwise_conv2d_nchwc), + name="depthwise_conv2d_nchwc.image2d", + plevel=10, ) elif (data_layout == "NHWC" and kernel_layout == "HWOI") or ( data_layout == "NHWC4c" and kernel_layout == "HWOI4o" ): if data.shape[-1] >= 4: - if out_type.dtype == "float16": - strategy.add_implementation( - wrap_compute_conv2d(topi.adreno.depthwise_conv2d_nhwc), - wrap_topi_schedule(topi.adreno.schedule_depthwise_conv2d_nhwc), - name="depthwise_conv2d_nhwc.image2d", - plevel=10, - ) strategy.add_implementation( - wrap_compute_conv2d(topi.adreno.depthwise_conv2d_nhwc_acc32), - wrap_topi_schedule(topi.adreno.schedule_depthwise_conv2d_nhwc_acc32), - name="depthwise_conv2d_nhwc_acc32.image2d", - plevel=20, + wrap_compute_conv2d(topi.adreno.depthwise_conv2d_nhwc), + wrap_topi_schedule(topi.adreno.schedule_depthwise_conv2d_nhwc), + name="depthwise_conv2d_nhwc.image2d", + plevel=10, ) else: strategy.add_implementation( @@ -208,40 +172,18 @@ def conv2d_winograd_without_weight_transfrom_strategy_adreno(attrs, inputs, out_ assert groups == 1, "Do not supoort arbitrary group number" strategy = _op.OpStrategy() if layout in ("NCHW", "NCHW4c"): - if out_type.dtype == "float16": - strategy.add_implementation( - wrap_compute_conv2d(topi.adreno.conv2d_nchw_winograd_without_weight_transform), - wrap_topi_schedule( - topi.adreno.schedule_conv2d_nchw_winograd_without_weight_transform - ), - name="conv2d_nchw_winograd_without_weight_transform.image2d", - plevel=5, - ) strategy.add_implementation( - wrap_compute_conv2d(topi.adreno.conv2d_nchw_winograd_without_weight_transform_acc32), - wrap_topi_schedule( - topi.adreno.schedule_conv2d_nchw_winograd_without_weight_transform_acc32 - ), - name="conv2d_nchw_winograd_without_weight_transform_acc32.image2d", - plevel=7, + wrap_compute_conv2d(topi.adreno.conv2d_nchw_winograd_without_weight_transform), + wrap_topi_schedule(topi.adreno.schedule_conv2d_nchw_winograd_without_weight_transform), + name="conv2d_nchw_winograd_without_weight_transform.image2d", + plevel=5, ) elif layout in ("NHWC", "NHWC4c"): - if out_type.dtype == "float16": - strategy.add_implementation( - wrap_compute_conv2d(topi.adreno.conv2d_nhwc_winograd_without_weight_transform), - wrap_topi_schedule( - topi.adreno.schedule_conv2d_nhwc_winograd_without_weight_transform - ), - name="conv2d_nhwc_winograd_without_weight_transform.image2d", - plevel=5, - ) strategy.add_implementation( - wrap_compute_conv2d(topi.adreno.conv2d_nhwc_winograd_without_weight_transform_acc32), - wrap_topi_schedule( - topi.adreno.schedule_conv2d_nhwc_winograd_without_weight_transform_acc32 - ), - name="conv2d_nhwc_winograd_without_weight_transform_acc32.image2d", - plevel=7, + wrap_compute_conv2d(topi.adreno.conv2d_nhwc_winograd_without_weight_transform), + wrap_topi_schedule(topi.adreno.schedule_conv2d_nhwc_winograd_without_weight_transform), + name="conv2d_nhwc_winograd_without_weight_transform.image2d", + plevel=5, ) else: raise RuntimeError( diff --git a/python/tvm/topi/adreno/conv2d_alter_op.py b/python/tvm/topi/adreno/conv2d_alter_op.py index 16573991e09c..6cf749a62b27 100644 --- a/python/tvm/topi/adreno/conv2d_alter_op.py +++ b/python/tvm/topi/adreno/conv2d_alter_op.py @@ -304,7 +304,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): num_filter_block = 4 # no support yet for tensors that cannot be divisible by factor 4 - if in_channel_block != 4 or num_filter_block != 4: + if num_filter_block != 4: return None batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape) @@ -312,16 +312,22 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): # update new attrs new_attrs["channels"] = out_channel - new_attrs["data_layout"] = "NCHW%dc" % in_channel_block + if in_channel_block == 4: + new_attrs["data_layout"] = "NCHW%dc" % in_channel_block + else: + new_attrs["data_layout"] = "NCHW" # (oc, ic, h, w) -> (OC, ic, h, w, oc) new_attrs["kernel_layout"] = "OIHW%do" % num_filter_block new_attrs["out_layout"] = "NCHW%dc" % num_filter_block # Store altered operator's config for applying of tuned AutoTVM statistics - new_data = te.placeholder( - (batch_size, in_channel // in_channel_block, height, width, in_channel_block), - dtype=data_dtype, - ) + if in_channel_block == 4: + new_data = te.placeholder( + (batch_size, in_channel // in_channel_block, height, width, in_channel_block), + dtype=data_dtype, + ) + else: + new_data = data_tensor new_kernel = te.placeholder( (out_channel // num_filter_block, in_filter_channel, kh, kw, num_filter_block), dtype=kernel_tensor.dtype, @@ -361,12 +367,15 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): num_filter_block = 4 # no support yet for tensors cannot be divisible by factor 4 - if in_channel_block != 4 or num_filter_block != 4: + if num_filter_block != 4: return None # update new attrs new_attrs["channels"] = out_channles - new_attrs["data_layout"] = "NHWC%dc" % in_channel_block + if in_channel_block == 4: + new_attrs["data_layout"] = "NHWC%dc" % in_channel_block + else: + new_attrs["data_layout"] = "NHWC" # (h, w, ic, oc) -> (h, w, ic, OC, oc) if kernel_layout == "HWIO": new_attrs["kernel_layout"] = "HWIO%do" % num_filter_block @@ -375,16 +384,19 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): new_attrs["out_layout"] = "NHWC%dc" % num_filter_block # Store altered operator's config for applying of tuned AutoTVM statistics - new_data = te.placeholder( - ( - batch_size, - in_height, - in_width, - in_channels // in_channel_block, - in_channel_block, - ), - dtype=data_dtype, - ) + if in_channel_block == 4: + new_data = te.placeholder( + ( + batch_size, + in_height, + in_width, + in_channels // in_channel_block, + in_channel_block, + ), + dtype=data_dtype, + ) + else: + new_data = data_tensor if kernel_layout == "HWIO": new_kernel = te.placeholder( ( diff --git a/python/tvm/topi/adreno/conv2d_nchw.py b/python/tvm/topi/adreno/conv2d_nchw.py index 65cd8e0150a8..082f71364af8 100644 --- a/python/tvm/topi/adreno/conv2d_nchw.py +++ b/python/tvm/topi/adreno/conv2d_nchw.py @@ -33,48 +33,22 @@ ) -@autotvm.register_topi_compute("conv2d_nchwc.image2d") -def conv2d_nchwc(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"): - """Compute conv2d with NCHWc layout""" - args = {"shared": False, "accumulator": "float16"} - return compute_conv2d_NCHWc_KCRSk( - data, kernel, strides, padding, dilation, out_dtype, args=args - ) - - -@autotvm.register_topi_compute("conv2d_nchwc_acc32.image2d") -def conv2d_nchwc_acc32(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"): - """Compute conv2d with NCHWc layout""" - args = {"shared": False, "accumulator": "float32"} - return compute_conv2d_NCHWc_KCRSk( - data, kernel, strides, padding, dilation, out_dtype, args=args - ) - - @autotvm.register_topi_schedule("conv2d_nchwc.image2d") def schedule_conv2d_nchwc(cfg, outs): - return schedule_conv2d_nchwc_impl(cfg, outs, tag="cast_from_acc16") - - -@autotvm.register_topi_schedule("conv2d_nchwc_acc32.image2d") -def schedule_conv2d_nchwc_acc32(cfg, outs): - return schedule_conv2d_nchwc_impl(cfg, outs, tag="cast_from_acc32") - - -def schedule_conv2d_nchwc_impl(cfg, outs, tag): """Create the schedule for conv2d_nchw""" outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs s = te.create_schedule([x.op for x in outs]) def _callback(op): - if op.tag == tag: + if op.tag == "adreno_conv2d_latest_op": schedule_conv2d_NCHWc_KCRSk(cfg, s, op.output(0)) traverse_inline(s, outs[0].op, _callback) return s -def compute_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilation, out_dtype, args): +@autotvm.register_topi_compute("conv2d_nchwc.image2d") +def conv2d_nchwc(cfg, Input, Filter, stride, padding, dilation, out_dtype): """ Convolution operator in NCHWc layout. Algo: @@ -109,18 +83,12 @@ def compute_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilation, out_dty convert_from4d = False if len(Input.shape) == 4: batch, in_channels, in_height, in_width = Input.shape - out_channles, in_filter_channels, kernel_h, kernel_w = Filter.shape - in_channel_chunks, in_channel_block, in_channel_tail = split_to_chunks(in_channels, 4) - out_channel_chunks, out_channel_block, out_channel_tail = split_to_chunks(out_channles, 4) if autotvm.GLOBAL_SCOPE.in_tuning: dshape = (batch, in_channel_chunks, in_height, in_width, in_channel_block) Input = tvm.te.placeholder(dshape, Input.dtype, name="data_placeholder") - kshape = (out_channel_chunks, in_filter_channels, kernel_h, kernel_w, out_channel_block) - Filter = tvm.te.placeholder(kshape, Filter.dtype, name="kernel_placeholder") else: - convert_from4d = True Input = pack_input( Input, "NCHW", @@ -131,6 +99,18 @@ def compute_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilation, out_dty in_height, in_width, ) + else: + batch, in_channel_chunks, in_height, in_width, in_channel_block = Input.shape + + if len(Filter.shape) == 4: + out_channles, in_filter_channels, kernel_h, kernel_w = Filter.shape + out_channel_chunks, out_channel_block, out_channel_tail = split_to_chunks(out_channles, 4) + + if autotvm.GLOBAL_SCOPE.in_tuning: + kshape = (out_channel_chunks, in_filter_channels, kernel_h, kernel_w, out_channel_block) + Filter = tvm.te.placeholder(kshape, Filter.dtype, name="kernel_placeholder") + else: + convert_from4d = True Filter = pack_filter( Filter, "OIHW", @@ -144,9 +124,7 @@ def compute_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilation, out_dty kernel_h, kernel_w, ) - else: - batch, in_channel_chunks, in_height, in_width, in_channel_block = Input.shape out_channel_chunks, in_filter_channels, kernel_h, kernel_w, out_channel_block = Filter.shape out_height_orig, out_height, out_width_orig, out_width = expand_spatial_dimensions( @@ -178,7 +156,7 @@ def compute_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilation, out_dty ( temp[nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcb] * Filter[ffc, rcc * in_channel_block + rcb, ry, rx, ffb] - ).astype(args["accumulator"]), + ).astype(out_dtype), axis=[rcc, rcb, ry, rx], ), tag="conv2d_nchwc", @@ -193,13 +171,13 @@ def compute_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilation, out_dty return te.compute( (batch, out_channles, out_height_orig, out_width_orig), lambda n, c, y, x: dummy_cast[n, c // out_channel_block, y, x, c % out_channel_block], - tag="cast_from_acc" + args["accumulator"][-2:], + tag="adreno_conv2d_latest_op", ) else: return te.compute( (batch, out_channel_chunks, out_height_orig, out_width_orig, out_channel_block), lambda n, ffc, y, x, ffb: conv[n, ffc, y, x, ffb].astype(out_dtype), - tag="cast_from_acc" + args["accumulator"][-2:], + tag="adreno_conv2d_latest_op", ) @@ -234,6 +212,20 @@ def schedule_conv2d_NCHWc_KCRSk(cfg, s, output): conv = output.op.input_tensors[0] latest_blocked = latest + pad_data, kernel = s[conv].op.input_tensors + filter_pack_rt = bool( + isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag + ) + + if "pad_temp" in pad_data.op.name: + input_pad_temp = pad_data.op.input_tensors[0] + else: + input_pad_temp = pad_data + + input_pack_rt = bool( + isinstance(input_pad_temp.op, tvm.te.ComputeOp) and "input_pack" in input_pad_temp.op.tag + ) + ##### space definition begin ##### n, fc, y, x, fb = s[conv].op.axis rcc, rcb, ry, rx = s[conv].op.reduce_axis @@ -274,37 +266,40 @@ def schedule_conv2d_NCHWc_KCRSk(cfg, s, output): ##### space definition end ##### pad_data, kernel = s[conv].op.input_tensors - if ( - isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag - ): # len(latest.op.axis) == 4: - # manage scheduling of datacopy - pad_data, kernel = s[conv].op.input_tensors - if "pad_temp" in pad_data.op.name: - pack_data = pad_data.op.input_tensors[0] - bind_data_copy(s[pack_data]) + # There are several conditions that have to be handled: + # 1. If we are in the tuning, we always add cache read for data to main conv kernel + # to get texture in tuning opencl kernel + # 2. If we are repacking input in runtime, we should always explicit schedule this one more + # stage of data copy from 4d to 5d (referred as pack_data). + # 3. If we have pad (independently if we have runtime repack or not) we should inline it in the + # cache_read("texture") + if autotvm.GLOBAL_SCOPE.in_tuning or input_pack_rt: + if autotvm.GLOBAL_SCOPE.in_tuning: + if "pad_temp" in pad_data.op.name: + s[pad_data].compute_inline() else: - bind_data_copy(s[pad_data]) - bind_data_copy(s[kernel]) - - pad_data, kernel = s[conv].op.input_tensors + if "pad_temp" in pad_data.op.name: + pack_data = pad_data.op.input_tensors[0] + bind_data_copy(s[pack_data]) + s[pad_data].compute_inline() + else: + pack_data = pad_data + bind_data_copy(s[pack_data]) - if ( - autotvm.GLOBAL_SCOPE.in_tuning - or isinstance(kernel.op, tvm.te.ComputeOp) - and "filter_pack" in kernel.op.tag - ): - if "pad_temp" in pad_data.op.name: - s[pad_data].compute_inline() AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv]) bind_data_copy(s[AT]) - WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv]) - bind_data_copy(s[WT]) elif "pad_temp" in pad_data.op.name: s[pad_data].compute_inline() # create cache stage AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv]) bind_data_copy(s[AT]) + if autotvm.GLOBAL_SCOPE.in_tuning or filter_pack_rt: + if not autotvm.GLOBAL_SCOPE.in_tuning: + bind_data_copy(s[kernel]) + WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv]) + bind_data_copy(s[WT]) + s[conv].set_scope("local") if latest_blocked == latest and output != latest: s[output].compute_inline() diff --git a/python/tvm/topi/adreno/conv2d_nchw_winograd.py b/python/tvm/topi/adreno/conv2d_nchw_winograd.py index 16f7cb8b19d9..0ddc0e7f2c0d 100644 --- a/python/tvm/topi/adreno/conv2d_nchw_winograd.py +++ b/python/tvm/topi/adreno/conv2d_nchw_winograd.py @@ -27,62 +27,32 @@ @autotvm.register_topi_compute("conv2d_nchw_winograd.image2d") def conv2d_nchw_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype): - args = {"shared": False, "accumulator": "float16"} return conv2d_nchw_winograd_comp( - cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=False - ) - - -@autotvm.register_topi_compute("conv2d_nchw_winograd_acc32.image2d") -def conv2d_nchw_winograd_acc32(cfg, data, kernel, strides, padding, dilation, out_dtype): - args = {"shared": False, "accumulator": "float32"} - return conv2d_nchw_winograd_comp( - cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=False + cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=False ) @autotvm.register_topi_schedule("conv2d_nchw_winograd.image2d") def schedule_conv2d_nchw_winograd(cfg, outs): - return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc16") - - -@autotvm.register_topi_schedule("conv2d_nchw_winograd_acc32.image2d") -def schedule_conv2d_nchw_winograd_acc32(cfg, outs): - return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc32") + return schedule_conv2d_winograd_impl(cfg, outs, tag="dummy_compute_at") @autotvm.register_topi_compute("conv2d_nchw_winograd_without_weight_transform.image2d") def conv2d_nchw_winograd_without_weight_transform( cfg, data, kernel, strides, padding, dilation, out_dtype ): - args = {"shared": False, "accumulator": "float16"} return conv2d_nchw_winograd_comp( - cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=True - ) - - -@autotvm.register_topi_compute("conv2d_nchw_winograd_without_weight_transform_acc32.image2d") -def conv2d_nchw_winograd_without_weight_transform_acc32( - cfg, data, kernel, strides, padding, dilation, out_dtype -): - args = {"shared": False, "accumulator": "float32"} - return conv2d_nchw_winograd_comp( - cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=True + cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=True ) @autotvm.register_topi_schedule("conv2d_nchw_winograd_without_weight_transform.image2d") def schedule_conv2d_nchw_winograd_without_weight_transform(cfg, outs): - return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc16", pre_computed=True) - - -@autotvm.register_topi_schedule("conv2d_nchw_winograd_without_weight_transform_acc32.image2d") -def schedule_conv2d_nchw_winograd_without_weight_transform_acc32(cfg, outs): - return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc32", pre_computed=True) + return schedule_conv2d_winograd_impl(cfg, outs, tag="dummy_compute_at", pre_computed=True) def conv2d_nchw_winograd_comp( - cfg, data, kernel, strides, padding, dilation, out_dtype, args, pre_computed + cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed ): """Compute declaration for winograd @@ -111,9 +81,6 @@ def conv2d_nchw_winograd_comp( out_dtype: str The output type. This is used for mixed precision. - args: dict - Dictionary with additional arguments, e.g. accumulator type - pre_computed: bool Flag if weights were pre computed if true or the weights should be computed in runtime @@ -124,5 +91,5 @@ def conv2d_nchw_winograd_comp( 4-D or 5-D with shape NCHW or NCHW4c """ return conv2d_winograd_comp( - cfg, data, kernel, strides, padding, dilation, out_dtype, args, pre_computed, "NCHW" + cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed, "NCHW" ) diff --git a/python/tvm/topi/adreno/conv2d_nhwc.py b/python/tvm/topi/adreno/conv2d_nhwc.py index b377169ca8c9..993b63252531 100644 --- a/python/tvm/topi/adreno/conv2d_nhwc.py +++ b/python/tvm/topi/adreno/conv2d_nhwc.py @@ -33,44 +33,22 @@ ) -@autotvm.register_topi_compute("conv2d_nhwc.image2d") -def conv2d_nhwc(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"): - """Compute conv2d with NCHWc layout""" - args = {"shared": False, "accumulator": "float16"} - return compute_conv2d_NHWC_HWIO(data, kernel, strides, padding, dilation, out_dtype, args=args) - - -@autotvm.register_topi_compute("conv2d_nhwc_acc32.image2d") -def conv2d_nhwc_acc32(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"): - """Compute conv2d with NCHWc layout""" - args = {"shared": False, "accumulator": "float32"} - return compute_conv2d_NHWC_HWIO(data, kernel, strides, padding, dilation, out_dtype, args=args) - - @autotvm.register_topi_schedule("conv2d_nhwc.image2d") def schedule_conv2d_nhwc(cfg, outs): - return schedule_conv2d_nhwc_impl(cfg, outs, tag="cast_from_acc16") - - -@autotvm.register_topi_schedule("conv2d_nhwc_acc32.image2d") -def schedule_conv2d_nhwc_acc32(cfg, outs): - return schedule_conv2d_nhwc_impl(cfg, outs, tag="cast_from_acc32") - - -def schedule_conv2d_nhwc_impl(cfg, outs, tag): """Create the schedule for conv2d_nhwc""" outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs s = te.create_schedule([x.op for x in outs]) def _callback(op): - if op.tag == tag: + if op.tag == "adreno_conv2d_latest_op": schedule_conv2d_NHWC(cfg, s, op.output(0)) traverse_inline(s, outs[0].op, _callback) return s -def compute_conv2d_NHWC_HWIO(Input, Filter, stride, padding, dilation, out_dtype, args): +@autotvm.register_topi_compute("conv2d_nhwc.image2d") +def conv2d_nhwc(cfg, Input, Filter, stride, padding, dilation, out_dtype): """ Convolution operator in NHWC layout. Algo: @@ -105,18 +83,12 @@ def compute_conv2d_NHWC_HWIO(Input, Filter, stride, padding, dilation, out_dtype convert_from4d = False if len(Input.shape) == 4: batch, in_height, in_width, in_channels = Input.shape - kernel_h, kernel_w, in_filter_channels, out_channles = Filter.shape - in_channel_chunks, in_channel_block, in_channel_tail = split_to_chunks(in_channels, 4) - out_channel_chunks, out_channel_block, out_channel_tail = split_to_chunks(out_channles, 4) if autotvm.GLOBAL_SCOPE.in_tuning: dshape = (batch, in_height, in_width, in_channel_chunks, in_channel_block) Input = tvm.te.placeholder(dshape, Input.dtype, name="data_placeholder") - kshape = (kernel_h, kernel_w, in_filter_channels, out_channel_chunks, out_channel_block) - Filter = tvm.te.placeholder(kshape, Filter.dtype, name="kernel_placeholder") else: - convert_from4d = True Input = pack_input( Input, "NHWC", @@ -127,6 +99,17 @@ def compute_conv2d_NHWC_HWIO(Input, Filter, stride, padding, dilation, out_dtype in_height, in_width, ) + else: + batch, in_height, in_width, in_channel_chunks, in_channel_block = Input.shape + + if len(Filter.shape) == 4: + kernel_h, kernel_w, in_filter_channels, out_channles = Filter.shape + out_channel_chunks, out_channel_block, out_channel_tail = split_to_chunks(out_channles, 4) + if autotvm.GLOBAL_SCOPE.in_tuning: + kshape = (kernel_h, kernel_w, in_filter_channels, out_channel_chunks, out_channel_block) + Filter = tvm.te.placeholder(kshape, Filter.dtype, name="kernel_placeholder") + else: + convert_from4d = True Filter = pack_filter( Filter, "HWIO", @@ -140,9 +123,7 @@ def compute_conv2d_NHWC_HWIO(Input, Filter, stride, padding, dilation, out_dtype kernel_h, kernel_w, ) - else: - batch, in_height, in_width, in_channel_chunks, in_channel_block = Input.shape kernel_h, kernel_w, in_filter_channels, out_channel_chunks, out_channel_block = Filter.shape out_height_orig, out_height, out_width_orig, out_width = expand_spatial_dimensions( @@ -173,7 +154,7 @@ def compute_conv2d_NHWC_HWIO(Input, Filter, stride, padding, dilation, out_dtype ( temp[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcc, rcb] * Filter[ry, rx, rcc * in_channel_block + rcb, fc, fb] - ).astype(args["accumulator"]), + ).astype(out_dtype), axis=[ry, rx, rcc, rcb], ), tag="conv2d_nhwc", @@ -188,13 +169,13 @@ def compute_conv2d_NHWC_HWIO(Input, Filter, stride, padding, dilation, out_dtype return te.compute( (batch, out_height_orig, out_width_orig, out_channles), lambda n, y, x, c: dummy_cast[n, y, x, c // out_channel_block, c % out_channel_block], - tag="cast_from_acc" + args["accumulator"][-2:], + tag="adreno_conv2d_latest_op", ) else: return te.compute( (batch, out_height_orig, out_width_orig, out_channel_chunks, out_channel_block), lambda n, y, x, ffc, ffb: conv[n, y, x, ffc, ffb].astype(out_dtype), - tag="cast_from_acc" + args["accumulator"][-2:], + tag="adreno_conv2d_latest_op", ) @@ -229,6 +210,19 @@ def schedule_conv2d_NHWC(cfg, s, output): conv = output.op.input_tensors[0] latest_blocked = latest + pad_data, kernel = s[conv].op.input_tensors + filter_pack_rt = bool( + isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag + ) + + if "pad_temp" in pad_data.op.name: + input_pad_temp = pad_data.op.input_tensors[0] + else: + input_pad_temp = pad_data + + input_pack_rt = bool( + isinstance(input_pad_temp.op, tvm.te.ComputeOp) and "input_pack" in input_pad_temp.op.tag + ) ##### space definition begin ##### n, y, x, fc, fb = s[conv].op.axis ry, rx, rcc, rcb = s[conv].op.reduce_axis @@ -270,37 +264,40 @@ def schedule_conv2d_NHWC(cfg, s, output): ##### space definition end ##### pad_data, kernel = s[conv].op.input_tensors - if ( - isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag - ): # len(latest.op.axis) == 4: - # manage scheduling of datacopy - pad_data, kernel = s[conv].op.input_tensors - if "pad_temp" in pad_data.op.name: - pack_data = pad_data.op.input_tensors[0] - bind_data_copy(s[pack_data]) + # There are several conditions that have to be handled: + # 1. If we are in the tuning, we always add cache read for data to main conv kernel + # to get texture in tuning opencl kernel + # 2. If we are repacking input in runtime, we should always explicit schedule this one more + # stage of data copy from 4d to 5d (referred as pack_data). + # 3. If we have pad (independently if we have runtime repack or not) we should inline it in the + # cache_read("texture") + if autotvm.GLOBAL_SCOPE.in_tuning or input_pack_rt: + if autotvm.GLOBAL_SCOPE.in_tuning: + if "pad_temp" in pad_data.op.name: + s[pad_data].compute_inline() else: - bind_data_copy(s[pad_data]) - bind_data_copy(s[kernel]) - - pad_data, kernel = s[conv].op.input_tensors + if "pad_temp" in pad_data.op.name: + s[pad_data].compute_inline() + pack_data = pad_data.op.input_tensors[0] + bind_data_copy(s[pack_data]) + else: + pack_data = pad_data + bind_data_copy(s[pack_data]) - if ( - autotvm.GLOBAL_SCOPE.in_tuning - or isinstance(kernel.op, tvm.te.ComputeOp) - and "filter_pack" in kernel.op.tag - ): - if "pad_temp" in pad_data.op.name: - s[pad_data].compute_inline() AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv]) bind_data_copy(s[AT]) - WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv]) - bind_data_copy(s[WT]) elif "pad_temp" in pad_data.op.name: s[pad_data].compute_inline() # create cache stage AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv]) bind_data_copy(s[AT]) + if autotvm.GLOBAL_SCOPE.in_tuning or filter_pack_rt: + if not autotvm.GLOBAL_SCOPE.in_tuning: + bind_data_copy(s[kernel]) + WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv]) + bind_data_copy(s[WT]) + s[conv].set_scope("local") if latest_blocked == latest and output != latest: s[output].compute_inline() diff --git a/python/tvm/topi/adreno/conv2d_nhwc_winograd.py b/python/tvm/topi/adreno/conv2d_nhwc_winograd.py index bfe385f210a4..b055b388e1a7 100644 --- a/python/tvm/topi/adreno/conv2d_nhwc_winograd.py +++ b/python/tvm/topi/adreno/conv2d_nhwc_winograd.py @@ -27,62 +27,32 @@ @autotvm.register_topi_compute("conv2d_nhwc_winograd.image2d") def conv2d_nhwc_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype): - args = {"shared": False, "accumulator": "float16"} return conv2d_nhwc_winograd_comp( - cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=False - ) - - -@autotvm.register_topi_compute("conv2d_nhwc_winograd_acc32.image2d") -def conv2d_nhwc_winograd_acc32(cfg, data, kernel, strides, padding, dilation, out_dtype): - args = {"shared": False, "accumulator": "float32"} - return conv2d_nhwc_winograd_comp( - cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=False + cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=False ) @autotvm.register_topi_schedule("conv2d_nhwc_winograd.image2d") def schedule_conv2d_nhwc_winograd(cfg, outs): - return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc16") - - -@autotvm.register_topi_schedule("conv2d_nhwc_winograd_acc32.image2d") -def schedule_conv2d_nhwc_winograd_acc32(cfg, outs): - return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc32") + return schedule_conv2d_winograd_impl(cfg, outs, tag="dummy_compute_at") @autotvm.register_topi_compute("conv2d_nhwc_winograd_without_weight_transform.image2d") def conv2d_nhwc_winograd_without_weight_transform( cfg, data, kernel, strides, padding, dilation, out_dtype ): - args = {"shared": False, "accumulator": "float16"} return conv2d_nhwc_winograd_comp( - cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=True - ) - - -@autotvm.register_topi_compute("conv2d_nhwc_winograd_without_weight_transform_acc32.image2d") -def conv2d_nhwc_winograd_without_weight_transform_acc32( - cfg, data, kernel, strides, padding, dilation, out_dtype -): - args = {"shared": False, "accumulator": "float32"} - return conv2d_nhwc_winograd_comp( - cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=True + cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=True ) @autotvm.register_topi_schedule("conv2d_nhwc_winograd_without_weight_transform.image2d") def schedule_conv2d_nhwc_winograd_without_weight_transform(cfg, outs): - return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc16", pre_computed=True) - - -@autotvm.register_topi_schedule("conv2d_nhwc_winograd_without_weight_transform_acc32.image2d") -def schedule_conv2d_nhwc_winograd_without_weight_transform_acc32(cfg, outs): - return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc32", pre_computed=True) + return schedule_conv2d_winograd_impl(cfg, outs, tag="dummy_compute_at", pre_computed=True) def conv2d_nhwc_winograd_comp( - cfg, data, kernel, strides, padding, dilation, out_dtype, args, pre_computed + cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed ): """Compute declaration for winograd @@ -111,9 +81,6 @@ def conv2d_nhwc_winograd_comp( out_dtype: str The output type. This is used for mixed precision. - args: dict - Dictionary with additional arguments, e.g. accumulator type - pre_computed: bool Flag if weights were pre computed if true or the weights should be computed in runtime @@ -124,5 +91,5 @@ def conv2d_nhwc_winograd_comp( 4-D or 5-D with shape NCHW or NCHW4c """ return conv2d_winograd_comp( - cfg, data, kernel, strides, padding, dilation, out_dtype, args, pre_computed, "NHWC" + cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed, "NHWC" ) diff --git a/python/tvm/topi/adreno/conv2d_winograd_common.py b/python/tvm/topi/adreno/conv2d_winograd_common.py index b0cec0f70280..501773ad46fa 100644 --- a/python/tvm/topi/adreno/conv2d_winograd_common.py +++ b/python/tvm/topi/adreno/conv2d_winograd_common.py @@ -35,7 +35,7 @@ def conv2d_winograd_comp( - cfg, data, kernel, strides, padding, dilation, out_dtype, args, pre_computed, layout + cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed, layout ): """Compute declaration for winograd @@ -64,9 +64,6 @@ def conv2d_winograd_comp( out_dtype: str The output type. This is used for mixed precision. - args: dict - Dictionary with additional arguments, e.g. accumulator type - pre_computed: bool Flag if weights were pre computed if true or the weights should be computed in runtime @@ -186,7 +183,7 @@ def conv2d_winograd_comp( r = KW m = tile_size - A, B, G = winograd_transform_matrices(m, r, out_dtype) + A, B, G = winograd_transform_matrices(m, r, data.dtype) H = (H + pt + pb - KH) // HSTR + 1 W = (W + pl + pr - KW) // WSTR + 1 @@ -268,7 +265,7 @@ def conv2d_winograd_comp( lambda eps, nu, co, p, cob: te.sum( ( kernel_pack[eps][nu][ci * CB + cb][co][cob] * data_pack_trans[eps][nu][ci][p][cb] - ).astype(args["accumulator"]), + ).astype(out_dtype), axis=[ci, cb], ), name="bgemm", @@ -280,7 +277,7 @@ def conv2d_winograd_comp( inverse = te.compute( (CO, P, m, m, COB), lambda co, p, vh, vw, cob: te.sum( - bgemm[r_a][r_b][co][p][cob] * (A[r_a][vh] * A[r_b][vw]).astype(args["accumulator"]), + bgemm[r_a][r_b][co][p][cob] * (A[r_a][vh] * A[r_b][vw]).astype(out_dtype), axis=[r_a, r_b], ), name="inverse", @@ -295,7 +292,7 @@ def conv2d_winograd_comp( idxmod(h, m) ][idxmod(w, m)][c % CB].astype(out_dtype), name="output", - tag="cast_from_acc" + args["accumulator"][-2:], + tag="dummy_compute_at", ) else: output = te.compute( @@ -304,7 +301,7 @@ def conv2d_winograd_comp( n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m) ][idxmod(h, m)][idxmod(w, m)][cob].astype(out_dtype), name="output", - tag="cast_from_acc" + args["accumulator"][-2:], + tag="dummy_compute_at", ) else: if convert_from4d and autotvm.GLOBAL_SCOPE.in_tuning is False: @@ -314,7 +311,7 @@ def conv2d_winograd_comp( idxmod(h, m) ][idxmod(w, m)][c % CB].astype(out_dtype), name="output", - tag="cast_from_acc" + args["accumulator"][-2:], + tag="dummy_compute_at", ) else: output = te.compute( @@ -323,7 +320,7 @@ def conv2d_winograd_comp( n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m) ][idxmod(h, m)][idxmod(w, m)][cob].astype(out_dtype), name="output", - tag="cast_from_acc" + args["accumulator"][-2:], + tag="dummy_compute_at", ) if isinstance(N, int): diff --git a/python/tvm/topi/adreno/depthwise_conv2d_nchw.py b/python/tvm/topi/adreno/depthwise_conv2d_nchw.py index 37713b4584b9..eb998bdbcd6e 100644 --- a/python/tvm/topi/adreno/depthwise_conv2d_nchw.py +++ b/python/tvm/topi/adreno/depthwise_conv2d_nchw.py @@ -33,50 +33,22 @@ ) -@autotvm.register_topi_compute("depthwise_conv2d_nchwc.image2d") -def depthwise_conv2d_nchwc(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"): - """Compute depthwise_conv2d with NCHWc layout""" - args = {"shared": False, "accumulator": "float16"} - return compute_depthwise_conv2d_NCHWc_KCRSk( - data, kernel, strides, padding, dilation, out_dtype, args=args - ) - - -@autotvm.register_topi_compute("depthwise_conv2d_nchwc_acc32.image2d") -def depthwise_conv2d_nchwc_acc32( - cfg, data, kernel, strides, padding, dilation, out_dtype="float16" -): - """Compute depthwise_conv2d with NCHWc layout""" - args = {"shared": False, "accumulator": "float32"} - return compute_depthwise_conv2d_NCHWc_KCRSk( - data, kernel, strides, padding, dilation, out_dtype, args=args - ) - - @autotvm.register_topi_schedule("depthwise_conv2d_nchwc.image2d") def schedule_depthwise_conv2d_nchwc(cfg, outs): - return schedule_depthwise_conv2d_nchwc_impl(cfg, outs, tag="cast_from_acc16") - - -@autotvm.register_topi_schedule("depthwise_conv2d_nchwc_acc32.image2d") -def schedule_depthwise_conv2d_nchwc_acc32(cfg, outs): - return schedule_depthwise_conv2d_nchwc_impl(cfg, outs, tag="cast_from_acc32") - - -def schedule_depthwise_conv2d_nchwc_impl(cfg, outs, tag): """Create the schedule for depthwise conv2d_nchw4c_ohwi4o""" outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs s = te.create_schedule([x.op for x in outs]) def _callback(op): - if op.tag == tag: + if op.tag == "adreno_dw_conv2d_latest_op": schedule_depthwise_conv2d_NCHWc_KCRSk(cfg, s, op.output(0)) traverse_inline(s, outs[0].op, _callback) return s -def compute_depthwise_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilation, out_dtype, args): +@autotvm.register_topi_compute("depthwise_conv2d_nchwc.image2d") +def depthwise_conv2d_nchwc(cfg, Input, Filter, stride, padding, dilation, out_dtype): """ Depthwise convolution operator in NCHWc layout. Algo: @@ -183,10 +155,10 @@ def compute_depthwise_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilatio ffb, ] * Filter[ffc // in_filter_channels, ffc % in_filter_channels, ry, rx, ffb] - ).astype(args["accumulator"]), + ).astype(out_dtype), axis=[ry, rx], ), - tag="depthwise_conv2d_nchwc_kcrsk", + tag="depthwise_conv2d_nchwc", ) if convert_from4d and not autotvm.GLOBAL_SCOPE.in_tuning: @@ -198,13 +170,13 @@ def compute_depthwise_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilatio return te.compute( (batch, out_channles, out_height_orig, out_width_orig), lambda n, c, y, x: dummy_cast[n, c // out_channel_block, y, x, c % out_channel_block], - tag="cast_from_acc" + args["accumulator"][-2:], + tag="adreno_dw_conv2d_latest_op", ) else: return te.compute( (batch, out_channel_chunks, out_height_orig, out_width_orig, out_channel_block), lambda n, ffc, y, x, ffb: conv[n, ffc, y, x, ffb].astype(out_dtype), - tag="cast_from_acc" + args["accumulator"][-2:], + tag="adreno_dw_conv2d_latest_op", ) diff --git a/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py b/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py index 2b228b444fca..c27f2a9eae7c 100644 --- a/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py +++ b/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py @@ -33,48 +33,22 @@ ) -@autotvm.register_topi_compute("depthwise_conv2d_nhwc.image2d") -def depthwise_conv2d_nhwc(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"): - """Compute depthwise_conv2d with NHWC layout""" - args = {"shared": False, "accumulator": "float16"} - return compute_depthwise_conv2d_NHWC_HWOI( - data, kernel, strides, padding, dilation, out_dtype, args=args - ) - - -@autotvm.register_topi_compute("depthwise_conv2d_nhwc_acc32.image2d") -def depthwise_conv2d_nhwc_acc32(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"): - """Compute depthwise_conv2d with NHWC layout""" - args = {"shared": False, "accumulator": "float32"} - return compute_depthwise_conv2d_NHWC_HWOI( - data, kernel, strides, padding, dilation, out_dtype, args=args - ) - - @autotvm.register_topi_schedule("depthwise_conv2d_nhwc.image2d") def schedule_depthwise_conv2d_nhwc(cfg, outs): - return schedule_depthwise_conv2d_nhwc_impl(cfg, outs, tag="cast_from_acc16") - - -@autotvm.register_topi_schedule("depthwise_conv2d_nhwc_acc32.image2d") -def schedule_depthwise_conv2d_nhwc_acc32(cfg, outs): - return schedule_depthwise_conv2d_nhwc_impl(cfg, outs, tag="cast_from_acc32") - - -def schedule_depthwise_conv2d_nhwc_impl(cfg, outs, tag): """Create the schedule for depthwise conv2d_nchw4c_ohwi4o""" outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs s = te.create_schedule([x.op for x in outs]) def _callback(op): - if op.tag == tag: + if op.tag == "adreno_dw_conv2d_latest_op": schedule_depthwise_conv2d_NHWC_HWOI(cfg, s, op.output(0)) traverse_inline(s, outs[0].op, _callback) return s -def compute_depthwise_conv2d_NHWC_HWOI(Input, Filter, stride, padding, dilation, out_dtype, args): +@autotvm.register_topi_compute("depthwise_conv2d_nhwc.image2d") +def depthwise_conv2d_nhwc(cfg, Input, Filter, stride, padding, dilation, out_dtype): """ Depthwise convolution operator in NCHWc layout. Algo: @@ -175,7 +149,7 @@ def compute_depthwise_conv2d_NHWC_HWOI(Input, Filter, stride, padding, dilation, ( temp[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, ffc, ffb] * Filter[ry, rx, ffc, 0, ffb] - ).astype(args["accumulator"]), + ).astype(out_dtype), axis=[ry, rx], ), tag="depthwise_conv2d_nhwc", @@ -190,13 +164,13 @@ def compute_depthwise_conv2d_NHWC_HWOI(Input, Filter, stride, padding, dilation, return te.compute( (batch, out_height_orig, out_width_orig, out_channles), lambda n, y, x, c: dummy_cast[n, y, x, c // out_channel_block, c % out_channel_block], - tag="cast_from_acc" + args["accumulator"][-2:], + tag="adreno_dw_conv2d_latest_op", ) else: return te.compute( (batch, out_height_orig, out_width_orig, out_channel_chunks, out_channel_block), lambda n, y, x, ffc, ffb: conv[n, y, x, ffc, ffb].astype(out_dtype), - tag="cast_from_acc" + args["accumulator"][-2:], + tag="adreno_dw_conv2d_latest_op", ) diff --git a/tests/python/relay/test_conv2d_nchw_texture.py b/tests/python/relay/test_conv2d_nchw_texture.py index 6eadd8fc1c7a..ab12e40b39cb 100644 --- a/tests/python/relay/test_conv2d_nchw_texture.py +++ b/tests/python/relay/test_conv2d_nchw_texture.py @@ -437,7 +437,7 @@ def test_conv2d_vgg16_winograd_4d(): stat_file = temp.relpath("stat.log") with open(stat_file, "w") as f: f.write( - '{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nchw_winograd_acc32.image2d", [["TENSOR", [1, 512, 28, 28], "float16"], ["TENSOR", [512, 512, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}\n' + '{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nchw_winograd.image2d", [["TENSOR", [1, 512, 28, 28], "float16"], ["TENSOR", [512, 512, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}\n' ) graph = build_run_compare( mod, params1, {"data": input_shape}, dtype, target, stat_file=stat_file @@ -486,7 +486,7 @@ def test_conv2d_winograd_conv(): stat_file = temp.relpath("stat.log") with open(stat_file, "w") as f: f.write( - '{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nchw_winograd_acc32.image2d", [["TENSOR", [1, 4, 3, 3], "float16"], ["TENSOR", [8, 4, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}\n' + '{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nchw_winograd.image2d", [["TENSOR", [1, 4, 3, 3], "float16"], ["TENSOR", [8, 4, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}\n' ) graph = build_run_compare( mod, params1, {"data": input_shape}, dtype, target, stat_file=stat_file diff --git a/tests/python/relay/test_conv2d_nhwc_texture.py b/tests/python/relay/test_conv2d_nhwc_texture.py index be5cefd46038..cf8116c076cc 100644 --- a/tests/python/relay/test_conv2d_nhwc_texture.py +++ b/tests/python/relay/test_conv2d_nhwc_texture.py @@ -598,7 +598,7 @@ def test_conv2d_vgg16_winograd_4d(): stat_file = temp.relpath("stat.log") with open(stat_file, "w") as f: f.write( - '{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nhwc_winograd_acc32.image2d", [["TENSOR", [1, 28, 28, 512], "float16"], ["TENSOR", [3, 3, 512, 512], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}\n' + '{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nhwc_winograd.image2d", [["TENSOR", [1, 28, 28, 512], "float16"], ["TENSOR", [3, 3, 512, 512], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}\n' ) graph = build_run_compare( mod, params1, {"data": input_shape}, dtype, target, stat_file=stat_file