From 1cd86afd26e5649fd19850c5caf1c7b238ce2171 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Thu, 13 Sep 2018 14:23:14 -0700 Subject: [PATCH] [AUTOTVM] End2End autotvm support for vta (#18) * support tuning a whole network * pass unit test * update tune resnet * update all --- python/tvm/autotvm/measure/measure_methods.py | 13 +- python/tvm/autotvm/task/nnvm_integration.py | 33 +- python/tvm/target.py | 7 + src/codegen/build_module.cc | 11 +- topi/python/topi/__init__.py | 4 + topi/python/topi/util.py | 4 + vta/python/vta/__init__.py | 3 +- vta/python/vta/build_module.py | 36 ++ vta/python/vta/environment.py | 24 +- vta/python/vta/top/__init__.py | 9 +- vta/python/vta/top/arm_conv2d.py | 24 - vta/python/vta/top/op.py | 153 ++++++ vta/python/vta/top/testing/__init__.py | 3 - vta/python/vta/top/testing/vta_conv2d_test.py | 220 -------- vta/python/vta/top/vta_conv2d.py | 472 ++---------------- vta/scripts/tune_conv.py | 210 ++------ vta/scripts/tune_resnet.py | 224 +++++++++ vta/src/runtime.cc | 4 +- .../integration/test_benchmark_topi_conv2d.py | 447 ++++++++++++----- vta/tutorials/resnet.py | 15 +- 20 files changed, 874 insertions(+), 1042 deletions(-) delete mode 100644 vta/python/vta/top/arm_conv2d.py create mode 100644 vta/python/vta/top/op.py delete mode 100644 vta/python/vta/top/testing/__init__.py delete mode 100644 vta/python/vta/top/testing/vta_conv2d_test.py create mode 100644 vta/scripts/tune_resnet.py diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index f1e884d8df14b..69a309216a4e4 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -207,7 +207,18 @@ def set_task(self, task): for x in arg_bufs] func = build(s, arg_bufs, "llvm") tvm_buf = [nd.array(x) for x in self.ref_input] - func(*tvm_buf) + + def _run_func(): + """Run tvm function in a thread. + Because there is some issues with python multiprocessing and the thread pool in tvm + """ + func(*tvm_buf) + + thread = threading.Thread(target=_run_func) + thread.start() + thread.join() + del thread + self.ref_output = [x.asnumpy() for x in tvm_buf] def get_build_kwargs(self): diff --git a/python/tvm/autotvm/task/nnvm_integration.py b/python/tvm/autotvm/task/nnvm_integration.py index cd7337586519f..e1c039d678e03 100644 --- a/python/tvm/autotvm/task/nnvm_integration.py +++ b/python/tvm/autotvm/task/nnvm_integration.py @@ -5,6 +5,7 @@ """ import warnings import logging +import sys from ... import target as _target @@ -18,8 +19,7 @@ def extract_from_graph(graph, shape, dtype, target, symbols, target_host=None): """ Extract tuning tasks from a nnvm graph. - This function collects tuning tasks by building the graph - with a "tracing" target and tracing all the calls to topi. + This function collects tuning tasks by building the graph and trace all the calls to topi. Parameters ---------- @@ -45,7 +45,7 @@ def extract_from_graph(graph, shape, dtype, target, symbols, target_host=None): import nnvm import topi - env = TaskExtractEnv.get() + env = TaskExtractEnv(symbols) #NOTE: To add more symbols, you only need to change the following lists #nnvm symbol -> topi compute @@ -63,26 +63,23 @@ def extract_from_graph(graph, shape, dtype, target, symbols, target_host=None): else: warnings.warn("Symbol %s is not tunable, ignored" % sym_name) - # run compiler to collect all TOPI calls during compilation - env.reset(topi_funcs) + # run compiler to collect all TOPI calls during compilation + nnvm.compiler.engine.clear_cache() + nnvm.compiler.build(graph, target=target, shape=shape, dtype=dtype) + nnvm.compiler.engine.clear_cache() - # disable logger temporarily - old_state = logger.disabled - logger.disabled = True - - # use a "tracing" target to do a fake compile for collecting topi calls - tracing_target = _target.create("llvm -device=tracing") - nnvm.compiler.engine.clear_cache() - nnvm.compiler.build(graph, target=tracing_target, shape=shape, dtype=dtype) - - logger.disabled = old_state + logger.disabled = old_state # create tasks for target tasks = [] for task_name, args in env.get_tasks(): - tasks.append(create(task_name, args, - target=target, target_host=target_host, - template_key='direct')) + try: + tsk = create(task_name, args, + target=target, target_host=target_host, + template_key='direct') + tasks.append(tsk) + except topi.InvalidShapeError: + print("shape error") return tasks diff --git a/python/tvm/target.py b/python/tvm/target.py index 75f82743f9fae..23bec3adddda7 100644 --- a/python/tvm/target.py +++ b/python/tvm/target.py @@ -473,6 +473,13 @@ def rasp(options=None): return arm_cpu('rasp3b', options) +def vta(model='unknown', options=None): + opts = ["-device=vta", '-keys=cpu', '-model=%s' % model] + opts = _merge_opts(opts, options) + ret = _api_internal._TargetCreate("ext_dev", *opts) + return ret + + def create(target_str): """Get a target given target string. diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc index 859fdb2bc86c4..b2aba0ec032e8 100644 --- a/src/codegen/build_module.cc +++ b/src/codegen/build_module.cc @@ -39,6 +39,7 @@ Target CreateTarget(const std::string& target_name, std::string libs_flag = "-libs="; std::string device_flag = "-device="; + std::string keys_flag = "-keys="; for (auto& item : options) { t->options_array.push_back(ir::StringImm::make(item)); @@ -50,12 +51,16 @@ Target CreateTarget(const std::string& target_name, } } else if (item.find(device_flag) == 0) { t->device_name = item.substr(device_flag.length()); + t->keys_array.push_back(ir::StringImm::make(t->device_name)); + } else if (item.find(keys_flag) == 0) { + std::stringstream ss(item.substr(keys_flag.length())); + std::string key_item; + while (std::getline(ss, key_item, ',')) { + t->keys_array.push_back(ir::StringImm::make(key_item)); + } } } - if (t->device_name.length() > 0) { - t->keys_array.push_back(ir::StringImm::make(t->device_name)); - } t->device_type = kDLCPU; t->thread_warp_size = 1; if (target_name == "c" || target_name == "llvm") { diff --git a/topi/python/topi/__init__.py b/topi/python/topi/__init__.py index 2eb460d151ae5..ed03da110c1f3 100644 --- a/topi/python/topi/__init__.py +++ b/topi/python/topi/__init__.py @@ -34,6 +34,10 @@ from . import image from . import sparse from . import hls + +# some short cut +from .util import InvalidShapeError + # not import testing by default # because testing can have extra deps that are not necessary # we can import them from test cases explicitly diff --git a/topi/python/topi/util.py b/topi/python/topi/util.py index edfb0e467e1fe..9fd32827d8236 100644 --- a/topi/python/topi/util.py +++ b/topi/python/topi/util.py @@ -6,6 +6,10 @@ import tvm from . import tag +class InvalidShapeError(ValueError): + """Invalid shape for a topi function. i.e. call winograd template for non-3x3 kernel)""" + pass + def traverse_inline(s, final_op, callback): """Traverse computation graph and do auto inline diff --git a/vta/python/vta/__init__.py b/vta/python/vta/__init__.py index 18f501f4ebb1a..e765fa39e18bb 100644 --- a/vta/python/vta/__init__.py +++ b/vta/python/vta/__init__.py @@ -18,6 +18,7 @@ # to maintain minimum dependency on the board if sys.argv[0] not in ("-c", "-m"): from . import top - from .build_module import build_config, lower, build from . import graph + + from .build_module import build_config, lower, build, vta_autotvm_build_func from .ptr_alias import reinterpret diff --git a/vta/python/vta/build_module.py b/vta/python/vta/build_module.py index a1d2299ba7aa6..0d6ee7fcf6a56 100644 --- a/vta/python/vta/build_module.py +++ b/vta/python/vta/build_module.py @@ -102,3 +102,39 @@ def build(*args, **kwargs): with build_config(): return tvm.build(*args, **kwargs) return tvm.build(*args, **kwargs) + + +def vta_autotvm_build_func(measure_input, tmp_dir, **kwargs): + """Custom build func for VTA. Used for autotvm""" + + import time + import os + from random import getrandbits + from tvm.autotvm.util import get_const_tuple + from tvm.autotvm.measure.measure_methods import BuildResult, InstantiationError + + tic = time.time() + try: + filename = os.path.join(tmp_dir, "tmp_func_%0x.tar" % getrandbits(64)) + target, task, config = measure_input + + with target: + s, args = task.instantiate(config) + if not config.valid(): + raise InstantiationError(config.errors) + + func = build(s, args, target_host=task.target_host) + func2 = build(s, args) + + arg_info = tuple((get_const_tuple(x.shape), x.dtype) for x in args) + func.export_library(filename) + + # check by local simulator + ctx = tvm.context(str(target)) + args = [tvm.nd.empty(x[0], dtype=x[1], ctx=ctx) for x in arg_info] + func2(*args) + + except Exception as e: # pylint: disable=broad-except + return BuildResult(None, None, e, time.time() - tic) + return BuildResult(filename, arg_info, None, time.time() - tic) + diff --git a/vta/python/vta/environment.py b/vta/python/vta/environment.py index d8590ce74d31c..362adf69686b8 100644 --- a/vta/python/vta/environment.py +++ b/vta/python/vta/environment.py @@ -227,35 +227,25 @@ def gemm(self): """GEMM intrinsic""" return self.dev.gemm - # TODO get rid of it @property - def target_host(self): - """The target host""" - return "llvm " + self.llvm_triple + def target(self): + return tvm.target.vta(model=self.TARGET) @property - def target_vta_cpu(self): + def target_host(self): """The target host""" if self.TARGET == "pynq": - return "llvm -device=arm_cpu -model=pynq {}".format(self.llvm_triple) + return "llvm -target=armv7-none-linux-gnueabihf" elif self.TARGET == "ultra96": - return "llvm -device=arm_cpu -model=ultra96 {}".format(self.llvm_triple) + return "llvm -target=aarch64-linux-gnu" elif self.TARGET == "sim": return "llvm" else: raise ValueError("Unknown target %s" % self.TARGET) @property - def llvm_triple(self): - """The llvm flags for the target platform""" - if self.TARGET == "pynq": - return "-target=armv7-none-linux-gnueabihf" - elif self.TARGET == "ultra96": - return "-target=aarch64-linux-gnu" - elif self.TARGET == "sim": - return "" - else: - raise ValueError("Unknown target %s" % self.TARGET) + def target_vta_cpu(self): + return tvm.target.arm_cpu(model=self.TARGET) def get_env(): diff --git a/vta/python/vta/top/__init__.py b/vta/python/vta/top/__init__.py index 3e2563285434d..ac1e7c60ebdc6 100644 --- a/vta/python/vta/top/__init__.py +++ b/vta/python/vta/top/__init__.py @@ -1,11 +1,4 @@ """TVM TOPI connector, eventually most of these should go to TVM repo""" from . import vta_conv2d -from . import arm_conv2d -from . import testing - -from .bitpack import bitpack -from .vta_dense import packed_dense, schedule_packed_dense -from .vta_conv2d import packed_conv2d, schedule_packed_conv2d -from .vta_group_conv2d import packed_group_conv2d, schedule_packed_group_conv2d -from .vta_conv2d_transpose import packed_conv2d_transpose, schedule_packed_conv2d_transpose +from . import op diff --git a/vta/python/vta/top/arm_conv2d.py b/vta/python/vta/top/arm_conv2d.py deleted file mode 100644 index 506a47a4c3443..0000000000000 --- a/vta/python/vta/top/arm_conv2d.py +++ /dev/null @@ -1,24 +0,0 @@ -"""Reuse conv2d schedule from ARM CPU""" - -import tvm - -from topi.nn import conv2d, conv2d_alter_layout -from topi import generic - -@conv2d.register(["vta"]) -def compute(*args, **kwargs): - target = tvm.target.current_target() - with tvm.target.arm_cpu(model=target.model): - return conv2d(*args, **kwargs) - -@generic.schedule_conv2d_nchw.register(["vta"]) -def schedule(*args, **kwargs): - target = tvm.target.current_target() - with tvm.target.arm_cpu(model=target.model): - return generic.schedule_conv2d_nchw(*args, **kwargs) - -@conv2d_alter_layout.register(["vta"]) -def alter(*args, **kwargs): - target = tvm.target.current_target() - with tvm.target.arm_cpu(model=target.model): - return conv2d_alter_layout(*args, **kwargs) diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py new file mode 100644 index 0000000000000..34897ac837801 --- /dev/null +++ b/vta/python/vta/top/op.py @@ -0,0 +1,153 @@ +"""Namespace for supporting packed_conv2d + ewise variant of nnvm.""" +from __future__ import absolute_import as _abs + +from collections import namedtuple + +import logging + +import tvm +from tvm import autotvm +import topi + +from nnvm.top import registry as reg, OpPattern +from nnvm.top import nn as _nn + +from ..environment import get_env +from ..ptr_alias import reinterpret + +from .vta_group_conv2d import packed_group_conv2d, schedule_packed_group_conv2d +from .vta_conv2d_transpose import packed_conv2d_transpose, schedule_packed_conv2d_transpose + +@reg.register_compute("clip", level=15) +def compute_clip(attrs, inputs, _): + """ Clip operator. """ + x = inputs[0] + a_min = attrs.get_float("a_min") + a_max = attrs.get_float("a_max") + const_min = tvm.const(a_min, x.dtype) + const_max = tvm.const(a_max, x.dtype) + with tvm.tag_scope(topi.tag.ELEMWISE): + x = tvm.compute( + x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA") + x = tvm.compute( + x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB") + return x + +# override to force partition at copy +reg.register_pattern("copy", OpPattern.INJECTIVE, level=15) + +def is_packed_layout(layout): + """Check if layout is packed layout""" + if layout == "NCHW": + return False + if "n" in layout and "c" in layout: + return True + return False + +@reg.register_compute("conv2d", level=15) +def compute_conv2d(attrs, inputs, out): + """ 2D convolution algorithm. + """ + padding = attrs.get_int_tuple("padding") + strides = attrs.get_int_tuple("strides") + dilation = attrs.get_int_tuple("dilation") + groups = attrs.get_int("groups") + layout = attrs["layout"] + out_dtype = attrs['out_dtype'] + + assert dilation == (1, 1), "not support dilate now" + if is_packed_layout(layout): + if groups == 1: + assert groups == 1 + env = get_env() + assert env.LOG_INP_WIDTH == 3, "only support 8bit inp for now" + assert env.LOG_OUT_WIDTH == 3, "only support 8bit inp for now" + inputs = list(inputs) + w_pack_factor = 1 << (3 - env.LOG_WGT_WIDTH) + assert inputs[1].dtype == "int8" + + # Apply bit packing if necessary + if w_pack_factor != 1: + kshape = list(topi.util.get_const_tuple(inputs[1].shape)) + kshape[-1] *= w_pack_factor + inputs[1] = reinterpret(inputs[1], kshape, dtype=env.wgt_dtype) + + return topi.nn.conv2d(inputs[0], inputs[1], strides, padding, layout, out_dtype) + else: + return packed_group_conv2d(inputs[0], inputs[1], padding, strides, groups, out_dtype) + + with tvm.target.arm_cpu(tvm.target.current_target().model): + return _nn.compute_conv2d(attrs, inputs, out) + + +@reg.register_schedule("conv2d", level=15) +def schedule_conv2d(attrs, outs, target): + """ 2D convolution schedule. + """ + layout = attrs["layout"] + groups = attrs.get_int('groups') + + if is_packed_layout(layout): + target = tvm.target.create(target) + if target.device_name == "vta": + if groups == 1: + return topi.generic.schedule_conv2d_nchw(outs) + else: + return schedule_packed_group_conv2d(outs) + elif str(target).startswith("llvm"): + return tvm.create_schedule([x.op for x in outs]) + else: + raise RuntimeError("not support target %s" % target) + + with tvm.target.arm_cpu(tvm.target.current_target().model): + return _nn.schedule_conv2d(attrs, outs, tvm.target.current_target()) + + +@reg.register_alter_op_layout("conv2d", level=15) +def alter_conv2d_layout(attrs, inputs, out): + layout = attrs['layout'] + if is_packed_layout(layout): + return None + + with tvm.target.arm_cpu(tvm.target.current_target().model): + return _nn.alter_conv2d_layout(attrs, inputs, out) + + +@reg.register_compute("conv2d_transpose", level=15) +def compute_conv2d_transpose(attrs, inputs, out): + """ 2D convolution algorithm. + """ + padding = attrs.get_int_tuple("padding") + strides = attrs.get_int_tuple("strides") + dilation = attrs.get_int_tuple("dilation") + layout = attrs["layout"] + out_dtype = attrs['out_dtype'] + + assert dilation == (1, 1), "not support dilate now" + if is_packed_layout(layout): + return packed_conv2d_transpose(inputs[0], inputs[1], + padding, strides, + out_dtype=out_dtype) + + with tvm.target.arm_cpu(tvm.target.current_target().model): + return _nn.compute_conv2d_transpose(attrs, inputs, out) + + +@reg.register_schedule("conv2d_transpose", level=15) +def schedule_conv2d_transpose(attrs, outs, target): + """ 2D convolution schedule. + """ + layout = attrs["layout"] + + if is_packed_layout(layout): + target = tvm.target.create(target) + if target.device_name == "vta": + return schedule_packed_conv2d_transpose(outs) + elif str(target).startswith("llvm"): + return tvm.create_schedule([x.op for x in outs]) + else: + raise RuntimeError("not support target %s" % target) + + with tvm.target.arm_cpu(tvm.target.current_target().model): + return _nn.schedule_conv2d_transpose(attrs, outs, target) + diff --git a/vta/python/vta/top/testing/__init__.py b/vta/python/vta/top/testing/__init__.py deleted file mode 100644 index 0ba1567d21d25..0000000000000 --- a/vta/python/vta/top/testing/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -"""TVM TOPI test, used by integration tests""" - -from .vta_conv2d_test import run_vta_conv2d, my_clip \ No newline at end of file diff --git a/vta/python/vta/top/testing/vta_conv2d_test.py b/vta/python/vta/top/testing/vta_conv2d_test.py deleted file mode 100644 index 5116d1f260238..0000000000000 --- a/vta/python/vta/top/testing/vta_conv2d_test.py +++ /dev/null @@ -1,220 +0,0 @@ -import tvm -from tvm import autotvm -from tvm.contrib import util -from tvm.contrib.pickle_memoize import memoize -import topi -import topi.testing -import vta -import vta.testing -from vta.testing import simulator -import numpy as np - -import os -import json - -def _sign_extend(value, bits): - sign_bit = 1 << (bits - 1) - return (value & (sign_bit - 1)) - (value & sign_bit) - -_vector_sign_extend = np.vectorize(_sign_extend) - -def _pack(x, width): - assert(len(x.shape)==6) - assert(x.dtype=="int8") - pack_factor = 8 // width - mask = ((1 << width) - 1) - - s = x.shape - s_reshape = s[:-1] + (s[-1] // pack_factor, pack_factor) - s_pack = s[:-1] + (s[-1] // pack_factor,) - x_reshape = x.reshape(s_reshape) - x_packed = np.zeros(s_pack, dtype="int8") - for i in range(0, pack_factor): - x_packed |= (x_reshape[:,:,:,:,:,:,i] & mask) << (i * width) - - return x_packed - -def _unpack(x, width): - assert(len(x.shape)==6) - assert(x.dtype=="int8") - pack_factor = 8 // width - mask = ((1 << width) - 1) - - s = x.shape - x_unpack = np.zeros(s[:] + (pack_factor,), dtype=x.dtype) - for i in range(0, pack_factor): - x_unpack[:,:,:,:,:,:,i] = _vector_sign_extend(((x >> (i * width)) & mask), width) - - return x_unpack.reshape(s[:-1] + (s[-1] * pack_factor,)) - -@tvm.tag_scope(tag=topi.tag.ELEMWISE) -def my_clip(x, a_min, a_max): - """Unlike topi's current clip, put min and max into two stages.""" - const_min = tvm.const(a_min, x.dtype) - const_max = tvm.const(a_max, x.dtype) - x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA") - x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB") - return x - -def run_vta_conv2d(env, remote, wl, print_ir=False, - plan_str=None, samples=5, profileOnly=False, - skip_load_inp=False, skip_load_wgt=False, skip_load_acc=False, - skip_store_out=False, skip_alu=False, skip_gemm=False): - data_shape = (wl.batch//env.BATCH, wl.in_filter//env.BLOCK_IN, - wl.height, wl.width, env.BATCH, env.BLOCK_IN) - kernel_shape = (wl.out_filter//env.BLOCK_OUT, wl.in_filter//env.BLOCK_IN, - wl.hkernel, wl.wkernel, env.BLOCK_OUT, env.BLOCK_IN) - bias_shape = (wl.batch//env.BATCH, wl.out_filter//env.BLOCK_OUT, - 1, 1, env.BATCH, env.BLOCK_OUT) - data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype) - bias = tvm.placeholder(bias_shape, name="kernel", dtype=env.acc_dtype) - - # Handle quantized inputs (less than 8 bits) - # x_pack_factor = 1 << (3 - env.LOG_INP_WIDTH) - # data_shape_pack = data_shape[:-1] + (data_shape[-1]//x_pack_factor,) - # data_arg = tvm.placeholder( - # data_shape_pack, - # dtype="int8", name="data_arg") - # data = vta.reinterpret(data_arg, data_shape, dtype=env.inp_dtype) - - # Handle quantized kernels (less than 8 bits) - w_pack_factor = 1 << (3 - env.LOG_WGT_WIDTH) - kernel_shape_pack = kernel_shape[:-1] + (kernel_shape[-1]//w_pack_factor,) - kernel_arg = tvm.placeholder( - kernel_shape_pack, - dtype="int8", name="kernel_arg") - kernel = vta.reinterpret(kernel_arg, kernel_shape, dtype=env.wgt_dtype) - - res_conv = vta.top.packed_conv2d( - data, kernel, padding=(wl.hpad, wl.wpad), strides=(wl.hstride, wl.wstride)) - res = topi.right_shift(res_conv, 8) - res = topi.add(res, bias) - res = my_clip(res, 0, (1 << env.OUT_WIDTH-1)-1) - res = topi.cast(res, "int8") - - # Handle quantized outputs (less than 8 bits) - # o_pack_factor = 1 << (3 - env.LOG_OUT_WIDTH) - res_shape = topi.util.get_const_tuple(res.shape) - # res_shape_pack = res_shape[:-1] + (res_shape[-1]//o_pack_factor,) - # res_arg = vta.reinterpret(res, res_shape_pack, dtype="int8") - - # To compute number of ops, use a x2 factor for FMA - fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1 - fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1 - - a_shape = (wl.batch, wl.in_filter, wl.height, wl.width) - w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel) - stride = (wl.hstride, wl.wstride) - data_dtype = data.dtype - kernel_dtype = kernel.dtype - acc_dtype = env.acc_dtype - assert wl.hpad == wl.wpad - padding = wl.hpad - - # @memoize("vta.tests.test_benchmark_topi.conv2d.verify_nhwc") - def get_ref_data(): - # derive min max for input and weight types (max non inclusive) - a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1)) - w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 << (env.WGT_WIDTH - 1)) - a_np = np.random.randint( - a_min, a_max, size=a_shape).astype("int8") - w_np = np.random.randint( - w_min, w_max, size=w_shape).astype("int8") - b_np = topi.testing.conv2d_nchw_python( - a_np.astype(acc_dtype), w_np.astype(acc_dtype), stride, padding).astype(acc_dtype) - return a_np, w_np, b_np - - def verify(s, check_correctness): - mod = vta.build(s, - [data, kernel_arg, bias, res], - "ext_dev", - env.target_host, name="conv2d") - temp = util.tempdir() - - mod.save(temp.relpath("conv2d.o")) - remote.upload(temp.relpath("conv2d.o")) - f = remote.load_module("conv2d.o") - # verify - ctx = remote.ext_dev(0) - # Data in original format - data_orig, kernel_orig, res_ref = get_ref_data() - bias_orig = (np.random.uniform(size=(wl.batch, wl.out_filter,)) * (1 << (env.INP_WIDTH + env.WGT_WIDTH - 2))) - bias_orig = bias_orig.astype("int32") - bias_orig = np.abs(bias_orig) - - data_packed = data_orig.reshape( - wl.batch//env.BATCH, env.BATCH, - wl.in_filter//env.BLOCK_IN, env.BLOCK_IN, - wl.height, wl.width).transpose((0, 2, 4, 5, 1, 3)) - kernel_packed = kernel_orig.reshape( - wl.out_filter//env.BLOCK_OUT, env.BLOCK_OUT, - wl.in_filter//env.BLOCK_IN, env.BLOCK_IN, - wl.hkernel, wl.wkernel).transpose((0, 2, 4, 5, 1, 3)) - bias_packed = bias_orig.reshape( - wl.batch // env.BATCH, wl.out_filter // env.BLOCK_OUT, - 1, 1, env.BATCH, env.BLOCK_OUT) - - # Quantized packing - data_qpacked = _pack(data_packed, env.INP_WIDTH) - kernel_qpacked = _pack(kernel_packed, env.WGT_WIDTH) - - res_np = np.zeros(res_shape).astype(res.dtype) - data_arr = tvm.nd.array(data_qpacked, ctx) - kernel_arr = tvm.nd.array(kernel_qpacked, ctx) - bias_arr = tvm.nd.array(bias_packed, ctx) - res_arr = tvm.nd.array(res_np, ctx) - time_f = f.time_evaluator("conv2d", ctx, number=1, repeat=samples) - - # In sim mode, collect simulator runtime statistics - stats = {} - cost = None - if env.TARGET == "sim": - # Check if we're in local RPC mode (allows us to rebuild the - # runtime on the fly when varying the VTA designs) - local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0")) - if local_rpc: - remote.get_function("vta.simulator.profiler_clear")() - if profileOnly: - remote.get_function("vta.simulator.profiler_debug_mode")(1) - cost = time_f(data_arr, kernel_arr, bias_arr, res_arr) - stats = json.loads(remote.get_function("vta.simulator.profiler_status")()) - else: - simulator.clear_stats() - if profileOnly: - simulator.debug_mode(1) - cost = time_f(data_arr, kernel_arr, bias_arr, res_arr) - stats = simulator.stats() - else: - cost = time_f(data_arr, kernel_arr, bias_arr, res_arr) - - # Check correctness - correct = False - if check_correctness: - res_unpack = res_arr.asnumpy() - res_unpack = _unpack(res_unpack.astype("int8"), env.OUT_WIDTH) - res_unpack = res_unpack.transpose( - (0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter, fout_height, fout_width) - assert wl.hpad == wl.wpad - stride = (wl.hstride, wl.wstride) - padding = wl.hpad - res_ref = res_ref >> 8 - res_ref += bias_orig.reshape(wl.out_filter, 1, 1) - res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH-1)-1) - res_ref = res_ref.astype("int8") - correct = np.allclose(res_unpack, res_ref) - - return (correct, cost, stats) - - with vta.build_config(): - s = vta.top.schedule_packed_conv2d([res], - planStr=plan_str, - skip_load_inp=skip_load_inp, - skip_load_wgt=skip_load_wgt, - skip_load_acc=skip_load_acc, - skip_store_out=skip_store_out, - skip_alu=skip_alu, - skip_gemm=skip_gemm) - if print_ir: - print(vta.lower(s, [data, kernel_arg, bias, res], simple_mode=True)) - return verify(s, profileOnly is False) - diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py index 767586a998124..3e517f98e9c7a 100644 --- a/vta/python/vta/top/vta_conv2d.py +++ b/vta/python/vta/top/vta_conv2d.py @@ -1,182 +1,29 @@ """Namespace for supporting packed_conv2d + ewise variant of nnvm.""" -from __future__ import absolute_import as _abs -from collections import namedtuple - -import logging import tvm +from tvm import autotvm import topi -import re - -from nnvm.top import registry as reg, OpPattern -from nnvm.top import nn as _nn -from ..environment import get_env -from ..ptr_alias import reinterpret -from .vta_group_conv2d import packed_group_conv2d, schedule_packed_group_conv2d -from .vta_conv2d_transpose import packed_conv2d_transpose, schedule_packed_conv2d_transpose - -Workload = namedtuple("Conv2DWorkload", - ['batch', 'height', 'width', 'in_filter', 'out_filter', - 'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride']) - -_SCHEDULE_STR_MAP = {} - - -def find_schedules(layer, vt_only=False, best_only=False): - """ Returns a schedule for a given a layer. - - Parameters - ---------- - layer : Workload - Convolutional layer description. - vt_only : Boolean - Produce a schedule plan with virtual threading. - best_only : Boolean - Return the "best" schedule plan. - - Returns - ------- - fil_sched : list - List of valid schedules. - - """ - # pylint: disable=too-many-nested-blocks - env = get_env() - - # Helper function to get factors - def _find_factors(n): - factors = [] - for f in range(1, n + 1): - if n % f == 0: - factors.append(f) - return factors - - def _get_data_movement_byte(schedule, layer): - """ Estimate data movement in bytes for the schedule plan - """ - env = get_env() - b_f = schedule.b_factor - h_f = schedule.h_factor - w_f = schedule.w_factor - ci_f = schedule.ic_factor - co_f = schedule.oc_factor - # Derive data movement - inp_elem_sizeb = env.BATCH * env.BLOCK_IN * env.INP_WIDTH - wgt_elem_sizeb = env.BLOCK_IN * env.BLOCK_OUT * env.WGT_WIDTH - out_elem_sizeb = env.BATCH * env.BLOCK_OUT * env.OUT_WIDTH - input_tile_elems = b_f * \ - ((h_f - 1) * layer.hstride + layer.hkernel) * \ - ((w_f - 1) * layer.wstride + layer.wkernel) * ci_f - weight_tile_elems = layer.hkernel * layer.wkernel * ci_f - output_tile_elems = b_f * h_f * w_f * co_f - # Derive tiling factors - b_factor = layer.batch // (b_f * env.BATCH) - h_factor = (layer.height // layer.hstride) // h_f - w_factor = (layer.width // layer.wstride) // w_f - ci_factor = layer.in_filter // (ci_f * env.BLOCK_IN) - co_factor = layer.out_filter // (co_f * env.BLOCK_OUT) - # Compute input transaction count - input_xfers = b_factor * h_factor * w_factor * co_factor * ci_factor - weight_xfers = b_factor * h_factor * w_factor * co_factor * ci_factor - output_xfers = b_factor * h_factor * w_factor * co_factor - # Compute total transfer sizes - input_xfer_byte = input_tile_elems * input_xfers * inp_elem_sizeb // 8 - weight_xfer_byte = weight_tile_elems * weight_xfers * wgt_elem_sizeb // 8 - output_xfer_byte = output_tile_elems * output_xfers * out_elem_sizeb // 8 - total_xfer_byte = input_xfer_byte + weight_xfer_byte + output_xfer_byte - return total_xfer_byte - - # Scheduling exploration - OH = (layer.height + 2 * layer.hpad - layer.hkernel) // layer.hstride + 1 - OW = (layer.width + 2 * layer.wpad - layer.wkernel) // layer.wstride + 1 - batch_factors = _find_factors(layer.batch // env.BATCH) - height_factors = _find_factors(OH) - width_factors = _find_factors(OW) - cin_factors = _find_factors(layer.in_filter // env.BLOCK_IN) - cout_factors = _find_factors(layer.out_filter // env.BLOCK_OUT) - ht_factors = [1, 2] - cot_factors = [1, 2] - - # Explore schedules - schedules = [] - for b_f in batch_factors: - for h_f in height_factors: - for w_f in width_factors: - for ci_f in cin_factors: - for co_f in cout_factors: - # FIXME: 2D load pattern matching imposes restrictions on schedule - valid = (w_f == layer.width // layer.wstride) or \ - (w_f != layer.width // layer.wstride and co_f == 1) and \ - ci_f == 1 - if valid: - schedules.append([b_f, h_f, w_f, ci_f, co_f]) - # Filter the schedules that wouldn't work in the available BRAM sizes - inp_elem_sizeb = env.BATCH * env.BLOCK_IN * env.INP_WIDTH - wgt_elem_sizeb = env.BLOCK_IN * env.BLOCK_OUT * env.WGT_WIDTH - out_elem_sizeb = env.BATCH * env.BLOCK_OUT * env.OUT_WIDTH - inp_brams_sizeb = env.INP_BUFF_SIZE * 8 - wgt_brams_sizeb = env.WGT_BUFF_SIZE * 8 - out_brams_sizeb = env.OUT_BUFF_SIZE * 8 - fil_sched = [] - xfer_size = [] - for sched in schedules: - b_f, h_f, w_f, ci_f, co_f = sched - for h_t in ht_factors: - for co_t in cot_factors: - # Make sure to filter cases where we apply threading on two axes - # or cases where the threading factors for h and co are not - # factors of h and co - if (h_t == 2 and co_t == 2) or (h_f % h_t != 0) or (co_f % co_t != 0): - continue - # Adjust tile sizes if threading is applied - h_f //= h_t - co_f //= co_t - # Derive tile sizes - input_tile_elems = b_f * \ - ((h_f - 1) * layer.hstride + layer.hkernel) * \ - ((w_f - 1) * layer.wstride + layer.wkernel) * ci_f - weight_tile_elems = layer.hkernel * layer.wkernel * ci_f * co_f - output_tile_elems = b_f * h_f * w_f * co_f +import numpy as np - # Derive valid schedule filter - valid = True - # If in vitrual-threaded mode, only allow for threaded plans - valid &= (vt_only and (h_t == 2 or co_t == 2)) or not vt_only - # Check that we don't exceed input/weight/output capacity - valid &= input_tile_elems * inp_elem_sizeb <= inp_brams_sizeb // (co_t * h_t) - valid &= weight_tile_elems * wgt_elem_sizeb <= wgt_brams_sizeb // (co_t * h_t) - valid &= output_tile_elems * out_elem_sizeb <= out_brams_sizeb // (co_t * h_t) - # Make sure that we don't write to the same acc location within 2 consecutive cycles - valid &= h_f > 2 and w_f > 2 - # TODO: check that we don't exceed instruction or micro-op count - - if valid: - schedule = Schedule(b_factor=b_f, oc_factor=co_f, ic_factor=ci_f, h_factor=h_f, - w_factor=w_f, oc_nthread=co_t, h_nthread=h_t) - fil_sched.append(schedule) - xfer_size.append(_get_data_movement_byte(schedule, layer)) +from ..environment import get_env +from .op import is_packed_layout - if best_only: - return [fil_sched[xfer_size.index(min(xfer_size))]] - return fil_sched +@autotvm.register_topi_compute(topi.nn.conv2d, 'vta', 'direct') +def packed_conv2d(cfg, data, kernel, strides, padding, layout, out_dtype): + """ Packed conv2d function.""" + if not is_packed_layout(layout): + raise topi.InvalidShapeError() -def packed_conv2d(data, - kernel, - padding, - strides, - out_dtype="int32"): - """ Packed conv2d function. - """ if padding[0]: pad_data = topi.nn.pad(data, [0, 0, padding[0], padding[1], 0, 0], name="pad_data") else: pad_data = data assert len(data.shape) == 6 assert len(kernel.shape) == 6 - oheight = topi.util.simplify((pad_data.shape[2] - kernel.shape[2]) // strides[0] + 1) - owidth = topi.util.simplify((pad_data.shape[3] - kernel.shape[3]) // strides[1] + 1) + oheight = topi.util.get_const_int((pad_data.shape[2] - kernel.shape[2]) // strides[0] + 1) + owidth = topi.util.get_const_int((pad_data.shape[3] - kernel.shape[3]) // strides[1] + 1) oshape = (data.shape[0], kernel.shape[0], oheight, owidth, data.shape[4], kernel.shape[4]) ishape = topi.util.get_const_tuple(data.shape) @@ -193,200 +40,16 @@ def packed_conv2d(data, kernel[c_o, k_o, d_i, d_j, c_i, k_i].astype(out_dtype), axis=[k_o, d_i, d_j, k_i]), name="res", tag="packed_conv2d") - return res - -@tvm.register_func("nnvm.compiler.build_target", override=True) -def _build(funcs, target, target_host): - tvm_t = tvm.target.create(target) - if tvm_t.device_name == "vta": - return tvm.build(funcs, target="ext_dev", target_host=target_host) - elif tvm_t.device_name == "rasp" or tvm_t.device_name == "vtacpu": - return tvm.build(funcs, target=target_host) - return tvm.build(funcs, target=target) - - -@tvm.register_func("nnvm.compiler.lower", override=True) -def _lower(sch, inputs, func_name, graph): - import traceback - # pylint: disable=broad-except - try: - f = tvm.lower(sch, inputs, name=func_name) - if "quantized_conv2d" in func_name: - logging.info(graph.ir(join_entry_attrs=["shape"])) - except Exception: - msg = traceback.format_exc() - msg += "Error during compile graph\n" - msg += "--------------------------\n" - msg += graph.ir(join_entry_attrs=["shape"]) - raise RuntimeError(msg) - return f if isinstance( - f, (tvm.container.Array, tuple, list)) else [f] - - -@reg.register_compute("clip", level=15) -def compute_clip(attrs, inputs, _): - """ Clip operator. - """ - x = inputs[0] - a_min = attrs.get_float("a_min") - a_max = attrs.get_float("a_max") - const_min = tvm.const(a_min, x.dtype) - const_max = tvm.const(a_max, x.dtype) - with tvm.tag_scope(topi.tag.ELEMWISE): - x = tvm.compute( - x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA") - x = tvm.compute( - x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB") - return x - -# override to force partition at copy -reg.register_pattern("copy", OpPattern.INJECTIVE, level=15) - -def is_packed_layout(layout): - """Check if layout is packed layout""" - if layout == "NCHW": - return False - if "n" in layout and "c" in layout: - return True - return False - -@reg.register_alter_op_layout("conv2d", level=15) -def alter_conv2d_layout(attrs, inputs, out): - layout = attrs['layout'] - if is_packed_layout(layout): - return None - return _nn.alter_conv2d_layout(attrs, inputs, out) - - -@reg.register_compute("conv2d", level=15) -def compute_conv2d(attrs, inputs, out): - """ 2D convolution algorithm. - """ - padding = attrs.get_int_tuple("padding") - strides = attrs.get_int_tuple("strides") - dilation = attrs.get_int_tuple("dilation") - groups = attrs.get_int("groups") - layout = attrs["layout"] - out_dtype = attrs['out_dtype'] - - assert dilation == (1, 1), "not support dilate now" - if is_packed_layout(layout): - if groups == 1: - assert groups == 1 - env = get_env() - assert env.LOG_INP_WIDTH == 3, "only support 8bit inp for now" - assert env.LOG_OUT_WIDTH == 3, "only support 8bit inp for now" - inputs = list(inputs) - w_pack_factor = 1 << (3 - env.LOG_WGT_WIDTH) - assert inputs[1].dtype == "int8" - - # Apply bit packing if necessary - if w_pack_factor != 1: - kshape = list(topi.util.get_const_tuple(inputs[1].shape)) - kshape[-1] *= w_pack_factor - inputs[1] = reinterpret(inputs[1], kshape, dtype=env.wgt_dtype) - - return packed_conv2d(inputs[0], inputs[1], - padding, strides, out_dtype=out_dtype) - else: - return packed_group_conv2d(inputs[0], inputs[1], - padding, strides, groups, out_dtype=out_dtype) - return _nn.compute_conv2d(attrs, inputs, out) - - -@reg.register_schedule("conv2d", level=15) -def schedule_conv2d(attrs, outs, target): - """ 2D convolution schedule. - """ - layout = attrs["layout"] - groups = attrs.get_int('groups') - - if is_packed_layout(layout): - target = tvm.target.create(target) - if target.device_name == "vta": - if groups == 1: - return schedule_packed_conv2d(outs) - else: - return schedule_packed_group_conv2d(outs) - elif str(target).startswith("llvm"): - return tvm.create_schedule([x.op for x in outs]) - else: - raise RuntimeError("not support target %s" % target) - return _nn.schedule_conv2d(attrs, outs, target) - - -@reg.register_compute("conv2d_transpose", level=15) -def compute_conv2d_transpose(attrs, inputs, out): - """ 2D convolution algorithm. - """ - padding = attrs.get_int_tuple("padding") - strides = attrs.get_int_tuple("strides") - dilation = attrs.get_int_tuple("dilation") - layout = attrs["layout"] - out_dtype = attrs['out_dtype'] - - assert dilation == (1, 1), "not support dilate now" - if is_packed_layout(layout): - return packed_conv2d_transpose(inputs[0], inputs[1], - padding, strides, - out_dtype=out_dtype) - return _nn.compute_conv2d_transpose(attrs, inputs, out) + cfg.add_flop(2 * np.prod(topi.util.get_const_tuple(oshape)) * + kshape[2] * kshape[3] * ishape[1] * ishape[-1]) + return res -@reg.register_schedule("conv2d_transpose", level=15) -def schedule_conv2d_transpose(attrs, outs, target): - """ 2D convolution schedule. - """ - layout = attrs["layout"] - if is_packed_layout(layout): - target = tvm.target.create(target) - if target.device_name == "vta": - return schedule_packed_conv2d_transpose(outs) - elif str(target).startswith("llvm"): - return tvm.create_schedule([x.op for x in outs]) - else: - raise RuntimeError("not support target %s" % target) - return _nn.schedule_conv2d_transpose(attrs, outs, target) - -def _get_workload(data, pad_data, kernel, output): - """ Get the workload structure. - """ - o_shape = topi.util.get_const_tuple(output.shape) - d_shape = topi.util.get_const_tuple(data.shape) - k_shape = topi.util.get_const_tuple(kernel.shape) - o_b, o_c, o_h, o_w, ob_blk, o_blk = o_shape - i_b, i_c, i_h, i_w, ib_blk, i_blk = d_shape - k_o, k_i, k_h, k_w, ko_blk, ki_blk = k_shape - # For now we need to assume that input channel blocking is the same - # as the output channel blocking - assert o_blk == i_blk - assert ob_blk == ib_blk - # Make sure that dimensions match - assert o_b == i_b - assert o_blk == ko_blk - assert i_blk == ki_blk - assert k_o == o_c - assert k_i == i_c - # Scale the channel size - i_b *= ib_blk - i_c *= i_blk - o_c *= o_blk - if pad_data is not None: - p_shape = topi.util.get_const_tuple(pad_data.shape) - h_pad = (p_shape[2] - d_shape[2]) // 2 - w_pad = (p_shape[3] - d_shape[3]) // 2 - else: - h_pad, w_pad = 0, 0 - h_str = (i_h + h_pad*2 - k_h) // (o_h - 1) - w_str = (i_w + w_pad*2 - k_w) // (o_w - 1) - return Workload(i_b, i_h, i_w, i_c, o_c, k_h, k_w, h_pad, w_pad, h_str, w_str) - -def schedule_packed_conv2d(outs, planStr=None, skip_load_inp=False, skip_load_wgt=False, - skip_load_acc=False, skip_store_out=False, skip_alu=False, - skip_gemm=False): - """ Schedule the packed conv2d. - """ +@autotvm.register_topi_schedule(topi.generic.schedule_conv2d_nchw, 'vta', 'direct') +def schedule_packed_conv2d(cfg, outs, + skip_load_inp=False, skip_load_wgt=False, skip_load_acc=False, + skip_store_out=False, skip_alu=False, skip_gemm=False): assert len(outs) == 1 output = outs[0] ewise_inputs = [] @@ -410,6 +73,19 @@ def _traverse(op): _traverse(output.op) assert len(conv2d_res) == 1 conv2d_stage = conv2d_res[0].output(0) + s = tvm.create_schedule(output.op) + + ##### space definition begin ##### + b, co, h, w, bi, ci = s[conv2d_stage].op.axis + ci, kh, kw, bci = s[conv2d_stage].op.reduce_axis + cfg.define_split('tile_b', b, num_outputs=2) + cfg.define_split('tile_h', h, num_outputs=2) + cfg.define_split('tile_w', w, num_outputs=2) + cfg.define_split('tile_ci', ci, num_outputs=2) + cfg.define_split('tile_co', co, num_outputs=2) + cfg.define_knob('oc_nthread', [1, 2]) + cfg.define_knob('h_nthread', [1, 2]) + ###### space definition end ###### data, kernel = conv2d_stage.op.input_tensors if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: @@ -418,34 +94,9 @@ def _traverse(op): data = temp else: pad_data = None - wrkld = _get_workload(data, pad_data, kernel, output) - - if wrkld in _SCHEDULE_STR_MAP and planStr is None: - planStr = _SCHEDULE_STR_MAP[wrkld] - logging.info("Apply pre-cached schedule for %s->%s", str(wrkld) , planStr) - if planStr: - matchObj = re.match( r'b(\d+)_oc(\d+)_ic(\d+)_h(\d+)_w(\d+)_oct(\d+)_ht(\d+)', planStr) - b_factor = int(matchObj.group(1)) - oc_factor = int(matchObj.group(2)) - ic_factor = int(matchObj.group(3)) - h_factor = int(matchObj.group(4)) - w_factor = int(matchObj.group(5)) - oc_nthread = int(matchObj.group(6)) - h_nthread = int(matchObj.group(7)) - plan = Schedule(b_factor=b_factor, - oc_factor=oc_factor, - ic_factor=ic_factor, - h_factor=h_factor, - w_factor=w_factor, - oc_nthread=oc_nthread, - h_nthread=h_nthread) - else: - plan = find_schedules(wrkld, vt_only=True, best_only=True)[0] - logging.info("Trying to find plan for %s", wrkld) env = get_env() mock = env.mock - load_inp = mock.dma_copy if skip_load_inp else env.dma_copy load_wgt = mock.dma_copy if skip_load_wgt else env.dma_copy load_acc = mock.dma_copy if skip_load_acc else env.dma_copy @@ -453,9 +104,8 @@ def _traverse(op): alu = mock.alu if skip_alu else env.alu gemm = mock.gemm if skip_gemm else env.gemm - # schedule1 + # schedule oshape = topi.util.get_const_tuple(output.shape) - s = tvm.create_schedule(output.op) # setup pad if pad_data is not None: @@ -465,28 +115,23 @@ def _traverse(op): cdata = s.cache_read(data, env.inp_scope, [conv2d_stage]) ckernel = s.cache_read(kernel, env.wgt_scope, [conv2d_stage]) s[conv2d_stage].set_scope(env.acc_scope) + # cache read input cache_read_ewise = [] - for consumer, tensor in ewise_inputs: cache_read_ewise.append( s.cache_read(tensor, env.acc_scope, [consumer])) + # set ewise scope for op in ewise_ops: s[op].set_scope(env.acc_scope) s[op].pragma(s[op].op.axis[0], alu) - # tile - oc_factor = (plan.oc_factor if plan.oc_factor - else plan.out_filter // env.BLOCK_OUT) - h_factor = (plan.h_factor if plan.h_factor else oshape[2]) - w_factor = (plan.w_factor if plan.w_factor else oshape[3]) - x_bo, x_co, x_i, x_j, x_bi, x_ci = s[output].op.axis - x_co0, x_co1 = s[output].split(x_co, factor=oc_factor) - x_i0, x_i1 = s[output].split(x_i, factor=h_factor) - x_j0, x_j1 = s[output].split(x_j, factor=w_factor) + x_co0, x_co1 = cfg['tile_co'].apply(s, output, x_co) + x_i0, x_i1 = cfg['tile_h'].apply(s, output, x_i) + x_j0, x_j1 = cfg['tile_w'].apply(s, output, x_j) s[output].reorder(x_bo, x_i0, x_co0, x_j0, x_co1, x_i1, x_j1, x_bi, x_ci) store_pt = x_j0 @@ -500,14 +145,14 @@ def _traverse(op): s[tensor].pragma(s[tensor].op.axis[0], load_acc) # virtual threading along output channel axes - if plan.oc_nthread > 1: - _, v_t = s[output].split(x_co0, factor=plan.oc_nthread) + if cfg['oc_nthread'].val > 1: + _, v_t = s[output].split(x_co0, factor=cfg['oc_nthread'].val) s[output].reorder(v_t, x_bo) s[output].bind(v_t, tvm.thread_axis("cthread")) # virtual threading along spatial rows - if plan.h_nthread > 1: - _, v_t = s[output].split(x_i0, factor=plan.h_nthread) + if cfg['h_nthread'].val > 1: + _, v_t = s[output].split(x_i0, factor=cfg['h_nthread'].val) s[output].reorder(v_t, x_bo) s[output].bind(v_t, tvm.thread_axis("cthread")) @@ -515,10 +160,9 @@ def _traverse(op): k_o, d_i, d_j, k_i = s[conv2d_stage].op.reduce_axis s[conv2d_stage].reorder(x_bo, k_o, x_j, d_j, d_i, x_co, x_i, x_bi, x_ci, k_i) - if plan.ic_factor: - k_o, _ = s[conv2d_stage].split(k_o, factor=plan.ic_factor) - s[cdata].compute_at(s[conv2d_stage], k_o) - s[ckernel].compute_at(s[conv2d_stage], k_o) + k_o, _ = cfg['tile_ci'].apply(s, conv2d_stage, k_o) + s[cdata].compute_at(s[conv2d_stage], k_o) + s[ckernel].compute_at(s[conv2d_stage], k_o) # Use VTA instructions s[cdata].pragma(s[cdata].op.axis[0], load_inp) @@ -527,31 +171,3 @@ def _traverse(op): s[output].pragma(x_co1, store_out) return s -class Conv2DSchedule(object): - """ 2D convolution schedule object. - """ - def __init__(self, - b_factor=1, - oc_factor=1, - ic_factor=1, - h_factor=1, - w_factor=0, - oc_nthread=0, - h_nthread=0, - debug_sync=False): - self.b_factor = b_factor - self.oc_factor = oc_factor - self.ic_factor = ic_factor - self.h_factor = h_factor - self.w_factor = w_factor - self.oc_nthread = oc_nthread - self.h_nthread = h_nthread - self.debug_sync = debug_sync - - def __str__(self): - return "{}.{}.{}.{}.{}.{}.{}".format( - self.b_factor, self.oc_factor, self.ic_factor, - self.h_factor, self.w_factor, - self.oc_nthread, self.h_nthread) - -Schedule = Conv2DSchedule diff --git a/vta/scripts/tune_conv.py b/vta/scripts/tune_conv.py index 95f13cc54f5a3..6b1740092b160 100644 --- a/vta/scripts/tune_conv.py +++ b/vta/scripts/tune_conv.py @@ -1,168 +1,25 @@ -"""Tuning a conv2d operator """ -import tvm -import sys +"""Tuning a single conv2d operator""" import logging + +import tvm from tvm import autotvm from tvm.contrib.util import get_lower_ir import topi import vta import vta.testing -from vta.top.testing import my_clip env = vta.get_env() -def vta_build_func(measure_input, tmp_dir, **kwargs): - import time - import os - from tvm.autotvm.measure.measure_methods import BuildResult - from random import getrandbits - from tvm.autotvm.util import get_const_tuple - tic = time.time() - try: - filename = os.path.join(tmp_dir, "tmp_func_%0x.tar" % getrandbits(64)) - target, task, config = measure_input - - with target: - s, args = task.instantiate(config) - if not config.valid(): - raise InstantiationError(config.errors) - - func = vta.build(s, args, target='ext_dev', target_host=task.target_host) - - arg_info = tuple((get_const_tuple(x.shape), x.dtype) for x in args) - func.export_library(filename) - except Exception as e: # pylint: disable=broad-except - return BuildResult(None, None, e, time.time() - tic) - return BuildResult(filename, arg_info, None, time.time() - tic) - - -def schedule_packed_conv2d(cfg, outs, - skip_load_inp=False, skip_load_wgt=False, skip_load_acc=False, - skip_store_out=False, skip_alu=False, skip_gemm=False): - """Schedule the packed conv2d. - """ - assert len(outs) == 1 - output = outs[0] - ewise_inputs = [] - ewise_ops = [] - conv2d_res = [] - assert output.op.input_tensors[0].dtype == "int32" - - def _traverse(op): - if topi.tag.is_broadcast(op.tag): - if not op.same_as(output.op): - ewise_ops.append(op) - for tensor in op.input_tensors: - if isinstance(tensor.op, tvm.tensor.PlaceholderOp): - ewise_inputs.append((op, tensor)) - else: - _traverse(tensor.op) - else: - assert op.tag == "packed_conv2d" - conv2d_res.append(op) - - _traverse(output.op) - assert len(conv2d_res) == 1 - conv2d_stage = conv2d_res[0].output(0) - s = tvm.create_schedule(output.op) - - ##### space definition begin ##### - b, co, h, w, bi, ci = s[conv2d_stage].op.axis - ci, kh, kw, bci = s[conv2d_stage].op.reduce_axis - cfg.define_split('tile_b', b, num_outputs=2) - cfg.define_split('tile_h', h, num_outputs=2) - cfg.define_split('tile_w', w, num_outputs=2) - cfg.define_split('tile_ci', ci, num_outputs=2) - cfg.define_split('tile_co', co, num_outputs=2) - cfg.define_knob('oc_nthread', [1, 2]) - cfg.define_knob('h_nthread', [1, 2]) - ###### space definition end ###### - - data, kernel = conv2d_stage.op.input_tensors - if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: - temp = data.op.input_tensors[0] - pad_data = data - data = temp - else: - pad_data = None - - mock = env.mock - load_inp = mock.dma_copy if skip_load_inp else env.dma_copy - load_wgt = mock.dma_copy if skip_load_wgt else env.dma_copy - load_acc = mock.dma_copy if skip_load_acc else env.dma_copy - store_out = mock.dma_copy if skip_store_out else env.dma_copy - alu = mock.alu if skip_alu else env.alu - gemm = mock.gemm if skip_gemm else env.gemm - - # schedule - oshape = topi.util.get_const_tuple(output.shape) - - # setup pad - if pad_data is not None: - cdata = pad_data - s[pad_data].set_scope(env.inp_scope) - else: - cdata = s.cache_read(data, env.inp_scope, [conv2d_stage]) - ckernel = s.cache_read(kernel, env.wgt_scope, [conv2d_stage]) - s[conv2d_stage].set_scope(env.acc_scope) - - # cache read input - cache_read_ewise = [] - for consumer, tensor in ewise_inputs: - cache_read_ewise.append( - s.cache_read(tensor, env.acc_scope, [consumer])) - - # set ewise scope - for op in ewise_ops: - s[op].set_scope(env.acc_scope) - s[op].pragma(s[op].op.axis[0], alu) - - # tile - x_bo, x_co, x_i, x_j, x_bi, x_ci = s[output].op.axis - x_co0, x_co1 = cfg['tile_co'].apply(s, output, x_co) - x_i0, x_i1 = cfg['tile_h'].apply(s, output, x_i) - x_j0, x_j1 = cfg['tile_w'].apply(s, output, x_j) - s[output].reorder(x_bo, x_i0, x_co0, x_j0, x_co1, x_i1, x_j1, x_bi, x_ci) - store_pt = x_j0 - - # set all compute scopes - s[conv2d_stage].compute_at(s[output], store_pt) - for op in ewise_ops: - s[op].compute_at(s[output], store_pt) - - for tensor in cache_read_ewise: - s[tensor].compute_at(s[output], store_pt) - s[tensor].pragma(s[tensor].op.axis[0], load_acc) - - # virtual threading along output channel axes - if cfg['oc_nthread'].val > 1: - _, v_t = s[output].split(x_co0, factor=cfg['oc_nthread'].val) - s[output].reorder(v_t, x_bo) - s[output].bind(v_t, tvm.thread_axis("cthread")) - - # virtual threading along spatial rows - if cfg['h_nthread'].val > 1: - _, v_t = s[output].split(x_i0, factor=cfg['h_nthread'].val) - s[output].reorder(v_t, x_bo) - s[output].bind(v_t, tvm.thread_axis("cthread")) - - x_bo, x_co, x_i, x_j, x_bi, x_ci = s[conv2d_stage].op.axis - k_o, d_i, d_j, k_i = s[conv2d_stage].op.reduce_axis - s[conv2d_stage].reorder(x_bo, k_o, x_j, d_j, d_i, x_co, x_i, x_bi, x_ci, k_i) - - k_o, _ = cfg['tile_ci'].apply(s, conv2d_stage, k_o) - s[cdata].compute_at(s[conv2d_stage], k_o) - s[ckernel].compute_at(s[conv2d_stage], k_o) - - # Use VTA instructions - s[cdata].pragma(s[cdata].op.axis[0], load_inp) - s[ckernel].pragma(s[ckernel].op.axis[0], load_wgt) - s[conv2d_stage].tensorize(x_bi, gemm) - s[output].pragma(x_co1, store_out) - return s - -@autotvm.template +@tvm.tag_scope(tag=topi.tag.ELEMWISE) +def my_clip(x, a_min, a_max): + """Unlike topi's current clip, put min and max into two stages.""" + const_min = tvm.const(a_min, x.dtype) + const_max = tvm.const(a_max, x.dtype) + x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA") + x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB") + return x + def conv2d(N, CI, H, W, CO, KH, KW, strides, padding, in_dtype, out_dtype): data_shape = (N//env.BATCH, CI//env.BLOCK_IN, H, W, env.BATCH, env.BLOCK_IN) kernel_shape = (CO//env.BLOCK_OUT, CI//env.BLOCK_IN, KH, KW, env.BLOCK_OUT, env.BLOCK_IN) @@ -172,41 +29,42 @@ def conv2d(N, CI, H, W, CO, KH, KW, strides, padding, in_dtype, out_dtype): OW = (W + 2 * padding[1] - KW) // strides[1] + 1 data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype) - bias = tvm.placeholder(bias_shape, name="kernel", dtype=env.acc_dtype) - - w_pack_factor = 1 << (3 - env.LOG_WGT_WIDTH) - kernel_shape_pack = kernel_shape[:-1] + (kernel_shape[-1] // w_pack_factor,) - kernel_arg = tvm.placeholder(kernel_shape_pack, dtype="int8", name="kernel_arg") - kernel = vta.reinterpret(kernel_arg, kernel_shape, dtype=env.wgt_dtype) - - res_conv = vta.top.packed_conv2d(data, kernel, padding=padding, strides=strides) - res = topi.right_shift(res_conv, 8) - res = topi.add(res, bias) - res = my_clip(res, 0, 127) - res = topi.cast(res, "int8") + bias = tvm.placeholder(bias_shape, name="bias", dtype=env.acc_dtype) + kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) + + with tvm.target.vta(): + res = topi.nn.conv2d(data, kernel, padding=padding, strides=strides, + layout='NCHW%dn%dc' % (env.BATCH, env.BLOCK_IN), out_dtype='int32') + res = topi.add(res, bias) + res = topi.right_shift(res, 8) + res = my_clip(res, 0, 127) + res = topi.cast(res, "int8") + + if tvm.target.current_target().device_name == 'vta': + s = topi.generic.schedule_conv2d_nchw([res]) + else: + s = tvm.create_schedule([res.op]) - cfg = autotvm.get_config() - s = schedule_packed_conv2d(cfg, [res]) - cfg.add_flop(2 * N * CI * OH * OW * CO * KH * KW) - return s, [data, kernel_arg, bias, res] + return s, [data, kernel, bias, res] if __name__ == '__main__': + model = env.TARGET N, CI, H, W, CO, KH, KW, strides, padding, in_dtype, out_dtype = \ 1, 64, 56, 56, 64, 3, 3, (1, 1), (1, 1), 'int8', 'int32' task = autotvm.task.create(conv2d, args=(N, CI, H, W, CO, KH, KW, strides, padding, in_dtype, out_dtype), - target='ext_dev', target_host=env.target_host) + target=tvm.target.vta(model), target_host=env.target_host, template_key='direct') print(task.config_space) # logging config (for printing tuning log to the screen) + logging.basicConfig() logging.getLogger('autotvm').setLevel(logging.DEBUG) - logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout)) measure_option = autotvm.measure_option( - builder=autotvm.LocalBuilder(build_func=vta_build_func), - runner=autotvm.RPCRunner( - 'ultra96', 'fleet', 9190)) + builder=autotvm.LocalBuilder(build_func=vta.vta_autotvm_build_func), + runner=autotvm.RPCRunner(model, 'fleet', 9190, number=4, repeat=3, timeout=30, + check_correctness=True)) tuner = autotvm.tuner.RandomTuner(task) tuner.tune(n_trial=len(task.config_space), diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py new file mode 100644 index 0000000000000..f3a64323723e7 --- /dev/null +++ b/vta/scripts/tune_resnet.py @@ -0,0 +1,224 @@ +import os +import argparse +import time + +import numpy as np + +import tvm +from tvm import rpc, autotvm +from tvm.autotvm.measure.measure_methods import request_remote +from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner +from tvm.contrib import graph_runtime, util +from tvm.contrib.download import download + +import topi +import nnvm.compiler +import vta +import vta.testing + +env = vta.get_env() + +def register_vta_tuning_tasks(): + from tvm.autotvm.task.nnvm_integration import TaskExtractEnv, deserialize_args + + @tvm.tag_scope(tag=topi.tag.ELEMWISE) + def my_clip(x, a_min, a_max): + """Unlike topi's current clip, put min and max into two stages.""" + const_min = tvm.const(a_min, x.dtype) + const_max = tvm.const(a_max, x.dtype) + x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA") + x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB") + return x + + # init old env + TaskExtractEnv([]) + + @autotvm.task.register("topi_nn_conv2d", override=True) + def _topi_nn_conv2d(*args, **kwargs): + assert not kwargs, "Do not support kwargs in template function call" + args = deserialize_args(args) + A, W = args[:2] + + with tvm.target.vta(): + res = topi.nn.conv2d(*args, **kwargs) + res = topi.right_shift(res, 8) + res = my_clip(res, 0, 127) + res = topi.cast(res, "int8") + + if tvm.target.current_target().device_name == 'vta': + s = topi.generic.schedule_conv2d_nchw([res]) + else: + s = tvm.create_schedule([res.op]) + return s, [A, W, res] + + +def generate_graph(sym, params, target, target_host): + # Populate the shape and data type dictionary + shape_dict = {"data": (1, 3, 224, 224)} + dtype_dict = {"data": 'float32'} + shape_dict.update({k: v.shape for k, v in params.items()}) + dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) + + # Apply NNVM graph optimization passes + sym = vta.graph.clean_cast(sym) + sym = vta.graph.clean_conv_fuse(sym) + assert env.BLOCK_IN == env.BLOCK_OUT + sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT) + + # Compile NNVM graph + with nnvm.compiler.build_config(opt_level=3): + with vta.build_config(): + graph, lib, params = nnvm.compiler.build( + sym, target, shape_dict, dtype_dict, + params=params, target_host=target_host) + + return graph, lib, params + + +def extract_tasks(sym, params, target, target_host): + # Populate the shape and data type dictionary + shape_dict = {"data": (1, 3, 224, 224)} + dtype_dict = {"data": 'float32'} + shape_dict.update({k: v.shape for k, v in params.items()}) + dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) + + # Apply NNVM graph optimization passes + sym = vta.graph.clean_cast(sym) + sym = vta.graph.clean_conv_fuse(sym) + assert env.BLOCK_IN == env.BLOCK_OUT + sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT) + + with vta.build_config(): + tasks = autotvm.task.extract_from_graph(sym, target=target, target_host=target_host, + shape=shape_dict, dtype=dtype_dict, symbols=(nnvm.sym.conv2d,)) + return tasks + + +def download_model(): + url = "https://github.com/uwsaml/web-data/raw/master/vta/models/" + categ_fn = 'synset.txt' + graph_fn = 'resnet18_qt8.json' + params_fn = 'resnet18_qt8.params' + data_dir = '_data' + if not os.path.exists(data_dir): + os.makedirs(data_dir) + + for file in [categ_fn, graph_fn, params_fn]: + if not os.path.isfile(file): + download(os.path.join(url, file), os.path.join(data_dir, file)) + + sym = nnvm.graph.load_json(open(os.path.join(data_dir, graph_fn)).read()) + params = nnvm.compiler.load_param_dict(open(os.path.join(data_dir, params_fn), 'rb').read()) + + return sym, params + + +def tune_tasks(tasks, + measure_option, + tuner='xgb', + n_trial=1000, + early_stopping=None, + log_filename='tuning.log', + use_transfer_learning=True, + try_winograd=True): + # create tmp log file + tmp_log_file = log_filename + ".tmp" + if os.path.exists(tmp_log_file): + os.remove(tmp_log_file) + + for i, tsk in enumerate(reversed(tasks)): + prefix = "[Task %2d/%2d] " % (i+1, len(tasks)) + + # create tuner + if tuner == 'xgb' or tuner == 'xgb-rank': + tuner_obj = XGBTuner(tsk, loss_type='rank') + elif tuner == 'ga': + tuner_obj = GATuner(tsk, pop_size=50) + elif tuner == 'random': + tuner_obj = RandomTuner(tsk) + elif tuner == 'gridsearch': + tuner_obj = GridSearchTuner(tsk) + else: + raise ValueError("Invalid tuner: " + tuner) + + if use_transfer_learning: + if os.path.isfile(tmp_log_file): + tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file)) + + # do tuning + n_trial_ = min(n_trial, len(tsk.config_space)) + tuner_obj.tune(n_trial_, + early_stopping=early_stopping, + measure_option=measure_option, + callbacks=[ + autotvm.callback.progress_bar(n_trial_, prefix=prefix), + autotvm.callback.log_to_file(tmp_log_file)]) + + # pick best records to a cache file + autotvm.record.pick_best(tmp_log_file, log_filename) + os.remove(tmp_log_file) + +if __name__ == '__main__': + device_key = env.TARGET + + tuning_opt = { + 'log_filename': 'resnet-18.log', + + 'tuner': 'random', + 'n_trial': 1e9, + 'early_stopping': None, + + 'measure_option': autotvm.measure_option( + builder=autotvm.LocalBuilder(build_func=vta.vta_autotvm_build_func), + runner=autotvm.RPCRunner(device_key, 'fleet', 9190, + number=4, repeat=3, timeout=60, + check_correctness=True)) + } + + # download model + sym, params = download_model() + + # extract tasks + register_vta_tuning_tasks() + + print("Extract tasks...") + target = tvm.target.vta(device_key) + target_host = env.target_host + tasks = extract_tasks(sym, params, target, target_host) + + print("Tuning...") + tune_tasks(tasks, **tuning_opt) + + # compile kernels with history best records + with autotvm.tophub.context(target, extra_files=[tuning_opt['log_filename']]): + print("Compile...") + graph, lib, params = generate_graph(sym, params, target, target_host) + input_shape = (1, 3, 224, 224) + dtype = 'float32' + + # export library + tmp = util.tempdir() + filename = "net.tar" + lib.export_library(tmp.relpath(filename)) + + # upload module to device + print("Upload...") + remote = autotvm.measure.request_remote(device_key, 'fleet', 9190, timeout=10000) + remote.upload(tmp.relpath(filename)) + rlib = remote.load_module(filename) + + # upload parameters to device + ctx = remote.context(str(target), 0) + rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()} + data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) + module = graph_runtime.create(graph, rlib, ctx) + module.set_input('data', data_tvm) + module.set_input(**rparams) + + # evaluate + print("Evaluate inference time cost...") + ftimer = module.module.time_evaluator("run", ctx, number=3, repeat=3) + prof_res = np.array(ftimer().results) * 1000 # convert to millisecond + print("Mean inference time (std dev): %.2f ms (%.2f ms)" % + (np.mean(prof_res), np.std(prof_res))) + diff --git a/vta/src/runtime.cc b/vta/src/runtime.cc index e159cdfe77a5f..1c377fea61188 100644 --- a/vta/src/runtime.cc +++ b/vta/src/runtime.cc @@ -901,12 +901,12 @@ class CommandQueue { insn_queue_.InitSpace(); device_ = VTADeviceAlloc(); CHECK(device_ != nullptr); - printf("Initialize VTACommandHandle...\n"); + //printf("Initialize VTACommandHandle...\n"); } ~CommandQueue() { VTADeviceFree(device_); - printf("Close VTACommandhandle...\n"); + //printf("Close VTACommandhandle...\n"); } uint32_t GetElemBytes(uint32_t memory_id) { diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py index fb3b350f129b9..21dfe26974038 100644 --- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py +++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py @@ -1,5 +1,11 @@ """Testing if we can generate code in topi style""" +import os +import json +from collections import namedtuple + +import numpy as np + import tvm from tvm import autotvm from tvm.contrib import util @@ -8,155 +14,333 @@ import topi.testing import vta import vta.testing -from vta.top.testing import run_vta_conv2d, my_clip -import numpy as np -import os -import json -Workload = vta.top.vta_conv2d.Workload +Workload = namedtuple("Conv2DWorkload", + ['batch', 'height', 'width', 'in_filter', 'out_filter', + 'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride']) +# ResNet18 workloads +resnet_wkls = [ + # Workloads of resnet18 on imagenet + # Workload(env.BATCH, 224, 224, env.BLOCK_IN, 64, 7, 7, 3, 3, 2, 2), + ('resnet-18.C2', Workload(1, 56, 56, 64, 64, 3, 3, 1, 1, 1, 1)), + #('resnet-18.C3', Workload(1, 56, 56, 64, 64, 1, 1, 0, 0, 1, 1)), # this is a workload from a wrong model + ('resnet-18.C4', Workload(1, 56, 56, 64, 128, 3, 3, 1, 1, 2, 2)), + ('resnet-18.C5', Workload(1, 56, 56, 64, 128, 1, 1, 0, 0, 2, 2)), + ('resnet-18.C6', Workload(1, 28, 28, 128, 128, 3, 3, 1, 1, 1, 1)), + ('resnet-18.C7', Workload(1, 28, 28, 128, 256, 3, 3, 1, 1, 2, 2)), + ('resnet-18.C8', Workload(1, 28, 28, 128, 256, 1, 1, 0, 0, 2, 2)), + ('resnet-18.C9', Workload(1, 14, 14, 256, 256, 3, 3, 1, 1, 1, 1)), + ('resnet-18.C10', Workload(1, 14, 14, 256, 512, 3, 3, 1, 1, 2, 2)), + ('resnet-18.C11', Workload(1, 14, 14, 256, 512, 1, 1, 0, 0, 2, 2)), + ('resnet-18.C12', Workload(1, 7, 7, 512, 512, 3, 3, 1, 1, 1, 1)), +] -def test_cpu_conv2d(): - def run_cpu_conv2d(env, remote, key, batch_size, wl): - data_shape = (batch_size, wl.in_filter, wl.height, wl.width) - kernel_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel) - fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1 - fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1 - data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype) - kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) +def run_cpu_conv2d(env, remote, wl, target): + data_shape = (wl.batch, wl.in_filter, wl.height, wl.width) + kernel_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel) + fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1 + fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1 + data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype) + kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) + + with target: res_conv = topi.nn.conv2d( - data, kernel, padding=(wl.hpad, wl.wpad), - strides=(wl.hstride, wl.wstride), - out_dtype="int32") + data, kernel, (wl.hstride, wl.wstride), (wl.hpad, wl.wpad), "NCHW", "int32") res = topi.right_shift(res_conv, 8) res = my_clip(res, 0, 127) res = topi.cast(res, "int8") + s = topi.generic.schedule_conv2d_nchw([res]) - # To compute number of ops, use a x2 factor for FMA - num_ops = 2 * batch_size * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter + # To compute number of ops, use a x2 factor for FMA + num_ops = 2 * wl.batch * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter - a_shape = (batch_size, wl.in_filter, wl.height, wl.width) - w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel) - stride = (wl.hstride, wl.wstride) - data_dtype = data.dtype - kernel_dtype = kernel.dtype - acc_dtype = env.acc_dtype + # get reference data + a_shape = (wl.batch, wl.in_filter, wl.height, wl.width) + w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel) + stride = (wl.hstride, wl.wstride) + data_dtype = data.dtype + kernel_dtype = kernel.dtype + acc_dtype = env.acc_dtype + padding = wl.hpad + + @memoize("vta.tests.test_benchmark_topi.conv2d.cpu.verify_nhwc") + def get_ref_data(): + a_np = (np.random.uniform(size=a_shape) * 4).astype(data_dtype) + w_np = (np.random.uniform(size=w_shape) * 4).astype(kernel_dtype) + a_np = np.abs(a_np) + w_np = np.abs(w_np) + b_np = topi.testing.conv2d_nchw_python( + a_np.astype(acc_dtype), w_np.astype(acc_dtype), stride, padding).astype(acc_dtype) + return a_np, w_np, b_np + + check_correctness = True + print_ir = False + + # build + mod = tvm.build(s, [data, kernel, res], target=target, target_host=env.target_host, name="conv2d") + temp = util.tempdir() + mod.save(temp.relpath("conv2d.o")) + remote.upload(temp.relpath("conv2d.o")) + f = remote.load_module("conv2d.o") + + # run + ctx = remote.context(str(target)) + data_orig, kernel_orig, res_ref = get_ref_data() + res_shape = topi.util.get_const_tuple(res.shape) + res_np = np.zeros(res_shape).astype(res.dtype) + + data_arr = tvm.nd.array(data_orig, ctx) + kernel_arr = tvm.nd.array(kernel_orig, ctx) + res_arr = tvm.nd.array(res_np, ctx) + time_f = f.time_evaluator("conv2d", ctx, number=4) + cost = time_f(data_arr, kernel_arr, res_arr) + res_unpack = res_arr.asnumpy() + + # verify + if check_correctness: assert wl.hpad == wl.wpad + stride = (wl.hstride, wl.wstride) padding = wl.hpad + res_ref = res_ref >> 8 + res_ref = np.clip(res_ref, 0, 127) + res_ref = res_ref.astype("int8") + np.testing.assert_allclose(res_unpack, res_ref) - @memoize("vta.tests.test_benchmark_topi.conv2d.cpu.verify_nhwc") - def get_ref_data(): - a_np = (np.random.uniform(size=a_shape) * 4).astype(data_dtype) - w_np = (np.random.uniform(size=w_shape) * 4).astype(kernel_dtype) - a_np = np.abs(a_np) - w_np = np.abs(w_np) - b_np = topi.testing.conv2d_nchw_python( - a_np.astype(acc_dtype), w_np.astype(acc_dtype), stride, padding).astype(acc_dtype) - return a_np, w_np, b_np - - - def verify(s, check_correctness): - mod = tvm.build(s, [data, kernel, res], - target_host=env.target_host, - name="conv2d") - temp = util.tempdir() - mod.save(temp.relpath("conv2d.o")) - remote.upload(temp.relpath("conv2d.o")) - f = remote.load_module("conv2d.o") - # verify - ctx = remote.cpu(0) - # Data in original format - data_orig, kernel_orig, res_ref = get_ref_data() - res_shape = topi.util.get_const_tuple(res.shape) - res_np = np.zeros(res_shape).astype(res.dtype) - data_arr = tvm.nd.array(data_orig, ctx) - kernel_arr = tvm.nd.array(kernel_orig, ctx) - res_arr = tvm.nd.array(res_np, ctx) - time_f = f.time_evaluator("conv2d", ctx, number=5) - cost = time_f(data_arr, kernel_arr, res_arr) - res_unpack = res_arr.asnumpy() - if check_correctness: - assert wl.hpad == wl.wpad - stride = (wl.hstride, wl.wstride) - padding = wl.hpad - res_ref = res_ref >> 8 - res_ref = np.clip(res_ref, 0, 127) - res_ref = res_ref.astype("int8") - np.testing.assert_allclose(res_unpack, res_ref) - return cost - - def conv_normal(print_ir): - print("----- CONV2D CPU End-to-End Test -------") - s = topi.generic.schedule_conv2d_nchw([res]) - if print_ir: - print(tvm.lower(s, [data, kernel, res], simple_mode=True)) - cost = verify(s, True) - gops = (num_ops / cost.mean) / float(10 ** 9) - print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops)) - - conv_normal(False) + if print_ir: + print(tvm.lower(s, [data, kernel, res], simple_mode=True)) + gops = (num_ops / cost.mean) / float(10 ** 9) + print("CPU TEST: Time cost = %g sec/op, %g GOPS\n" % (cost.mean, gops)) + +def test_cpu_conv2d(): def _run(env, remote): - # ResNet18 workloads - resnet = { - # Workloads of resnet18 on imagenet - 0: Workload(1, 224, 224, 16, 64, 7, 7, 3, 3, 2, 2), - 1: Workload(1, 56, 56, 64, 64, 3, 3, 1, 1, 1, 1), - 2: Workload(1, 56, 56, 64, 64, 1, 1, 0, 0, 1, 1), - 3: Workload(1, 56, 56, 64, 128, 3, 3, 1, 1, 2, 2), - 4: Workload(1, 56, 56, 64, 128, 1, 1, 0, 0, 2, 2), - 5: Workload(1, 28, 28, 128, 128, 3, 3, 1, 1, 1, 1), - 6: Workload(1, 28, 28, 128, 256, 3, 3, 1, 1, 2, 2), - 7: Workload(1, 28, 28, 128, 256, 1, 1, 0, 0, 2, 2), - 8: Workload(1, 14, 14, 256, 256, 3, 3, 1, 1, 1, 1), - 9: Workload(1, 14, 14, 256, 512, 3, 3, 1, 1, 2, 2), - 10: Workload(1, 14, 14, 256, 512, 1, 1, 0, 0, 2, 2), - 11: Workload(1, 7, 7, 512, 512, 3, 3, 1, 1, 1, 1), - } - batch_size = 1 - for i in range(1, len(resnet)): - wl = resnet[i] - key = "resnet-cfg[%d]" % i - print("key=%s" % key) - print(wl) - with tvm.target.create(env.target_vta_cpu): - run_cpu_conv2d(env, remote, key, batch_size, wl) - - # load pre-tuned operator parameters for ARM CPU - with autotvm.tophub.context('llvm -device=arm_cpu'): - vta.testing.run(_run) + target = env.target_vta_cpu + + with autotvm.tophub.context(target): # load pre-tuned operator parameters for ARM CPU + for name, wl in resnet_wkls: + print(name, wl) + run_cpu_conv2d(env, remote, wl, target) + + vta.testing.run(_run) + + +def _sign_extend(value, bits): + sign_bit = 1 << (bits - 1) + return (value & (sign_bit - 1)) - (value & sign_bit) +_vector_sign_extend = np.vectorize(_sign_extend) + + +def _pack(x, width): + assert(len(x.shape)==6) + assert(x.dtype=="int8") + pack_factor = 8 // width + mask = ((1 << width) - 1) + + s = x.shape + s_reshape = s[:-1] + (s[-1] // pack_factor, pack_factor) + s_pack = s[:-1] + (s[-1] // pack_factor,) + x_reshape = x.reshape(s_reshape) + x_packed = np.zeros(s_pack, dtype="int8") + for i in range(0, pack_factor): + x_packed |= (x_reshape[:,:,:,:,:,:,i] & mask) << (i * width) + + return x_packed + +def _unpack(x, width): + assert(len(x.shape)==6) + assert(x.dtype=="int8") + pack_factor = 8 // width + mask = ((1 << width) - 1) + + s = x.shape + x_unpack = np.zeros(s[:] + (pack_factor,), dtype=x.dtype) + for i in range(0, pack_factor): + x_unpack[:,:,:,:,:,:,i] = _vector_sign_extend(((x >> (i * width)) & mask), width) + + return x_unpack.reshape(s[:-1] + (s[-1] * pack_factor,)) + +@tvm.tag_scope(tag=topi.tag.ELEMWISE) +def my_clip(x, a_min, a_max): + """Unlike topi's current clip, put min and max into two stages.""" + const_min = tvm.const(a_min, x.dtype) + const_max = tvm.const(a_max, x.dtype) + x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA") + x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB") + return x + + +def run_vta_conv2d(env, remote, wl, target, check_correctness=True, print_ir=False, + plan_str=None, samples=4, profileOnly=False, + skip_load_inp=False, skip_load_wgt=False, skip_load_acc=False, + skip_store_out=False, skip_alu=False, skip_gemm=False): + + data_shape = (wl.batch//env.BATCH, wl.in_filter//env.BLOCK_IN, + wl.height, wl.width, env.BATCH, env.BLOCK_IN) + kernel_shape = (wl.out_filter//env.BLOCK_OUT, wl.in_filter//env.BLOCK_IN, + wl.hkernel, wl.wkernel, env.BLOCK_OUT, env.BLOCK_IN) + bias_shape = (wl.batch//env.BATCH, wl.out_filter//env.BLOCK_OUT, + 1, 1, env.BATCH, env.BLOCK_OUT) + data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype) + bias = tvm.placeholder(bias_shape, name="kernel", dtype=env.acc_dtype) + + # Handle quantized inputs (less than 8 bits) + # x_pack_factor = 1 << (3 - env.LOG_INP_WIDTH) + # data_shape_pack = data_shape[:-1] + (data_shape[-1]//x_pack_factor,) + # data_arg = tvm.placeholder( + # data_shape_pack, + # dtype="int8", name="data_arg") + # data = vta.reinterpret(data_arg, data_shape, dtype=env.inp_dtype) + + # Handle quantized kernels (less than 8 bits) + w_pack_factor = 1 << (3 - env.LOG_WGT_WIDTH) + kernel_shape_pack = kernel_shape[:-1] + (kernel_shape[-1]//w_pack_factor,) + kernel_arg = tvm.placeholder(kernel_shape_pack, dtype="int8", name="kernel_arg") + kernel = vta.reinterpret(kernel_arg, kernel_shape, dtype=env.wgt_dtype) + + with target: + res_conv = topi.nn.conv2d( + data, kernel, (wl.hstride, wl.wstride), (wl.hpad, wl.wpad), + "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN), 'int32') + res = topi.right_shift(res_conv, 8) + res = topi.add(res, bias) + res = my_clip(res, 0, (1 << env.OUT_WIDTH-1)-1) + res = topi.cast(res, "int8") + + s = topi.generic.schedule_conv2d_nchw([res]) + #planStr=plan_str, + #skip_load_inp=skip_load_inp, + #skip_load_wgt=skip_load_wgt, + #skip_load_acc=skip_load_acc, + #skip_store_out=skip_store_out, + #skip_alu=skip_alu, + #skip_gemm=skip_gemm) + if print_ir: + print(vta.lower(s, [data, kernel_arg, bias, res], simple_mode=True)) + + # Handle quantized outputs (less than 8 bits) + # o_pack_factor = 1 << (3 - env.LOG_OUT_WIDTH) + res_shape = topi.util.get_const_tuple(res.shape) + # res_shape_pack = res_shape[:-1] + (res_shape[-1]//o_pack_factor,) + # res_arg = vta.reinterpret(res, res_shape_pack, dtype="int8") + + # generate referene data + fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1 + fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1 + num_ops = 2 * wl.batch * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter + + a_shape = (wl.batch, wl.in_filter, wl.height, wl.width) + w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel) + stride = (wl.hstride, wl.wstride) + data_dtype, kernel_dtype, acc_dtype = data.dtype, kernel.dtype, env.acc_dtype + padding = wl.hpad + INP_WIDTH, WGT_WIDTH = env.INP_WIDTH, env.WGT_WIDTH + + @memoize("vta.tests.test_benchmark_topi.conv2d.verify_nhwc") + def get_ref_data(): + # derive min max for input and weight types (max non inclusive) + a_min, a_max = 0 - (1 << (INP_WIDTH - 1)), (1 << (INP_WIDTH - 1)) + w_min, w_max = 0 - (1 << (WGT_WIDTH - 1)), (1 << (WGT_WIDTH - 1)) + a_np = np.random.randint(a_min, a_max, size=a_shape).astype("int8") + w_np = np.random.randint(w_min, w_max, size=w_shape).astype("int8") + b_np = topi.testing.conv2d_nchw_python( + a_np.astype(acc_dtype), w_np.astype(acc_dtype), stride, padding).astype(acc_dtype) + return a_np, w_np, b_np + + mod = vta.build(s, + [data, kernel_arg, bias, res], + target, + env.target_host, name="conv2d") + + # Data in original format + data_orig, kernel_orig, res_ref = get_ref_data() + bias_orig = (np.random.uniform(size=(wl.batch, wl.out_filter,)) * (1 << (env.INP_WIDTH + env.WGT_WIDTH - 2))) + bias_orig = bias_orig.astype("int32") + bias_orig = np.abs(bias_orig) + + data_packed = data_orig.reshape( + wl.batch//env.BATCH, env.BATCH, + wl.in_filter//env.BLOCK_IN, env.BLOCK_IN, + wl.height, wl.width).transpose((0, 2, 4, 5, 1, 3)) + kernel_packed = kernel_orig.reshape( + wl.out_filter//env.BLOCK_OUT, env.BLOCK_OUT, + wl.in_filter//env.BLOCK_IN, env.BLOCK_IN, + wl.hkernel, wl.wkernel).transpose((0, 2, 4, 5, 1, 3)) + bias_packed = bias_orig.reshape( + wl.batch // env.BATCH, wl.out_filter // env.BLOCK_OUT, + 1, 1, env.BATCH, env.BLOCK_OUT) + + # Quantized packing + data_qpacked = _pack(data_packed, env.INP_WIDTH) + kernel_qpacked = _pack(kernel_packed, env.WGT_WIDTH) + + # Upload + temp = util.tempdir() + mod.save(temp.relpath("conv2d.o")) + remote.upload(temp.relpath("conv2d.o")) + f = remote.load_module("conv2d.o") + + ctx = remote.context(str(target)) + res_np = np.zeros(res_shape).astype(res.dtype) + data_arr = tvm.nd.array(data_qpacked, ctx) + kernel_arr = tvm.nd.array(kernel_qpacked, ctx) + bias_arr = tvm.nd.array(bias_packed, ctx) + res_arr = tvm.nd.array(res_np, ctx) + time_f = f.time_evaluator("conv2d", ctx, number=samples) + + # In sim mode, collect simulator runtime statistics + stats = {} + cost = None + if env.TARGET == "sim": + # Check if we're in local RPC mode (allows us to rebuild the + # runtime on the fly when varying the VTA designs) + local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0")) + if local_rpc: + remote.get_function("vta.simulator.profiler_clear")() + if profileOnly: + remote.get_function("vta.simulator.profiler_debug_mode")(1) + cost = time_f(data_arr, kernel_arr, bias_arr, res_arr) + stats = json.loads(remote.get_function("vta.simulator.profiler_status")()) + else: + simulator.clear_stats() + if profileOnly: + simulator.debug_mode(1) + cost = time_f(data_arr, kernel_arr, bias_arr, res_arr) + stats = simulator.stats() + else: + cost = time_f(data_arr, kernel_arr, bias_arr, res_arr) + + # Check correctness + correct = False + if check_correctness: + res_unpack = res_arr.asnumpy() + res_unpack = _unpack(res_unpack.astype("int8"), env.OUT_WIDTH) + res_unpack = res_unpack.transpose( + (0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter, fout_height, fout_width) + assert wl.hpad == wl.wpad + stride = (wl.hstride, wl.wstride) + padding = wl.hpad + res_ref = res_ref >> 8 + res_ref += bias_orig.reshape(wl.out_filter, 1, 1) + res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH-1)-1) + res_ref = res_ref.astype("int8") + correct = np.allclose(res_unpack, res_ref) + + gops = (num_ops / cost.mean) / float(10 ** 9) + print("VTA TEST: Time cost = %g sec/op, %g GOPS\n" % (cost.mean, gops)) + + return correct, cost, stats + def test_vta_conv2d(): def _run(env, remote): - # ResNet18 workloads - resnet = { - # Workloads of resnet18 on imagenet - 0: Workload(env.BATCH, 224, 224, env.BLOCK_IN, 64, 7, 7, 3, 3, 2, 2), - 1: Workload(env.BATCH, 56, 56, 64, 64, 3, 3, 1, 1, 1, 1), - 2: Workload(env.BATCH, 56, 56, 64, 64, 1, 1, 0, 0, 1, 1), - 3: Workload(env.BATCH, 56, 56, 64, 128, 3, 3, 1, 1, 2, 2), - 4: Workload(env.BATCH, 56, 56, 64, 128, 1, 1, 0, 0, 2, 2), - 5: Workload(env.BATCH, 28, 28, 128, 128, 3, 3, 1, 1, 1, 1), - 6: Workload(env.BATCH, 28, 28, 128, 256, 3, 3, 1, 1, 2, 2), - 7: Workload(env.BATCH, 28, 28, 128, 256, 1, 1, 0, 0, 2, 2), - 8: Workload(env.BATCH, 14, 14, 256, 256, 3, 3, 1, 1, 1, 1), - 9: Workload(env.BATCH, 14, 14, 256, 512, 3, 3, 1, 1, 2, 2), - 10: Workload(env.BATCH, 14, 14, 256, 512, 1, 1, 0, 0, 2, 2), - 11: Workload(env.BATCH, 7, 7, 512, 512, 3, 3, 1, 1, 1, 1), - } - - for i in range(0, len(resnet)): - wl = resnet[i] - fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1 - fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1 - num_ops = 2 * wl.batch * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter - print("----- CONV2D End-to-End Test -------") - print(wl) - (correct, cost, stats) = run_vta_conv2d(env, remote, wl) - assert(correct) - gops = (num_ops / cost.mean) / float(10 ** 9) - print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops)) + target = env.target + + with autotvm.tophub.context(target): # load pre-tuned operator parameters for ARM CPU + for name, wl in resnet_wkls: + print(name, wl) + run_vta_conv2d(env, remote, wl, target) vta.testing.run(_run) @@ -164,3 +348,4 @@ def _run(env, remote): if __name__ == "__main__": # test_cpu_conv2d() test_vta_conv2d() + diff --git a/vta/tutorials/resnet.py b/vta/tutorials/resnet.py index 2900b383ac8c3..faa24e137c8d5 100644 --- a/vta/tutorials/resnet.py +++ b/vta/tutorials/resnet.py @@ -59,7 +59,7 @@ def process_image(image): # Takes in the graph runtime, and an image, and returns top result and time def classify(m, image): m.set_input('data', image) - timer = m.module.time_evaluator("run", ctx, number=1) + timer = m.module.time_evaluator("run", ctx, number=3) tcost = timer() tvm_output = m.get_output(0) top = np.argmax(tvm_output.asnumpy()[0]) @@ -108,7 +108,6 @@ def generate_graph(graph_fn, params_fn, target): params=params, target_host=target_host) # Save the compiled inference graph library - assert tvm.module.enabled("rpc") temp = util.tempdir() lib.save(temp.relpath("graphlib.o")) @@ -193,15 +192,11 @@ def generate_graph(graph_fn, params_fn, target): # ------------------------ # Build the ResNet graph runtime, and configure the parameters. -# llvm_command = 'llvm -device=arm_cpu -model=pynq {}'.format(env.llvm_triple) # run arm cpu on pynq -# llvm_command = 'llvm -device=arm_cpu -model=ultra96 {}'.format(env.llvm_triple) # run arm cpu on ultra96 -# llvm_command = 'llvm -device=vta -model=pynq {}'.format(env.llvm_triple) # run vta cpu on pynq -llvm_command = 'llvm -device=vta -model=ultra96 {}'.format(env.llvm_triple) # run vta cpu on ultra96 - -target = tvm.target.create(llvm_command) +#target = tvm.target.arm_cpu(env.TARGET) # run on arm cpu +target = tvm.target.vta(env.TARGET) # Device context -ctx = remote.ext_dev(0) if target.device_name == "vta" else remote.cpu(0) +ctx = remote.context(str(target)) # Build the graph runtime graph, lib, params = generate_graph(os.path.join(data_dir, graph_fn), @@ -231,7 +226,7 @@ def generate_graph(graph_fn, params_fn, target): m.set_input('data', image) # Perform inference -timer = m.module.time_evaluator("run", ctx, number=1) +timer = m.module.time_evaluator("run", ctx, number=4) tcost = timer() # Get classification results