From 41f1a52d1d6739dde194708c84a85a658f5bb4db Mon Sep 17 00:00:00 2001 From: Egor Churaev Date: Thu, 11 Aug 2022 16:49:58 +0300 Subject: [PATCH 1/3] [Profiler] Fix graph_executor_debug hang For some operations such as `__nop` or `__copy` the measured inference time is equal to 0. In this case we are in infinite loop and we won't exit from it. Added new parameter `max_repeat_num` which specify the maximum number of repeats then the inference time is equal to 0. When we exceed this value then we will exit from a loop. --- include/tvm/runtime/profiling.h | 4 +- python/tvm/contrib/debugger/debug_executor.py | 24 +++++++-- python/tvm/contrib/graph_executor.py | 7 +++ python/tvm/runtime/module.py | 6 +++ python/tvm/runtime/vm.py | 7 +++ src/runtime/crt/common/crt_runtime_api.c | 13 +++-- .../debug/graph_executor_debug.cc | 52 +++++++++++-------- src/runtime/profiling.cc | 10 ++-- src/runtime/rpc/rpc_module.cc | 34 ++++++------ web/emcc/tvmjs_support.cc | 17 +++--- web/src/runtime.ts | 8 +++ 11 files changed, 123 insertions(+), 59 deletions(-) diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h index 5f6f36e3b279..4334858bfb67 100644 --- a/include/tvm/runtime/profiling.h +++ b/include/tvm/runtime/profiling.h @@ -573,6 +573,8 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i * minimum duration requirement of one `repeat`. * i.e., When the run time of one `repeat` falls below this time, * the `number` parameter will be automatically increased. + * \param max_repeat_ms The maximum number of repeats when measured time is equal to 0. + * It helps to avoid hanging during measurements. * \param cooldown_interval_ms The cooldown interval in milliseconds between the number of repeats * defined by `repeats_to_cooldown`. * \param repeats_to_cooldown The number of repeats before the @@ -582,7 +584,7 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i * \return f_timer A timer function. */ PackedFunc WrapTimeEvaluator(PackedFunc f, Device dev, int number, int repeat, int min_repeat_ms, - int cooldown_interval_ms, int repeats_to_cooldown, + int max_repeat_num, int cooldown_interval_ms, int repeats_to_cooldown, PackedFunc f_preproc = nullptr); } // namespace profiling diff --git a/python/tvm/contrib/debugger/debug_executor.py b/python/tvm/contrib/debugger/debug_executor.py index 5ce378965246..712ced1f2f3c 100644 --- a/python/tvm/contrib/debugger/debug_executor.py +++ b/python/tvm/contrib/debugger/debug_executor.py @@ -223,7 +223,7 @@ def _run_per_layer(self): output_tensors.append(self._get_node_output(i, j)) self.debug_datum.update_output_tensors(output_tensors) - def _run_debug(self, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown): + def _run_debug(self, number, repeat, min_repeat_ms, max_repeat_num, cooldown_interval_ms, repeats_to_cooldown): """Execute the node specified with index will be executed. Each debug output will be copied to the buffer Time consumed for each execution will be set as debug output. @@ -233,6 +233,7 @@ def _run_debug(self, number, repeat, min_repeat_ms, cooldown_interval_ms, repeat number=number, repeat=repeat, min_repeat_ms=min_repeat_ms, + max_repeat_num=max_repeat_num, cooldown_interval_ms=cooldown_interval_ms, repeats_to_cooldown=repeats_to_cooldown, ) @@ -272,6 +273,7 @@ def run( number=10, repeat=1, min_repeat_ms=1, + max_repeat_num=100, cooldown_interval_ms=0, repeats_to_cooldown=1, **input_dict, @@ -299,6 +301,10 @@ def run( i.e., When the run time of one `repeat` falls below this time, the `number` parameter will be automatically increased. + max_repeat_num: int, optional + The maximum number of repeats when measured time is equal to 0. + It helps to avoid hanging during measurements. + cooldown_interval_ms: int, optional The cooldown interval in milliseconds between the number of repeats defined by `repeats_to_cooldown`. @@ -317,6 +323,7 @@ def run( number=number, repeat=repeat, min_repeat_ms=min_repeat_ms, + max_repeat_num=max_repeat_num, cooldown_interval_ms=cooldown_interval_ms, repeats_to_cooldown=repeats_to_cooldown, ) @@ -328,7 +335,7 @@ def run( self.debug_datum.display_debug_result() def run_individual( - self, number, repeat=1, min_repeat_ms=0, cooldown_interval_ms=0, repeats_to_cooldown=1 + self, number, repeat=1, min_repeat_ms=0, max_repeat_num=100, cooldown_interval_ms=0, repeats_to_cooldown=1 ): """Run each operation in the graph and get the time per op for all ops. @@ -351,6 +358,10 @@ def run_individual( i.e., When the run time of one `repeat` falls below this time, the `number` parameter will be automatically increased. + max_repeat_num: int, optional + The maximum number of repeats when measured time is equal to 0. + It helps to avoid hanging during measurements. + cooldown_interval_ms: int, optional The cooldown interval in milliseconds between the number of repeats defined by `repeats_to_cooldown`. @@ -364,7 +375,7 @@ def run_individual( the repeat of the measurement. """ res = self._run_individual( - number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown + number, repeat, min_repeat_ms, max_repeat_num, cooldown_interval_ms, repeats_to_cooldown ) results = [] offset = 0 @@ -384,6 +395,7 @@ def run_individual_node( number=10, repeat=1, min_repeat_ms=0, + max_repeat_num=100, cooldown_interval_ms=0, repeats_to_cooldown=1, ): @@ -415,6 +427,10 @@ def run_individual_node( i.e., When the run time of one `repeat` falls below this time, the `number` parameter will be automatically increased. + max_repeat_num: int, optional + The maximum number of repeats when measured time is equal to 0. + It helps to avoid hanging during measurements. + cooldown_interval_ms: int, optional The cooldown interval in milliseconds between the number of repeats defined by `repeats_to_cooldown`. @@ -428,7 +444,7 @@ def run_individual_node( """ # Results are returned as serialized strings which we deserialize res = self._run_individual_node( - index, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown + index, number, repeat, min_repeat_ms, max_repeat_num, cooldown_interval_ms, repeats_to_cooldown ) fmt = "@" + ("d" * repeat) results = struct.unpack(fmt, res) diff --git a/python/tvm/contrib/graph_executor.py b/python/tvm/contrib/graph_executor.py index a4b90baf1d38..3cc412954a47 100644 --- a/python/tvm/contrib/graph_executor.py +++ b/python/tvm/contrib/graph_executor.py @@ -355,6 +355,7 @@ def benchmark( repeat=5, number=5, min_repeat_ms=None, + max_repeat_num=100, end_to_end=False, cooldown_interval_ms=0, repeats_to_cooldown=1, @@ -402,6 +403,10 @@ def benchmark( milliseconds. This can be used to ensure that the function is run enough to get an accurate measurement. + max_repeat_num : Optional[int] + The maximum number of repeats when measured time is equal to 0. + It helps to avoid hanging during measurements. + end_to_end : bool If set, include time to transfer input tensors to the device and time to transfer returned tensors in the total runtime. This will give accurate timings for end to end @@ -437,6 +442,7 @@ def benchmark( repeat=repeat, number=number, min_repeat_ms=min_repeat_ms, + max_repeat_num=max_repeat_num, )(device.device_type % rpc_base.RPC_SESS_MASK, device.device_id, *args) if kwargs: self.set_input(**kwargs) @@ -446,6 +452,7 @@ def benchmark( repeat=repeat, number=number, min_repeat_ms=min_repeat_ms, + max_repeat_num=max_repeat_num, cooldown_interval_ms=cooldown_interval_ms, repeats_to_cooldown=repeats_to_cooldown, )() diff --git a/python/tvm/runtime/module.py b/python/tvm/runtime/module.py index e2af556413b4..d67a7da59dbd 100644 --- a/python/tvm/runtime/module.py +++ b/python/tvm/runtime/module.py @@ -277,6 +277,7 @@ def time_evaluator( number=10, repeat=1, min_repeat_ms=0, + max_repeat_num=100, cooldown_interval_ms=0, repeats_to_cooldown=1, f_preproc="", @@ -310,6 +311,10 @@ def time_evaluator( i.e., When the run time of one `repeat` falls below this time, the `number` parameter will be automatically increased. + max_repeat_num: int, optional + The maximum number of repeats when measured time is equal to 0. + It helps to avoid hanging during measurements. + cooldown_interval_ms: int, optional The cooldown interval in milliseconds between the number of repeats defined by `repeats_to_cooldown`. @@ -340,6 +345,7 @@ def time_evaluator( number, repeat, min_repeat_ms, + max_repeat_num, cooldown_interval_ms, repeats_to_cooldown, f_preproc, diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py index 83f1656a0dd8..efec3ba83be9 100644 --- a/python/tvm/runtime/vm.py +++ b/python/tvm/runtime/vm.py @@ -583,6 +583,7 @@ def benchmark( repeat=5, number=5, min_repeat_ms=None, + max_repeat_num=100, end_to_end=False, cooldown_interval_ms=0, repeats_to_cooldown=1, @@ -630,6 +631,10 @@ def benchmark( milliseconds. This can be used to ensure that the function is run enough to get an accurate measurement. + max_repeat_num : Optional[int] + The maximum number of repeats when measured time is equal to 0. + It helps to avoid hanging during measurements. + end_to_end : bool If set, include time to transfer input tensors to the device and time to transfer returned tensors in the total runtime. This will give accurate timings for end to end @@ -672,6 +677,7 @@ def benchmark( repeat=repeat, number=number, min_repeat_ms=min_repeat_ms, + max_repeat_num=max_repeat_num, )(func_name, device.device_type % RPC_SESS_MASK, device.device_id, *packed_args) if args or kwargs: self.set_input(func_name, *args, **kwargs) @@ -681,6 +687,7 @@ def benchmark( repeat=repeat, number=number, min_repeat_ms=min_repeat_ms, + max_repeat_num=max_repeat_num, cooldown_interval_ms=cooldown_interval_ms, repeats_to_cooldown=repeats_to_cooldown, )(func_name) diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c index 23ab5af08a7f..baae12bcfa29 100644 --- a/src/runtime/crt/common/crt_runtime_api.c +++ b/src/runtime/crt/common/crt_runtime_api.c @@ -477,6 +477,7 @@ typedef struct { int number; int repeat; int min_repeat_ms; + int max_repeat_num; int cooldown_interval_ms; int repeats_to_cooldown; } time_evaluator_state_t; @@ -487,14 +488,14 @@ int RPCTimeEvaluator(TVMValue* args, int* type_codes, int num_args, TVMValue* re int* ret_type_code) { ret_val[0].v_handle = NULL; ret_type_code[0] = kTVMNullptr; - if (num_args < 10) { + if (num_args < 11) { TVMAPIErrorf("not enough args"); return kTvmErrorFunctionCallNumArguments; } if (type_codes[0] != kTVMModuleHandle || type_codes[1] != kTVMStr || type_codes[2] != kTVMArgInt || type_codes[3] != kTVMArgInt || type_codes[4] != kTVMArgInt || type_codes[5] != kTVMArgInt || type_codes[6] != kTVMArgInt || type_codes[7] != kTVMArgInt || - type_codes[8] != kTVMArgInt || type_codes[9] != kTVMStr) { + type_codes[8] != kTVMArgInt || type_codes[9] != kTVMArgInt || type_codes[10] != kTVMStr) { TVMAPIErrorf("one or more invalid arg types"); return kTvmErrorFunctionCallWrongArgType; } @@ -506,8 +507,9 @@ int RPCTimeEvaluator(TVMValue* args, int* type_codes, int num_args, TVMValue* re g_time_evaluator_state.number = args[4].v_int64; g_time_evaluator_state.repeat = args[5].v_int64; g_time_evaluator_state.min_repeat_ms = args[6].v_int64; - g_time_evaluator_state.cooldown_interval_ms = args[7].v_int64; - g_time_evaluator_state.repeats_to_cooldown = args[8].v_int64; + g_time_evaluator_state.min_repeat_num = args[7].v_int64; + g_time_evaluator_state.cooldown_interval_ms = args[8].v_int64; + g_time_evaluator_state.repeats_to_cooldown = args[9].v_int64; int ret_code = TVMModGetFunction(mod, name, /* query_imports */ 0, &g_time_evaluator_state.func_to_time); @@ -556,6 +558,7 @@ tvm_crt_error_t RunTimeEvaluator(tvm_function_index_t function_index, TVMValue* double* iter = (double*)result_byte_arr->data; for (int i = 0; i < g_time_evaluator_state.repeat; i++) { double curr_res_seconds = 0.0; + int absolute_zero_times = 0; // do-while structure ensures we run even when `min_repeat_ms` isn't set (i.e., is 0). do { if (curr_res_seconds > 0.0) { @@ -588,6 +591,8 @@ tvm_crt_error_t RunTimeEvaluator(tvm_function_index_t function_index, TVMValue* if (err != kTvmErrorNoError) { goto release_and_return; } + if (std::fpclassify(curr_res_seconds) == FP_ZERO) absolute_zero_times++; + if (absolute_zero_times >= max_repeat_num) break; } while (curr_res_seconds < min_repeat_seconds); double mean_exec_seconds = curr_res_seconds / g_time_evaluator_state.number; *iter = mean_exec_seconds; diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc index cf98141037b7..8ff21c585481 100644 --- a/src/runtime/graph_executor/debug/graph_executor_debug.cc +++ b/src/runtime/graph_executor/debug/graph_executor_debug.cc @@ -56,6 +56,8 @@ class GraphExecutorDebug : public GraphExecutor { * By default, one `repeat` contains `number` runs. If this parameter is set, * the parameters `number` will be dynamically adjusted to meet the * minimum duration requirement of one `repeat`. + * \param max_repeat_ms The maximum number of repeats when measured time is equal to 0. + * It helps to avoid hanging during measurements. * \param cooldown_interval_ms The cooldown interval in milliseconds between the number of repeats * defined by `repeats_to_cooldown`. * \param repeats_to_cooldown The number of repeats before the @@ -64,8 +66,8 @@ class GraphExecutorDebug : public GraphExecutor { * representing the number of layers. Next the encoded real numbers are float32_t in the number of * repeat multiplied by the number of layers. */ - std::string RunIndividual(int number, int repeat, int min_repeat_ms, int cooldown_interval_ms, - int repeats_to_cooldown) { + std::string RunIndividual(int number, int repeat, int min_repeat_ms, int max_repeat_num, + int cooldown_interval_ms, int repeats_to_cooldown) { // warmup run GraphExecutor::Run(); std::string tkey = module_->type_key(); @@ -73,14 +75,15 @@ class GraphExecutorDebug : public GraphExecutor { if (tkey == "rpc") { // RPC modules rely on remote timing which implements the logic from the else branch. for (size_t index = 0; index < op_execs_.size(); ++index) { - time_sec_per_op[index] = RunOpRPC(index, number, repeat, min_repeat_ms, + time_sec_per_op[index] = RunOpRPC(index, number, repeat, min_repeat_ms, max_repeat_num, cooldown_interval_ms, repeats_to_cooldown); } } else { int op = 0; for (size_t index = 0; index < op_execs_.size(); ++index) { - std::string result_str = RunIndividualNode(index, number, repeat, min_repeat_ms, - cooldown_interval_ms, repeats_to_cooldown); + std::string result_str = + RunIndividualNode(index, number, repeat, min_repeat_ms, max_repeat_num, + cooldown_interval_ms, repeats_to_cooldown); const double* blob_ptr = reinterpret_cast(result_str.data()); for (int i = 0; i < repeat; ++i, ++blob_ptr) { time_sec_per_op[index].push_back(*blob_ptr); @@ -110,14 +113,15 @@ class GraphExecutorDebug : public GraphExecutor { } std::string RunIndividualNode(int node_index, int number, int repeat, int min_repeat_ms, - int cooldown_interval_ms, int repeats_to_cooldown) { + int max_repeat_num, int cooldown_interval_ms, + int repeats_to_cooldown) { std::string tkey = module_->type_key(); if (tkey == "rpc") { LOG(FATAL) << "RPC measurements should not use RunIndividualNode!"; } - if (!op_execs_[node_index] || nodes_[node_index].param.func_name == "__nop") { + if (!op_execs_[node_index]) { // don't return anything... std::ostringstream os; double zero = 0; @@ -131,12 +135,13 @@ class GraphExecutorDebug : public GraphExecutor { Device& d = devices_[0]; PackedFunc time_evaluator = profiling::WrapTimeEvaluator( TypedPackedFunc([this, node_index]() { this->RunOpHost(node_index); }), d, number, - repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown); + repeat, min_repeat_ms, max_repeat_num, cooldown_interval_ms, repeats_to_cooldown); return time_evaluator(); } std::vector RunOpRPC(int index, int number, int repeat, int min_repeat_ms, - int cooldown_interval_ms, int repeats_to_cooldown) { + int max_repeat_num, int cooldown_interval_ms, + int repeats_to_cooldown) { std::vector results(repeat, 0); // Right now we expect either "tvm_op" for nodes which run PackedFunc or "null" for nodes // which represent inputs/parameters to the graph. Other types may be supported in the @@ -152,11 +157,6 @@ class GraphExecutorDebug : public GraphExecutor { return results; } - if (nodes_[index].param.func_name == "__nop") { - LOG_INFO << "Skipping __nop function"; - return results; - } - const Device& dev = data_entry_[entry_id(index, 0)]->device; TVMOpParam param = nodes_[index].param; std::string name = param.func_name; @@ -167,7 +167,8 @@ class GraphExecutorDebug : public GraphExecutor { runtime::Registry::Get("runtime.RPCTimeEvaluator") -> operator()(module_, name, static_cast(dev.device_type), dev.device_id, number, - repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown, ""); + repeat, min_repeat_ms, max_repeat_num, cooldown_interval_ms, + repeats_to_cooldown, ""); int num_flat_args = num_inputs + num_outputs; std::unique_ptr values(new TVMValue[num_flat_args]); @@ -390,15 +391,17 @@ PackedFunc GraphExecutorDebug::GetFunction(const std::string& name, int number = args[0]; int repeat = args[1]; int min_repeat_ms = args[2]; - int cooldown_interval_ms = args[3]; - int repeats_to_cooldown = args[4]; + int max_repeat_num = args[3]; + int cooldown_interval_ms = args[4]; + int repeats_to_cooldown = args[5]; ICHECK_GT(number, 0); ICHECK_GT(repeat, 0); ICHECK_GE(min_repeat_ms, 0); + ICHECK_GE(max_repeat_num, 0); ICHECK_GE(cooldown_interval_ms, 0); ICHECK_GT(repeats_to_cooldown, 0); - std::string blob = this->RunIndividual(number, repeat, min_repeat_ms, cooldown_interval_ms, - repeats_to_cooldown); + std::string blob = this->RunIndividual(number, repeat, min_repeat_ms, max_repeat_num, + cooldown_interval_ms, repeats_to_cooldown); TVMByteArray arr; arr.size = blob.length(); arr.data = blob.data(); @@ -410,17 +413,20 @@ PackedFunc GraphExecutorDebug::GetFunction(const std::string& name, int number = args[1]; int repeat = args[2]; int min_repeat_ms = args[3]; - int cooldown_interval_ms = args[4]; - int repeats_to_cooldown = args[5]; + int max_repeat_num = args[4]; + int cooldown_interval_ms = args[5]; + int repeats_to_cooldown = args[6]; ICHECK_GE(node_index, 0); ICHECK_LT(node_index, nodes_.size()); ICHECK_GT(number, 0); ICHECK_GT(repeat, 0); ICHECK_GE(min_repeat_ms, 0); + ICHECK_GE(max_repeat_num, 0); ICHECK_GE(cooldown_interval_ms, 0); ICHECK_GT(repeats_to_cooldown, 0); - std::string blob = this->RunIndividualNode(node_index, number, repeat, min_repeat_ms, - cooldown_interval_ms, repeats_to_cooldown); + std::string blob = + this->RunIndividualNode(node_index, number, repeat, min_repeat_ms, max_repeat_num, + cooldown_interval_ms, repeats_to_cooldown); TVMByteArray arr; arr.size = blob.length(); arr.data = blob.data(); diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc index 187a98964af2..fb345a721053 100644 --- a/src/runtime/profiling.cc +++ b/src/runtime/profiling.cc @@ -848,7 +848,7 @@ TVM_REGISTER_GLOBAL("runtime.profiling.ProfileFunction") }); PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, int min_repeat_ms, - int cooldown_interval_ms, int repeats_to_cooldown, + int max_repeat_num, int cooldown_interval_ms, int repeats_to_cooldown, PackedFunc f_preproc) { ICHECK(pf != nullptr); @@ -858,8 +858,8 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, return (*get_micro_time_evaluator)(pf, dev, number, repeat); } - auto ftimer = [pf, dev, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown, - f_preproc](TVMArgs args, TVMRetValue* rv) mutable { + auto ftimer = [pf, dev, number, repeat, min_repeat_ms, max_repeat_num, cooldown_interval_ms, + repeats_to_cooldown, f_preproc](TVMArgs args, TVMRetValue* rv) mutable { TVMRetValue temp; std::ostringstream os; // skip first time call, to activate lazy compilation components. @@ -872,7 +872,7 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, f_preproc.CallPacked(args, &temp); } double duration_ms = 0.0; - + int absolute_zero_times = 0; do { if (duration_ms > 0.0) { const double golden_ratio = 1.618; @@ -887,6 +887,8 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, } t->Stop(); int64_t t_nanos = t->SyncAndGetElapsedNanos(); + if (t_nanos == 0) absolute_zero_times++; + if (absolute_zero_times >= max_repeat_num) break; duration_ms = t_nanos / 1e6; } while (duration_ms < min_repeat_ms); diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc index ff5889500592..28c7e3654ddc 100644 --- a/src/runtime/rpc/rpc_module.cc +++ b/src/runtime/rpc/rpc_module.cc @@ -191,8 +191,8 @@ class RPCModuleNode final : public ModuleNode { } PackedFunc GetTimeEvaluator(const std::string& name, Device dev, int number, int repeat, - int min_repeat_ms, int cooldown_interval_ms, int repeats_to_cooldown, - const std::string& f_preproc_name) { + int min_repeat_ms, int max_repeat_num, int cooldown_interval_ms, + int repeats_to_cooldown, const std::string& f_preproc_name) { InitRemoteFunc(&remote_get_time_evaluator_, "runtime.RPCTimeEvaluator"); // Remove session mask because we pass dev by parts. ICHECK_EQ(GetRPCSessionIndex(dev), sess_->table_index()) @@ -200,13 +200,15 @@ class RPCModuleNode final : public ModuleNode { dev = RemoveRPCSessionMask(dev); if (module_handle_ != nullptr) { - return remote_get_time_evaluator_( - GetRef(this), name, static_cast(dev.device_type), dev.device_id, number, - repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown, f_preproc_name); + return remote_get_time_evaluator_(GetRef(this), name, + static_cast(dev.device_type), dev.device_id, number, + repeat, min_repeat_ms, max_repeat_num, cooldown_interval_ms, + repeats_to_cooldown, f_preproc_name); } else { - return remote_get_time_evaluator_( - Optional(nullptr), name, static_cast(dev.device_type), dev.device_id, number, - repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown, f_preproc_name); + return remote_get_time_evaluator_(Optional(nullptr), name, + static_cast(dev.device_type), dev.device_id, number, + repeat, min_repeat_ms, max_repeat_num, cooldown_interval_ms, + repeats_to_cooldown, f_preproc_name); } } @@ -244,7 +246,7 @@ class RPCModuleNode final : public ModuleNode { // The local channel std::shared_ptr sess_; // remote function to get time evaluator - TypedPackedFunc, std::string, int, int, int, int, int, int, int, + TypedPackedFunc, std::string, int, int, int, int, int, int, int, int, std::string)> remote_get_time_evaluator_; // remote function getter for modules. @@ -363,8 +365,9 @@ inline void CPUCacheFlush(int begin_index, const TVMArgs& args) { TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator") .set_body_typed([](Optional opt_mod, std::string name, int device_type, int device_id, - int number, int repeat, int min_repeat_ms, int cooldown_interval_ms, - int repeats_to_cooldown, std::string f_preproc_name) { + int number, int repeat, int min_repeat_ms, int max_repeat_num, + int cooldown_interval_ms, int repeats_to_cooldown, + std::string f_preproc_name) { Device dev; dev.device_type = static_cast(device_type); dev.device_id = device_id; @@ -373,8 +376,8 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator") std::string tkey = m->type_key(); if (tkey == "rpc") { return static_cast(m.operator->()) - ->GetTimeEvaluator(name, dev, number, repeat, min_repeat_ms, cooldown_interval_ms, - repeats_to_cooldown, f_preproc_name); + ->GetTimeEvaluator(name, dev, number, repeat, min_repeat_ms, max_repeat_num, + cooldown_interval_ms, repeats_to_cooldown, f_preproc_name); } else { PackedFunc f_preproc; if (!f_preproc_name.empty()) { @@ -386,7 +389,8 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator") PackedFunc pf = m.GetFunction(name, true); CHECK(pf != nullptr) << "Cannot find " << name << " in the global registry"; return profiling::WrapTimeEvaluator(pf, dev, number, repeat, min_repeat_ms, - cooldown_interval_ms, repeats_to_cooldown, f_preproc); + max_repeat_num, cooldown_interval_ms, + repeats_to_cooldown, f_preproc); } } else { auto* pf = runtime::Registry::Get(name); @@ -398,7 +402,7 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator") << "Cannot find " << f_preproc_name << " in the global function"; f_preproc = *pf_preproc; } - return profiling::WrapTimeEvaluator(*pf, dev, number, repeat, min_repeat_ms, + return profiling::WrapTimeEvaluator(*pf, dev, number, repeat, min_repeat_ms, max_repeat_num, cooldown_interval_ms, repeats_to_cooldown, f_preproc); } }); diff --git a/web/emcc/tvmjs_support.cc b/web/emcc/tvmjs_support.cc index 56f586d67046..05e49af5bba7 100644 --- a/web/emcc/tvmjs_support.cc +++ b/web/emcc/tvmjs_support.cc @@ -169,7 +169,7 @@ class AsyncLocalSession : public LocalSession { try { TVMArgs args(arg_values, arg_type_codes, num_args); PackedFunc retfunc = this->GetTimeEvaluator(args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8]); + args[5], args[6], args[7], args[8], args[9]); TVMRetValue rv; rv = retfunc; this->EncodeReturn(std::move(rv), [&](TVMArgs encoded_args) { @@ -252,7 +252,8 @@ class AsyncLocalSession : public LocalSession { // time evaluator PackedFunc GetTimeEvaluator(Optional opt_mod, std::string name, int device_type, int device_id, int number, int repeat, int min_repeat_ms, - int cooldown_interval_ms, int repeats_to_cooldown) { + int max_repeat_num, int cooldown_interval_ms, + int repeats_to_cooldown) { Device dev; dev.device_type = static_cast(device_type); dev.device_id = device_id; @@ -261,20 +262,20 @@ class AsyncLocalSession : public LocalSession { Module m = opt_mod.value(); std::string tkey = m->type_key(); return WrapWasmTimeEvaluator(m.GetFunction(name, false), dev, number, repeat, min_repeat_ms, - cooldown_interval_ms, repeats_to_cooldown); + max_repeat_num, cooldown_interval_ms, repeats_to_cooldown); } else { auto* pf = runtime::Registry::Get(name); CHECK(pf != nullptr) << "Cannot find " << name << " in the global function"; - return WrapWasmTimeEvaluator(*pf, dev, number, repeat, min_repeat_ms, cooldown_interval_ms, - repeats_to_cooldown); + return WrapWasmTimeEvaluator(*pf, dev, number, repeat, min_repeat_ms, max_repeat_num, + cooldown_interval_ms, repeats_to_cooldown); } } // time evaluator PackedFunc WrapWasmTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, - int min_repeat_ms, int cooldown_interval_ms, + int min_repeat_ms, int max_repeat_num, int cooldown_interval_ms, int repeats_to_cooldown) { - auto ftimer = [pf, dev, number, repeat, min_repeat_ms, cooldown_interval_ms, + auto ftimer = [pf, dev, number, repeat, min_repeat_ms, max_repeat_num, cooldown_interval_ms, repeats_to_cooldown](TVMArgs args, TVMRetValue* rv) { // the function is a async function. PackedFunc on_complete = args[args.size() - 1]; @@ -293,7 +294,7 @@ class AsyncLocalSession : public LocalSession { auto* time_exec = runtime::Registry::Get("__async.wasm.TimeExecution"); CHECK(time_exec != nullptr) << "Cannot find wasm.GetTimer in the global function"; (*time_exec)(TypedPackedFunc(finvoke), dev, number, repeat, min_repeat_ms, - cooldown_interval_ms, repeats_to_cooldown, on_complete); + max_repeat_num, cooldown_interval_ms, repeats_to_cooldown, on_complete); }; return PackedFunc(ftimer); } diff --git a/web/src/runtime.ts b/web/src/runtime.ts index 8df26aff14c3..9f1b85cb8d8a 100644 --- a/web/src/runtime.ts +++ b/web/src/runtime.ts @@ -1058,6 +1058,7 @@ export class Instance implements Disposable { nstep: number, repeat: number, minRepeatMs: number, + maxRepeatNum: number, cooldownIntervalMs: number, repeatsToCooldown: number ): Promise => { @@ -1068,6 +1069,7 @@ export class Instance implements Disposable { for (let i = 0; i < repeat; ++i) { let durationMs = 0.0; + let absoluteZeroTimes = 0; do { if (durationMs > 0.0) { let golden_ratio = 1.618; @@ -1081,6 +1083,12 @@ export class Instance implements Disposable { const tend: number = perf.now(); durationMs = tend - tstart; + if (durationMS == 0) { + absoluteZeroTimes++; + } + if (absoluteZeroTimes >= maxRepeatNum) { + break; + } } while (durationMs < minRepeatMs); const speed = durationMs / setupNumber / 1000; result.push(speed); From 8067b1c65a44dea2d05a56c4dbb946200acb6a0c Mon Sep 17 00:00:00 2001 From: Egor Churaev Date: Thu, 11 Aug 2022 17:16:01 +0300 Subject: [PATCH 2/3] Fix lint --- python/tvm/contrib/debugger/debug_executor.py | 26 ++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/python/tvm/contrib/debugger/debug_executor.py b/python/tvm/contrib/debugger/debug_executor.py index 712ced1f2f3c..9cfe6c30f24d 100644 --- a/python/tvm/contrib/debugger/debug_executor.py +++ b/python/tvm/contrib/debugger/debug_executor.py @@ -223,7 +223,15 @@ def _run_per_layer(self): output_tensors.append(self._get_node_output(i, j)) self.debug_datum.update_output_tensors(output_tensors) - def _run_debug(self, number, repeat, min_repeat_ms, max_repeat_num, cooldown_interval_ms, repeats_to_cooldown): + def _run_debug( + self, + number, + repeat, + min_repeat_ms, + max_repeat_num, + cooldown_interval_ms, + repeats_to_cooldown, + ): """Execute the node specified with index will be executed. Each debug output will be copied to the buffer Time consumed for each execution will be set as debug output. @@ -335,7 +343,13 @@ def run( self.debug_datum.display_debug_result() def run_individual( - self, number, repeat=1, min_repeat_ms=0, max_repeat_num=100, cooldown_interval_ms=0, repeats_to_cooldown=1 + self, + number, + repeat=1, + min_repeat_ms=0, + max_repeat_num=100, + cooldown_interval_ms=0, + repeats_to_cooldown=1, ): """Run each operation in the graph and get the time per op for all ops. @@ -444,7 +458,13 @@ def run_individual_node( """ # Results are returned as serialized strings which we deserialize res = self._run_individual_node( - index, number, repeat, min_repeat_ms, max_repeat_num, cooldown_interval_ms, repeats_to_cooldown + index, + number, + repeat, + min_repeat_ms, + max_repeat_num, + cooldown_interval_ms, + repeats_to_cooldown, ) fmt = "@" + ("d" * repeat) results = struct.unpack(fmt, res) From 140d554d9dc4f1414b898b44b16401468aaa5b5b Mon Sep 17 00:00:00 2001 From: Egor Churaev Date: Fri, 12 Aug 2022 10:07:46 +0300 Subject: [PATCH 3/3] Apply comments --- include/tvm/runtime/profiling.h | 8 ++-- python/tvm/contrib/debugger/debug_executor.py | 27 ++++++----- python/tvm/contrib/graph_executor.py | 8 ++-- python/tvm/runtime/module.py | 6 +-- python/tvm/runtime/vm.py | 8 ++-- src/runtime/crt/common/crt_runtime_api.c | 11 ++--- .../debug/graph_executor_debug.cc | 45 ++++++++++--------- src/runtime/profiling.cc | 12 ++--- src/runtime/rpc/rpc_module.cc | 27 ++++++----- web/emcc/tvmjs_support.cc | 21 +++++---- web/src/runtime.ts | 9 ++-- 11 files changed, 98 insertions(+), 84 deletions(-) diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h index 4334858bfb67..3922ef76dfb7 100644 --- a/include/tvm/runtime/profiling.h +++ b/include/tvm/runtime/profiling.h @@ -573,8 +573,8 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i * minimum duration requirement of one `repeat`. * i.e., When the run time of one `repeat` falls below this time, * the `number` parameter will be automatically increased. - * \param max_repeat_ms The maximum number of repeats when measured time is equal to 0. - * It helps to avoid hanging during measurements. + * \param limit_zero_time_iterations The maximum number of repeats when + * measured time is equal to 0. It helps to avoid hanging during measurements. * \param cooldown_interval_ms The cooldown interval in milliseconds between the number of repeats * defined by `repeats_to_cooldown`. * \param repeats_to_cooldown The number of repeats before the @@ -584,8 +584,8 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i * \return f_timer A timer function. */ PackedFunc WrapTimeEvaluator(PackedFunc f, Device dev, int number, int repeat, int min_repeat_ms, - int max_repeat_num, int cooldown_interval_ms, int repeats_to_cooldown, - PackedFunc f_preproc = nullptr); + int limit_zero_time_iterations, int cooldown_interval_ms, + int repeats_to_cooldown, PackedFunc f_preproc = nullptr); } // namespace profiling } // namespace runtime diff --git a/python/tvm/contrib/debugger/debug_executor.py b/python/tvm/contrib/debugger/debug_executor.py index 9cfe6c30f24d..8160fa96b8ee 100644 --- a/python/tvm/contrib/debugger/debug_executor.py +++ b/python/tvm/contrib/debugger/debug_executor.py @@ -228,7 +228,7 @@ def _run_debug( number, repeat, min_repeat_ms, - max_repeat_num, + limit_zero_time_iterations, cooldown_interval_ms, repeats_to_cooldown, ): @@ -241,7 +241,7 @@ def _run_debug( number=number, repeat=repeat, min_repeat_ms=min_repeat_ms, - max_repeat_num=max_repeat_num, + limit_zero_time_iterations=limit_zero_time_iterations, cooldown_interval_ms=cooldown_interval_ms, repeats_to_cooldown=repeats_to_cooldown, ) @@ -281,7 +281,7 @@ def run( number=10, repeat=1, min_repeat_ms=1, - max_repeat_num=100, + limit_zero_time_iterations=100, cooldown_interval_ms=0, repeats_to_cooldown=1, **input_dict, @@ -309,7 +309,7 @@ def run( i.e., When the run time of one `repeat` falls below this time, the `number` parameter will be automatically increased. - max_repeat_num: int, optional + limit_zero_time_iterations: int, optional The maximum number of repeats when measured time is equal to 0. It helps to avoid hanging during measurements. @@ -331,7 +331,7 @@ def run( number=number, repeat=repeat, min_repeat_ms=min_repeat_ms, - max_repeat_num=max_repeat_num, + limit_zero_time_iterations=limit_zero_time_iterations, cooldown_interval_ms=cooldown_interval_ms, repeats_to_cooldown=repeats_to_cooldown, ) @@ -347,7 +347,7 @@ def run_individual( number, repeat=1, min_repeat_ms=0, - max_repeat_num=100, + limit_zero_time_iterations=100, cooldown_interval_ms=0, repeats_to_cooldown=1, ): @@ -372,7 +372,7 @@ def run_individual( i.e., When the run time of one `repeat` falls below this time, the `number` parameter will be automatically increased. - max_repeat_num: int, optional + limit_zero_time_iterations: int, optional The maximum number of repeats when measured time is equal to 0. It helps to avoid hanging during measurements. @@ -389,7 +389,12 @@ def run_individual( the repeat of the measurement. """ res = self._run_individual( - number, repeat, min_repeat_ms, max_repeat_num, cooldown_interval_ms, repeats_to_cooldown + number, + repeat, + min_repeat_ms, + limit_zero_time_iterations, + cooldown_interval_ms, + repeats_to_cooldown, ) results = [] offset = 0 @@ -409,7 +414,7 @@ def run_individual_node( number=10, repeat=1, min_repeat_ms=0, - max_repeat_num=100, + limit_zero_time_iterations=100, cooldown_interval_ms=0, repeats_to_cooldown=1, ): @@ -441,7 +446,7 @@ def run_individual_node( i.e., When the run time of one `repeat` falls below this time, the `number` parameter will be automatically increased. - max_repeat_num: int, optional + limit_zero_time_iterations: int, optional The maximum number of repeats when measured time is equal to 0. It helps to avoid hanging during measurements. @@ -462,7 +467,7 @@ def run_individual_node( number, repeat, min_repeat_ms, - max_repeat_num, + limit_zero_time_iterations, cooldown_interval_ms, repeats_to_cooldown, ) diff --git a/python/tvm/contrib/graph_executor.py b/python/tvm/contrib/graph_executor.py index 3cc412954a47..08dae307a89e 100644 --- a/python/tvm/contrib/graph_executor.py +++ b/python/tvm/contrib/graph_executor.py @@ -355,7 +355,7 @@ def benchmark( repeat=5, number=5, min_repeat_ms=None, - max_repeat_num=100, + limit_zero_time_iterations=100, end_to_end=False, cooldown_interval_ms=0, repeats_to_cooldown=1, @@ -403,7 +403,7 @@ def benchmark( milliseconds. This can be used to ensure that the function is run enough to get an accurate measurement. - max_repeat_num : Optional[int] + limit_zero_time_iterations : Optional[int] The maximum number of repeats when measured time is equal to 0. It helps to avoid hanging during measurements. @@ -442,7 +442,7 @@ def benchmark( repeat=repeat, number=number, min_repeat_ms=min_repeat_ms, - max_repeat_num=max_repeat_num, + limit_zero_time_iterations=limit_zero_time_iterations, )(device.device_type % rpc_base.RPC_SESS_MASK, device.device_id, *args) if kwargs: self.set_input(**kwargs) @@ -452,7 +452,7 @@ def benchmark( repeat=repeat, number=number, min_repeat_ms=min_repeat_ms, - max_repeat_num=max_repeat_num, + limit_zero_time_iterations=limit_zero_time_iterations, cooldown_interval_ms=cooldown_interval_ms, repeats_to_cooldown=repeats_to_cooldown, )() diff --git a/python/tvm/runtime/module.py b/python/tvm/runtime/module.py index d67a7da59dbd..e85b99234100 100644 --- a/python/tvm/runtime/module.py +++ b/python/tvm/runtime/module.py @@ -277,7 +277,7 @@ def time_evaluator( number=10, repeat=1, min_repeat_ms=0, - max_repeat_num=100, + limit_zero_time_iterations=100, cooldown_interval_ms=0, repeats_to_cooldown=1, f_preproc="", @@ -311,7 +311,7 @@ def time_evaluator( i.e., When the run time of one `repeat` falls below this time, the `number` parameter will be automatically increased. - max_repeat_num: int, optional + limit_zero_time_iterations: int, optional The maximum number of repeats when measured time is equal to 0. It helps to avoid hanging during measurements. @@ -345,7 +345,7 @@ def time_evaluator( number, repeat, min_repeat_ms, - max_repeat_num, + limit_zero_time_iterations, cooldown_interval_ms, repeats_to_cooldown, f_preproc, diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py index efec3ba83be9..c065d77a7c9f 100644 --- a/python/tvm/runtime/vm.py +++ b/python/tvm/runtime/vm.py @@ -583,7 +583,7 @@ def benchmark( repeat=5, number=5, min_repeat_ms=None, - max_repeat_num=100, + limit_zero_time_iterations=100, end_to_end=False, cooldown_interval_ms=0, repeats_to_cooldown=1, @@ -631,7 +631,7 @@ def benchmark( milliseconds. This can be used to ensure that the function is run enough to get an accurate measurement. - max_repeat_num : Optional[int] + limit_zero_time_iterations : Optional[int] The maximum number of repeats when measured time is equal to 0. It helps to avoid hanging during measurements. @@ -677,7 +677,7 @@ def benchmark( repeat=repeat, number=number, min_repeat_ms=min_repeat_ms, - max_repeat_num=max_repeat_num, + limit_zero_time_iterations=limit_zero_time_iterations, )(func_name, device.device_type % RPC_SESS_MASK, device.device_id, *packed_args) if args or kwargs: self.set_input(func_name, *args, **kwargs) @@ -687,7 +687,7 @@ def benchmark( repeat=repeat, number=number, min_repeat_ms=min_repeat_ms, - max_repeat_num=max_repeat_num, + limit_zero_time_iterations=limit_zero_time_iterations, cooldown_interval_ms=cooldown_interval_ms, repeats_to_cooldown=repeats_to_cooldown, )(func_name) diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c index baae12bcfa29..2151c23f8462 100644 --- a/src/runtime/crt/common/crt_runtime_api.c +++ b/src/runtime/crt/common/crt_runtime_api.c @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -477,7 +478,7 @@ typedef struct { int number; int repeat; int min_repeat_ms; - int max_repeat_num; + int limit_zero_time_iterations; int cooldown_interval_ms; int repeats_to_cooldown; } time_evaluator_state_t; @@ -507,7 +508,7 @@ int RPCTimeEvaluator(TVMValue* args, int* type_codes, int num_args, TVMValue* re g_time_evaluator_state.number = args[4].v_int64; g_time_evaluator_state.repeat = args[5].v_int64; g_time_evaluator_state.min_repeat_ms = args[6].v_int64; - g_time_evaluator_state.min_repeat_num = args[7].v_int64; + g_time_evaluator_state.limit_zero_time_iterations = args[7].v_int64; g_time_evaluator_state.cooldown_interval_ms = args[8].v_int64; g_time_evaluator_state.repeats_to_cooldown = args[9].v_int64; @@ -591,9 +592,9 @@ tvm_crt_error_t RunTimeEvaluator(tvm_function_index_t function_index, TVMValue* if (err != kTvmErrorNoError) { goto release_and_return; } - if (std::fpclassify(curr_res_seconds) == FP_ZERO) absolute_zero_times++; - if (absolute_zero_times >= max_repeat_num) break; - } while (curr_res_seconds < min_repeat_seconds); + if (fpclassify(curr_res_seconds) == FP_ZERO) absolute_zero_times++; + } while (curr_res_seconds < min_repeat_seconds && + absolute_zero_times < g_time_evaluator_state.limit_zero_time_iterations); double mean_exec_seconds = curr_res_seconds / g_time_evaluator_state.number; *iter = mean_exec_seconds; iter++; diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc index 8ff21c585481..ba546165c6a0 100644 --- a/src/runtime/graph_executor/debug/graph_executor_debug.cc +++ b/src/runtime/graph_executor/debug/graph_executor_debug.cc @@ -56,8 +56,9 @@ class GraphExecutorDebug : public GraphExecutor { * By default, one `repeat` contains `number` runs. If this parameter is set, * the parameters `number` will be dynamically adjusted to meet the * minimum duration requirement of one `repeat`. - * \param max_repeat_ms The maximum number of repeats when measured time is equal to 0. - * It helps to avoid hanging during measurements. + * \param limit_zero_time_iterations The maximum number of repeats when + * measured time is equal to 0. It helps to avoid hanging during + * measurements. * \param cooldown_interval_ms The cooldown interval in milliseconds between the number of repeats * defined by `repeats_to_cooldown`. * \param repeats_to_cooldown The number of repeats before the @@ -66,8 +67,9 @@ class GraphExecutorDebug : public GraphExecutor { * representing the number of layers. Next the encoded real numbers are float32_t in the number of * repeat multiplied by the number of layers. */ - std::string RunIndividual(int number, int repeat, int min_repeat_ms, int max_repeat_num, - int cooldown_interval_ms, int repeats_to_cooldown) { + std::string RunIndividual(int number, int repeat, int min_repeat_ms, + int limit_zero_time_iterations, int cooldown_interval_ms, + int repeats_to_cooldown) { // warmup run GraphExecutor::Run(); std::string tkey = module_->type_key(); @@ -75,14 +77,15 @@ class GraphExecutorDebug : public GraphExecutor { if (tkey == "rpc") { // RPC modules rely on remote timing which implements the logic from the else branch. for (size_t index = 0; index < op_execs_.size(); ++index) { - time_sec_per_op[index] = RunOpRPC(index, number, repeat, min_repeat_ms, max_repeat_num, - cooldown_interval_ms, repeats_to_cooldown); + time_sec_per_op[index] = + RunOpRPC(index, number, repeat, min_repeat_ms, limit_zero_time_iterations, + cooldown_interval_ms, repeats_to_cooldown); } } else { int op = 0; for (size_t index = 0; index < op_execs_.size(); ++index) { std::string result_str = - RunIndividualNode(index, number, repeat, min_repeat_ms, max_repeat_num, + RunIndividualNode(index, number, repeat, min_repeat_ms, limit_zero_time_iterations, cooldown_interval_ms, repeats_to_cooldown); const double* blob_ptr = reinterpret_cast(result_str.data()); for (int i = 0; i < repeat; ++i, ++blob_ptr) { @@ -113,7 +116,7 @@ class GraphExecutorDebug : public GraphExecutor { } std::string RunIndividualNode(int node_index, int number, int repeat, int min_repeat_ms, - int max_repeat_num, int cooldown_interval_ms, + int limit_zero_time_iterations, int cooldown_interval_ms, int repeats_to_cooldown) { std::string tkey = module_->type_key(); @@ -135,12 +138,13 @@ class GraphExecutorDebug : public GraphExecutor { Device& d = devices_[0]; PackedFunc time_evaluator = profiling::WrapTimeEvaluator( TypedPackedFunc([this, node_index]() { this->RunOpHost(node_index); }), d, number, - repeat, min_repeat_ms, max_repeat_num, cooldown_interval_ms, repeats_to_cooldown); + repeat, min_repeat_ms, limit_zero_time_iterations, cooldown_interval_ms, + repeats_to_cooldown); return time_evaluator(); } std::vector RunOpRPC(int index, int number, int repeat, int min_repeat_ms, - int max_repeat_num, int cooldown_interval_ms, + int limit_zero_time_iterations, int cooldown_interval_ms, int repeats_to_cooldown) { std::vector results(repeat, 0); // Right now we expect either "tvm_op" for nodes which run PackedFunc or "null" for nodes @@ -167,7 +171,7 @@ class GraphExecutorDebug : public GraphExecutor { runtime::Registry::Get("runtime.RPCTimeEvaluator") -> operator()(module_, name, static_cast(dev.device_type), dev.device_id, number, - repeat, min_repeat_ms, max_repeat_num, cooldown_interval_ms, + repeat, min_repeat_ms, limit_zero_time_iterations, cooldown_interval_ms, repeats_to_cooldown, ""); int num_flat_args = num_inputs + num_outputs; @@ -391,17 +395,18 @@ PackedFunc GraphExecutorDebug::GetFunction(const std::string& name, int number = args[0]; int repeat = args[1]; int min_repeat_ms = args[2]; - int max_repeat_num = args[3]; + int limit_zero_time_iterations = args[3]; int cooldown_interval_ms = args[4]; int repeats_to_cooldown = args[5]; ICHECK_GT(number, 0); ICHECK_GT(repeat, 0); ICHECK_GE(min_repeat_ms, 0); - ICHECK_GE(max_repeat_num, 0); + ICHECK_GE(limit_zero_time_iterations, 0); ICHECK_GE(cooldown_interval_ms, 0); ICHECK_GT(repeats_to_cooldown, 0); - std::string blob = this->RunIndividual(number, repeat, min_repeat_ms, max_repeat_num, - cooldown_interval_ms, repeats_to_cooldown); + std::string blob = + this->RunIndividual(number, repeat, min_repeat_ms, limit_zero_time_iterations, + cooldown_interval_ms, repeats_to_cooldown); TVMByteArray arr; arr.size = blob.length(); arr.data = blob.data(); @@ -413,7 +418,7 @@ PackedFunc GraphExecutorDebug::GetFunction(const std::string& name, int number = args[1]; int repeat = args[2]; int min_repeat_ms = args[3]; - int max_repeat_num = args[4]; + int limit_zero_time_iterations = args[4]; int cooldown_interval_ms = args[5]; int repeats_to_cooldown = args[6]; ICHECK_GE(node_index, 0); @@ -421,12 +426,12 @@ PackedFunc GraphExecutorDebug::GetFunction(const std::string& name, ICHECK_GT(number, 0); ICHECK_GT(repeat, 0); ICHECK_GE(min_repeat_ms, 0); - ICHECK_GE(max_repeat_num, 0); + ICHECK_GE(limit_zero_time_iterations, 0); ICHECK_GE(cooldown_interval_ms, 0); ICHECK_GT(repeats_to_cooldown, 0); - std::string blob = - this->RunIndividualNode(node_index, number, repeat, min_repeat_ms, max_repeat_num, - cooldown_interval_ms, repeats_to_cooldown); + std::string blob = this->RunIndividualNode(node_index, number, repeat, min_repeat_ms, + limit_zero_time_iterations, cooldown_interval_ms, + repeats_to_cooldown); TVMByteArray arr; arr.size = blob.length(); arr.data = blob.data(); diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc index fb345a721053..2c92633c34fc 100644 --- a/src/runtime/profiling.cc +++ b/src/runtime/profiling.cc @@ -848,8 +848,8 @@ TVM_REGISTER_GLOBAL("runtime.profiling.ProfileFunction") }); PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, int min_repeat_ms, - int max_repeat_num, int cooldown_interval_ms, int repeats_to_cooldown, - PackedFunc f_preproc) { + int limit_zero_time_iterations, int cooldown_interval_ms, + int repeats_to_cooldown, PackedFunc f_preproc) { ICHECK(pf != nullptr); if (static_cast(dev.device_type) == static_cast(kDLMicroDev)) { @@ -858,8 +858,9 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, return (*get_micro_time_evaluator)(pf, dev, number, repeat); } - auto ftimer = [pf, dev, number, repeat, min_repeat_ms, max_repeat_num, cooldown_interval_ms, - repeats_to_cooldown, f_preproc](TVMArgs args, TVMRetValue* rv) mutable { + auto ftimer = [pf, dev, number, repeat, min_repeat_ms, limit_zero_time_iterations, + cooldown_interval_ms, repeats_to_cooldown, + f_preproc](TVMArgs args, TVMRetValue* rv) mutable { TVMRetValue temp; std::ostringstream os; // skip first time call, to activate lazy compilation components. @@ -888,9 +889,8 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, t->Stop(); int64_t t_nanos = t->SyncAndGetElapsedNanos(); if (t_nanos == 0) absolute_zero_times++; - if (absolute_zero_times >= max_repeat_num) break; duration_ms = t_nanos / 1e6; - } while (duration_ms < min_repeat_ms); + } while (duration_ms < min_repeat_ms && absolute_zero_times < limit_zero_time_iterations); double speed = duration_ms / 1e3 / number; os.write(reinterpret_cast(&speed), sizeof(speed)); diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc index 28c7e3654ddc..a3f41e063226 100644 --- a/src/runtime/rpc/rpc_module.cc +++ b/src/runtime/rpc/rpc_module.cc @@ -191,8 +191,9 @@ class RPCModuleNode final : public ModuleNode { } PackedFunc GetTimeEvaluator(const std::string& name, Device dev, int number, int repeat, - int min_repeat_ms, int max_repeat_num, int cooldown_interval_ms, - int repeats_to_cooldown, const std::string& f_preproc_name) { + int min_repeat_ms, int limit_zero_time_iterations, + int cooldown_interval_ms, int repeats_to_cooldown, + const std::string& f_preproc_name) { InitRemoteFunc(&remote_get_time_evaluator_, "runtime.RPCTimeEvaluator"); // Remove session mask because we pass dev by parts. ICHECK_EQ(GetRPCSessionIndex(dev), sess_->table_index()) @@ -202,13 +203,13 @@ class RPCModuleNode final : public ModuleNode { if (module_handle_ != nullptr) { return remote_get_time_evaluator_(GetRef(this), name, static_cast(dev.device_type), dev.device_id, number, - repeat, min_repeat_ms, max_repeat_num, cooldown_interval_ms, - repeats_to_cooldown, f_preproc_name); + repeat, min_repeat_ms, limit_zero_time_iterations, + cooldown_interval_ms, repeats_to_cooldown, f_preproc_name); } else { return remote_get_time_evaluator_(Optional(nullptr), name, static_cast(dev.device_type), dev.device_id, number, - repeat, min_repeat_ms, max_repeat_num, cooldown_interval_ms, - repeats_to_cooldown, f_preproc_name); + repeat, min_repeat_ms, limit_zero_time_iterations, + cooldown_interval_ms, repeats_to_cooldown, f_preproc_name); } } @@ -365,7 +366,7 @@ inline void CPUCacheFlush(int begin_index, const TVMArgs& args) { TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator") .set_body_typed([](Optional opt_mod, std::string name, int device_type, int device_id, - int number, int repeat, int min_repeat_ms, int max_repeat_num, + int number, int repeat, int min_repeat_ms, int limit_zero_time_iterations, int cooldown_interval_ms, int repeats_to_cooldown, std::string f_preproc_name) { Device dev; @@ -376,8 +377,9 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator") std::string tkey = m->type_key(); if (tkey == "rpc") { return static_cast(m.operator->()) - ->GetTimeEvaluator(name, dev, number, repeat, min_repeat_ms, max_repeat_num, - cooldown_interval_ms, repeats_to_cooldown, f_preproc_name); + ->GetTimeEvaluator(name, dev, number, repeat, min_repeat_ms, + limit_zero_time_iterations, cooldown_interval_ms, + repeats_to_cooldown, f_preproc_name); } else { PackedFunc f_preproc; if (!f_preproc_name.empty()) { @@ -389,7 +391,7 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator") PackedFunc pf = m.GetFunction(name, true); CHECK(pf != nullptr) << "Cannot find " << name << " in the global registry"; return profiling::WrapTimeEvaluator(pf, dev, number, repeat, min_repeat_ms, - max_repeat_num, cooldown_interval_ms, + limit_zero_time_iterations, cooldown_interval_ms, repeats_to_cooldown, f_preproc); } } else { @@ -402,8 +404,9 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator") << "Cannot find " << f_preproc_name << " in the global function"; f_preproc = *pf_preproc; } - return profiling::WrapTimeEvaluator(*pf, dev, number, repeat, min_repeat_ms, max_repeat_num, - cooldown_interval_ms, repeats_to_cooldown, f_preproc); + return profiling::WrapTimeEvaluator(*pf, dev, number, repeat, min_repeat_ms, + limit_zero_time_iterations, cooldown_interval_ms, + repeats_to_cooldown, f_preproc); } }); diff --git a/web/emcc/tvmjs_support.cc b/web/emcc/tvmjs_support.cc index 05e49af5bba7..aa9546f3b71a 100644 --- a/web/emcc/tvmjs_support.cc +++ b/web/emcc/tvmjs_support.cc @@ -252,7 +252,7 @@ class AsyncLocalSession : public LocalSession { // time evaluator PackedFunc GetTimeEvaluator(Optional opt_mod, std::string name, int device_type, int device_id, int number, int repeat, int min_repeat_ms, - int max_repeat_num, int cooldown_interval_ms, + int limit_zero_time_iterations, int cooldown_interval_ms, int repeats_to_cooldown) { Device dev; dev.device_type = static_cast(device_type); @@ -262,21 +262,23 @@ class AsyncLocalSession : public LocalSession { Module m = opt_mod.value(); std::string tkey = m->type_key(); return WrapWasmTimeEvaluator(m.GetFunction(name, false), dev, number, repeat, min_repeat_ms, - max_repeat_num, cooldown_interval_ms, repeats_to_cooldown); + limit_zero_time_iterations, cooldown_interval_ms, + repeats_to_cooldown); } else { auto* pf = runtime::Registry::Get(name); CHECK(pf != nullptr) << "Cannot find " << name << " in the global function"; - return WrapWasmTimeEvaluator(*pf, dev, number, repeat, min_repeat_ms, max_repeat_num, - cooldown_interval_ms, repeats_to_cooldown); + return WrapWasmTimeEvaluator(*pf, dev, number, repeat, min_repeat_ms, + limit_zero_time_iterations, cooldown_interval_ms, + repeats_to_cooldown); } } // time evaluator PackedFunc WrapWasmTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, - int min_repeat_ms, int max_repeat_num, int cooldown_interval_ms, - int repeats_to_cooldown) { - auto ftimer = [pf, dev, number, repeat, min_repeat_ms, max_repeat_num, cooldown_interval_ms, - repeats_to_cooldown](TVMArgs args, TVMRetValue* rv) { + int min_repeat_ms, int limit_zero_time_iterations, + int cooldown_interval_ms, int repeats_to_cooldown) { + auto ftimer = [pf, dev, number, repeat, min_repeat_ms, limit_zero_time_iterations, + cooldown_interval_ms, repeats_to_cooldown](TVMArgs args, TVMRetValue* rv) { // the function is a async function. PackedFunc on_complete = args[args.size() - 1]; // keep argument alive in finvoke so that they @@ -294,7 +296,8 @@ class AsyncLocalSession : public LocalSession { auto* time_exec = runtime::Registry::Get("__async.wasm.TimeExecution"); CHECK(time_exec != nullptr) << "Cannot find wasm.GetTimer in the global function"; (*time_exec)(TypedPackedFunc(finvoke), dev, number, repeat, min_repeat_ms, - max_repeat_num, cooldown_interval_ms, repeats_to_cooldown, on_complete); + limit_zero_time_iterations, cooldown_interval_ms, repeats_to_cooldown, + on_complete); }; return PackedFunc(ftimer); } diff --git a/web/src/runtime.ts b/web/src/runtime.ts index 9f1b85cb8d8a..8df382dbc837 100644 --- a/web/src/runtime.ts +++ b/web/src/runtime.ts @@ -1058,7 +1058,7 @@ export class Instance implements Disposable { nstep: number, repeat: number, minRepeatMs: number, - maxRepeatNum: number, + limitZeroTimeIterations: number, cooldownIntervalMs: number, repeatsToCooldown: number ): Promise => { @@ -1083,13 +1083,10 @@ export class Instance implements Disposable { const tend: number = perf.now(); durationMs = tend - tstart; - if (durationMS == 0) { + if (durationMs == 0) { absoluteZeroTimes++; } - if (absoluteZeroTimes >= maxRepeatNum) { - break; - } - } while (durationMs < minRepeatMs); + } while (durationMs < minRepeatMs && absoluteZeroTimes < limitZeroTimeIterations); const speed = durationMs / setupNumber / 1000; result.push(speed); if (cooldownIntervalMs > 0.0 && (i % repeatsToCooldown) == 0 ) {