Skip to content

Commit

Permalink
[GRAPH EXECUTOR,VM] Add benchmarking function to graph executor and vm (
Browse files Browse the repository at this point in the history
#8807)

* [GRAPH EXECUTOR,VM] Add benchmarking function to graph executor and vm

This new benchmarking function is just a convenience function for
calling time_evaluator on the underlying module. Hopefully this should
make it easier for users to get good benchmarks of their code.

* formatting

* import order

* more test, more comments, more precision

* fix tests

* add seconds descriptions to doc
  • Loading branch information
Tristan Konolige authored Aug 26, 2021
1 parent 0648fff commit f1ca91d
Show file tree
Hide file tree
Showing 23 changed files with 283 additions and 92 deletions.
59 changes: 59 additions & 0 deletions python/tvm/contrib/graph_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,3 +320,62 @@ def __getitem__(self, key):
The key to the module.
"""
return self.module[key]

def benchmark(self, device, func_name="run", repeat=5, number=5, min_repeat_ms=None, **kwargs):
"""Calculate runtime of a function by repeatedly calling it.
Use this function to get an accurate measurement of the runtime of a function. The function
is run multiple times in order to account for variability in measurements, processor speed
or other external factors. Mean, median, standard deviation, min and max runtime are all
reported. On GPUs, CUDA and ROCm specifically, special on-device timers are used so that
synchonization and data transfer operations are not counted towards the runtime. This allows
for fair comparison of runtimes across different functions and models.
The benchmarking loop looks approximately like so:
.. code-block:: python
for r in range(repeat):
time_start = now()
for n in range(number):
func_name()
time_end = now()
total_times.append((time_end - time_start)/number)
Parameters
----------
func_name : str
The function to benchmark
repeat : int
Number of times to run the outer loop of the timing code (see above). The output will
contain `repeat` number of datapoints.
number : int
Number of times to run the inner loop of the timing code. This inner loop is run in
between the timer starting and stopping. In order to amortize any timing overhead,
`number` should be increased when the runtime of the function is small (less than a 1/10
of a millisecond).
min_repeat_ms : Optional[float]
If set, the inner loop will be run until it takes longer than `min_repeat_ms`
milliseconds. This can be used to ensure that the function is run enough to get an
accurate measurement.
kwargs : Dict[str, Object]
Named arguments to the function. These are cached before running timing code, so that
data transfer costs are not counted in the runtime.
Returns
-------
timing_results : BenchmarkResult
Runtimes of the function. Use `.mean` to access the mean runtime, use `.results` to
access the individual runtimes (in seconds).
"""
min_repeat_ms = 0 if min_repeat_ms is None else min_repeat_ms
if kwargs:
self.set_input(**kwargs)
return self.module.time_evaluator(
func_name, device, repeat=repeat, number=number, min_repeat_ms=min_repeat_ms
)()
27 changes: 7 additions & 20 deletions python/tvm/driver/tvmc/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,15 @@
import os
import tarfile
import json
from typing import Optional, Union, List, Dict, Callable, TextIO
from typing import Optional, Union, Dict, Callable, TextIO
import numpy as np

import tvm
import tvm.contrib.cc
from tvm import relay
from tvm.contrib import utils
from tvm.relay.backend.executor_factory import GraphExecutorFactoryModule
from tvm.runtime.module import BenchmarkResult

try:
from tvm.micro import export_model_library_format
Expand Down Expand Up @@ -371,14 +372,14 @@ def import_package(self, package_path: str):
class TVMCResult(object):
"""A class that stores the results of tvmc.run and provides helper utilities."""

def __init__(self, outputs: Dict[str, np.ndarray], times: List[float]):
def __init__(self, outputs: Dict[str, np.ndarray], times: BenchmarkResult):
"""Create a convenience wrapper around the output of tvmc.run
Parameters
----------
outputs : dict
Outputs dictionary mapping the name of the output to its numpy value.
times : list of float
times : BenchmarkResult
The execution times measured by the time evaluator in seconds to produce outputs.
"""
self.outputs = outputs
Expand All @@ -390,29 +391,15 @@ def format_times(self):
This has the effect of producing a small table that looks like:
.. code-block::
Execution time summary:
mean (ms) max (ms) min (ms) std (ms)
0.14310 0.16161 0.12933 0.01004
mean (ms) median (ms) max (ms) min (ms) std (ms)
0.14310 0.14310 0.16161 0.12933 0.01004
Returns
-------
str
A formatted string containing the statistics.
"""

# timestamps
mean_ts = np.mean(self.times) * 1000
std_ts = np.std(self.times) * 1000
max_ts = np.max(self.times) * 1000
min_ts = np.min(self.times) * 1000

header = "Execution time summary:\n{0:^10} {1:^10} {2:^10} {3:^10}".format(
"mean (ms)", "max (ms)", "min (ms)", "std (ms)"
)
stats = "{0:^10.2f} {1:^10.2f} {2:^10.2f} {3:^10.2f}".format(
mean_ts, max_ts, min_ts, std_ts
)

return "%s\n%s\n" % (header, stats)
return str(self.times)

def get_output(self, name: str):
"""A helper function to grab one of the outputs by name.
Expand Down
8 changes: 2 additions & 6 deletions python/tvm/driver/tvmc/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,12 +421,8 @@ def run_module(
# This print is intentional
print(report)

# create the module time evaluator (returns a function)
timer = module.module.time_evaluator("run", dev, number=number, repeat=repeat)
# call the evaluator function to invoke the module and save execution times
prof_result = timer()
# collect a list of execution times from the profiling results
times = prof_result.results
# call the benchmarking function of the executor
times = module.benchmark(dev, number=number, repeat=repeat)

logger.debug("Collecting the output tensors.")
num_outputs = module.get_num_outputs()
Expand Down
75 changes: 68 additions & 7 deletions python/tvm/runtime/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
import os
import ctypes
import struct
from collections import namedtuple
from typing import Sequence
import numpy as np

import tvm._ffi
from tvm._ffi.base import _LIB, check_call, c_str, string_types, _RUNTIME_ONLY
Expand All @@ -30,8 +31,69 @@
from . import _ffi_api


# profile result of time evaluator
ProfileResult = namedtuple("ProfileResult", ["mean", "results"])
class BenchmarkResult:
"""Runtimes from benchmarking"""

def __init__(self, results: Sequence[float]):
"""Construct a new BenchmarkResult from a sequence of runtimes.
Parameters
----------
results : Sequence[float]
Raw times from benchmarking
Attributes
----------
min : float
Minimum runtime in seconds of all results.
mean : float
Mean runtime in seconds of all results. If py:meth:`Module.time_evaluator` or
`benchmark` is called with `number` > 0, then each result is already the mean of a
`number` of runtimes, so this becomes the mean of means.
median : float
Median runtime in seconds of all results. If py:meth:`Module.time_evaluator` is called
with `number` > 0, then each result is already the mean of a `number` of runtimes, so
this becomes the median of means.
max : float
Maximum runtime in seconds of all results. If py:meth:`Module.time_evaluator` is called
with `number` > 0, then each result is already the mean of a `number` of runtimes, so
this becomes the maximum of those means.
std : float
Standard deviation in seconds of runtimes. If py:meth:`Module.time_evaluator` is called
with `number` > 0, then each result is already the mean of a `number` of runtimes, so
this becomes the standard deviation of means.
results : Sequence[float]
The collected runtimes (in seconds). This may be a series of mean runtimes if
py:meth:`Module.time_evaluator` or `benchmark` was run with `number` > 1.
"""
self.results = results
self.mean = np.mean(self.results)
self.std = np.std(self.results)
self.median = np.median(self.results)
self.min = np.min(self.results)
self.max = np.max(self.results)

def __repr__(self):
return "BenchmarkResult(min={}, mean={}, median={}, max={}, std={}, results={})".format(
self.min, self.mean, self.median, self.max, self.std, self.results
)

def __str__(self):
return """Execution time summary:
{:^12} {:^12} {:^12} {:^12} {:^12}
{:^12.4f} {:^12.4f} {:^12.4f} {:^12.4f} {:^12.4f}
""".format(
"mean (ms)",
"median (ms)",
"max (ms)",
"min (ms)",
"std (ms)",
self.mean * 1000,
self.median * 1000,
self.max * 1000,
self.min * 1000,
self.std * 1000,
)


class Module(object):
Expand Down Expand Up @@ -209,7 +271,7 @@ def time_evaluator(self, func_name, dev, number=10, repeat=1, min_repeat_ms=0, f
Returns
-------
ftimer : function
The function that takes same argument as func and returns a ProfileResult.
The function that takes same argument as func and returns a BenchmarkResult.
The ProfileResult reports `repeat` time costs in seconds.
"""
try:
Expand All @@ -230,12 +292,11 @@ def evaluator(*args):
blob = feval(*args)
fmt = "@" + ("d" * repeat)
results = struct.unpack(fmt, blob)
mean = sum(results) / float(repeat)
return ProfileResult(mean=mean, results=results)
return BenchmarkResult(results)

return evaluator
except NameError:
raise NameError("time_evaluate is only supported when RPC is enabled")
raise NameError("time_evaluator is only supported when RPC is enabled")

def _collect_from_import_tree(self, filter_func):
"""Helper function to collect modules from the tree matching a filter_func, then return it.
Expand Down
64 changes: 64 additions & 0 deletions python/tvm/runtime/vm.py
Original file line number Diff line number Diff line change
Expand Up @@ -507,3 +507,67 @@ def get_input_index(self, input_name, func_name="main"):
The input index. -1 will be returned if the given input name is not found.
"""
return self._get_input_index(input_name, func_name)

def benchmark(
self, device, *args, func_name="main", repeat=5, number=5, min_repeat_ms=None, **kwargs
):
"""Calculate runtime of a function by repeatedly calling it.
Use this function to get an accurate measurement of the runtime of a function. The function
is run multiple times in order to account for variability in measurements, processor speed
or other external factors. Mean, median, standard deviation, min and max runtime are all
reported. On GPUs, CUDA and ROCm specifically, special on-device timers are used so that
synchonization and data transfer operations are not counted towards the runtime. This allows
for fair comparison of runtimes across different functions and models.
The benchmarking loop looks approximately like so:
.. code-block:: python
for r in range(repeat):
time_start = now()
for n in range(number):
func_name()
time_end = now()
total_times.append((time_end - time_start)/number)
Parameters
----------
func_name : str
The function to benchmark
repeat : int
Number of times to run the outer loop of the timing code (see above). The output will
contain `repeat` number of datapoints.
number : int
Number of times to run the inner loop of the timing code. This inner loop is run in
between the timer starting and stopping. In order to amortize any timing overhead,
`number` should be increased when the runtime of the function is small (less than a 1/10
of a millisecond).
min_repeat_ms : Optional[float]
If set, the inner loop will be run until it takes longer than `min_repeat_ms`
milliseconds. This can be used to ensure that the function is run enough to get an
accurate measurement.
args : Sequence[Object]
Arguments to the function. These are cached before running timing code, so that data
transfer costs are not counted in the runtime.
kwargs : Dict[str, Object]
Named arguments to the function. These are cached like `args`.
Returns
-------
timing_results : BenchmarkResult
Runtimes of the function. Use `.mean` to access the mean runtime, use `.results` to
access the individual runtimes (in seconds).
"""
min_repeat_ms = 0 if min_repeat_ms is None else min_repeat_ms
if args or kwargs:
self.set_input(func_name, *args, **kwargs)
return self.module.time_evaluator(
"invoke", device, repeat=repeat, number=number, min_repeat_ms=min_repeat_ms
)(func_name)
5 changes: 3 additions & 2 deletions src/runtime/rpc/rpc_module.cc
Original file line number Diff line number Diff line change
Expand Up @@ -417,8 +417,9 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
<< "Cannot find " << f_preproc_name << " in the global function";
f_preproc = *pf_preproc;
}
return WrapTimeEvaluator(m.GetFunction(name, false), dev, number, repeat, min_repeat_ms,
f_preproc);
PackedFunc pf = m.GetFunction(name, false);
CHECK(pf != nullptr) << "Cannot find " << name << " in the global registry";
return WrapTimeEvaluator(pf, dev, number, repeat, min_repeat_ms, f_preproc);
}
} else {
auto* pf = runtime::Registry::Get(name);
Expand Down
3 changes: 2 additions & 1 deletion tests/python/driver/tvmc/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

from tvm.driver import tvmc
from tvm.driver.tvmc.model import TVMCModel, TVMCPackage, TVMCResult
from tvm.runtime.module import BenchmarkResult


def test_tvmc_workflow(keras_simple):
Expand All @@ -35,7 +36,7 @@ def test_tvmc_workflow(keras_simple):
assert type(result) is TVMCResult
assert path.exists(tuning_records)
assert type(result.outputs) is dict
assert type(result.times) is tuple
assert type(result.times) is BenchmarkResult
assert "output_0" in result.outputs.keys()


Expand Down
5 changes: 3 additions & 2 deletions tests/python/driver/tvmc/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from tvm.driver import tvmc
from tvm.driver.tvmc.model import TVMCResult
from tvm.driver.tvmc.result_utils import get_top_results
from tvm.runtime.module import BenchmarkResult


def test_generate_tensor_data_zeros():
Expand Down Expand Up @@ -52,7 +53,7 @@ def test_generate_tensor_data__type_unknown():


def test_format_times__contains_header():
fake_result = TVMCResult(outputs=None, times=[0.6, 1.2, 0.12, 0.42])
fake_result = TVMCResult(outputs=None, times=BenchmarkResult([0.6, 1.2, 0.12, 0.42]))
sut = fake_result.format_times()
assert "std (ms)" in sut

Expand Down Expand Up @@ -101,5 +102,5 @@ def test_run_tflite_module__with_profile__valid_input(
tiger_cat_mobilenet_id in top_5_ids
), "tiger cat is expected in the top-5 for mobilenet v1"
assert type(result.outputs) is dict
assert type(result.times) is tuple
assert type(result.times) is BenchmarkResult
assert "output_0" in result.outputs.keys()
Loading

0 comments on commit f1ca91d

Please sign in to comment.