Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GRAPH EXECUTOR,VM] Add benchmarking function to graph executor and vm #8807

Merged
merged 6 commits into from
Aug 26, 2021
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions python/tvm/contrib/graph_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,3 +320,62 @@ def __getitem__(self, key):
The key to the module.
"""
return self.module[key]

def benchmark(self, device, func_name="run", repeat=5, number=5, min_repeat_ms=None, **kwargs):
"""Calculate runtime of a function by repeatedly calling it.

Use this function to get an accurate measurement of the runtime of a function. The function
is run multiple times in order to account for variability in measurements, processor speed
or other external factors. Mean, median, standard deviation, min and max runtime are all
reported. On GPUs, CUDA and ROCm specifically, special on-device timers are used so that
synchonization and data transfer operations are not counted towards the runtime. This allows
for fair comparison of runtimes across different functions and models.

The benchmarking loop looks approximately like so:

.. code-block:: python

for r in range(repeat):
time_start = now()
for n in range(number):
func_name()
time_end = now()
total_times.append((time_end - time_start)/number)


Parameters
----------
func_name : str
The function to benchmark

repeat : int
Number of times to run the outer loop of the timing code (see above). The output will
contain `repeat` number of datapoints.

number : int
Number of times to run the inner loop of the timing code. This inner loop is run in
between the timer starting and stopping. In order to amortize any timing overhead,
`number` should be increased when the runtime of the function is small (less than a 1/10
of a millisecond).

min_repeat_ms : Optional[float]
If set, the inner loop will be run until it takes longer than `min_repeat_ms`
milliseconds. This can be used to ensure that the function is run enough to get an
accurate measurement.

kwargs : Dict[str, Object]
Named arguments to the function. These are cached before running timing code, so that
data transfer costs are not counted in the runtime.

Returns
-------
timing_results : BenchmarkResult
Runtimes of the function. Use `.mean` to access the mean runtime, use `.results` to
jwfromm marked this conversation as resolved.
Show resolved Hide resolved
access the individual runtimes.
"""
min_repeat_ms = 0 if min_repeat_ms is None else min_repeat_ms
if kwargs:
self.set_input(**kwargs)
return self.module.time_evaluator(
func_name, device, repeat=repeat, number=number, min_repeat_ms=min_repeat_ms
)()
27 changes: 7 additions & 20 deletions python/tvm/driver/tvmc/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,15 @@
import os
import tarfile
import json
from typing import Optional, Union, List, Dict, Callable, TextIO
from typing import Optional, Union, Dict, Callable, TextIO
import numpy as np

import tvm
import tvm.contrib.cc
from tvm import relay
from tvm.contrib import utils
from tvm.relay.backend.executor_factory import GraphExecutorFactoryModule
from tvm.runtime.module import BenchmarkResult

try:
from tvm.micro import export_model_library_format
Expand Down Expand Up @@ -371,14 +372,14 @@ def import_package(self, package_path: str):
class TVMCResult(object):
"""A class that stores the results of tvmc.run and provides helper utilities."""

def __init__(self, outputs: Dict[str, np.ndarray], times: List[float]):
def __init__(self, outputs: Dict[str, np.ndarray], times: BenchmarkResult):
"""Create a convenience wrapper around the output of tvmc.run

Parameters
----------
outputs : dict
Outputs dictionary mapping the name of the output to its numpy value.
times : list of float
times : BenchmarkResult
The execution times measured by the time evaluator in seconds to produce outputs.
"""
self.outputs = outputs
Expand All @@ -390,29 +391,15 @@ def format_times(self):
This has the effect of producing a small table that looks like:
.. code-block::
Execution time summary:
mean (ms) max (ms) min (ms) std (ms)
0.14310 0.16161 0.12933 0.01004
mean (ms) median (ms) max (ms) min (ms) std (ms)
0.14310 0.14310 0.16161 0.12933 0.01004

Returns
-------
str
A formatted string containing the statistics.
"""

# timestamps
mean_ts = np.mean(self.times) * 1000
std_ts = np.std(self.times) * 1000
max_ts = np.max(self.times) * 1000
min_ts = np.min(self.times) * 1000

header = "Execution time summary:\n{0:^10} {1:^10} {2:^10} {3:^10}".format(
"mean (ms)", "max (ms)", "min (ms)", "std (ms)"
)
stats = "{0:^10.2f} {1:^10.2f} {2:^10.2f} {3:^10.2f}".format(
mean_ts, max_ts, min_ts, std_ts
)

return "%s\n%s\n" % (header, stats)
return str(self.times)

def get_output(self, name: str):
"""A helper function to grab one of the outputs by name.
Expand Down
8 changes: 2 additions & 6 deletions python/tvm/driver/tvmc/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,12 +421,8 @@ def run_module(
# This print is intentional
print(report)

# create the module time evaluator (returns a function)
timer = module.module.time_evaluator("run", dev, number=number, repeat=repeat)
# call the evaluator function to invoke the module and save execution times
prof_result = timer()
# collect a list of execution times from the profiling results
times = prof_result.results
# call the benchmarking function of the executor
times = module.benchmark(dev, number=number, repeat=repeat)

logger.debug("Collecting the output tensors.")
num_outputs = module.get_num_outputs()
Expand Down
75 changes: 68 additions & 7 deletions python/tvm/runtime/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
import os
import ctypes
import struct
from collections import namedtuple
from typing import Sequence
import numpy as np

import tvm._ffi
from tvm._ffi.base import _LIB, check_call, c_str, string_types, _RUNTIME_ONLY
Expand All @@ -30,8 +31,69 @@
from . import _ffi_api


# profile result of time evaluator
ProfileResult = namedtuple("ProfileResult", ["mean", "results"])
class BenchmarkResult:
"""Runtimes from benchmarking"""

def __init__(self, results: Sequence[float]):
"""Construct a new BenchmarkResult from a sequence of runtimes.

Parameters
----------
results : Sequence[float]
Raw times from benchmarking

Attributes
----------
min : float
Minimum runtime in seconds of all results.
mean : float
Mean runtime in seconds of all results. If py:meth:`Module.time_evaluator` or
`benchmark` is called with `number` > 0, then each result is already the mean of a
`number` of runtimes, so this becomes the mean of means.
median : float
Median runtime in seconds of all results. If py:meth:`Module.time_evaluator` is called
with `number` > 0, then each result is already the mean of a `number` of runtimes, so
this becomes the median of means.
max : float
Maximum runtime in seconds of all results. If py:meth:`Module.time_evaluator` is called
with `number` > 0, then each result is already the mean of a `number` of runtimes, so
this becomes the maximum of those means.
std : float
Standard deviation in seconds of runtimes. If py:meth:`Module.time_evaluator` is called
with `number` > 0, then each result is already the mean of a `number` of runtimes, so
this becomes the standard deviation of means.
results : Sequence[float]
The collected runtimes (in seconds). This may be a series of mean runtimes if
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this needs more explanation. Currently a BenchmarkResult object contains no information on the benchmark parameters that were used, and it would be best (IMHO) to avoid there being surprises in terms of the interpretation of the results based on how the object was created. My recommendation would be to either fully document the behavior of what it means to benchmark with 'number > 1' or ensure that the BenchmarkResult object itself contains the benchmark parameters used.

py:meth:`Module.time_evaluator` or `benchmark` was run with `number` > 1.
"""
self.results = results
self.mean = np.mean(self.results)
self.std = np.std(self.results)
self.median = np.median(self.results)
self.min = np.min(self.results)
self.max = np.max(self.results)

def __repr__(self):
return "BenchmarkResult(min={}, mean={}, median={}, max={}, std={}, results={})".format(
self.min, self.mean, self.median, self.max, self.std, self.results
)

def __str__(self):
return """Execution time summary:
{:^12} {:^12} {:^12} {:^12} {:^12}
{:^12.4f} {:^12.4f} {:^12.4f} {:^12.4f} {:^12.4f}
""".format(
"mean (ms)",
"median (ms)",
"max (ms)",
"min (ms)",
"std (ms)",
self.mean * 1000,
self.median * 1000,
self.max * 1000,
self.min * 1000,
self.std * 1000,
)


class Module(object):
Expand Down Expand Up @@ -209,7 +271,7 @@ def time_evaluator(self, func_name, dev, number=10, repeat=1, min_repeat_ms=0, f
Returns
-------
ftimer : function
The function that takes same argument as func and returns a ProfileResult.
The function that takes same argument as func and returns a BenchmarkResult.
The ProfileResult reports `repeat` time costs in seconds.
"""
try:
Expand All @@ -230,12 +292,11 @@ def evaluator(*args):
blob = feval(*args)
fmt = "@" + ("d" * repeat)
results = struct.unpack(fmt, blob)
mean = sum(results) / float(repeat)
return ProfileResult(mean=mean, results=results)
return BenchmarkResult(results)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does time_evaluator itself have a unit test that needs to be updated? If not, would it make sense to add a quick one?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

time_evaluator was not changed, so it makes no sense to create/modify a unit test for it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Um, unless I'm reading this diff incorrectly (sorry if I'm getting confused by github diffs!) this is a change to time_evaluator. Am I confused?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh yes, this is the python binding for time_evaluator. Output is essentially the same, so there is no need to modify the tests.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, OK, but the existing tests for time_evaluator seem kind of weak. Here we're adding a new class (BenchmarkResult) which calculates a number of new statistics, but there's no tests for it ... meaning ... a regression in this behavior (either a change to time_evaluator or BenchmarkResult) would not be caught by CI.

At minimum I'd recommend a unit test for BenchmarkResult() itself, which is easy to write. Not trying to be too nitpicky but I do feel like we should be constantly improving our test coverage and adding a new class without corresponding tests tends to be a red flag for me :-)

WDYT?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To clarify BenchmarkResult is just a renaming + adding a constructor to ProfileResult (which already existed). It is tested in an ad hoc manner in a variety of places. I've added some tests for it constructor.


return evaluator
except NameError:
raise NameError("time_evaluate is only supported when RPC is enabled")
raise NameError("time_evaluator is only supported when RPC is enabled")

def _collect_from_import_tree(self, filter_func):
"""Helper function to collect modules from the tree matching a filter_func, then return it.
Expand Down
64 changes: 64 additions & 0 deletions python/tvm/runtime/vm.py
Original file line number Diff line number Diff line change
Expand Up @@ -507,3 +507,67 @@ def get_input_index(self, input_name, func_name="main"):
The input index. -1 will be returned if the given input name is not found.
"""
return self._get_input_index(input_name, func_name)

def benchmark(
self, device, *args, func_name="main", repeat=5, number=5, min_repeat_ms=None, **kwargs
):
"""Calculate runtime of a function by repeatedly calling it.

Use this function to get an accurate measurement of the runtime of a function. The function
is run multiple times in order to account for variability in measurements, processor speed
or other external factors. Mean, median, standard deviation, min and max runtime are all
reported. On GPUs, CUDA and ROCm specifically, special on-device timers are used so that
synchonization and data transfer operations are not counted towards the runtime. This allows
for fair comparison of runtimes across different functions and models.

The benchmarking loop looks approximately like so:

.. code-block:: python

for r in range(repeat):
time_start = now()
for n in range(number):
func_name()
time_end = now()
total_times.append((time_end - time_start)/number)


Parameters
----------
func_name : str
The function to benchmark

repeat : int
Number of times to run the outer loop of the timing code (see above). The output will
contain `repeat` number of datapoints.

number : int
Number of times to run the inner loop of the timing code. This inner loop is run in
between the timer starting and stopping. In order to amortize any timing overhead,
`number` should be increased when the runtime of the function is small (less than a 1/10
of a millisecond).

min_repeat_ms : Optional[float]
If set, the inner loop will be run until it takes longer than `min_repeat_ms`
milliseconds. This can be used to ensure that the function is run enough to get an
accurate measurement.

args : Sequence[Object]
Arguments to the function. These are cached before running timing code, so that data
transfer costs are not counted in the runtime.

kwargs : Dict[str, Object]
Named arguments to the function. These are cached like `args`.

Returns
-------
timing_results : ProfileResult
Runtimes of the function. Use `.mean` to access the mean runtime, use `.results` to
access the individual runtimes.
"""
min_repeat_ms = 0 if min_repeat_ms is None else min_repeat_ms
if args or kwargs:
self.set_input(func_name, *args, **kwargs)
return self.module.time_evaluator(
"invoke", device, repeat=repeat, number=number, min_repeat_ms=min_repeat_ms
)(func_name)
5 changes: 3 additions & 2 deletions src/runtime/rpc/rpc_module.cc
Original file line number Diff line number Diff line change
Expand Up @@ -417,8 +417,9 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
<< "Cannot find " << f_preproc_name << " in the global function";
f_preproc = *pf_preproc;
}
return WrapTimeEvaluator(m.GetFunction(name, false), dev, number, repeat, min_repeat_ms,
f_preproc);
PackedFunc pf = m.GetFunction(name, false);
CHECK(pf != nullptr) << "Cannot find " << name << " in the global registry";
return WrapTimeEvaluator(pf, dev, number, repeat, min_repeat_ms, f_preproc);
}
} else {
auto* pf = runtime::Registry::Get(name);
Expand Down
3 changes: 2 additions & 1 deletion tests/python/driver/tvmc/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

from tvm.driver import tvmc
from tvm.driver.tvmc.model import TVMCModel, TVMCPackage, TVMCResult
from tvm.runtime.module import BenchmarkResult


def test_tvmc_workflow(keras_simple):
Expand All @@ -35,7 +36,7 @@ def test_tvmc_workflow(keras_simple):
assert type(result) is TVMCResult
assert path.exists(tuning_records)
assert type(result.outputs) is dict
assert type(result.times) is tuple
assert type(result.times) is BenchmarkResult
assert "output_0" in result.outputs.keys()


Expand Down
5 changes: 3 additions & 2 deletions tests/python/driver/tvmc/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from tvm.driver import tvmc
from tvm.driver.tvmc.model import TVMCResult
from tvm.driver.tvmc.result_utils import get_top_results
from tvm.runtime.module import BenchmarkResult


def test_generate_tensor_data_zeros():
Expand Down Expand Up @@ -52,7 +53,7 @@ def test_generate_tensor_data__type_unknown():


def test_format_times__contains_header():
fake_result = TVMCResult(outputs=None, times=[0.6, 1.2, 0.12, 0.42])
fake_result = TVMCResult(outputs=None, times=BenchmarkResult([0.6, 1.2, 0.12, 0.42]))
sut = fake_result.format_times()
assert "std (ms)" in sut

Expand Down Expand Up @@ -101,5 +102,5 @@ def test_run_tflite_module__with_profile__valid_input(
tiger_cat_mobilenet_id in top_5_ids
), "tiger cat is expected in the top-5 for mobilenet v1"
assert type(result.outputs) is dict
assert type(result.times) is tuple
assert type(result.times) is BenchmarkResult
assert "output_0" in result.outputs.keys()
Loading