Skip to content

Commit

Permalink
Add --warm-start-latency to benchmark harness (#125353)
Browse files Browse the repository at this point in the history
Summary:
This change introduces a new flagg to perform a "warm start" test from the benchmark harness. The idea is to test a model twice: first with a fresh inductor cache (i.e., a "cold start"), and then a second run in a fresh process with the cache available (i.e. a "warm start"). We can later add this mode to CI runs to collect compile times for warm start.

X-link: pytorch/pytorch#125353
Approved by: https://github.com/eellison, https://github.com/desertfire

Reviewed By: izaitsevfb

Differential Revision: D57216102

Pulled By: masnesral

fbshipit-source-id: 2cb5751260e05844ad3324572ada4caca5109f23
  • Loading branch information
masnesral authored and facebook-github-bot committed May 11, 2024
1 parent 73dd57e commit f188e24
Showing 1 changed file with 56 additions and 52 deletions.
108 changes: 56 additions & 52 deletions userbenchmark/dynamo/dynamobench/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2005,33 +2005,6 @@ def get_dynamo_stats():
)


def maybe_fresh_cache(fn, is_cold_start):
def inner(*args, **kwargs):
cache_minder = contextlib.nullcontext()
if is_cold_start:
cache_entries = {}
cache_minder = fresh_inductor_cache(cache_entries)

try:
with cache_minder:
return fn(*args, **kwargs)
finally:
dump_cache = False
if dump_cache and is_cold_start:
output_csv(
output_filename[:-4] + "_triton_cache.csv",
["dev", "name", "batch_size", "triton_cache"],
[
current_device,
current_name,
current_batch_size,
cache_entries,
],
)

return inner


@contextmanager
def maybe_init_distributed(should_init_distributed, rank, world_size, port="6789"):
try:
Expand Down Expand Up @@ -3297,12 +3270,6 @@ def get_example_inputs(self):
action="store_true",
help="print dataframe result used for calculating accuracy",
)
parser.add_argument(
"--cold-start-latency",
"--cold_start_latency",
action="store_true",
help="Use a fresh triton cachedir when running each model, to force cold-start compile.",
)
parser.add_argument(
"--disable-cudagraphs",
action="store_true",
Expand Down Expand Up @@ -3415,6 +3382,19 @@ def get_example_inputs(self):
help="Enables Memory Snapshot tool for memory deep dives: https://pytorch.org/blog/understanding-gpu-memory-1/",
)

group_latency = parser.add_mutually_exclusive_group()
group_latency.add_argument(
"--cold-start-latency",
"--cold_start_latency",
action="store_true",
help="Use a fresh triton cachedir when running each model, to force cold-start compile.",
)
group_latency.add_argument(
"--warm-start-latency",
action="store_true",
help="Run model(s) twice and preseve caches in between to enable a 'warm start' on the 2nd run",
)

group_fuser = parser.add_mutually_exclusive_group()
# --nvfuser is now the default, keep the option to not break scripts
group_fuser.add_argument("--nvfuser", action="store_true", help=argparse.SUPPRESS)
Expand Down Expand Up @@ -3571,9 +3551,17 @@ def process_entry(rank, runner, original_dir, args):
world_size=args.world_size,
port=args.distributed_master_port,
):
return maybe_fresh_cache(
run, (args.cold_start_latency and args.only) or args.ci
)(runner, args, original_dir)
return run(runner, args, original_dir)


def maybe_fresh_cache(args):
cache_dir_assigned = "TORCHINDUCTOR_CACHE_DIR" in os.environ
if not cache_dir_assigned and (
args.cold_start_latency or args.warm_start_latency or args.ci
):
return fresh_inductor_cache()
else:
return contextlib.nullcontext()


def main(runner, original_dir=None, args=None):
Expand All @@ -3598,23 +3586,39 @@ def main(runner, original_dir=None, args=None):
f"--diff-branch: current branch is same as {args.diff_branch} branch, what are you diffing?"
)

args.init_distributed = args.only and args.multiprocess
if args.init_distributed:
# NB: Do NOT query device count before CUDA initialization; we're
# going to overwrite CUDA_VISIBLE_DEVICES and this will result in
# https://github.com/pytorch/pytorch/issues/107300
device_count = torch.cuda.device_count()
if device_count <= 1:
log.warning(
"The use multiprocess flag is set but there are <= 1 devices available."
with maybe_fresh_cache(args):
args.init_distributed = args.only and args.multiprocess
if args.init_distributed:
# NB: Do NOT query device count before CUDA initialization; we're
# going to overwrite CUDA_VISIBLE_DEVICES and this will result in
# https://github.com/pytorch/pytorch/issues/107300
device_count = torch.cuda.device_count()
if device_count <= 1:
log.warning(
"The use multiprocess flag is set but there are <= 1 devices available."
)
# multiprocess path
args.world_size = device_count
mp.spawn(
process_entry, args=(runner, original_dir, args), nprocs=device_count
)
# multiprocess path
args.world_size = device_count
mp.spawn(process_entry, args=(runner, original_dir, args), nprocs=device_count)
else:
# single process path just uses the main process
args.world_size = 1
process_entry(0, runner, original_dir, args)
elif args.only and args.warm_start_latency:
# Warm start mode. Enable FX graph caching and perform back-to-back runs in
# separate processes (but ensure the inductor cache is preserved across runs).
env = os.environ.copy()
env["TORCHINDUCTOR_FX_GRAPH_CACHE"] = "1"
cmd = [sys.executable] + sys.argv
cmd.remove("--warm-start-latency")

print(f"Executing cold-start run for {args.only}")
subprocess.check_call(cmd, timeout=args.timeout, env=env)

print(f"Executing warm-start run for {args.only}")
subprocess.check_call(cmd, timeout=args.timeout, env=env)
else:
# single process path just uses the main process
args.world_size = 1
process_entry(0, runner, original_dir, args)


def write_csv_when_exception(args, name: str, status: str, device=None):
Expand Down

0 comments on commit f188e24

Please sign in to comment.