Add --warm-start-latency to benchmark harness (#125353)

Summary: This change introduces a new flagg to perform a "warm start" test from the benchmark harness. The idea is to test a model twice: first with a fresh inductor cache (i.e., a "cold start"), and then a second run in a fresh process with the cache available (i.e. a "warm start"). We can later add this mode to CI runs to collect compile times for warm start. X-link: pytorch/pytorch#125353 Approved by: https://github.com/eellison, https://github.com/desertfire Reviewed By: izaitsevfb Differential Revision: D57216102 Pulled By: masnesral fbshipit-source-id: 2cb5751260e05844ad3324572ada4caca5109f23
pytorch · May 11, 2024 · f188e24 · f188e24
1 parent 73dd57e
commit f188e24
Showing 1 changed file with 56 additions and 52 deletions.
diff --git a/userbenchmark/dynamo/dynamobench/common.py b/userbenchmark/dynamo/dynamobench/common.py
@@ -2005,33 +2005,6 @@ def get_dynamo_stats():
  )
 
 
-def maybe_fresh_cache(fn, is_cold_start):
- def inner(*args, **kwargs):
- cache_minder = contextlib.nullcontext()
- if is_cold_start:
- cache_entries = {}
- cache_minder = fresh_inductor_cache(cache_entries)
-
- try:
- with cache_minder:
- return fn(*args, **kwargs)
- finally:
- dump_cache = False
- if dump_cache and is_cold_start:
- output_csv(
- output_filename[:-4] + "_triton_cache.csv",
- ["dev", "name", "batch_size", "triton_cache"],
- [
- current_device,
- current_name,
- current_batch_size,
- cache_entries,
- ],
- )
-
- return inner
-
-
 @contextmanager
 def maybe_init_distributed(should_init_distributed, rank, world_size, port="6789"):
  try:
@@ -3297,12 +3270,6 @@ def get_example_inputs(self):
  action="store_true",
  help="print dataframe result used for calculating accuracy",
  )
- parser.add_argument(
- "--cold-start-latency",
- "--cold_start_latency",
- action="store_true",
- help="Use a fresh triton cachedir when running each model, to force cold-start compile.",
- )
  parser.add_argument(
  "--disable-cudagraphs",
  action="store_true",
@@ -3415,6 +3382,19 @@ def get_example_inputs(self):
  help="Enables Memory Snapshot tool for memory deep dives: https://pytorch.org/blog/understanding-gpu-memory-1/",
  )
 
+ group_latency = parser.add_mutually_exclusive_group()
+ group_latency.add_argument(
+ "--cold-start-latency",
+ "--cold_start_latency",
+ action="store_true",
+ help="Use a fresh triton cachedir when running each model, to force cold-start compile.",
+ )
+ group_latency.add_argument(
+ "--warm-start-latency",
+ action="store_true",
+ help="Run model(s) twice and preseve caches in between to enable a 'warm start' on the 2nd run",
+ )
+
  group_fuser = parser.add_mutually_exclusive_group()
  # --nvfuser is now the default, keep the option to not break scripts
  group_fuser.add_argument("--nvfuser", action="store_true", help=argparse.SUPPRESS)
@@ -3571,9 +3551,17 @@ def process_entry(rank, runner, original_dir, args):
  world_size=args.world_size,
  port=args.distributed_master_port,
  ):
- return maybe_fresh_cache(
- run, (args.cold_start_latency and args.only) or args.ci
- )(runner, args, original_dir)
+ return run(runner, args, original_dir)
+
+
+def maybe_fresh_cache(args):
+ cache_dir_assigned = "TORCHINDUCTOR_CACHE_DIR" in os.environ
+ if not cache_dir_assigned and (
+ args.cold_start_latency or args.warm_start_latency or args.ci
+ ):
+ return fresh_inductor_cache()
+ else:
+ return contextlib.nullcontext()
 
 
 def main(runner, original_dir=None, args=None):
@@ -3598,23 +3586,39 @@ def main(runner, original_dir=None, args=None):
  f"--diff-branch: current branch is same as {args.diff_branch} branch, what are you diffing?"
  )
 
- args.init_distributed = args.only and args.multiprocess
- if args.init_distributed:
- # NB: Do NOT query device count before CUDA initialization; we're
- # going to overwrite CUDA_VISIBLE_DEVICES and this will result in
- # https://github.com/pytorch/pytorch/issues/107300
- device_count = torch.cuda.device_count()
- if device_count <= 1:
- log.warning(
- "The use multiprocess flag is set but there are <= 1 devices available."
+ with maybe_fresh_cache(args):
+ args.init_distributed = args.only and args.multiprocess
+ if args.init_distributed:
+ # NB: Do NOT query device count before CUDA initialization; we're
+ # going to overwrite CUDA_VISIBLE_DEVICES and this will result in
+ # https://github.com/pytorch/pytorch/issues/107300
+ device_count = torch.cuda.device_count()
+ if device_count <= 1:
+ log.warning(
+ "The use multiprocess flag is set but there are <= 1 devices available."
+ )
+ # multiprocess path
+ args.world_size = device_count
+ mp.spawn(
+ process_entry, args=(runner, original_dir, args), nprocs=device_count
  )
- # multiprocess path
- args.world_size = device_count
- mp.spawn(process_entry, args=(runner, original_dir, args), nprocs=device_count)
- else:
- # single process path just uses the main process
- args.world_size = 1
- process_entry(0, runner, original_dir, args)
+ elif args.only and args.warm_start_latency:
+ # Warm start mode. Enable FX graph caching and perform back-to-back runs in
+ # separate processes (but ensure the inductor cache is preserved across runs).
+ env = os.environ.copy()
+ env["TORCHINDUCTOR_FX_GRAPH_CACHE"] = "1"
+ cmd = [sys.executable] + sys.argv
+ cmd.remove("--warm-start-latency")
+
+ print(f"Executing cold-start run for {args.only}")
+ subprocess.check_call(cmd, timeout=args.timeout, env=env)
+
+ print(f"Executing warm-start run for {args.only}")
+ subprocess.check_call(cmd, timeout=args.timeout, env=env)
+ else:
+ # single process path just uses the main process
+ args.world_size = 1
+ process_entry(0, runner, original_dir, args)
 
 
 def write_csv_when_exception(args, name: str, status: str, device=None):