demostration of cuda memory footprint with multi-stream use case

TroyGarden · facebook-github-bot · commit dc2defd969f3 · 2025-10-23T20:40:55.000-07:00
Differential Revision: D85399705
diff --git a/torchrec/distributed/benchmark/benchmark_comms.py b/torchrec/distributed/benchmark/benchmark_comms.py
@@ -21,6 +21,7 @@
 see README.md for more details
 """
 
+from contextlib import nullcontext
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional
 
@@ -54,6 +55,8 @@ class AllToAllSingleRunConfig(BenchFuncConfig):
     num_profiles: int = 2
     num_mul: int = 5
     num_concat: int = 100
+    multi_stream: bool = True
+    main_stream_allocation: bool = False
 
 
 def _compute(
@@ -94,6 +97,7 @@ def a2a_sync_base(
     num_mul: int,
     num_concat: int,
     ctx: MultiProcessContext,
+    **_kwargs: Dict[str, Any],
 ) -> None:
     with record_function("## pre-comms compute ##"):
         pre_comms = _compute(dim=dim, num_mul=num_mul, num_concat=num_concat, ctx=ctx)
@@ -186,6 +190,7 @@ def a2a_async_twice(
     num_mul: int,
     num_concat: int,
     ctx: MultiProcessContext,
+    **_kwargs: Dict[str, Any],
 ) -> None:
     with record_function("## pre-comms compute ##"):
         pre_comms = _compute(dim=dim, num_mul=num_mul, num_concat=num_concat, ctx=ctx)
@@ -254,13 +259,14 @@ def a2a_async_twice(
         assert checks1 and checks2
 
 
-# all_to_all_single with sync and single stream
+# LazyAwaitable
 def lazyawaitable(
     _batch_inputs: List[Dict[str, Any]],
     dim: int,
     num_mul: int,
     num_concat: int,
     ctx: MultiProcessContext,
+    **_kwargs: Dict[str, Any],
 ) -> None:
     with record_function("## pre-comms compute ##"):
         pre_comms = _compute(dim=dim, num_mul=num_mul, num_concat=num_concat, ctx=ctx)
@@ -294,6 +300,149 @@ def lazyawaitable(
         assert check_awaitable.item()
 
 
+# muti-stream memory footprint
+def multi_stream_memory(
+    _batch_inputs: List[Dict[str, Any]],
+    dim: int,
+    num_mul: int,
+    num_concat: int,
+    ctx: MultiProcessContext,
+    multi_stream: bool,
+    **_kwargs: Dict[str, Any],
+) -> None:
+    with record_function("## setup ##"):
+        main_stream = torch.cuda.current_stream()
+        data_copy_stream = torch.cuda.Stream() if multi_stream else nullcontext()
+        data_dist_stream = torch.cuda.Stream() if multi_stream else nullcontext()
+        irrelevant_data = torch.rand(dim, dim, device=ctx.device) - 0.5
+
+        # the host to device data transfer will block cuda execution without the `pin_memory()`
+        host_data = (torch.rand(dim, dim) - 0.5).pin_memory()
+
+    with record_function("## irrelevant compute before h2d ##"):
+        pre_comms = _compute(
+            dim=dim, num_mul=num_mul, num_concat=num_concat, ctx=ctx, x=irrelevant_data
+        )
+
+    with record_function("## copy data to device ##"):
+        with data_copy_stream:
+            device_data = host_data.to(ctx.device, non_blocking=True)
+
+    with record_function("## irrelevant compute after h2d ##"):
+        pre_comms = _compute(
+            dim=dim, num_mul=num_mul, num_concat=num_concat, ctx=ctx, x=irrelevant_data
+        )
+
+    with record_function("## pre-comms compute ##"):
+        if data_copy_stream is torch.cuda.Stream:
+            main_stream.wait_stream(data_copy_stream)
+            device_data.record_stream(main_stream)
+        pre_comms = _compute(
+            dim=dim, num_mul=num_mul, num_concat=num_concat, ctx=ctx, x=device_data
+        )
+
+    with data_dist_stream:
+        with record_function("## all_to_all_single ##"):
+            if data_dist_stream is torch.cuda.Stream:
+                data_dist_stream.wait_stream(main_stream)  # pyre-ignore[16]
+            post_comms = torch.zeros_like(pre_comms)
+            req = dist.all_to_all_single(
+                output=post_comms,
+                input=pre_comms,
+                group=ctx.pg,
+                async_op=True,
+            )
+        with record_function("## a2a comm validation ##"):
+            req.wait()
+            checks = DeviceToHostTensorAwaitable(_validate(post_comms, ctx))
+
+    with record_function("## irrelevant compute after a2a ##"):
+        pre_comms = _compute(
+            dim=dim, num_mul=num_mul, num_concat=num_concat, ctx=ctx, x=irrelevant_data
+        )
+
+    with record_function("## post-comms compute ##"):
+        req.wait()
+        post_comms.record_stream(main_stream)
+        post_comms = _compute(
+            dim=dim, num_mul=num_mul, num_concat=num_concat, ctx=ctx, x=post_comms[0]
+        )
+
+    with record_function("## assert ##"):
+        assert checks.item()
+
+
+def multi_stream_optimized(
+    _batch_inputs: List[Dict[str, Any]],
+    dim: int,
+    num_mul: int,
+    num_concat: int,
+    ctx: MultiProcessContext,
+    **_kwargs: Dict[str, Any],
+) -> None:
+    with record_function("## setup ##"):
+        main_stream = torch.cuda.current_stream()
+        data_copy_stream = torch.cuda.Stream()
+        data_dist_stream = torch.cuda.Stream()
+        irrelevant_data = torch.rand(dim, dim, device=ctx.device) - 0.5
+
+        # the host to device data transfer will block cuda execution without the `pin_memory()`
+        host_data = (torch.rand(dim, dim) - 0.5).pin_memory()
+        device_data = torch.empty_like(host_data, device=ctx.device)
+
+    with record_function("## irrelevant compute before h2d ##"):
+        pre_comms = _compute(
+            dim=dim, num_mul=num_mul, num_concat=num_concat, ctx=ctx, x=irrelevant_data
+        )
+
+    with record_function("## copy data to device ##"):
+        with data_copy_stream:
+            device_data.record_stream(data_copy_stream)
+            device_data.copy_(host_data, non_blocking=True)
+
+    with record_function("## irrelevant compute after h2d ##"):
+        pre_comms = _compute(
+            dim=dim, num_mul=num_mul, num_concat=num_concat, ctx=ctx, x=irrelevant_data
+        )
+
+    with record_function("## pre-comms compute ##"):
+        if data_copy_stream is torch.cuda.Stream:
+            main_stream.wait_stream(data_copy_stream)
+        pre_comms = _compute(
+            dim=dim, num_mul=num_mul, num_concat=num_concat, ctx=ctx, x=device_data
+        )
+
+    with record_function("## pre-allocate memory for a2a on main stream ##"):
+        post_comms = torch.zeros_like(pre_comms)
+
+    with data_dist_stream:
+        with record_function("## all_to_all_single ##"):
+            data_dist_stream.wait_stream(main_stream)
+            req = dist.all_to_all_single(
+                output=post_comms,
+                input=pre_comms,
+                group=ctx.pg,
+                async_op=True,
+            )
+
+    with record_function("## irrelevant compute after a2a ##"):
+        pre_comms = _compute(
+            dim=dim, num_mul=num_mul, num_concat=num_concat, ctx=ctx, x=irrelevant_data
+        )
+
+    with record_function("## a2a comm validation ##"):
+        req.wait()
+        checks = DeviceToHostTensorAwaitable(_validate(post_comms, ctx))
+
+    with record_function("## post-comms compute ##"):
+        post_comms = _compute(
+            dim=dim, num_mul=num_mul, num_concat=num_concat, ctx=ctx, x=post_comms[0]
+        )
+
+    with record_function("## assert ##"):
+        assert checks.item()
+
+
 # single-rank runner
 def a2a_single_runner(rank: int, world_size: int, arg: AllToAllSingleRunConfig) -> None:
     # Ensure GPUs are available and we have enough of them
@@ -317,6 +466,10 @@ def a2a_single_runner(rank: int, world_size: int, arg: AllToAllSingleRunConfig)
             func = a2a_async_twice
         elif arg.name.startswith("lazyawaitable"):
             func = lazyawaitable
+        elif arg.name.startswith("multi_stream_memory"):
+            func = multi_stream_memory
+        elif arg.name.startswith("multi_stream_optimized"):
+            func = multi_stream_optimized
         else:
             raise ValueError(f"Unknown benchmark name: {arg.name}")
 
@@ -328,6 +481,8 @@ def a2a_single_runner(rank: int, world_size: int, arg: AllToAllSingleRunConfig)
                 "dim": arg.dim,
                 "num_mul": arg.num_mul,
                 "num_concat": arg.num_concat,
+                "multi_stream": arg.multi_stream,
+                "main_stream_allocation": arg.main_stream_allocation,
             },
             func_to_benchmark=func,
             rank=rank,