Pass to split prefetch fsdp graph

IvanKobzarev · IvanKobzarev · commit a6dd593af946 · 2025-10-10T09:13:25.000-07:00
stack-info: PR: #201, branch: IvanKobzarev/stack/9
diff --git a/autoparallel/passes.py b/autoparallel/passes.py
@@ -0,0 +1,51 @@
+import dataclasses
+
+import torch
+import torch.utils._pytree as pytree
+from torch._functorch._aot_autograd.descriptors import AOTOutput
+from torch._functorch.partitioners import _extract_graph_with_inputs_outputs
+
+
+@dataclasses.dataclass(frozen=True)
+class PrefetchOutput(AOTOutput):
+    pass
+
+
+def split_fsdp_prefetch(g: torch.fx.Graph) -> tuple[torch.fx.Graph, torch.fx.Graph]:
+    g_ins = g.find_nodes(op="placeholder")
+    prefetch_g_outs_map = {}
+
+    for g_in in g_ins:
+        n = g_in
+        while True:
+            if len(n.users) != 1:
+                break
+            user = next(iter(n.users))
+            if len(user.all_input_nodes) > 1:
+                break
+            n = user
+        prefetch_g_outs_map[g_in] = n
+
+    prefetch_g_outs = list(prefetch_g_outs_map.values())
+    prefetch_g_outs_descs: list[AOTOutput] = [
+        PrefetchOutput() for _ in range(len(prefetch_g_outs))
+    ]
+
+    prefetch_g = _extract_graph_with_inputs_outputs(
+        g,
+        g_ins,
+        prefetch_g_outs,
+        prefetch_g_outs_descs,
+    )
+
+    g_outs = pytree.arg_tree_leaves(*(n.args for n in g.find_nodes(op="output")))
+    g_outs_descs = pytree.arg_tree_leaves(
+        next(iter(g.find_nodes(op="output"))).meta.get("desc", [None] * len(g_outs))
+    )
+    main_g = _extract_graph_with_inputs_outputs(
+        g,
+        prefetch_g_outs,
+        g_outs,
+        g_outs_descs,
+    )
+    return prefetch_g, main_g
diff --git a/examples/example_llama3.py b/examples/example_llama3.py
@@ -213,6 +213,7 @@ def add_tp_constraints(autop):
 with AutoParallel(
     model, input_fn, mesh, mp_policy, compile=True, repeated_subgraphs=True
 ) as autop:
+
     autop.add_parameter_memory_constraint(low=None, high=None)
 
     x_sharding = (Shard(0),) + (Replicate(),) * (mesh.ndim - 1)
@@ -253,6 +254,14 @@ def _pass(graph):
     print(f"Took {time.time() - t:.2f} s")
     parallel_mod = autop.apply_placement(sharding_placement)
 
+    test_split_fsdp_prefetch = True
+    if test_split_fsdp_prefetch:
+        gm = autop.parallel_gm
+        g = gm.graph
+        from autoparallel.passes import split_fsdp_prefetch
+
+        prefetch_g, main_g = split_fsdp_prefetch(g)
+
 # run weight init on our sharded DTensor params
 parallel_mod.to_empty(device="cuda")
 parallel_mod.init_weights()