minor refactor over EP (pytorch#1854)

tianyu-l · githubsgi · commit 0ebc9cf6bd3f · 2025-10-28T18:13:15.000-07:00
This PR:

- let `ExpertParallel` handles indices permute / unpermute when EP is
used
- move `to_local` to model code to be more explicit
- rename the `expert_parallel` wrapper which does permute / unpermute to
`indices_permutation_wrapper` to be more accurate
diff --git a/torchtitan/distributed/expert_parallel.py b/torchtitan/distributed/expert_parallel.py
@@ -223,6 +223,7 @@ def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
 class ReordererSequenceParallel(ParallelStyle):
     def __init__(self):
         super().__init__()
+        self.top_k = None
 
     def _prepare_inputput_fn(self, mod, inputs, device_mesh):
         # shape (batch_size*seq_len, top_k)