fixing test and comments

kinjalpatel27 · kinjalpatel27 · commit 2df77b1ff70f · 2025-10-11T04:30:51.000Z
Signed-off-by: Kinjal Patel &lt;kinjalpravin@nvidia.com&gt;
diff --git a/modelopt/torch/quantization/plugins/megatron.py b/modelopt/torch/quantization/plugins/megatron.py
@@ -28,8 +28,6 @@
 from megatron.core.extensions import transformer_engine as megatron_te
 from megatron.core.parallel_state import get_data_parallel_group
 from megatron.core.tensor_parallel.mappings import gather_from_sequence_parallel_region
-from megatron.core.parallel_state import get_data_parallel_group
-from megatron.core.tensor_parallel.mappings import gather_from_sequence_parallel_region
 from megatron.core.transformer import MegatronModule
 from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
 from megatron.core.utils import get_tensor_model_parallel_group_if_none
@@ -63,23 +61,24 @@ def get_sequential_mlp_expert_names(name: str, module: torch.nn.Module):
             expert_name, local_expert_name = name.split(".local_experts.")
             # extract quantizer name by removing local_expert number from the name
             local_expert_name = ".".join(local_expert_name.split(".")[1:])
-            return expert_name, local_expert_name
-        return None, None
+            return f"{expert_name}.{local_expert_name}"
+        return None
 
     # gather amax values from SequentialMLP experts
     for name, module in model.named_modules():
-        expert_name, local_expert_name = get_sequential_mlp_expert_names(name, module)
-        if expert_name and local_expert_name:
-            amax_dict[local_expert_name] = amax_dict.get(local_expert_name, {})
-            amax_dict[local_expert_name][expert_name] = max(
-                amax_dict[local_expert_name].get(expert_name, 0), module.amax
+        expert_name = get_sequential_mlp_expert_names(name, module)
+        if expert_name and module.amax is not None:
+            stored_amax = amax_dict.get(expert_name)
+            amax_tensor = module.amax.detach().clone()
+            amax_dict[expert_name] = (
+                amax_tensor if stored_amax is None else torch.maximum(stored_amax, amax_tensor)
             )
 
     # sync amax values across experts in SequentialMLP
     for name, module in model.named_modules():
-        expert_name, local_expert_name = get_sequential_mlp_expert_names(name, module)
-        if expert_name and local_expert_name:
-            module.amax = amax_dict[local_expert_name][expert_name]
+        expert_name = get_sequential_mlp_expert_names(name, module)
+        if expert_name and module.amax is not None:
+            module.amax = amax_dict[expert_name].detach().clone().to(module.amax.device)
 
 
 CUSTOM_POST_CALIBRATION_PLUGINS.add(sync_amax_across_sequential_mlp)
diff --git a/tests/_test_utils/torch_dist/plugins/megatron_common.py b/tests/_test_utils/torch_dist/plugins/megatron_common.py
@@ -562,9 +562,8 @@ def compare_amax_sync_across_expert_parallel(model):
         if isinstance(module, mtq.nn.TensorQuantizer) and hasattr(module, "_amax"):
             # Check for both TEGrouped and sequential MoE patterns
             if "local_experts" in name or ("experts" in name and "linear_fc" in name):
-                expert_amax_values[name] = (
-                    module.amax.item() if hasattr(module.amax, "item") else module.amax
-                )
+                amax_val = module.amax.item() if hasattr(module.amax, "item") else module.amax
+                expert_amax_values[name] = amax_val.detach().clone()
 
     # Early return if no expert quantizers found
     assert expert_amax_values, "No expert quantizers found"
@@ -602,7 +601,7 @@ def compare_amax_sync_across_expert_parallel(model):
     # Check synchronization - fail fast on first inconsistency
     for quantizer_type, rank_values in expert_quantizers.items():
         if len(rank_values) > 1:  # Only check if we have multiple ranks
-            values = list(rank_values.values())
+            values = list(rank_values.detach().values().cpu())
             max_diff = max(values) - min(values)
             if max_diff > 1e-6:  # Allow for small floating point differences
                 return False, quantizer_type, rank_values
diff --git a/tests/gpu/torch/quantization/plugins/test_megatron.py b/tests/gpu/torch/quantization/plugins/test_megatron.py
@@ -34,7 +34,6 @@
 from _test_utils.torch_quantization.quantize_common import (
     auto_quantize_helper,
     data_tensor_context_parallel_test_helper,
-    dp_cp_parallel_test_helper,
 )
 
 skip_if_no_megatron()

Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,6 @@`
`34`	`34`	`from _test_utils.torch_quantization.quantize_common import (`
`35`	`35`	`auto_quantize_helper,`
`36`	`36`	`data_tensor_context_parallel_test_helper,`
`37`		`- dp_cp_parallel_test_helper,`
`38`	`37`	`)`
`39`	`38`
`40`	`39`	`skip_if_no_megatron()`