diff --git a/docs/en/get_started/usage.md b/docs/en/get_started/usage.md
index 507e387bb..276d6ceff 100644
--- a/docs/en/get_started/usage.md
+++ b/docs/en/get_started/usage.md
@@ -315,7 +315,7 @@ In some customized Megatron implementations, special operations need to be perfo
 
 slime also support FSDP2 as the training backend, docs [here](https://lmsys.org/blog/2025-12-03-miles-fsdp/). 
 
-> FSDP automatically reads all architecture information via `AutoModelForCausalLM.from_pretrained()`, without manual specification. Megatron requires manual configuration of parameters to read model architecture information, or automatic inference via `--use-hf-config-for-megatron`. FSDP can read entirely from `config.json`, directly avoiding the weight format conversion step.
+> FSDP automatically reads all architecture information via `AutoModelForCausalLM.from_pretrained()`, without manual specification. Megatron requires manual configuration of parameters to read model architecture information. FSDP can read entirely from `config.json`, directly avoiding the weight format conversion step.
 
 To run FSDP as the training backend, pass `--train-backend fsdp` to enable.
 
@@ -325,7 +325,7 @@ Parameters that FSDP used are shown as below in comparison to Megatron, more sup
 
 | Configuration Category | Megatron Parameter | FSDP Parameter | Description |
 | --- | --- | --- | --- |
-| **Model Loading**         | `--load` (Megatron checkpoint) + architecture args (`--num-layers`, `--hidden-size` etc.) or `--use-hf-config-for-megatron` | `--hf-checkpoint` (Required)                           | **FSDP**: Directly uses HuggingFace format, no weight conversion needed, architecture inferred via `AutoConfig` |
+| **Model Loading**         | `--load` (Megatron checkpoint) + architecture args (`--num-layers`, `--hidden-size` etc.) | `--hf-checkpoint` (Required)                           | **FSDP**: Directly uses HuggingFace format, no weight conversion needed, architecture inferred via `AutoConfig` |
 | **Tensor Parallel**       | `--tensor-model-parallel-size`                               | Coming Soon                                            |                                                              |
 | **Pipeline Parallel**     | `--pipeline-model-parallel-size`                             | Coming Soon                                            |                                                              |
 | **Expert Parallel**       | `--expert-model-parallel-size`                               | Coming Soon                                            |                                                              |
diff --git a/docs/zh/get_started/usage.md b/docs/zh/get_started/usage.md
index d9d12a731..a27d1009a 100644
--- a/docs/zh/get_started/usage.md
+++ b/docs/zh/get_started/usage.md
@@ -314,7 +314,7 @@ if __name__ == "__main__":
 
 slime 同样也支持FSDP2作为训练后端，可以参考[文档](https://github.com/zhaochenyang20/Awesome-ML-SYS-Tutorial/blob/main/rlhf/slime/fsdp/readme.md)。
 
-> FSDP 通过 `AutoModelForCausalLM.from_pretrained()` 自动读取所有架构信息，无需手动指定。Megatron 需要手动配置参数读取 model 架构信息，或者通过 `--use-hf-config-for-megatron` 实现自动推断， FSDP可以全部从 `config.json` 自动读取，可以直接避免权重格式转换步骤。
+> FSDP 通过 `AutoModelForCausalLM.from_pretrained()` 自动读取所有架构信息，无需手动指定。Megatron 需要手动配置参数读取 model 架构信息，FSDP可以全部从 `config.json` 自动读取，可以直接避免权重格式转换步骤。
 
 可以通过在命令行传递 `--train-backend fsdp` 来启动 FSDP 作为训练后端。
 
@@ -324,7 +324,7 @@ FSDP和Megatron后端支持的参数的对比如下表所示，接下来FSDP会
 
 | 配置类别 | Megatron 参数 | FSDP 参数 | 说明 |
 | --- | --- | --- | --- |
-| **模型加载** | `--load` (Megatron checkpoint) + 架构参数 (`--num-layers`, `--hidden-size` 等) 或 `--use-hf-config-for-megatron` | `--hf-checkpoint` (必需) | **FSDP**: 直接使用 HuggingFace 格式，无需转换权重，通过 `AutoConfig` 自动推断架构 |
+| **模型加载** | `--load` (Megatron checkpoint) + 架构参数 (`--num-layers`, `--hidden-size` 等) | `--hf-checkpoint` (必需) | **FSDP**: 直接使用 HuggingFace 格式，无需转换权重，通过 `AutoConfig` 自动推断架构 |
 | **张量并行** | `--tensor-model-parallel-size` | Coming Soon |  |
 | **流水线并行** | `--pipeline-model-parallel-size` | Coming Soon |  |
 | **专家并行** | `--expert-model-parallel-size` | Coming Soon |  |
diff --git a/slime/backends/megatron_utils/config_mapping/__init__.py b/slime/backends/megatron_utils/config_mapping/__init__.py
deleted file mode 100644
index cc8ebc132..000000000
--- a/slime/backends/megatron_utils/config_mapping/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from .registry import mapper_registry, register_mapper
-
-
-def get_mapper(name: str):
-    return mapper_registry.get_mapper(name)
-
-
-__all__ = [
-    "register_mapper",
-    "mapper_registry",
-    "get_mapper",
-]
diff --git a/slime/backends/megatron_utils/config_mapping/predefined_config_mappers.py b/slime/backends/megatron_utils/config_mapping/predefined_config_mappers.py
deleted file mode 100644
index 8f092d9ae..000000000
--- a/slime/backends/megatron_utils/config_mapping/predefined_config_mappers.py
+++ /dev/null
@@ -1,128 +0,0 @@
-from collections import namedtuple
-import torch.nn.functional as F
-from transformers import PretrainedConfig
-from .registry import register_mapper
-
-
-MegatronModelConfig = namedtuple("MegatronModelConfig", ["transformer_config", "gpt_model_args"])
-
-
-def _get_activation_func(name: str):
-    if name == "silu":
-        return F.silu
-    elif name == "gelu":
-        return F.gelu
-    else:
-        raise ValueError(f"Unsupported activation function: {name}")
-
-
-def _to_n_args(value):
-    if isinstance(value, list):
-        return value
-    return [value]
-
-
-def _map_common_configs(hf_config: PretrainedConfig) -> MegatronModelConfig:
-    rope_scaling_args = {}
-    if "rope_scaling" in hf_config and hf_config.rope_scaling is not None:
-        rope_scaling_args["seq_len_interpolation_factor"] = hf_config.rope_scaling["factor"]
-    return MegatronModelConfig(
-        transformer_config={
-            # Model architecture parameters
-            "num_layers": hf_config.num_hidden_layers,
-            "hidden_size": hf_config.hidden_size,
-            "num_attention_heads": hf_config.num_attention_heads,
-            "num_query_groups": hf_config.num_key_value_heads,
-            "ffn_hidden_size": hf_config.intermediate_size,
-            "kv_channels": getattr(hf_config, "head_dim", None),
-            "layernorm_epsilon": hf_config.rms_norm_eps,
-            # Activation and normalization
-            "activation_func": _get_activation_func(hf_config.hidden_act),
-            "normalization": "RMSNorm",
-            "gated_linear_unit": True,
-        },
-        gpt_model_args={
-            "vocab_size": hf_config.vocab_size,
-            "rotary_base": hf_config.rope_theta,
-            "position_embedding_type": "rope",
-            "untie_embeddings_and_output_weights": not hf_config.tie_word_embeddings,
-        },
-    )
-
-
-@register_mapper("qwen2")
-def qwen2_config_mapper(hf_config: PretrainedConfig) -> MegatronModelConfig:
-    mapped_config = _map_common_configs(hf_config)
-    mapped_config.transformer_config.update(
-        {
-            "add_bias_linear": False,
-            "add_qkv_bias": hf_config.attention_bias,
-        }
-    )
-
-    return mapped_config
-
-
-@register_mapper("qwen3")
-def qwen3_config_mapper(hf_config: PretrainedConfig) -> MegatronModelConfig:
-    mapped_config = _map_common_configs(hf_config)
-    mapped_config.transformer_config.update(
-        {
-            "add_bias_linear": False,
-            "add_qkv_bias": hf_config.attention_bias,
-            "qk_layernorm": True,
-        }
-    )
-
-    return mapped_config
-
-
-@register_mapper("qwen3_moe")
-def qwen3_moe_config_mapper(hf_config: PretrainedConfig) -> MegatronModelConfig:
-    mapped_config = _map_common_configs(hf_config)
-    mapped_config.transformer_config.update(
-        {
-            "add_bias_linear": False,
-            "add_qkv_bias": hf_config.attention_bias,
-            "moe_ffn_hidden_size": hf_config.moe_intermediate_size,
-            "moe_router_topk": hf_config.num_experts_per_tok,
-            "num_moe_experts": hf_config.num_experts,
-            "moe_aux_loss_coeff": _to_n_args(hf_config.router_aux_loss_coef),
-            "moe_router_load_balancing_type": _to_n_args("none"),  # turn off aux_loss as it hurts perf in RL
-            "moe_router_score_function": "softmax",
-            "moe_router_pre_softmax": False,
-            "qk_layernorm": True,
-        }
-    )
-
-    return mapped_config
-
-
-@register_mapper("glm4_moe")
-def glm4_moe_config_mapper(hf_config: PretrainedConfig) -> MegatronModelConfig:
-    moe_layer_freq = [1] * hf_config.num_hidden_layers
-    for i in range(min(hf_config.first_k_dense_replace, hf_config.num_hidden_layers)):
-        moe_layer_freq[i] = 0
-
-    mapped_config = _map_common_configs(hf_config)
-    mapped_config.transformer_config.update(
-        {
-            "add_bias_linear": False,
-            "qk_layernorm": hf_config.use_qk_norm,
-            "add_qkv_bias": hf_config.attention_bias,
-            "moe_ffn_hidden_size": hf_config.moe_intermediate_size,
-            "moe_router_topk": hf_config.num_experts_per_tok,
-            "moe_router_topk_scaling_factor": hf_config.routed_scaling_factor,
-            "moe_router_dtype": "fp32",
-            "num_moe_experts": hf_config.num_experts,
-            "moe_router_enable_expert_bias": True,
-            "moe_layer_freq": moe_layer_freq,
-            "moe_router_bias_update_rate": 0.0,
-            "moe_aux_loss_coeff": _to_n_args(hf_config.router_aux_loss_coef),
-            "moe_router_load_balancing_type": _to_n_args("seq_aux_loss"),
-            "moe_router_score_function": "sigmoid",
-            "rotary_percent": hf_config.partial_rotary_factor,
-        }
-    )
-
-    return mapped_config
diff --git a/slime/backends/megatron_utils/config_mapping/registry.py b/slime/backends/megatron_utils/config_mapping/registry.py
deleted file mode 100644
index ebc2677f2..000000000
--- a/slime/backends/megatron_utils/config_mapping/registry.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import logging
-from collections.abc import Callable
-
-logger = logging.getLogger(__name__)
-
-
-class MapperRegistry:
-    """
-    Registry for config mappers.
-    """
-
-    def __init__(self):
-        self._mappers: dict[str, Callable] = {}
-
-    def register(self, model_types: list[str], mapper_func: Callable):
-        if not callable(mapper_func):
-            raise TypeError(f"Mapper for {model_types} must be callable")
-
-        for name in model_types:
-            if name in self._mappers:
-                logger.warning(f"Mapper for {name} is being overridden")
-            self._mappers[name] = mapper_func
-            logger.info(f"Registered config mapper for model type: {name}")
-
-    def get_mapper(self, name: str) -> Callable:
-        """
-        Get the mapper by model_type.
-        """
-        if name not in self._mappers:
-            raise ValueError(f"Mapper for {name} is not registered.")
-        return self._mappers[name]
-
-    def list_registered_mappers(self) -> list[str]:
-        return list(self._mappers.keys())
-
-
-# Global registry instance
-mapper_registry = MapperRegistry()
-
-
-def register_mapper(*args):
-    """
-    Decorator: register config mapper.
-
-    Args: suppotred model_types.
-    """
-
-    def decorator(func: Callable):
-        mapper_registry.register(
-            model_types=list(args),
-            mapper_func=func,
-        )
-        return func
-
-    return decorator
diff --git a/slime/utils/arguments.py b/slime/utils/arguments.py
index 0d696c402..fa4efe465 100644
--- a/slime/utils/arguments.py
+++ b/slime/utils/arguments.py
@@ -170,11 +170,6 @@ def add_rollout_arguments(parser):
                     "It doesn't necessary need to contain the most up-to-date parameters."
                 ),
             )
-            parser.add_argument(
-                "--use-hf-config-for-megatron",
-                action="store_true",
-                help="Whether to use HF config for Megatron core to define the model architecture.",
-            )
             parser.add_argument(
                 "--model-name",
                 type=str,
@@ -1295,12 +1290,6 @@ def parse_args(add_custom_arguments=None):
         args = megatron_parse_args(extra_args_provider=add_slime_arguments)
         if args.hf_checkpoint:
             hf_config = AutoConfig.from_pretrained(args.hf_checkpoint, trust_remote_code=True)
-            if args.use_hf_config_for_megatron:
-                from slime.backends.megatron_utils.config_mapping import get_mapper
-
-                megatron_config_from_hf = get_mapper(hf_config.model_type)(hf_config)
-                _validate_and_update_megatron_args_from_hf(args, megatron_config_from_hf.transformer_config)
-                _validate_and_update_megatron_args_from_hf(args, megatron_config_from_hf.gpt_model_args)
             hf_validate_args(args, hf_config)
 
         args.rank = 0
@@ -1614,12 +1603,3 @@ def equal(x, y):
 
     if len(errors) > 0:
         raise AssertionError("hf_validate_args failed: " + "; ".join(errors))
-
-
-def _validate_and_update_megatron_args_from_hf(args, args_from_hf_config: dict[str, Any]):
-    for key, value in args_from_hf_config.items():
-        if hasattr(args, key) and getattr(args, key) != value:
-            raise ValueError(
-                f"Argument {key} is not consistent. {key} in args is {getattr(args, key)}, but from HF config is {value}."
-            )
-        setattr(args, key, value)