diff --git a/docs/en/get_started/usage.md b/docs/en/get_started/usage.md index 507e387bb..276d6ceff 100644 --- a/docs/en/get_started/usage.md +++ b/docs/en/get_started/usage.md @@ -315,7 +315,7 @@ In some customized Megatron implementations, special operations need to be perfo slime also support FSDP2 as the training backend, docs [here](https://lmsys.org/blog/2025-12-03-miles-fsdp/). -> FSDP automatically reads all architecture information via `AutoModelForCausalLM.from_pretrained()`, without manual specification. Megatron requires manual configuration of parameters to read model architecture information, or automatic inference via `--use-hf-config-for-megatron`. FSDP can read entirely from `config.json`, directly avoiding the weight format conversion step. +> FSDP automatically reads all architecture information via `AutoModelForCausalLM.from_pretrained()`, without manual specification. Megatron requires manual configuration of parameters to read model architecture information. FSDP can read entirely from `config.json`, directly avoiding the weight format conversion step. To run FSDP as the training backend, pass `--train-backend fsdp` to enable. @@ -325,7 +325,7 @@ Parameters that FSDP used are shown as below in comparison to Megatron, more sup | Configuration Category | Megatron Parameter | FSDP Parameter | Description | | --- | --- | --- | --- | -| **Model Loading** | `--load` (Megatron checkpoint) + architecture args (`--num-layers`, `--hidden-size` etc.) or `--use-hf-config-for-megatron` | `--hf-checkpoint` (Required) | **FSDP**: Directly uses HuggingFace format, no weight conversion needed, architecture inferred via `AutoConfig` | +| **Model Loading** | `--load` (Megatron checkpoint) + architecture args (`--num-layers`, `--hidden-size` etc.) | `--hf-checkpoint` (Required) | **FSDP**: Directly uses HuggingFace format, no weight conversion needed, architecture inferred via `AutoConfig` | | **Tensor Parallel** | `--tensor-model-parallel-size` | Coming Soon | | | **Pipeline Parallel** | `--pipeline-model-parallel-size` | Coming Soon | | | **Expert Parallel** | `--expert-model-parallel-size` | Coming Soon | | diff --git a/docs/zh/get_started/usage.md b/docs/zh/get_started/usage.md index d9d12a731..a27d1009a 100644 --- a/docs/zh/get_started/usage.md +++ b/docs/zh/get_started/usage.md @@ -314,7 +314,7 @@ if __name__ == "__main__": slime 同样也支持FSDP2作为训练后端,可以参考[文档](https://github.com/zhaochenyang20/Awesome-ML-SYS-Tutorial/blob/main/rlhf/slime/fsdp/readme.md)。 -> FSDP 通过 `AutoModelForCausalLM.from_pretrained()` 自动读取所有架构信息,无需手动指定。Megatron 需要手动配置参数读取 model 架构信息,或者通过 `--use-hf-config-for-megatron` 实现自动推断, FSDP可以全部从 `config.json` 自动读取,可以直接避免权重格式转换步骤。 +> FSDP 通过 `AutoModelForCausalLM.from_pretrained()` 自动读取所有架构信息,无需手动指定。Megatron 需要手动配置参数读取 model 架构信息,FSDP可以全部从 `config.json` 自动读取,可以直接避免权重格式转换步骤。 可以通过在命令行传递 `--train-backend fsdp` 来启动 FSDP 作为训练后端。 @@ -324,7 +324,7 @@ FSDP和Megatron后端支持的参数的对比如下表所示,接下来FSDP会 | 配置类别 | Megatron 参数 | FSDP 参数 | 说明 | | --- | --- | --- | --- | -| **模型加载** | `--load` (Megatron checkpoint) + 架构参数 (`--num-layers`, `--hidden-size` 等) 或 `--use-hf-config-for-megatron` | `--hf-checkpoint` (必需) | **FSDP**: 直接使用 HuggingFace 格式,无需转换权重,通过 `AutoConfig` 自动推断架构 | +| **模型加载** | `--load` (Megatron checkpoint) + 架构参数 (`--num-layers`, `--hidden-size` 等) | `--hf-checkpoint` (必需) | **FSDP**: 直接使用 HuggingFace 格式,无需转换权重,通过 `AutoConfig` 自动推断架构 | | **张量并行** | `--tensor-model-parallel-size` | Coming Soon | | | **流水线并行** | `--pipeline-model-parallel-size` | Coming Soon | | | **专家并行** | `--expert-model-parallel-size` | Coming Soon | | diff --git a/slime/backends/megatron_utils/config_mapping/__init__.py b/slime/backends/megatron_utils/config_mapping/__init__.py deleted file mode 100644 index cc8ebc132..000000000 --- a/slime/backends/megatron_utils/config_mapping/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -from .registry import mapper_registry, register_mapper - - -def get_mapper(name: str): - return mapper_registry.get_mapper(name) - - -__all__ = [ - "register_mapper", - "mapper_registry", - "get_mapper", -] diff --git a/slime/backends/megatron_utils/config_mapping/predefined_config_mappers.py b/slime/backends/megatron_utils/config_mapping/predefined_config_mappers.py deleted file mode 100644 index 8f092d9ae..000000000 --- a/slime/backends/megatron_utils/config_mapping/predefined_config_mappers.py +++ /dev/null @@ -1,128 +0,0 @@ -from collections import namedtuple -import torch.nn.functional as F -from transformers import PretrainedConfig -from .registry import register_mapper - - -MegatronModelConfig = namedtuple("MegatronModelConfig", ["transformer_config", "gpt_model_args"]) - - -def _get_activation_func(name: str): - if name == "silu": - return F.silu - elif name == "gelu": - return F.gelu - else: - raise ValueError(f"Unsupported activation function: {name}") - - -def _to_n_args(value): - if isinstance(value, list): - return value - return [value] - - -def _map_common_configs(hf_config: PretrainedConfig) -> MegatronModelConfig: - rope_scaling_args = {} - if "rope_scaling" in hf_config and hf_config.rope_scaling is not None: - rope_scaling_args["seq_len_interpolation_factor"] = hf_config.rope_scaling["factor"] - return MegatronModelConfig( - transformer_config={ - # Model architecture parameters - "num_layers": hf_config.num_hidden_layers, - "hidden_size": hf_config.hidden_size, - "num_attention_heads": hf_config.num_attention_heads, - "num_query_groups": hf_config.num_key_value_heads, - "ffn_hidden_size": hf_config.intermediate_size, - "kv_channels": getattr(hf_config, "head_dim", None), - "layernorm_epsilon": hf_config.rms_norm_eps, - # Activation and normalization - "activation_func": _get_activation_func(hf_config.hidden_act), - "normalization": "RMSNorm", - "gated_linear_unit": True, - }, - gpt_model_args={ - "vocab_size": hf_config.vocab_size, - "rotary_base": hf_config.rope_theta, - "position_embedding_type": "rope", - "untie_embeddings_and_output_weights": not hf_config.tie_word_embeddings, - }, - ) - - -@register_mapper("qwen2") -def qwen2_config_mapper(hf_config: PretrainedConfig) -> MegatronModelConfig: - mapped_config = _map_common_configs(hf_config) - mapped_config.transformer_config.update( - { - "add_bias_linear": False, - "add_qkv_bias": hf_config.attention_bias, - } - ) - - return mapped_config - - -@register_mapper("qwen3") -def qwen3_config_mapper(hf_config: PretrainedConfig) -> MegatronModelConfig: - mapped_config = _map_common_configs(hf_config) - mapped_config.transformer_config.update( - { - "add_bias_linear": False, - "add_qkv_bias": hf_config.attention_bias, - "qk_layernorm": True, - } - ) - - return mapped_config - - -@register_mapper("qwen3_moe") -def qwen3_moe_config_mapper(hf_config: PretrainedConfig) -> MegatronModelConfig: - mapped_config = _map_common_configs(hf_config) - mapped_config.transformer_config.update( - { - "add_bias_linear": False, - "add_qkv_bias": hf_config.attention_bias, - "moe_ffn_hidden_size": hf_config.moe_intermediate_size, - "moe_router_topk": hf_config.num_experts_per_tok, - "num_moe_experts": hf_config.num_experts, - "moe_aux_loss_coeff": _to_n_args(hf_config.router_aux_loss_coef), - "moe_router_load_balancing_type": _to_n_args("none"), # turn off aux_loss as it hurts perf in RL - "moe_router_score_function": "softmax", - "moe_router_pre_softmax": False, - "qk_layernorm": True, - } - ) - - return mapped_config - - -@register_mapper("glm4_moe") -def glm4_moe_config_mapper(hf_config: PretrainedConfig) -> MegatronModelConfig: - moe_layer_freq = [1] * hf_config.num_hidden_layers - for i in range(min(hf_config.first_k_dense_replace, hf_config.num_hidden_layers)): - moe_layer_freq[i] = 0 - - mapped_config = _map_common_configs(hf_config) - mapped_config.transformer_config.update( - { - "add_bias_linear": False, - "qk_layernorm": hf_config.use_qk_norm, - "add_qkv_bias": hf_config.attention_bias, - "moe_ffn_hidden_size": hf_config.moe_intermediate_size, - "moe_router_topk": hf_config.num_experts_per_tok, - "moe_router_topk_scaling_factor": hf_config.routed_scaling_factor, - "moe_router_dtype": "fp32", - "num_moe_experts": hf_config.num_experts, - "moe_router_enable_expert_bias": True, - "moe_layer_freq": moe_layer_freq, - "moe_router_bias_update_rate": 0.0, - "moe_aux_loss_coeff": _to_n_args(hf_config.router_aux_loss_coef), - "moe_router_load_balancing_type": _to_n_args("seq_aux_loss"), - "moe_router_score_function": "sigmoid", - "rotary_percent": hf_config.partial_rotary_factor, - } - ) - - return mapped_config diff --git a/slime/backends/megatron_utils/config_mapping/registry.py b/slime/backends/megatron_utils/config_mapping/registry.py deleted file mode 100644 index ebc2677f2..000000000 --- a/slime/backends/megatron_utils/config_mapping/registry.py +++ /dev/null @@ -1,55 +0,0 @@ -import logging -from collections.abc import Callable - -logger = logging.getLogger(__name__) - - -class MapperRegistry: - """ - Registry for config mappers. - """ - - def __init__(self): - self._mappers: dict[str, Callable] = {} - - def register(self, model_types: list[str], mapper_func: Callable): - if not callable(mapper_func): - raise TypeError(f"Mapper for {model_types} must be callable") - - for name in model_types: - if name in self._mappers: - logger.warning(f"Mapper for {name} is being overridden") - self._mappers[name] = mapper_func - logger.info(f"Registered config mapper for model type: {name}") - - def get_mapper(self, name: str) -> Callable: - """ - Get the mapper by model_type. - """ - if name not in self._mappers: - raise ValueError(f"Mapper for {name} is not registered.") - return self._mappers[name] - - def list_registered_mappers(self) -> list[str]: - return list(self._mappers.keys()) - - -# Global registry instance -mapper_registry = MapperRegistry() - - -def register_mapper(*args): - """ - Decorator: register config mapper. - - Args: suppotred model_types. - """ - - def decorator(func: Callable): - mapper_registry.register( - model_types=list(args), - mapper_func=func, - ) - return func - - return decorator diff --git a/slime/utils/arguments.py b/slime/utils/arguments.py index 0d696c402..fa4efe465 100644 --- a/slime/utils/arguments.py +++ b/slime/utils/arguments.py @@ -170,11 +170,6 @@ def add_rollout_arguments(parser): "It doesn't necessary need to contain the most up-to-date parameters." ), ) - parser.add_argument( - "--use-hf-config-for-megatron", - action="store_true", - help="Whether to use HF config for Megatron core to define the model architecture.", - ) parser.add_argument( "--model-name", type=str, @@ -1295,12 +1290,6 @@ def parse_args(add_custom_arguments=None): args = megatron_parse_args(extra_args_provider=add_slime_arguments) if args.hf_checkpoint: hf_config = AutoConfig.from_pretrained(args.hf_checkpoint, trust_remote_code=True) - if args.use_hf_config_for_megatron: - from slime.backends.megatron_utils.config_mapping import get_mapper - - megatron_config_from_hf = get_mapper(hf_config.model_type)(hf_config) - _validate_and_update_megatron_args_from_hf(args, megatron_config_from_hf.transformer_config) - _validate_and_update_megatron_args_from_hf(args, megatron_config_from_hf.gpt_model_args) hf_validate_args(args, hf_config) args.rank = 0 @@ -1614,12 +1603,3 @@ def equal(x, y): if len(errors) > 0: raise AssertionError("hf_validate_args failed: " + "; ".join(errors)) - - -def _validate_and_update_megatron_args_from_hf(args, args_from_hf_config: dict[str, Any]): - for key, value in args_from_hf_config.items(): - if hasattr(args, key) and getattr(args, key) != value: - raise ValueError( - f"Argument {key} is not consistent. {key} in args is {getattr(args, key)}, but from HF config is {value}." - ) - setattr(args, key, value)