[Model] Support HF format of minimax (vllm-project#20211)

mgoin · epwalsh · commit ea109e341e35 · 2025-08-27T16:55:22.000-07:00
Signed-off-by: mgoin &lt;mgoin64@gmail.com&gt;
diff --git a/tests/models/registry.py b/tests/models/registry.py
@@ -218,6 +218,8 @@ def check_available_online(
                                          trust_remote_code=True),
     "MiniCPM3ForCausalLM": _HfExamplesInfo("openbmb/MiniCPM3-4B",
                                          trust_remote_code=True),
+    "MiniMaxForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-Text-01-hf",
+                                          min_transformers_version="4.53"),
     "MiniMaxText01ForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-Text-01",
                                                 trust_remote_code=True,
                                                 revision="a59aa9cbc53b9fb8742ca4e9e1531b9802b6fdc3"),  # noqa: E501
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
@@ -667,16 +667,24 @@ def __init__(
                                                 eps=config.rms_norm_eps)
         if config.attention_type == 0:
             self.layernorm_attention_alpha = getattr(
-                config, 'layernorm_linear_attention_alpha', 1)
+                config, 'layernorm_linear_attention_alpha',
+                getattr(config, 'linear_attn_alpha_factor', 1))
             self.layernorm_attention_beta = getattr(
-                config, 'layernorm_linear_attention_beta', 1)
+                config, 'layernorm_linear_attention_beta',
+                getattr(config, 'linear_attn_beta_factor', 1))
         else:
             self.layernorm_attention_alpha = getattr(
-                config, 'layernorm_full_attention_alpha', 1)
+                config, 'layernorm_full_attention_alpha',
+                getattr(config, 'full_attn_alpha_factor', 1))
             self.layernorm_attention_beta = getattr(
-                config, 'layernorm_full_attention_beta', 1)
-        self.layernorm_mlp_alpha = getattr(config, 'layernorm_mlp_alpha', 1)
-        self.layernorm_mlp_beta = getattr(config, 'layernorm_mlp_beta', 1)
+                config, 'layernorm_full_attention_beta',
+                getattr(config, 'full_attn_beta_factor', 1))
+        self.layernorm_mlp_alpha = getattr(
+            config, 'layernorm_mlp_alpha',
+            getattr(config, 'mlp_alpha_factor', 1))
+        self.layernorm_mlp_beta = getattr(
+            config, 'layernorm_mlp_beta', getattr(config, 'mlp_beta_factor',
+                                                  1))
         self.postnorm = getattr(config, 'postnorm', False)
         self.shared_moe = False
 
@@ -794,6 +802,18 @@ def __init__(
         self.decoder_attention_types = getattr(
             config, "attn_type_list", False) or getattr(
                 config, "decoder_attention_types", False)
+        # The HF format uses "layer_types" instead of "attn_type_list"
+        # where "linear_attention" is 0 and "full_attention" is 1
+        if not self.decoder_attention_types and hasattr(config, "layer_types"):
+            self.decoder_attention_types = []
+            for layer_type in config.layer_types:
+                if layer_type == "linear_attention":
+                    self.decoder_attention_types.append(0)
+                elif layer_type == "full_attention":
+                    self.decoder_attention_types.append(1)
+                else:
+                    raise ValueError(f"Unsupported layer type: {layer_type}")
+        # Default to full attention
         if not self.decoder_attention_types:
             self.decoder_attention_types = [1] * config.num_hidden_layers
         self.num_layers = config.num_hidden_layers
@@ -1022,8 +1042,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         else:
             self.lm_head = PPMissingLayer()
         self.lm_head.float()
-        flash_layer_count = sum(1 for attn_type in self.config.attn_type_list
-                                if attn_type == 1)
+        flash_layer_count = sum(
+            1 for attn_type in self.model.decoder_attention_types
+            if attn_type == 1)
         self.kv_cache = [torch.tensor([]) for _ in range(flash_layer_count)]
         return
 
@@ -1085,9 +1106,10 @@ def which_layer(name: str) -> int:
             return None
 
         def is_linear_attn_layer(layer_idx: int) -> bool:
-            if layer_idx is None or not hasattr(self.config, "attn_type_list"):
+            if layer_idx is None or layer_idx >= len(
+                    self.model.decoder_attention_types):
                 return False
-            return self.config.attn_type_list[layer_idx] == 0
+            return self.model.decoder_attention_types[layer_idx] == 0
 
         def is_moe_weight(name: str) -> bool:
             return "block_sparse_moe" in name and not name.endswith(".bias")
@@ -1275,7 +1297,7 @@ def load_basic_weight(name: str, loaded_weight: torch.Tensor,
         for name, loaded_weight in weights:
             weight_at_layer = which_layer(name)
             if weight_at_layer and weight_at_layer >= len(
-                    self.config.attn_type_list):
+                    self.model.decoder_attention_types):
                 continue
 
             if is_layer_norm_weight(name):
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
@@ -34,6 +34,7 @@
     "AquilaModel": ("llama", "LlamaForCausalLM"),
     "AquilaForCausalLM": ("llama", "LlamaForCausalLM"),  # AquilaChat2
     "ArcticForCausalLM": ("arctic", "ArcticForCausalLM"),
+    "MiniMaxForCausalLM": ("minimax_text_01", "MiniMaxText01ForCausalLM"),
     "MiniMaxText01ForCausalLM": ("minimax_text_01", "MiniMaxText01ForCausalLM"),
     "MiniMaxM1ForCausalLM": ("minimax_text_01", "MiniMaxText01ForCausalLM"),
     # baichuan-7b, upper case 'C' in the class name