Moe 128 rebased (#8)

liuzijing2014 · pcuenca · luccafong · liuzijing2014 · commit 189a1032ca5e · 2025-03-31T16:14:19.000-07:00
* 128 experts * Use default rope * Unfuse mlp * Address feedback * Use None "default" for rope_scaling. Add eot. * Meta/llama quant compat (#7) * add quant compatible model & conversion code for llama4 * fix a few issues * fix a few issues * minor type mapping fix --------- Co-authored-by: Lu Fang <fanglu@fb.com> * use a new config parameter to determine which model definition to use for MoE --------- Co-authored-by: Pedro Cuenca <pedro@huggingface.co> Co-authored-by: Lu Fang <fanglu@fb.com>
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -529,6 +529,7 @@ def load_sharded_checkpoint(model, folder, strict=True, prefer_safe=True):
     "F32": torch.float32,
     "F64": torch.float64,
     "I64": torch.int64,
+    "F8_E4M3": torch.float8_e4m3fn
 }
 
 if is_torch_greater_or_equal("2.3.0"):
@@ -4061,7 +4062,7 @@ def from_pretrained(
             if not torch.distributed.is_initialized():
                 try:
                     rank = int(os.environ["LOCAL_RANK"])
-                    world_size = int(os.environ["ROLE_WORLD_SIZE"]) 
+                    world_size = int(os.environ["ROLE_WORLD_SIZE"])
                     logger.warning(
                         "Tensor Parallel requires torch.distributed to be initialized first."
                         f"Initializing with world size {world_size} on rank {rank}"
diff --git a/src/transformers/models/llama4/configuration_llama4.py b/src/transformers/models/llama4/configuration_llama4.py
@@ -177,6 +177,7 @@ def __init__(
         router_aux_loss_coef=0.001,
         router_jitter_noise=0.0,
         rope_scaling=None,
+        for_llm_compressor=False,
         **kwargs,
     ):
         super().__init__(
@@ -217,6 +218,8 @@ def __init__(
         self.router_aux_loss_coef = router_aux_loss_coef
         self.router_jitter_noise = router_jitter_noise
 
+        self.for_llm_compressor = for_llm_compressor
+
 
 class Llama4Config(PretrainedConfig):
     r"""
@@ -290,6 +293,9 @@ class Llama4Config(PretrainedConfig):
             The aux loss factor for the total loss.
         router_jitter_noise (`float`, *optional*, defaults to 0.0):
             Amount of noise to add to the router.
+        for_llm_compressor: (`bool`, *optional*, defaults to `False`):
+            Whether this config is for a checkpoint that aims to use LLM compressor for fp8 quantization.
+            If `True`, the model MoE part would swap to use Linear instead of FusedMoE.
 
     ```python
     >>> from transformers import Llama4Model, Llama4Config
diff --git a/src/transformers/models/llama4/convert_llama4_weights_to_hf.py b/src/transformers/models/llama4/convert_llama4_weights_to_hf.py
@@ -21,6 +21,8 @@
 from transformers.integrations.tiktoken import TikTokenConverter
 
 
+_OFFLINE_QUANT_COMPATIBLE = os.environ.get("OFFLINE_QUANT_COMPATIBLE", "0") == "1"
+
 torch.serialization.add_safe_globals([io.BytesIO])
 # fmt: off
 
@@ -29,6 +31,8 @@
 # Still not sure what to do with those!
 # `None` means we drop the key
 
+
+weight_postfix = ".weight" if _OFFLINE_QUANT_COMPATIBLE else ""
 ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
     # CausalLM keys
     r"output.weight":                                        r"language_model.lm_head.weight",
@@ -44,9 +48,9 @@
     r"layers.(\d+).attention.wqkv.weight":                   r"language_model.model.layers.\1.self_attn.qkv_proj.weight",
 
     # MoE keys: no simple MLPmodel.
-    r"layers.(\d+).feed_forward.experts.moe_w_in_eD_F":      r"language_model.model.layers.\1.feed_forward.experts.gate_proj",       # will be fused with up
-    r"layers.(\d+).feed_forward.experts.moe_w_out_eF_D":     r"language_model.model.layers.\1.feed_forward.experts.down_proj",       # expert win
-    r"layers.(\d+).feed_forward.experts.moe_w_swiglu_eD_F":  r"language_model.model.layers.\1.feed_forward.experts.up_proj",         # fused with up
+    r"layers.(\d+).feed_forward.experts.moe_w_in_eD_F":      r"language_model.model.layers.\1.feed_forward.experts.gate_proj" + weight_postfix,       # will be fused with up
+    r"layers.(\d+).feed_forward.experts.moe_w_out_eF_D":     r"language_model.model.layers.\1.feed_forward.experts.down_proj" + weight_postfix,       # expert win
+    r"layers.(\d+).feed_forward.experts.moe_w_swiglu_eD_F":  r"language_model.model.layers.\1.feed_forward.experts.up_proj" + weight_postfix,         # fused with up
     r"layers.(\d+).feed_forward.router_DE":                  r"language_model.model.layers.\1.feed_forward.router.weight",           # used for top
     r"layers.(\d+).feed_forward.w_in_shared_FD":             r"language_model.model.layers.\1.feed_forward.shared_expert.gate_proj", # might need to be fused for efficiency?
     r"layers.(\d+).feed_forward.w_out_shared_DF":            r"language_model.model.layers.\1.feed_forward.shared_expert.down_proj", # might need to be fused for efficiency?
@@ -262,6 +266,7 @@ def write_model(
         pad_token_id=pad_token_id,
         tie_word_embeddings=False,  # Constant set to False
         torch_dtype=torch_dtype,
+        for_llm_compressor=_OFFLINE_QUANT_COMPATIBLE,
         **config_kwargs,
     )
     # default vision config frmo params
@@ -380,6 +385,16 @@ def write_model(
                 v = new_key.replace("qkv", "v")
                 tqdm.write(f"Processing: {key.ljust(50)}  ->\t {v}, {values.shape}")
                 state_dict[v] = values
+            elif _OFFLINE_QUANT_COMPATIBLE and "feed_forward.experts." in new_key:
+                # for experts, we need to split expert for offline quantiation purpose and don't need to fuse
+                expert_lists = []
+                for k in current_parameter:
+                    expert_lists.append(list(k.reshape(num_experts, -1, k.shape[-1]).unbind(0))) # [#expert * IN, OUT] -> #experts * [IN, OUT]
+                for i in range(num_experts):
+                    expert = torch.cat([expert_list[i] for expert_list in expert_lists], dim=concat_dim)
+                    expert_key = new_key.replace("experts.", f"experts.{i}.")
+                    state_dict[expert_key] = expert.transpose(0,1).contiguous() #[OUT, IN]
+                    tqdm.write(f"Processing: {key.ljust(50)}  ->\t {expert_key}, {state_dict[expert_key].shape}")
             elif re.search(r"(gate|up)_proj", new_key):
                 path = new_key.split(".")
                 gate_key = re.sub(r"(gate|up)_proj", lambda m: "gate_proj", new_key)
@@ -408,6 +423,7 @@ def write_model(
                         gate_up_proj = torch.cat((gate_proj, up_proj), dim=-1)
                         new_key = new_key.replace("up_proj", "gate_up_proj")
                         state_dict[new_key] = gate_up_proj.contiguous()
+
                     tqdm.write(f"Processing: {key.ljust(50)}  ->\t {new_key}, {state_dict[new_key].shape}")
             elif "down_proj" in new_key:
                 current_parameter = torch.cat(current_parameter, dim=concat_dim)
@@ -710,11 +726,11 @@ def write_tokenizer(tokenizer_path: str, save_dir: str, instruct: bool = False):
     )
 
     args = parser.parse_args()
-    write_tokenizer(
-        tokenizer_path=os.path.join(args.input_dir, "tokenizer.model"),
-        save_dir=args.output_dir,
-        instruct=args.instruct,
-    )
+    # write_tokenizer(
+    #     tokenizer_path=os.path.join(args.input_dir, "tokenizer.model"),
+    #     save_dir=args.output_dir,
+    #     instruct=args.instruct,
+    # )
 
     write_model(
         model_path=args.output_dir,
diff --git a/src/transformers/models/llama4/modeling_llama4.py b/src/transformers/models/llama4/modeling_llama4.py
@@ -23,6 +23,7 @@
 from dataclasses import dataclass
 from typing import Callable, List, Optional, Tuple, Union
 
+import os
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -61,7 +62,6 @@
 _CHECKPOINT_FOR_DOC = "meta-ai/Llama-4-17B"
 _CONFIG_FOR_DOC = "Llama4Config"
 
-
 class Llama4TextExperts(nn.Module):
     def __init__(self, config: Llama4Config):
         super().__init__()
@@ -153,7 +153,12 @@ def __init__(self, config):
         super().__init__()
         self.top_k = config.num_experts_per_tok
         self.hidden_dim = config.hidden_size
-        self.experts = Llama4TextExperts(config)
+        self.num_experts = config.num_local_experts
+        self.for_llm_compressor = config.for_llm_compressor
+        if self.for_llm_compressor:
+            self.experts = nn.ModuleList([Llama4TextMLP(config) for _ in range(self.num_experts)])
+        else:
+            self.experts = Llama4TextExperts(config)
         self.router = nn.Linear(config.hidden_size, config.num_local_experts, bias=False)
         self.shared_expert = Llama4TextMLP(config)
 
@@ -184,8 +189,14 @@ def forward(self, hidden_states):
         )
         # we gather inputs corresponding to each expert based on the router indices
         routed_in = routed_in * router_scores.reshape(-1, 1)
-        routed_out = self.experts(routed_in)  # routed in is "sorted" / ready for EP
-
+        expert_routed_out_list = []
+        if self.for_llm_compressor:
+            routed_in = routed_in.reshape(self.num_experts, -1, routed_in.shape[-1])
+            for expert_idx in range(self.num_experts):
+                expert_routed_out_list.append(self.experts[expert_idx](routed_in[expert_idx]))
+            routed_out = torch.cat(expert_routed_out_list, dim=0)
+        else:
+            routed_out = self.experts(routed_in)
         out = self.shared_expert(hidden_states)
         # now that we finished expert computation -> we scatter add because we gathered previously
         # we have to do this because we used all experts on all tokens. This is faster than the for loop, tho you are compute bound
@@ -1706,7 +1717,7 @@ def forward(
             projected_vision_flat = self.multi_modal_projector(vision_flat)
 
             special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
-            final_mask = special_image_mask.to(inputs_embeds.device) 
+            final_mask = special_image_mask.to(inputs_embeds.device)
             inputs_embeds = inputs_embeds.view(-1, inputs_embeds.size(-1))
 
             final_mask_1d = final_mask[..., 0].reshape(-1)