Merge pull request huggingface#21 from huggingface/add_fbgemm

MekkCyber · web-flow · commit aa8daba2be2f · 2025-04-04T14:45:41.000+02:00
Adding fbgemm
diff --git a/src/transformers/integrations/__init__.py b/src/transformers/integrations/__init__.py
@@ -53,7 +53,7 @@
         "unset_hf_deepspeed_config",
     ],
     "eetq": ["replace_with_eetq_linear"],
-    "fbgemm_fp8": ["FbgemmFp8Linear", "replace_with_fbgemm_fp8_linear"],
+    "fbgemm_fp8": ["FbgemmFp8Linear", "replace_with_fbgemm_fp8_linear", "FbgemmFp8Llama4TextExperts"],
     "finegrained_fp8": ["FP8Linear", "replace_with_fp8_linear"],
     "fsdp": ["is_fsdp_managed_module"],
     "ggml": [
@@ -192,7 +192,7 @@
         unset_hf_deepspeed_config,
     )
     from .eetq import replace_with_eetq_linear
-    from .fbgemm_fp8 import FbgemmFp8Linear, replace_with_fbgemm_fp8_linear
+    from .fbgemm_fp8 import FbgemmFp8Linear, replace_with_fbgemm_fp8_linear, FbgemmFp8Llama4TextExperts
     from .finegrained_fp8 import FP8Linear, replace_with_fp8_linear
     from .fsdp import is_fsdp_managed_module
     from .ggml import (
diff --git a/src/transformers/integrations/fbgemm_fp8.py b/src/transformers/integrations/fbgemm_fp8.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from ..utils import is_accelerate_available, is_fbgemm_gpu_available, is_torch_available, logging
-
+from ..activations import ACT2FN
 
 if is_torch_available():
     import torch
@@ -28,18 +28,18 @@
 logger = logging.get_logger(__name__)
 
 
-class FbgemmFp8Linear(torch.nn.Module):
+class FbgemmFp8Linear(torch.nn.Linear):
     def __init__(self, in_features, out_features, bias, weight_dtype=torch.float32):
-        super().__init__()
+        super().__init__(in_features, out_features, bias)
         self.in_features = in_features
         self.out_features = out_features
 
-        self.register_buffer("weight", torch.zeros((out_features, in_features), dtype=torch.float8_e4m3fn))
-        self.register_buffer("weight_scale", torch.zeros((out_features, 1), dtype=weight_dtype))
+        self.weight = torch.nn.Parameter(torch.zeros((out_features, in_features), dtype=torch.float8_e4m3fn))
+        self.weight_scale = torch.nn.Parameter(torch.zeros((out_features, 1), dtype=weight_dtype))
         self.register_buffer("input_scale_ub", torch.zeros([1], dtype=torch.float), persistent=False)
 
         if bias:
-            self.register_buffer("bias", torch.zeros((self.out_features), dtype=weight_dtype))
+            self.bias = torch.nn.Parameter(torch.zeros((self.out_features), dtype=weight_dtype))
         else:
             self.bias = None
 
@@ -50,15 +50,16 @@ def forward(self, x):
         # x_quantized and x_scale are not necessarily on the same device as x, this is an issue.
         # https://github.com/pytorch/FBGEMM/blob/e08af8539c391437f447173863df0f3f6f6f1855/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cu#L1237C3-L1237C45
         x_quantized, x_scale = torch.ops.fbgemm.quantize_fp8_per_row(
-            x.view(-1, x.shape[-1]), num_tokens, self.input_scale_ub
+            x.view(-1, x.shape[-1]), scale_ub=self.input_scale_ub
         )
         # moving x_quantized, x_scale here creates glibberish output ... However, if we move the output, it works
         # x_quantized, x_scale = x_quantized.to(x.device), x_scale.to(x.device)
 
         # The computation still happens on the device where self.weight is even if x_quantized is not on the same device as self.weight
+        weight_scale_float32 = self.weight_scale.to(torch.float32)
         output = torch.ops.fbgemm.f8f8bf16_rowwise(
-            x_quantized, self.weight, x_scale, self.weight_scale, use_fast_accum=True
-        )
+            x_quantized, self.weight, x_scale, weight_scale_float32, use_fast_accum=True
+        )   
         output = output + self.bias if self.bias is not None else output
         # Hacky for now, we have the output to the device of x
         output = output.to(x.device)
@@ -67,19 +68,104 @@ def forward(self, x):
         return output
 
 
+class FbgemmFp8Llama4TextExperts(nn.Module):
+    def __init__(self, config, dtype=torch.float32):
+        super().__init__()
+        self.num_experts = config.num_local_experts
+        self.intermediate_size = config.intermediate_size
+        self.hidden_size = config.hidden_size
+        self.expert_dim = self.intermediate_size
+        self.act_fn = ACT2FN[config.hidden_act]
+        # Register FP8 buffers for gate_up_proj
+        self.gate_up_proj = torch.nn.Parameter(torch.zeros((self.num_experts, self.hidden_size, 2 * self.expert_dim), dtype=torch.float8_e4m3fn))
+        self.gate_up_proj_scale = torch.nn.Parameter(torch.zeros((self.num_experts, 1, self.expert_dim * 2), dtype=torch.float32))
+        # Register FP8 buffers for down_proj
+        self.down_proj = torch.nn.Parameter(torch.zeros((self.num_experts, self.expert_dim, self.hidden_size), dtype=torch.float8_e4m3fn))
+        self.down_proj_scale = torch.nn.Parameter(torch.zeros((self.num_experts, self.hidden_size, 1), dtype=torch.float32))
+        # Register input scale upper bound
+        self.register_buffer("input_scale_ub", torch.zeros([1], dtype=torch.float), persistent=False)
+
+
+    def forward(self, hidden_states):
+        """
+        Args:
+            hidden_states (torch.Tensor): (batch_size * token_num, hidden_size)
+        Returns:
+            torch.Tensor: (batch_size * token_num, hidden_size)
+        """
+        # Reshape hidden states for expert computation
+        hidden_states = hidden_states.view(self.num_experts, -1, self.hidden_size)
+        num_tokens = None
+
+        # Pre-allocate tensor for all expert outputs with same shape as hidden_states
+        next_states = torch.empty_like(hidden_states)
+        
+        for i in range(self.num_experts):
+            # Extract expert's hidden states
+            expert_hidden = hidden_states[i]
+            expert_hidden_reshaped = expert_hidden.reshape(-1, self.hidden_size)
+            # Quantize for this expert
+            expert_quantized, expert_scale = torch.ops.fbgemm.quantize_fp8_per_row(
+                expert_hidden_reshaped, num_tokens, self.input_scale_ub
+            )
+            sharded_expert_dim = self.gate_up_proj.shape[-1] // 2
+            gate_up_proj_scale_float32 = self.gate_up_proj_scale.to(torch.float32)
+       
+            gate = torch.ops.fbgemm.f8f8bf16_rowwise(
+                expert_quantized,
+                self.gate_up_proj[i].transpose(0,1)[:sharded_expert_dim].contiguous(), 
+                expert_scale,
+                gate_up_proj_scale_float32[i][0][:sharded_expert_dim].view(-1, 1).contiguous(), 
+                use_fast_accum=True
+            )
+
+            up = torch.ops.fbgemm.f8f8bf16_rowwise(
+                expert_quantized, 
+                self.gate_up_proj[i].transpose(0,1)[sharded_expert_dim:].contiguous(), 
+                expert_scale, 
+                gate_up_proj_scale_float32[i][0][sharded_expert_dim:].view(-1, 1).contiguous(), 
+                use_fast_accum=True
+            )
+
+            activated = up * self.act_fn(gate)
+
+            activated_quantized, activated_scale = torch.ops.fbgemm.quantize_fp8_per_row(
+                activated, num_tokens, self.input_scale_ub
+            )
+
+            down_proj_scale_float32 = self.down_proj_scale.to(torch.float32)
+            expert_output = torch.ops.fbgemm.f8f8bf16_rowwise(
+                activated_quantized, 
+                self.down_proj[i].transpose(0,1).contiguous(), 
+                activated_scale, 
+                down_proj_scale_float32[i].view(-1, 1).contiguous(), 
+                use_fast_accum=True
+            )
+
+            next_states[i] = expert_output
+        next_states = next_states.to(hidden_states.device)
+        return next_states.view(-1, self.hidden_size)
+
+
 def _replace_with_fbgemm_fp8_linear(
     model,
     modules_to_not_convert=None,
     current_key_name=None,
     quantization_config=None,
     has_been_replaced=False,
     pre_quantized=False,
+    config=None,
+    tp_plan=None
 ):
     """
     Private method that wraps the recursion for module replacement.
 
     Returns the converted model and a boolean that indicates if the conversion has been successfull or not.
     """
+
+    from transformers.models.llama4.modeling_llama4 import Llama4TextExperts
+    import re
+    
     if current_key_name is None:
         current_key_name = []
 
@@ -105,9 +191,24 @@ def _replace_with_fbgemm_fp8_linear(
                     # Force requires grad to False to avoid unexpected errors
                     model._modules[name].requires_grad_(False)
                 # set non persistant buffer outside of init_empty_weights
+                model._modules[name].input_scale_ub = torch.tensor(
+                    [quantization_config.activation_scale_ub], dtype=torch.float,
+                )
+        if module.__class__.__name__ == "Llama4TextExperts" and name not in modules_to_not_convert:
+            current_key_name_str = ".".join(current_key_name)
+            if not any(
+                (key + "." in current_key_name_str) or (key == current_key_name_str) for key in modules_to_not_convert
+            ):
+                with init_empty_weights(include_buffers=True):
+                    tp_plan[re.sub(r"\d+", "*", current_key_name_str + ".gate_up_proj_scale")] = tp_plan[re.sub(r"\d+", "*", current_key_name_str + ".gate_up_proj")]
+                    tp_plan[re.sub(r"\d+", "*", current_key_name_str + ".down_proj_scale")] = None
+                    model._modules[name] = FbgemmFp8Llama4TextExperts(
+                        config.text_config,
+                    )
                 model._modules[name].input_scale_ub = torch.tensor(
                     [quantization_config.activation_scale_ub], dtype=torch.float
                 )
+            
         if len(list(module.children())) > 0:
             _, has_been_replaced = _replace_with_fbgemm_fp8_linear(
                 module,
@@ -116,14 +217,16 @@ def _replace_with_fbgemm_fp8_linear(
                 quantization_config,
                 has_been_replaced=has_been_replaced,
                 pre_quantized=pre_quantized,
+                config=config,
+                tp_plan=tp_plan
             )
         # Remove the last key for recursion
         current_key_name.pop(-1)
     return model, has_been_replaced
 
 
 def replace_with_fbgemm_fp8_linear(
-    model, modules_to_not_convert=None, current_key_name=None, quantization_config=None, pre_quantized=False
+    model, modules_to_not_convert=None, current_key_name=None, quantization_config=None, pre_quantized=False, config=None, tp_plan=None
 ):
     """
     A helper function to replace all `torch.nn.Linear` modules by `FbgemmFp8Linear` modules.
@@ -151,9 +254,8 @@ def replace_with_fbgemm_fp8_linear(
         modules_to_not_convert.extend(quantization_config.modules_to_not_convert)
     modules_to_not_convert = list(set(modules_to_not_convert))
     model, has_been_replaced = _replace_with_fbgemm_fp8_linear(
-        model, modules_to_not_convert, current_key_name, quantization_config, pre_quantized=pre_quantized
+        model, modules_to_not_convert, current_key_name, quantization_config, pre_quantized=pre_quantized, config=config, tp_plan=tp_plan
     )
-
     if not has_been_replaced:
         logger.warning(
             "You are loading your model using FP8 quantization but no linear modules were found in your model."
diff --git a/src/transformers/integrations/tensor_parallel.py b/src/transformers/integrations/tensor_parallel.py
@@ -60,6 +60,19 @@ def _blocks_to_block_sizes(total_size: int, blocks: Union[int, List[int]]) -> Li
         single_size = total_size // blocks
         return [single_size] * blocks
 
+str_to_torch_dtype = {
+    "BOOL": torch.bool,
+    "U8": torch.uint8,
+    "I8": torch.int8,
+    "I16": torch.int16,
+    "F16": torch.float16,
+    "BF16": torch.bfloat16,
+    "I32": torch.int32,
+    "F32": torch.float32,
+    "F64": torch.float64,
+    "I64": torch.int64,
+    "F8_E4M3": torch.float8_e4m3fn
+}
 
 def get_packed_weights(param, empty_param, device_mesh, rank, dim):
     """
@@ -105,6 +118,12 @@ def get_packed_weights(param, empty_param, device_mesh, rank, dim):
         stop = (rank + 1) * shard_block_size
         tensors_slices += range(block_offset + start, block_offset + stop)
         block_offset += block_size
+    
+    slice_dtype = slice_.get_dtype()
+    # Handle F8_E4M3 dtype by converting to float16 before slicing
+    # Without upcasting, the slicing causes : RuntimeError: "index_cpu" not implemented for 'Float8_e4m3fn'
+    if slice_dtype == "F8_E4M3":
+        slice_ = slice_[...].to(torch.float16)
 
     if dim == 0:
         tensor = slice_[tensors_slices, ...]
@@ -114,7 +133,7 @@ def get_packed_weights(param, empty_param, device_mesh, rank, dim):
         tensor = slice_[..., tensors_slices]
     else:
         raise ValueError(f"Unsupported dim {dim}, only dim 0, 1 or 2 are supported")
-    return tensor
+    return tensor.to(str_to_torch_dtype[slice_dtype])
 
 
 def get_tensor_shard(param, empty_param, device_mesh, rank, dim):
@@ -539,10 +558,16 @@ def shard_and_distribute_module(
         module_to_tp._is_hooked = True
 
     if current_module_plan is not None:
-        tp_layer = translate_to_torch_parallel_style(current_module_plan)
-        param = tp_layer.partition_tensor(
-            param, empty_param, param_type, param_casting_dtype, is_contiguous, rank, device_mesh
-        )
+        try:
+            tp_layer = translate_to_torch_parallel_style(current_module_plan)
+            param = tp_layer.partition_tensor(
+                param, empty_param, param_type, param_casting_dtype, is_contiguous, rank, device_mesh
+            )
+        except NotImplementedError as e:
+
+            print(
+                f"Trying to prepare {parameter_name}, but it's not supported. Corresponding module: {module_to_tp} Fix it's TP plan, current layer: {tp_layer} : {e}"
+            )
     else:
         # TODO log no plan modules in set
         print("No plan for", parameter_name,end ="\r")
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -4273,6 +4273,7 @@ def from_pretrained(
             )
             torch_dtype = hf_quantizer.update_torch_dtype(torch_dtype)
             device_map = hf_quantizer.update_device_map(device_map)
+            config = hf_quantizer.update_tp_plan(config)
 
             # In order to ensure popular quantization methods are supported. Can be disable with `disable_telemetry`
             if hasattr(hf_quantizer.quantization_config.quant_method, "value"):
@@ -4405,9 +4406,8 @@ def from_pretrained(
 
         if hf_quantizer is not None:
             hf_quantizer.preprocess_model(
-                model=model, device_map=device_map, keep_in_fp32_modules=model._keep_in_fp32_modules
+                model=model, device_map=device_map, keep_in_fp32_modules=model._keep_in_fp32_modules, config=config
             )
-
             # We store the original dtype for quantized models as we cannot easily retrieve it
             # once the weights have been quantized
             # Note that once you have loaded a quantized model, you can't change its dtype so this will
diff --git a/src/transformers/models/llama4/modeling_llama4.py b/src/transformers/models/llama4/modeling_llama4.py
@@ -184,7 +184,7 @@ def forward(self, hidden_states):
             input=hidden_states,
             dim=0,
             index=router_indices,
-        )
+        ).to(hidden_states.device)
         # we gather inputs corresponding to each expert based on the router indices
         routed_in = routed_in * router_scores.reshape(-1, 1)
         expert_routed_out_list = []
diff --git a/src/transformers/quantizers/base.py b/src/transformers/quantizers/base.py
@@ -198,6 +198,10 @@ def validate_environment(self, *args, **kwargs):
         """
         return
 
+    def update_tp_plan(self, config):
+        "updates the tp plan for the scales"
+        return config
+
     def preprocess_model(self, model: "PreTrainedModel", **kwargs):
         """
         Setting model attributes and/or converting model before weights loading. At this point
diff --git a/src/transformers/quantizers/quantizer_compressed_tensors.py b/src/transformers/quantizers/quantizer_compressed_tensors.py
@@ -141,6 +141,19 @@ def _process_model_after_weight_loading(self, model, **kwargs):
 
                 self.compressor.quantization_config.quantization_status = QuantizationStatus.FROZEN
             self.compressor.decompress(model_path=cache_path, model=model)
+    
+    def update_tp_plan(self, config):
+        additional_plan = {
+                "layers.*.feed_forward.experts.*.gate_proj.weight": "local_colwise",
+                "layers.*.feed_forward.experts.*.gate_proj.weight_scale": "local_colwise",
+                "layers.*.feed_forward.experts.*.up_proj.weight": "local_colwise",
+                "layers.*.feed_forward.experts.*.up_proj.weight_scale": "local_colwise",
+                "layers.*.feed_forward.experts.*.down_proj.weight": "local_rowwise",
+        }
+        if config.get_text_config() is not None and config.get_text_config().base_model_tp_plan is not None:
+            config.get_text_config().base_model_tp_plan.update(additional_plan)
+
+        return config
 
     @property
     def is_quantized(self):
diff --git a/src/transformers/quantizers/quantizer_fbgemm_fp8.py b/src/transformers/quantizers/quantizer_fbgemm_fp8.py