juliendenize
diff --git a/‎benchmarks/kernels/benchmark_lora.py‎
Lines changed: 446 additions & 30 deletions b/‎benchmarks/kernels/benchmark_lora.py‎
Lines changed: 446 additions & 30 deletions
diff --git a/‎tests/lora/test_fused_moe_lora_kernel.py‎
Lines changed: 11 additions & 0 deletions b/‎tests/lora/test_fused_moe_lora_kernel.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎vllm/lora/layers/fused_moe.py‎
Lines changed: 83 additions & 20 deletions b/‎vllm/lora/layers/fused_moe.py‎
Lines changed: 83 additions & 20 deletions
diff --git a/‎vllm/lora/ops/triton_ops/README_TUNING.md‎
Lines changed: 10 additions & 1 deletion b/‎vllm/lora/ops/triton_ops/README_TUNING.md‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎vllm/lora/ops/triton_ops/__init__.py‎
Lines changed: 8 additions & 1 deletion b/‎vllm/lora/ops/triton_ops/__init__.py‎
Lines changed: 8 additions & 1 deletion
@@ -158,6 +158,8 @@ def use_fused_moe_lora_kernel(
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
+        "NUM_WARPS": 4,
+        "NUM_STAGES": 3,
         "SPLIT_K": 1,
     }
 
@@ -182,6 +184,15 @@ def use_fused_moe_lora_kernel(
         config["BLOCK_SIZE_N"],
         config["BLOCK_SIZE_K"],
         config["GROUP_SIZE_M"],
+        config["NUM_WARPS"],
+        config["NUM_STAGES"],
+        config["SPLIT_K"],
+        config["BLOCK_SIZE_M"],
+        config["BLOCK_SIZE_N"],
+        config["BLOCK_SIZE_K"],
+        config["GROUP_SIZE_M"],
+        config["NUM_WARPS"],
+        config["NUM_STAGES"],
         config["SPLIT_K"],
         mul_routed_weight,
     )
 
@@ -13,6 +13,7 @@
     get_tensor_model_parallel_world_size,
 )
 from vllm.lora.layers.base import BaseLayerWithLoRA
+from vllm.lora.ops.triton_ops.utils import get_lora_op_configs
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.fused_moe.config import (
     _get_config_dtype_str,
@@ -39,6 +40,64 @@ def __init__(self, base_layer: FusedMoE) -> None:
         self.device = base_layer.w2_weight.device
         self._inject_lora_into_fused_moe()
 
+    def _normalize_keys(self, config: dict[str, int | None]) -> dict[str, int | None]:
+        normalized_config = {}
+        for key, value in config.items():
+            if key.islower():
+                if key.startswith("block_"):
+                    normalized_key = "BLOCK_SIZE_" + key.split("_")[-1].upper()
+                else:
+                    normalized_key = key.upper()
+            else:
+                normalized_key = key
+            normalized_config[normalized_key] = value
+        return normalized_config
+
+    def _get_lora_moe_configs(
+        self,
+        op_prefix: str,
+        lora_a_stacked: torch.Tensor,
+        lora_b_stacked: torch.Tensor,
+        num_slices: int,
+        M: int,
+        layer: FusedMoE,
+        top_k: int,
+        config_dtype: str,
+    ):
+        if envs.VLLM_TUNED_CONFIG_FOLDER:
+            shrink_config = get_lora_op_configs(
+                op_type=f"fused_moe_lora_{op_prefix}_shrink",
+                max_loras=lora_a_stacked.shape[0],
+                batch=M,
+                hidden_size=lora_a_stacked.shape[-1],
+                rank=lora_a_stacked.shape[-2],
+                num_slices=num_slices,
+                moe_intermediate_size=lora_b_stacked.shape[-2],
+            )
+            expand_config = get_lora_op_configs(
+                op_type=f"fused_moe_lora_{op_prefix}_expand",
+                max_loras=lora_a_stacked.shape[0],
+                batch=M,
+                hidden_size=lora_a_stacked.shape[-1],
+                rank=lora_a_stacked.shape[-2],
+                num_slices=num_slices,
+                moe_intermediate_size=lora_b_stacked.shape[-2],
+            )
+        else:  # fall back to the default config
+            get_config_func = functools.partial(
+                try_get_optimal_moe_config,
+                layer.w13_weight.size(),
+                layer.w2_weight.size(),
+                top_k,
+                config_dtype,
+                block_shape=layer.quant_method.moe_quant_config.block_shape,
+            )
+            shrink_config = get_config_func(M)
+            expand_config = get_config_func(M)
+        shrink_config = self._normalize_keys(shrink_config)
+        expand_config = self._normalize_keys(expand_config)
+        return shrink_config, expand_config
+
     def _inject_lora_into_fused_moe(self):
         moe_state_dict = {}
         top_k = self.base_layer.top_k
@@ -90,25 +149,27 @@ def wrapper(*args, **kwargs):
                 num_tokens = hidden_states.size(0)
                 M = min(num_tokens, CHUNK_SIZE)
 
-                get_config_func = functools.partial(
-                    try_get_optimal_moe_config,
-                    layer.w13_weight.size(),
-                    layer.w2_weight.size(),
-                    top_k,
-                    config_dtype,
-                    block_shape=layer.quant_method.moe_quant_config.block_shape,
+                shrink_config, expand_config = self._get_lora_moe_configs(
+                    op_prefix="w13",
+                    lora_a_stacked=self.w1_lora_a_stacked,
+                    lora_b_stacked=self.w1_lora_b_stacked,
+                    num_slices=2,
+                    M=M,
+                    layer=layer,
+                    top_k=top_k,
+                    config_dtype=config_dtype,
                 )
 
+                # get the block size of m from customized config or default config
                 max_loras = self.w1_lora_a_stacked.shape[0]
-                config = get_config_func(M)
                 (
                     sorted_token_ids_lora,
                     expert_ids_lora,
                     num_tokens_post_padded_lora,
                 ) = self.punica_wrapper.moe_lora_align_block_size(
                     curr_topk_ids,
                     num_tokens,
-                    config["BLOCK_SIZE_M"],
+                    shrink_config["BLOCK_SIZE_M"],
                     self.base_layer.local_num_experts,
                     max_loras,
                     self.adapter_enabled,
@@ -138,7 +199,8 @@ def wrapper(*args, **kwargs):
                     num_tokens_post_padded_lora,
                     max_lora_rank,
                     top_k,
-                    config,
+                    shrink_config,  ## pass the shrink config
+                    expand_config,  ## pass the expand config
                     self.adapter_enabled,
                 )
 
@@ -164,17 +226,17 @@ def wrapper(*args, **kwargs):
                 num_tokens = hidden_states.size(0)
                 M = min(num_tokens, CHUNK_SIZE)
 
-                get_config_func = functools.partial(
-                    try_get_optimal_moe_config,
-                    layer.w13_weight.size(),
-                    layer.w2_weight.size(),
-                    top_k,
-                    config_dtype,
-                    block_shape=layer.quant_method.moe_quant_config.block_shape,
+                shrink_config, expand_config = self._get_lora_moe_configs(
+                    op_prefix="w2",
+                    lora_a_stacked=self.w2_lora_a_stacked,
+                    lora_b_stacked=self.w2_lora_b_stacked,
+                    num_slices=1,
+                    M=M,
+                    layer=layer,
+                    top_k=top_k,
+                    config_dtype=config_dtype,
                 )
 
-                config = get_config_func(M)
-
                 sorted_token_ids_lora = moe_state_dict["sorted_token_ids_lora"]
                 expert_ids_lora = moe_state_dict["expert_ids_lora"]
                 num_tokens_post_padded_lora = moe_state_dict[
@@ -197,7 +259,8 @@ def wrapper(*args, **kwargs):
                     num_tokens_post_padded_lora,
                     max_lora_rank,
                     top_k,
-                    config,
+                    shrink_config,  ## pass the shrink config
+                    expand_config,  ## pass the expand config
                     self.adapter_enabled,
                     True,
                 )
 
@@ -44,8 +44,17 @@ For `shrink`, the config file is named as `{gpu_name}_SHRINK.json`, e.g. `NVIDIA
 
 For `expand`, the config fileis named as `{gpu_name}_EXPAND_{add_input}.json`, e.g. `NVIDIA_H200_EXPAND_TRUE.json`.
 
+For `fused_moe_lora_w13_shrink`, the config file is named as `{gpu_name}_FUSED_MOE_LORA_W13_SHRINK.json`, e.g. `NVIDIA_H200_FUSED_MOE_LORA_W13_SHRINK.json`.
+
+For `fused_moe_lora_w13_expand`, the config file is named as `{gpu_name}_FUSED_MOE_LORA_W13_EXPAND.json`, e.g. `NVIDIA_H200_FUSED_MOE_LORA_W13_EXPAND.json`.
+
+For `fused_moe_lora_w2_shrink`, the config file is named as `{gpu_name}_FUSED_MOE_LORA_W2_SHRINK.json`, e.g. `NVIDIA_H200_FUSED_MOE_LORA_W2_SHRINK.json`.
+
+For `fused_moe_lora_w2_expand`, the config file is named as `{gpu_name}_FUSED_MOE_LORA_W2_EXPAND.json`, e.g. `NVIDIA_H200_FUSED_MOE_LORA_W2_EXPAND.json`.
+
 The `gpu_name` can be automatically detected by calling `torch.cuda.get_device_name()`
 
 ### Json Structure
 
-Optimal kernel configuration files are saved as JSON files with the structure `config_data[max_loras][num_slices][m][k][n]`
+Optimal kernel configuration files are saved as JSON files with the structure `config_data[max_loras][num_slices][m][k][n][i]`
+where `i` is an optional dimension in the `fused_moe_lora` configuration, representing the intermediate size of the MoE layer.
@@ -1,7 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from vllm.lora.ops.triton_ops.fused_moe_lora_op import fused_moe_lora
+
+from vllm.lora.ops.triton_ops.fused_moe_lora_op import (
+    fused_moe_lora,
+    fused_moe_lora_expand,
+    fused_moe_lora_shrink,
+)
 from vllm.lora.ops.triton_ops.lora_expand_op import lora_expand
 from vllm.lora.ops.triton_ops.lora_kernel_metadata import LoRAKernelMeta
 from vllm.lora.ops.triton_ops.lora_shrink_op import lora_shrink
@@ -11,4 +16,6 @@
     "lora_shrink",
     "LoRAKernelMeta",
     "fused_moe_lora",
+    "fused_moe_lora_shrink",
+    "fused_moe_lora_expand",
 ]