address review

NickLucche · NickLucche · commit 75b5425658d6 · 2025-03-05T13:53:26.000Z
Signed-off-by: NickLucche &lt;nlucches@redhat.com&gt;
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
@@ -1178,3 +1178,98 @@ def extra_repr(self) -> str:
         s += f", tp_size={self.tp_size}"
         s += f", reduce_results={self.reduce_results}"
         return s
+
+
+class QKVCrossParallelLinear(torch.nn.Module):
+
+    def __init__(self,
+                 hidden_size: int,
+                 head_size: int,
+                 total_num_heads: int,
+                 total_num_kv_heads: Optional[int] = None,
+                 bias: bool = True,
+                 skip_bias_add: bool = False,
+                 params_dtype: Optional[torch.dtype] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        # Empty placeholders for loading as a single module.
+        self.weight = torch.nn.Parameter()
+        set_weight_attrs(self.weight, {
+            "weight_loader": self.weight_loader_weight,
+        })
+        # Use a dictionary to avoid submodules parameters auto-registration:
+        # drop-in replacement for a `QKVParallelLinear` module.
+        self.proj = dict()
+        self.proj["q_proj_decoder"] = ColumnParallelLinear(
+            input_size=hidden_size,
+            output_size=total_num_heads * head_size,
+            bias=bias,
+            quant_config=quant_config,
+            skip_bias_add=skip_bias_add,
+            params_dtype=params_dtype,
+            prefix=f"{prefix}.q_proj_decoder")
+
+        self.proj["kv_proj_encoder"] = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=head_size,
+            total_num_heads=0,
+            total_num_kv_heads=total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            skip_bias_add=skip_bias_add,
+            params_dtype=params_dtype,
+            prefix=f"{prefix}.kv_proj_encoder")
+
+        # `kv_proj_encoder.num_kv_heads` accounts for sharding with tp>1.
+        self.kv_size = self.kv_proj_encoder.num_kv_heads * head_size
+
+        if bias:
+            self.bias = torch.nn.Parameter()
+            set_weight_attrs(self.bias, {
+                "weight_loader": self.weight_loader_bias,
+            })
+
+    @property
+    def q_proj_decoder(self):
+        return self.proj["q_proj_decoder"]
+
+    @property
+    def kv_proj_encoder(self):
+        return self.proj["kv_proj_encoder"]
+
+    def forward(self, decoder_hidden_states, encoder_hidden_states):
+        q, _ = self.q_proj_decoder(decoder_hidden_states)
+        if encoder_hidden_states is None:
+            # Encoder KV already cached.
+            k = None
+            v = None
+        else:
+            # Prefill phase, encoder KV cached here.
+            kv_enc, _ = self.kv_proj_encoder(encoder_hidden_states)
+            # Split kv in half
+            k, v = kv_enc.split(self.kv_size, dim=-1)
+        return q, k, v
+
+    def weight_loader_weight(self,
+                             param: torch.nn.Parameter,
+                             loaded_weight: torch.Tensor,
+                             loaded_shard_id: Optional[str] = None):
+        # NOTE Use QKV/ColumnParallel weight_loader, ignore placeholder param.
+        param = self.q_proj_decoder.weight if loaded_shard_id == "q" \
+            else self.kv_proj_encoder.weight
+        param.weight_loader(
+            param,
+            loaded_weight) if loaded_shard_id == "q" else param.weight_loader(
+                param, loaded_weight, loaded_shard_id)
+
+    def weight_loader_bias(self,
+                           param: torch.nn.Parameter,
+                           loaded_weight: torch.Tensor,
+                           loaded_shard_id: Optional[str] = None):
+        param = self.q_proj_decoder.bias if loaded_shard_id == "q" \
+            else self.kv_proj_encoder.bias
+        param.weight_loader(
+            param,
+            loaded_weight) if loaded_shard_id == "q" else param.weight_loader(
+                param, loaded_weight, loaded_shard_id)
diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py
@@ -31,6 +31,7 @@
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVCrossParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -44,7 +45,7 @@
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsV0Only
-from .utils import QKVCrossParallelLinear, maybe_prefix
+from .utils import maybe_prefix
 
 logger = logging.get_logger(__name__)
 
@@ -169,7 +170,7 @@ def __init__(
             # Number of KV heads is less than TP size, so we replicate
             # the KV heads across multiple tensor parallel GPUs.
             assert tp_world_size % self.total_num_kv_heads == 0
-        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_world_size)
+        self.num_kv_heads = self.num_heads
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
 
@@ -248,7 +249,7 @@ def __init__(
             # Number of KV heads is less than TP size, so we replicate
             # the KV heads across multiple tensor parallel GPUs.
             assert tp_world_size % self.total_num_kv_heads == 0
-        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_world_size)
+        self.num_kv_heads = self.num_heads
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
 
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
@@ -44,6 +44,7 @@
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                QKVParallelLinear,
+                                               QKVCrossParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -66,7 +67,7 @@
 from .clip import CLIPMLP
 from .interfaces import SupportsMultiModal, SupportsV0Only
 from .llama import LlamaDecoderLayer, LlamaMLP
-from .utils import QKVCrossParallelLinear, maybe_prefix
+from .utils import maybe_prefix
 
 logger = init_logger(__name__)
 
@@ -806,6 +807,7 @@ def __init__(
             self.num_key_value_heads // self.tensor_parallel_size
         self.hidden_size = config.hidden_size
         self.head_dim = config.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
 
         self.layer_idx = layer_idx
         self.num_key_value_groups = self.num_heads // self.num_key_value_heads
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
@@ -12,12 +12,7 @@
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               QKVParallelLinear)
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.utils import set_weight_attrs
 from vllm.multimodal import MultiModalPlaceholderMap, NestedTensors
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_pin_memory_available
@@ -655,98 +650,4 @@ def cast_overflow_tensors(
     if tensors.isinf().any() or tensors.isnan().any():
         clamp_value = torch.finfo(tensors.dtype).max - offset
         tensors = torch.clamp(tensors, min=-clamp_value, max=clamp_value)
-    return tensors
-
-class QKVCrossParallelLinear(torch.nn.Module):
-
-    def __init__(self,
-                 hidden_size: int,
-                 head_size: int,
-                 total_num_heads: int,
-                 total_num_kv_heads: Optional[int] = None,
-                 bias: bool = True,
-                 skip_bias_add: bool = False,
-                 params_dtype: Optional[torch.dtype] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
-        super().__init__()
-        # Empty placeholders for loading as a single module.
-        self.weight = torch.nn.Parameter()
-        set_weight_attrs(self.weight, {
-            "weight_loader": self.weight_loader_weight,
-        })
-        # Use a dictionary to avoid submodules parameters auto-registration:
-        # drop-in replacement for a `QKVParallelLinear` module.
-        self.proj = dict()
-        self.proj["q_proj_decoder"] = ColumnParallelLinear(
-            input_size=hidden_size,
-            output_size=total_num_heads * head_size,
-            bias=bias,
-            quant_config=quant_config,
-            skip_bias_add=skip_bias_add,
-            params_dtype=params_dtype,
-            prefix=f"{prefix}.q_proj_decoder")
-
-        self.proj["kv_proj_encoder"] = QKVParallelLinear(
-            hidden_size=hidden_size,
-            head_size=head_size,
-            total_num_heads=0,
-            total_num_kv_heads=total_num_kv_heads,
-            bias=bias,
-            quant_config=quant_config,
-            skip_bias_add=skip_bias_add,
-            params_dtype=params_dtype,
-            prefix=f"{prefix}.kv_proj_encoder")
-
-        # `kv_proj_encoder.num_kv_heads` accounts for sharding with tp>1.
-        self.kv_size = self.kv_proj_encoder.num_kv_heads * head_size
-
-        if bias:
-            self.bias = torch.nn.Parameter()
-            set_weight_attrs(self.bias, {
-                "weight_loader": self.weight_loader_bias,
-            })
-
-    @property
-    def q_proj_decoder(self):
-        return self.proj["q_proj_decoder"]
-
-    @property
-    def kv_proj_encoder(self):
-        return self.proj["kv_proj_encoder"]
-
-    def forward(self, decoder_hidden_states, encoder_hidden_states):
-        q, _ = self.q_proj_decoder(decoder_hidden_states)
-        if encoder_hidden_states is None:
-            # Encoder KV already cached.
-            k = None
-            v = None
-        else:
-            # Prefill phase, encoder KV cached here.
-            kv_enc, _ = self.kv_proj_encoder(encoder_hidden_states)
-            # Split kv in half
-            k, v = kv_enc.split(self.kv_size, dim=-1)
-        return q, k, v
-
-    def weight_loader_weight(self,
-                             param: torch.nn.Parameter,
-                             loaded_weight: torch.Tensor,
-                             loaded_shard_id: Optional[str] = None):
-        # NOTE Use QKV/ColumnParallel weight_loader, ignore placeholder param.
-        param = self.q_proj_decoder.weight if loaded_shard_id == "q" \
-            else self.kv_proj_encoder.weight
-        param.weight_loader(
-            param,
-            loaded_weight) if loaded_shard_id == "q" else param.weight_loader(
-                param, loaded_weight, loaded_shard_id)
-
-    def weight_loader_bias(self,
-                           param: torch.nn.Parameter,
-                           loaded_weight: torch.Tensor,
-                           loaded_shard_id: Optional[str] = None):
-        param = self.q_proj_decoder.bias if loaded_shard_id == "q" \
-            else self.kv_proj_encoder.bias
-        param.weight_loader(
-            param,
-            loaded_weight) if loaded_shard_id == "q" else param.weight_loader(
-                param, loaded_weight, loaded_shard_id)
+    return tensors