lulmer
diff --git a/‎vllm/lora/layers.py‎
Lines changed: 30 additions & 21 deletions b/‎vllm/lora/layers.py‎
Lines changed: 30 additions & 21 deletions
diff --git a/‎vllm/lora/utils.py‎
Lines changed: 0 additions & 10 deletions b/‎vllm/lora/utils.py‎
Lines changed: 0 additions & 10 deletions
@@ -395,17 +395,20 @@ def apply(self,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
+
+        # In transformers backend, x and output have extra batch dimension like
+        # (1, seq_len, hidden_dim), while punica expects (seq_len, hidden_dim),
+        # therefore we need to flatten the batch dimensions.
+        if x.ndim == 3 and output.ndim == 3:
+            output = output.flatten(0, 1)
+            x = x.flatten(0, 1)
+
         self.punica_wrapper.add_lora_linear(output, x, self.lora_a_stacked,
                                             self.lora_b_stacked,
                                             self.lora_bias_stacked, 1.0,
                                             self.output_slices)
         return output
 
-    @classmethod
-    def get_source_layer(cls, source_layer: nn.Module) -> type:
-        # Check parent_cls in case source_layer is a HFCompatibleLinear.
-        return getattr(source_layer, "parent_cls", type(source_layer))
-
 
 class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA):
 
@@ -418,7 +421,7 @@ def __init__(self, base_layer: ReplicatedLinear) -> None:
 
     def forward(
         self, input_: torch.Tensor
-    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[torch.Tensor]]]:
         """Forward of ReplicatedLinearWithLoRA
 
         Args:
@@ -436,6 +439,10 @@ def forward(
 
         output_bias = (self.base_layer.bias
                        if self.base_layer.skip_bias_add else None)
+
+        if not self.base_layer.return_bias:
+            return output
+
         return output, output_bias
 
     # ReplicatedLinear should always be replaced, regardless of the fully
@@ -448,8 +455,7 @@ def can_replace_layer(
         packed_modules_list: List,
         model_config: Optional[PretrainedConfig],
     ) -> bool:
-        source_layer = cls.get_source_layer(source_layer)
-        return source_layer is ReplicatedLinear
+        return type(source_layer) is ReplicatedLinear
 
 
 class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
@@ -512,7 +518,7 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
 
     def forward(
         self, input_: torch.Tensor
-    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[torch.Tensor]]]:
         """Forward of ColumnParallelLinear
 
         Args:
@@ -532,6 +538,10 @@ def forward(
             output = tensor_model_parallel_all_gather(output_parallel)
         else:
             output = output_parallel
+
+        if not self.base_layer.return_bias:
+            return output
+
         output_bias = (self.base_layer.bias
                        if self.base_layer.skip_bias_add else None)
         return output, output_bias
@@ -545,9 +555,8 @@ def can_replace_layer(
         packed_modules_list: List,
         model_config: Optional[PretrainedConfig],
     ) -> bool:
-        source_layer = cls.get_source_layer(source_layer)
-        return source_layer is ColumnParallelLinear or (
-            source_layer is MergedColumnParallelLinear
+        return type(source_layer) is ColumnParallelLinear or (
+            type(source_layer) is MergedColumnParallelLinear
             and len(packed_modules_list) == 1)
 
 
@@ -689,8 +698,7 @@ def can_replace_layer(
         packed_modules_list: List,
         model_config: Optional[PretrainedConfig],
     ) -> bool:
-        source_layer = cls.get_source_layer(source_layer)
-        return (source_layer is MergedColumnParallelLinear
+        return (type(source_layer) is MergedColumnParallelLinear
                 and len(packed_modules_list) == 2)
 
 
@@ -758,8 +766,7 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
     def can_replace_layer(cls, source_layer: nn.Module,
                           lora_config: LoRAConfig, packed_modules_list: List,
                           model_config: Optional[PretrainedConfig]) -> bool:
-        source_layer = cls.get_source_layer(source_layer)
-        return source_layer is QKVParallelLinear and len(
+        return type(source_layer) is QKVParallelLinear and len(
             packed_modules_list) == 1
 
 
@@ -820,8 +827,7 @@ def can_replace_layer(
         packed_modules_list: List,
         model_config: Optional[PretrainedConfig],
     ) -> bool:
-        source_layer = cls.get_source_layer(source_layer)
-        return (source_layer is QKVParallelLinear
+        return (type(source_layer) is QKVParallelLinear
                 and len(packed_modules_list) == 3)
 
 
@@ -855,7 +861,7 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
 
     def forward(
         self, input_: torch.Tensor
-    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[torch.Tensor]]]:
         """Forward of RowParallelLinear
 
         Args:
@@ -890,6 +896,10 @@ def forward(
         else:
             output = output_
             output_bias = self.base_layer.bias
+
+        if not self.base_layer.return_bias:
+            return output
+
         return output, output_bias
 
     @property
@@ -906,8 +916,7 @@ def can_replace_layer(
         packed_modules_list: List,
         model_config: Optional[PretrainedConfig],
     ) -> bool:
-        source_layer = cls.get_source_layer(source_layer)
-        return source_layer is RowParallelLinear
+        return type(source_layer) is RowParallelLinear
 
 
 class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
 
@@ -67,16 +67,6 @@ def from_layer(layer: nn.Module,
                                       packed_modules_list=packed_modules_list,
                                       model_config=model_config):
             instance_layer = lora_cls(layer)
-            if layer.__class__.__name__ == "HFCompatibleLinear":
-                # HACK:  Make the forward method compatible with the original
-                # forward method of the instance_layer.
-                original_forward = instance_layer.forward
-
-                def new_forward(input):
-                    input = input.squeeze(0)
-                    return original_forward(input)[0]  # noqa: B023
-
-                instance_layer.forward = new_forward
             instance_layer.create_lora_weights(max_loras, lora_config,
                                                model_config)
             return instance_layer