Merge pull request vllm-project#26 from intel-sandbox/jianan/enable_linear_fusion_and_prepack

jianan-gu · web-flow · commit dddd40ffe15e · 2024-06-21T17:35:51.000+08:00
Enable linear fusion/prepack and MOE AWQ fusion
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
@@ -14,7 +14,11 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.utils import set_weight_attrs
-
+import intel_extension_for_pytorch as ipex
+from intel_extension_for_pytorch.cpu._auto_kernel_selection import (
+    _enable_tpp,
+    _disable_tpp,
+)
 logger = init_logger(__name__)
 
 
@@ -103,6 +107,20 @@ def apply(self,
               layer: torch.nn.Module,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        if not hasattr(layer, "ipex_linear"):
+            linear = torch.nn.Linear(layer.weight.shape[1], layer.weight.shape[0], bias=True if bias is not None else False)
+            linear.weight = layer.weight
+            if bias is not None:
+                linear.bias = bias
+            _disable_tpp()
+            if layer.weight.dtype is torch.bfloat16:
+                _enable_tpp()
+            layer.ipex_linear = ipex.llm.optimize(linear.eval(), dtype=layer.weight.dtype, inplace=True)
+
+        if hasattr(layer, "ipex_linear"):
+            res = layer.ipex_linear(x)
+            return res
+
         weight = layer.weight
         if self.separate_bias_add:
             if bias is not None:
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
@@ -2,7 +2,7 @@
 
 import torch
 from torch.nn.parameter import Parameter
-import intel_extension_for_pytorch
+import intel_extension_for_pytorch as ipex
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
 from vllm.model_executor.layers.quantization.base_config import (
@@ -150,78 +150,6 @@ def create_weights(self, layer: torch.nn.Module,
         layer.register_parameter("scales", scales)
         set_weight_attrs(scales, extra_weight_attrs)
 
-    def awq_reverse_reorder_int_tensor(self,int_tensor, bits: int):
-        assert bits == 4
-
-        int_tensor = int_tensor.T.contiguous()
-        compress_ratio = (32 // bits)
-        assert int_tensor.shape[-1] % compress_ratio == 0
-
-        order_map = [0, 2, 4, 6, 1, 3, 5, 7]
-        order_tensor = torch.tensor(
-            order_map, dtype=torch.int32, device=int_tensor.device).reshape(1, -1)
-        order_tensor = order_tensor.repeat(
-            int_tensor.shape[1]//compress_ratio, 1)
-        order_tensor = order_tensor + torch.arange(0, int_tensor.shape[1],
-                                                    compress_ratio, dtype=torch.int32, device=int_tensor.device).reshape(-1, 1)
-        order_tensor = order_tensor.reshape(-1)
-
-        reverse_order_tensor = torch.arange(order_tensor.shape[0])[order_tensor]
-        reverse_order_tensor = reverse_order_tensor[order_tensor]
-        int_tensor = int_tensor[:, reverse_order_tensor]
-        return int_tensor
-    def unpack_awq(self, awq_qweight: torch.Tensor, awq_qzeros: torch.Tensor, awq_scales: torch.Tensor, bits: int, group_size: int):
-        """
-        Args:
-            awq_qweight (`torch.LongTensor`):
-                Expected shape: (in_features, out_features // (32 // bits))
-            awq_qzeros (`torch.LongTensor`):
-                Expected shape: (in_features // group_size, out_features // (32 // bits))
-            awq_scales (`torch.LongTensor`):
-                Expected shape: (in_features // group_size, out_features)
-
-        Returns:
-            fp16_weight (`torch.LongTensor`):
-                With shape (in_features, out_features).
-            zeros (`torch.LongTensor`):
-                With shape (in_features // group_size, out_features).
-        """
-        assert bits == 4
-
-        qzeros = awq_qzeros
-        qweight = awq_qweight
-        qweight = qweight.T.contiguous()
-
-        scales = awq_scales
-        scales = scales.reshape(-1, 1, scales.shape[-1])
-
-        infeatures = awq_qweight.shape[0]
-
-        wf = torch.tensor(list(range(0, 32, bits)), dtype=torch.int32, device=qzeros.device).unsqueeze(0)
-        zeros = torch.bitwise_right_shift(torch.unsqueeze(qzeros, 2), wf.unsqueeze(0)).to(
-            torch.int16 if bits == 8 else torch.int8)
-
-        #zeros = zeros + 1
-
-        torch.bitwise_and(zeros, (2 ** bits) - 1, out=zeros)
-
-        zeros = zeros.reshape(-1, 1, zeros.shape[1] * zeros.shape[2])
-
-        weight = torch.bitwise_right_shift(torch.unsqueeze(
-            qweight, 1), wf.unsqueeze(-1)).to(torch.int16 if bits == 8 else torch.int8)
-        torch.bitwise_and(weight, (2 ** bits) - 1, out=weight)
-        weight = weight.reshape(-1, group_size, weight.shape[2])
-
-        weight = weight.view(-1, weight.shape[-1])
-        zeros = zeros.view(-1, zeros.shape[-1])
-
-        zeros = zeros.T.contiguous()
-        zeros = self.awq_reverse_reorder_int_tensor(zeros, bits)
-        weight = self.awq_reverse_reorder_int_tensor(weight, bits)
-
-        return weight.contiguous(), zeros.contiguous()
-
-
     def apply(self,
               layer: torch.nn.Module,
               x: torch.Tensor,
@@ -232,27 +160,8 @@ def apply(self,
         pack_factor = self.quant_config.pack_factor
         out_shape = (x.shape[:-1] + (qweight.shape[-1] * pack_factor, ))
         reshaped_x = x.reshape(-1, x.shape[-1])
+        if not hasattr(layer,"ipex_qlinear") :
+            layer.ipex_qlinear = ipex.nn.modules.weight_only_quantization.WeightOnlyQuantizedLinear.from_int4_weight(qweight, scales, qzeros, x.shape[-1], out_shape[-1], bias=bias, group_size=self.quant_config.group_size)
+        out = layer.ipex_qlinear(reshaped_x)
 
-        if not hasattr(self,"_op_context") :
-            t, zp_x = self.unpack_awq(qweight, qzeros, scales, 4, 128)
-            # # transpose -> [N, K]
-            t = t.T.contiguous()
-            qweight_ = t[:, 1::2].bitwise_left_shift(4).bitwise_or_(t[:, ::2]).to(torch.uint8)
-            scales_ = scales.t().contiguous()
-            self._op_context = torch.ops.ipex_prepack.weight_only_qlinear_prepack_int4(
-                qweight_,
-                scales_,
-                zp_x.t_().contiguous(),
-                bias,
-                None,
-                None,
-                128,
-                2, # 2 for bf16 compute, 3 for int8 compute
-                1,
-            )
-
-        out = torch.ops.torch_ipex.ipex_woq_linear(reshaped_x, self._op_context.get_data_handle())
-
-        if bias is not None:
-            out.add_(bias)
         return out.reshape(out_shape)
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
@@ -20,6 +20,7 @@
 
 import torch
 from torch import nn
+import intel_extension_for_pytorch as ipex
 from transformers import GPTJConfig
 
 from vllm.attention import Attention, AttentionMetadata
@@ -130,9 +131,18 @@ def __init__(
                               intermediate_size)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states, _ = self.fc_in(hidden_states)
-        hidden_states = self.act(hidden_states)
-        hidden_states, _ = self.fc_out(hidden_states)
+        if not hasattr(self, "ipex_fusion"):
+            if hasattr(self.fc_in, "ipex_linear"):
+                self.ipex_fusion = ipex.llm.modules.LinearNewGelu(self.fc_in.ipex_linear)
+            elif hasattr(self.fc_in, "ipex_qlinear"):
+                self.ipex_fusion = ipex.llm.modules.LinearNewGelu(self.fc_in.ipex_qlinear)
+        if hasattr(self, "ipex_fusion"):
+            hidden_states = self.ipex_fusion(hidden_states)
+        else:
+            hidden_states, _ = self.fc_in(hidden_states)
+            hidden_states = self.act(hidden_states)
+        # move self.fc_out to GPTJBlock to enable linear+add+add fusion when tp_size <=1c
+        # hidden_states, _ = self.fc_out(hidden_states)
         return hidden_states
 
 
@@ -167,7 +177,20 @@ def forward(
             attn_metadata=attn_metadata,
         )
         mlp_output = self.mlp(hidden_states)
-        hidden_states = attn_output + mlp_output + residual
+        if self.mlp.fc_out.tp_size <=1 and not hasattr(self, "ipex_fusion"):
+            if hasattr(self.mlp.fc_out, "ipex_linear"):
+                self.ipex_fusion = ipex.llm.modules.LinearAddAdd(self.mlp.fc_out.ipex_linear)
+            elif hasattr(self.mlp.fc_out, "ipex_qlinear"):
+                self.ipex_fusion = ipex.llm.modules.LinearAddAdd(self.mlp.fc_out.ipex_qlinear)
+        if hasattr(self, "ipex_fusion"):
+            hidden_states = self.ipex_fusion(
+                mlp_output, attn_output,  residual
+            )
+            if not self.mlp.fc_out.skip_bias_add and self.mlp.fc_out.bias is not None:
+                hidden_states = hidden_states + self.mlp.fc_out.bias
+        else:
+            mlp_output, _ = self.mlp.fc_out(mlp_output)
+            hidden_states = attn_output + mlp_output + residual
         return hidden_states
 
 
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
@@ -56,88 +56,7 @@
     _enable_tpp,
     _disable_tpp,
 )
-class _IPEXlinearMOECPU(nn.Module):
-    def __init__(self, W13, W2, W3=None, tpp=False, woq=False):
-        super().__init__()
-        self.tpp = tpp
-        self.woq = woq
-        self.num_experts = W2.shape[0]
-        self.hidden_size = W2.shape[1]
-        self.intermediate_size = W2.shape[2]
-
-        linear_list = []
-        for i in range(W2.shape[0]):
-            if W3 is not None:
-                _W1 = W13[i]
-            else:
-                _W1 = W13[i][0 : self.intermediate_size, :]
-                _W3 = W13[i][self.intermediate_size : 2 * self.intermediate_size, :]
-            linear1 = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-            linear1.weight = nn.Parameter(_W1)
-            linear2 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-            linear2.weight = nn.Parameter(W2[i])
-            linear3 = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-            linear3.weight = nn.Parameter(_W3)
-            linear_per_expert = nn.ModuleList([linear1, linear2, linear3])
-            linear_list.append(linear_per_expert)
-        self.linear_module_list = nn.ModuleList([linear_list[i] for i in range(W2.shape[0])])
-
-    def forward(self, hidden_states, score, topk):
-        batch_size, head_dim = hidden_states.shape
-        routing_weights = torch.nn.functional.softmax(score, dim=1, dtype=torch.float32)
-        routing_weights, selected_experts = torch.topk(routing_weights, topk, dim=-1)
-        routing_weights = routing_weights.to(hidden_states.dtype)
-        final_hidden_states = torch.zeros(
-            (batch_size, head_dim),
-            dtype=hidden_states.dtype,
-            device=hidden_states.device,
-        )
-        expert_mask = torch.nn.functional.one_hot(
-            selected_experts, num_classes=self.num_experts
-        ).permute(2, 1, 0)
-        for expert_idx in range(self.num_experts):
-            idx, top_x = torch.where(expert_mask[expert_idx])
-            if (
-                hasattr(self.linear_module_list[expert_idx][0], "use_dnnl")
-                and self.linear_module_list[expert_idx][0].use_dnnl
-            ):
-                final_hidden_states = torch.ops.torch_ipex.mixtral_moe(
-                    hidden_states,
-                    top_x,
-                    idx,
-                    self.linear_module_list[expert_idx][0]._get_forward_weight(),
-                    self.linear_module_list[expert_idx][0].ctx.get_data_handle(),
-                    self.linear_module_list[expert_idx][2]._get_forward_weight(),
-                    self.linear_module_list[expert_idx][2].ctx.get_data_handle(),
-                    self.linear_module_list[expert_idx][1]._get_forward_weight(),
-                    self.linear_module_list[expert_idx][1].ctx.get_data_handle(),
-                    hasattr(self.linear_module_list[expert_idx][0], "use_dnnl")
-                    and self.linear_module_list[expert_idx][0].use_dnnl,
-                    routing_weights,
-                    final_hidden_states,
-                    False,
-                )
-            else:
-                final_hidden_states = torch.ops.torch_ipex.mixtral_moe_tpp(
-                    hidden_states,
-                    top_x,
-                    idx,
-                    self.linear_module_list[expert_idx][0].weight.detach(),
-                    self.linear_module_list[expert_idx][2].weight.detach(),
-                    self.linear_module_list[expert_idx][1].weight.detach(),
-                    (
-                        self.linear_module_list[expert_idx][0].tpp_fallback
-                        if hasattr(
-                            self.linear_module_list[expert_idx][0], "tpp_fallback"
-                        )
-                        else True
-                    ),
-                    routing_weights,
-                    final_hidden_states,
-                    False,
-                )
-
-        return final_hidden_states.view(-1, head_dim)
+
 
 class MixtralMoE(nn.Module):
     """A tensor-parallel MoE implementation for Mixtral that shards each expert
@@ -310,11 +229,11 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
         if not hasattr(self, "ipex_moe"):
-            self.ipex_moe = _IPEXlinearMOECPU(self.w13_weight, self.w2_weight)
+            self.ipex_moe = ipex.llm.modules.LinearMOE(W13=self.w13_weight, W2=self.w2_weight)
             _disable_tpp()
             if hidden_states.dtype is torch.bfloat16:
                 _enable_tpp()
-            self.ipex_moe = ipex.optimize(self.ipex_moe.eval(), dtype=hidden_states.dtype, inplace=True)
+            self.ipex_moe = ipex.llm.optimize(self.ipex_moe.eval(), dtype=hidden_states.dtype, inplace=True)
         final_hidden_states = self.ipex_moe(hidden_states, router_logits, self.top_k)
         if self.tp_size > 1:
             final_hidden_states = tensor_model_parallel_all_reduce(
@@ -396,8 +315,9 @@ def forward(
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
         attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
-        output, _ = self.o_proj(attn_output)
-        return output
+        # move self.o_proj to MixtralDecoderLayer to enable linear+add fusion when tp_size <=1
+        # output, _ = self.o_proj(attn_output)
+        return attn_output
 
 
 class MixtralDecoderLayer(nn.Module):
@@ -452,10 +372,19 @@ def forward(
             kv_cache=kv_cache,
             attn_metadata=attn_metadata,
         )
-
-        # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
+        if self.self_attn.o_proj.tp_size <=1 and not hasattr(self, "ipex_fusion") and hasattr(self.self_attn.o_proj, "ipex_linear"):
+                self.ipex_fusion = ipex.llm.modules.LinearAdd(self.self_attn.o_proj.ipex_linear)
+        if hasattr(self, "ipex_fusion"):
+            hidden_states = self.ipex_fusion(hidden_states, residual)
+            if not self.self_attn.o_proj.skip_bias_add and self.self_attn.o_proj.bias is not None:
+                hidden_states = hidden_states + self.self_attn.o_proj.bias
+            residual = hidden_states
+            hidden_states = self.post_attention_layernorm(
+                hidden_states)
+        else:
+            hidden_states, _ = self.self_attn.o_proj(hidden_states)
+            hidden_states, residual = self.post_attention_layernorm(
+                hidden_states, residual)
         hidden_states = self.block_sparse_moe(hidden_states)
         return hidden_states, residual
 
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py