Enable jit mlperf (vllm-project#28)

jianan-gu · web-flow · commit 76db283b1178 · 2024-07-02T15:21:16.000+08:00
* enable jit

* enable jit for mixtral

* fix gptj jit acc

* refine mixtral jit

* fix jit in warmup with None KV cache
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
@@ -1,7 +1,7 @@
 from typing import Optional, Tuple, Type
 
 import torch
-
+import intel_extension_for_pytorch as ipex
 try:
     from vllm._C import cache_ops as vllm_cache_ops
     from vllm._C import ops as vllm_ops
@@ -98,9 +98,23 @@ def rotary_embedding(
     cos_sin_cache: torch.Tensor,
     is_neox: bool,
 ) -> None:
-    vllm_ops.rotary_embedding(positions, query, key, head_size, cos_sin_cache,
-                              is_neox)
 
+    # vllm_ops.rotary_embedding(positions, query, key, head_size, cos_sin_cache,
+    #                           is_neox)
+
+    rotary_dim = cos_sin_cache.size(1)
+    query = query.view(*query.shape[:-1], -1, head_size)
+    key = key.view(*key.shape[:-1], -1, head_size)
+
+    query_rot = query[..., :rotary_dim]
+    key_rot = key[..., :rotary_dim]
+
+    cos_sin = cos_sin_cache[positions.long()]
+    cos, sin = cos_sin.chunk(2, dim=-1)
+    cos = cos.repeat(1, 2)
+    sin = sin.repeat(1, 2)
+
+    ipex.llm.functional.rotary_embedding(query_rot, key_rot, sin, cos, rotary_dim, is_neox)
 
 def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
                              key: torch.Tensor, head_size: int,
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
@@ -143,8 +143,17 @@ def forward(
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: Optional[torch.Tensor],
-        attn_metadata: TorchSDPAMetadata,  # type: ignore
+        is_prompt,
+        block_tables,
+        num_prefills,
+        num_prefill_tokens,
+        num_decode_tokens,
+        slot_mapping,
+        seq_lens,
+        seq_lens_tensor=None,
+        max_decode_seq_len=None,
         kv_scale: float = 1.0,
+        attn_bias=None,
     ) -> torch.Tensor:
         """Forward pass with torch SDPA and PagedAttention.
 
@@ -169,29 +178,29 @@ def forward(
                 kv_cache, self.num_kv_heads, self.head_size)
             PagedAttention.write_to_paged_cache(key, value, key_cache,
                                                 value_cache,
-                                                attn_metadata.slot_mapping,
+                                                slot_mapping,
                                                 self.kv_cache_dtype, kv_scale)
 
-        if attn_metadata.is_prompt:
-            assert attn_metadata.seq_lens is not None
-            if (kv_cache is None or attn_metadata.block_tables.numel() == 0):
+        if is_prompt:
+            assert seq_lens is not None
+            if (kv_cache is None or block_tables.numel() == 0):
                 if self.num_kv_heads != self.num_heads:
                     key = key.repeat_interleave(self.num_queries_per_kv, dim=1)
                     value = value.repeat_interleave(self.num_queries_per_kv,
                                                     dim=1)
 
-                if attn_metadata.attn_bias is None:
+                if attn_bias is None:
                     if self.alibi_slopes is not None:
                         att_masks = _make_alibi_bias(
                             self.alibi_slopes, query.dtype,
                             attn_metadata.seq_lens)  # type: ignore
                     elif self.sliding_window is not None:
                         att_masks = _make_sliding_window_bias(
-                            attn_metadata.seq_lens, self.sliding_window,
+                            seq_lens, self.sliding_window,
                             query.dtype)  # type: ignore
                     else:
-                        att_masks = [None] * len(attn_metadata.seq_lens)
-                    attn_metadata.attn_bias = att_masks
+                        att_masks = [None] * len(seq_lens)
+                    attn_bias = att_masks
 
                 query = query.movedim(0, query.dim() - 2)
                 key = key.movedim(0, key.dim() - 2)
@@ -201,8 +210,8 @@ def forward(
                 output = torch.empty(
                     (num_tokens, self.num_heads, self.head_size),
                     dtype=query.dtype)
-                for seq_len, mask in zip(attn_metadata.seq_lens,
-                                         attn_metadata.attn_bias):
+                for seq_len, mask in zip(seq_lens,
+                                         attn_bias):
                     end = start + seq_len
                     sub_out = scaled_dot_product_attention(
                         query[None, :, start:end, :],
@@ -226,9 +235,9 @@ def forward(
                 query,
                 key_cache,
                 value_cache,
-                attn_metadata.block_tables,
-                attn_metadata.seq_lens_tensor,
-                attn_metadata.max_decode_seq_len,
+                block_tables,
+                seq_lens_tensor,
+                max_decode_seq_len,
                 self.kv_cache_dtype,
                 self.num_kv_heads,
                 self.scale,
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
@@ -84,9 +84,17 @@ def forward(
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: Optional[torch.Tensor],
-        attn_metadata: AttentionMetadata,
+        is_prompt,
+        block_tables,
+        num_prefills,
+        num_prefill_tokens,
+        num_decode_tokens,
+        slot_mapping,
+        seq_lens,
+        seq_lens_tensor=None,
+        max_decode_seq_len=None,
     ) -> torch.Tensor:
-        return self.impl.forward(query, key, value, kv_cache, attn_metadata,
+        return self.impl.forward(query, key, value, kv_cache, is_prompt, block_tables,num_prefills,num_prefill_tokens,num_decode_tokens,slot_mapping,seq_lens,seq_lens_tensor,max_decode_seq_len,
                                  self._kv_scale)
 
     def extra_repr(self) -> str:
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
@@ -6,6 +6,8 @@
 
 from vllm.model_executor.custom_op import CustomOp
 
+from vllm import _custom_ops as ops
+import intel_extension_for_pytorch as ipex
 
 class RMSNorm(CustomOp):
     """Root mean square normalization.
@@ -48,19 +50,17 @@ def forward_cuda(
         x: torch.Tensor,
         residual: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
-        from vllm import _custom_ops as ops
-
         if residual is not None:
-            ops.fused_add_rms_norm(
-                x,
+            x = ipex.llm.functional.add_rms_norm(
                 residual,
+                x,
                 self.weight.data,
+                None,
                 self.variance_epsilon,
+                True
             )
             return x, residual
-        out = torch.empty_like(x)
-        ops.rms_norm(
-            out,
+        out = ipex.llm.functional.rms_norm(
             x,
             self.weight.data,
             self.variance_epsilon,
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
@@ -97,12 +97,20 @@ def forward(
         position_ids: torch.Tensor,
         hidden_states: torch.Tensor,
         kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
+        is_prompt,
+        block_tables,
+        num_prefills,
+        num_prefill_tokens,
+        num_decode_tokens,
+        slot_mapping,
+        seq_lens,
+        seq_lens_tensor=None,
+        max_decode_seq_len=None,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.chunk(chunks=3, dim=-1)
         q, k = self.rotary_emb(position_ids, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v, kv_cache, is_prompt, block_tables,num_prefills,num_prefill_tokens,num_decode_tokens,slot_mapping,seq_lens,seq_lens_tensor,max_decode_seq_len)
         attn_output, _ = self.out_proj(attn_output)
         return attn_output
 
@@ -166,15 +174,31 @@ def forward(
         position_ids: torch.Tensor,
         hidden_states: torch.Tensor,
         kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
+        is_prompt,
+        block_tables,
+        num_prefills,
+        num_prefill_tokens,
+        num_decode_tokens,
+        slot_mapping,
+        seq_lens,
+        seq_lens_tensor=None,
+        max_decode_seq_len=None,
     ) -> torch.Tensor:
         residual = hidden_states
         hidden_states = self.ln_1(hidden_states)
         attn_output = self.attn(
             position_ids=position_ids,
             hidden_states=hidden_states,
             kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
+            is_prompt=is_prompt,
+            block_tables=block_tables,
+            num_prefills=num_prefills,
+            num_prefill_tokens=num_prefill_tokens,
+            num_decode_tokens=num_decode_tokens,
+            slot_mapping=slot_mapping,
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
+            max_decode_seq_len=max_decode_seq_len,
         )
         mlp_output = self.mlp(hidden_states)
         if self.mlp.fc_out.tp_size <=1 and not hasattr(self, "ipex_fusion"):
@@ -220,7 +244,15 @@ def forward(
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
         kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
+        is_prompt,
+        block_tables,
+        num_prefills,
+        num_prefill_tokens,
+        num_decode_tokens,
+        slot_mapping,
+        seq_lens,
+        seq_lens_tensor=None,
+        max_decode_seq_len=None,
     ) -> torch.Tensor:
         hidden_states = self.wte(input_ids)
         for i in range(len(self.h)):
@@ -229,7 +261,15 @@ def forward(
                 position_ids,
                 hidden_states,
                 kv_caches[i],
-                attn_metadata,
+                is_prompt,
+                block_tables,
+                num_prefills,
+                num_prefill_tokens,
+                num_decode_tokens,
+                slot_mapping,
+                seq_lens,
+                seq_lens_tensor,
+                max_decode_seq_len,
             )
         hidden_states = self.ln_f(hidden_states)
         return hidden_states
@@ -255,6 +295,52 @@ def __init__(
         )
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
+        self.trace_first=None
+        self.trace_next=None
+
+    @torch.no_grad
+    def enable_jit(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        is_prompt,
+        block_tables,
+        num_prefills,
+        num_prefill_tokens,
+        num_decode_tokens,
+        slot_mapping,
+        seq_lens,
+        seq_lens_tensor=None,
+        max_decode_seq_len=None,
+    ) -> torch.Tensor:
+
+        if is_prompt:
+                self.transformer(input_ids, positions, kv_caches, is_prompt, block_tables,num_prefills,num_prefill_tokens,num_decode_tokens,slot_mapping,seq_lens,seq_lens_tensor,max_decode_seq_len)
+                example_input = (
+                    input_ids,
+                    positions,
+                    kv_caches,
+                    is_prompt, block_tables,num_prefills,num_prefill_tokens,num_decode_tokens,slot_mapping,seq_lens
+                )
+                self.trace_first = torch.jit.trace(self.transformer, example_input, check_trace=False, strict=False)
+                self.trace_first = torch.jit.freeze(self.trace_first)
+                self.trace_first(*example_input)
+                self.trace_first(*example_input)
+        else:
+                example_input = (
+                    input_ids,
+                    positions,
+                    kv_caches,
+                    is_prompt, block_tables,num_prefills,num_prefill_tokens,num_decode_tokens,slot_mapping,seq_lens,seq_lens_tensor,max_decode_seq_len
+                )
+                self.trace_next = torch.jit.trace(
+                    self.transformer, example_input, check_trace=False, strict=False
+                )
+                self.trace_next = torch.jit.freeze(self.trace_next)
+                self.trace_next(*example_input)
+                self.trace_next(*example_input)
+
 
     def forward(
         self,
@@ -263,8 +349,42 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
-        hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata)
+
+        is_prompt=torch.tensor(attn_metadata.is_prompt)
+        block_tables=attn_metadata.block_tables
+        num_prefills=torch.tensor(attn_metadata.num_prefills)
+        num_prefill_tokens=torch.tensor(attn_metadata.num_prefill_tokens)
+        num_decode_tokens=torch.tensor(attn_metadata.num_decode_tokens)
+        slot_mapping = attn_metadata.slot_mapping
+        seq_lens=torch.tensor(attn_metadata.seq_lens)
+        seq_lens_tensor=attn_metadata.seq_lens_tensor if attn_metadata.seq_lens_tensor is not None else None
+        max_decode_seq_len=torch.tensor(attn_metadata.max_decode_seq_len) if attn_metadata.max_decode_seq_len is not None else None
+        attn_bias = attn_metadata.attn_bias
+
+        if kv_caches[0] is not None:
+            if attn_metadata.is_prompt:
+                if self.trace_first is None:
+                    self.enable_jit(input_ids, positions, kv_caches, is_prompt, block_tables,num_prefills,num_prefill_tokens,num_decode_tokens,slot_mapping,seq_lens)
+                hidden_states = self.trace_first(
+                    input_ids,
+                    positions,
+                    kv_caches,
+                    is_prompt, block_tables,num_prefills,num_prefill_tokens,num_decode_tokens,slot_mapping,seq_lens
+                )
+            else:
+                if self.trace_next is None:
+                    self.enable_jit(input_ids, positions, kv_caches, is_prompt, block_tables,num_prefills,num_prefill_tokens,num_decode_tokens,slot_mapping,seq_lens,seq_lens_tensor,max_decode_seq_len)
+                hidden_states = self.trace_next(
+                    input_ids,
+                    positions,
+                    kv_caches,
+                    is_prompt, block_tables,num_prefills,num_prefill_tokens,num_decode_tokens,slot_mapping,seq_lens,seq_lens_tensor,max_decode_seq_len
+                )
+        else:
+            # TorchSDPAMetadata(seq_lens_tensor=None, max_decode_seq_len=None, block_tables=tensor([]), num_prefills=1, num_prefill_tokens=5, num_decode_tokens=0, slot_mapping=tensor([9344, 9345, 9346, 9347, 9348]), is_prompt=True, seq_lens=[5])
+            # TorchSDPAMetadata(seq_lens_tensor=tensor([6], dtype=torch.int32), max_decode_seq_len=6, block_tables=tensor([[584]], dtype=torch.int32), num_prefills=0, num_prefill_tokens=0, num_decode_tokens=1, slot_mapping=tensor([9349]), is_prompt=False, seq_lens=[6])
+            hidden_states = self.transformer(input_ids, positions, kv_caches, is_prompt, block_tables,num_prefills,num_prefill_tokens,num_decode_tokens,slot_mapping,seq_lens,seq_lens_tensor,max_decode_seq_len)
+
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py