From 6d89eee313da966c176b08c16ded302995c5e2cf Mon Sep 17 00:00:00 2001
From: Libin Tang <litang@habana.ai>
Date: Thu, 10 Apr 2025 00:25:26 +0000
Subject: [PATCH 1/7] Fix the embedding accuracy issue for non-casual model
 when merged prefill is enabled.

---
 vllm/worker/hpu_model_runner.py | 51 +++++++++++++++++++++------------
 1 file changed, 32 insertions(+), 19 deletions(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 7f4b3c25b75d..65eca4552706 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -242,7 +242,7 @@ def find_rope_layer(parent, path):
 
 class HpuModelAdapter(torch.nn.Module):
 
-    def __init__(self, model, vllm_config, layer_names):
+    def __init__(self, model, vllm_config, layer_names, is_causal):
         super().__init__()
         self.model = model
         self.prefill_use_fusedsdpa = "fsdpa" in enabled_flags()
@@ -253,9 +253,7 @@ def __init__(self, model, vllm_config, layer_names):
         self.dtype = vllm_config.model_config.dtype
         self.layer_names = layer_names
         self.is_pooler = hasattr(self.model, "_pooler")
-        self.is_causal = True
-        if self.is_pooler:
-            self.set_causal_option(self.model)
+        self.is_causal = is_causal
         self.use_merged_prefill = VLLM_MERGED_PREFILL
 
     def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device,
@@ -436,17 +434,6 @@ def sample(self, *args, **kwargs):
     def generate_proposals(self, *args, **kwargs):
         return self.model.generate_proposals(*args, **kwargs)
 
-    def set_causal_option(self, module):
-        if isinstance(module, HPUAttentionImpl) and hasattr(
-                module, 'attn_type'):
-            self.is_causal = not (
-                module.attn_type == AttentionType.ENCODER
-                or module.attn_type == AttentionType.ENCODER_ONLY
-                or module.attn_type == AttentionType.ENCODER_DECODER)
-            return
-        else:
-            for child_name, child_module in module.named_children():
-                self.set_causal_option(child_module)
 
     # sampler property will be used by spec_decode_worker
     # don't rename
@@ -832,12 +819,15 @@ def load_model(self) -> None:
                 hidden_layer_markstep_interval)
             path_to_rope = get_path_to_rope(self.model)
             torch.hpu.synchronize()
-
+            self.is_causal = True
+            if self.is_pooler:
+                self.set_causal_option(self.model)
             with HabanaMemoryProfiler() as m_wrap:
                 self.model = self._maybe_wrap_in_hpu_graph(
                     self.model,
                     vllm_config=self.vllm_config,
-                    layer_names=path_to_rope)
+                    layer_names=path_to_rope,
+                    is_causal=self.is_causal)
             msg = f"Wrapping in HPU Graph took {m_wrap.get_summary_string()}"
             logger.info(msg)
             with HabanaMemoryProfiler() as m_wrap:
@@ -1027,17 +1017,40 @@ def make_attn_bias(self, seq_lens, max_prompt_len, dtype):
                                     pad=-1,
                                     dtype=torch.long,
                                     flat=self.use_merged_prefill)
+
         q_seq_idx_t = seq_idx_t.unsqueeze(-1)
         kv_seq_idx_t = seq_idx_t.unsqueeze(-2)
         q_seq_pos_t = seq_pos_t.unsqueeze(-1)
         kv_seq_pos_t = seq_pos_t.unsqueeze(-2)
         seq_idx_t = q_seq_idx_t != kv_seq_idx_t
         seq_pos_t = kv_seq_pos_t > q_seq_pos_t
-        attn_mask = seq_idx_t | seq_pos_t
+        import pdb;pdb.set_trace()
+        if self.is_causal:
+            attn_mask = (seq_idx_t | seq_pos_t)
+        else:
+            attn_mask = seq_idx_t
+        if self.is_pooler:
+            mask_v = torch.where(q_seq_pos_t < 0 , True, False)
+            attn_mask = attn_mask | mask_v 
+            off_value = -3E38 #small number, avoid nan and overflow
+        else:
+            off_value = -math.inf
         attn_bias = torch.zeros_like(attn_mask, dtype=dtype)
-        attn_bias.masked_fill_(attn_mask, -math.inf)
+        attn_bias.masked_fill_(attn_mask, off_value)
+        import pdb;pdb.set_trace()
         return attn_bias.unsqueeze(1)
 
+    def set_causal_option(self, module):
+        if isinstance(module, HPUAttentionImpl) and hasattr(
+                module, 'attn_type'):
+            self.is_causal = not (
+                module.attn_type == AttentionType.ENCODER
+                or module.attn_type == AttentionType.ENCODER_ONLY
+                or module.attn_type == AttentionType.ENCODER_DECODER)
+            return
+        else:
+            for child_name, child_module in module.named_children():
+                self.set_causal_option(child_module)
     def move_to_device(self, tensor):
         return tensor if tensor is None else tensor.to(self.device,
                                                        non_blocking=True)

From d5548246406b22a0b63db2d8fcc3b68c24552673 Mon Sep 17 00:00:00 2001
From: Libin Tang <litang@habana.ai>
Date: Thu, 10 Apr 2025 05:25:23 +0000
Subject: [PATCH 2/7] Remove debug code.

---
 vllm/worker/hpu_model_runner.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 65eca4552706..baec79ae7c1a 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -1024,14 +1024,13 @@ def make_attn_bias(self, seq_lens, max_prompt_len, dtype):
         kv_seq_pos_t = seq_pos_t.unsqueeze(-2)
         seq_idx_t = q_seq_idx_t != kv_seq_idx_t
         seq_pos_t = kv_seq_pos_t > q_seq_pos_t
-        import pdb;pdb.set_trace()
         if self.is_causal:
             attn_mask = (seq_idx_t | seq_pos_t)
         else:
             attn_mask = seq_idx_t
         if self.is_pooler:
             mask_v = torch.where(q_seq_pos_t < 0 , True, False)
-            attn_mask = attn_mask | mask_v 
+            attn_mask = attn_mask | mask_v
             off_value = -3E38 #small number, avoid nan and overflow
         else:
             off_value = -math.inf

From d9fd0acdd4b1c37a7c1ce56f1501703147145236 Mon Sep 17 00:00:00 2001
From: Libin Tang <litang@habana.ai>
Date: Thu, 10 Apr 2025 05:33:23 +0000
Subject: [PATCH 3/7] Remove debug code.

---
 vllm/worker/hpu_model_runner.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index baec79ae7c1a..c2975172fb67 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -1029,14 +1029,13 @@ def make_attn_bias(self, seq_lens, max_prompt_len, dtype):
         else:
             attn_mask = seq_idx_t
         if self.is_pooler:
-            mask_v = torch.where(q_seq_pos_t < 0 , True, False)
+            mask_v = torch.where(q_seq_pos_t < 0, True, False)
             attn_mask = attn_mask | mask_v
-            off_value = -3E38 #small number, avoid nan and overflow
+            off_value = -3E38  #small number, avoid nan and overflow
         else:
             off_value = -math.inf
         attn_bias = torch.zeros_like(attn_mask, dtype=dtype)
         attn_bias.masked_fill_(attn_mask, off_value)
-        import pdb;pdb.set_trace()
         return attn_bias.unsqueeze(1)
 
     def set_causal_option(self, module):

From 3b75197e23ac8ea2222fa5f5aba9c303b08e28fb Mon Sep 17 00:00:00 2001
From: Libin Tang <litang@habana.ai>
Date: Thu, 10 Apr 2025 05:42:03 +0000
Subject: [PATCH 4/7] fix format.

---
 vllm/worker/hpu_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index c2975172fb67..5b0e2eddea7a 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -434,7 +434,6 @@ def sample(self, *args, **kwargs):
     def generate_proposals(self, *args, **kwargs):
         return self.model.generate_proposals(*args, **kwargs)
 
-
     # sampler property will be used by spec_decode_worker
     # don't rename
     @property
@@ -1049,6 +1048,7 @@ def set_causal_option(self, module):
         else:
             for child_name, child_module in module.named_children():
                 self.set_causal_option(child_module)
+
     def move_to_device(self, tensor):
         return tensor if tensor is None else tensor.to(self.device,
                                                        non_blocking=True)

From bbf09ee1eadd477068cd06ae178fe3d72bae5988 Mon Sep 17 00:00:00 2001
From: Libin Tang <litang@habana.ai>
Date: Thu, 10 Apr 2025 05:52:59 +0000
Subject: [PATCH 5/7] Add is_causal default value.

---
 vllm/worker/hpu_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 5b0e2eddea7a..823c9841a126 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -242,7 +242,7 @@ def find_rope_layer(parent, path):
 
 class HpuModelAdapter(torch.nn.Module):
 
-    def __init__(self, model, vllm_config, layer_names, is_causal):
+    def __init__(self, model, vllm_config, layer_names, is_causal=True):
         super().__init__()
         self.model = model
         self.prefill_use_fusedsdpa = "fsdpa" in enabled_flags()

From 5aedd5e6538f4976bb5c1695fd3e31a0aa93b943 Mon Sep 17 00:00:00 2001
From: Libin Tang <litang@habana.ai>
Date: Thu, 10 Apr 2025 06:04:00 +0000
Subject: [PATCH 6/7] format change.

---
 vllm/worker/hpu_model_runner.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 823c9841a126..c956cb107ae4 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -1023,10 +1023,7 @@ def make_attn_bias(self, seq_lens, max_prompt_len, dtype):
         kv_seq_pos_t = seq_pos_t.unsqueeze(-2)
         seq_idx_t = q_seq_idx_t != kv_seq_idx_t
         seq_pos_t = kv_seq_pos_t > q_seq_pos_t
-        if self.is_causal:
-            attn_mask = (seq_idx_t | seq_pos_t)
-        else:
-            attn_mask = seq_idx_t
+        attn_mask = (seq_idx_t | seq_pos_t) if self.is_causal else seq_idx_t
         if self.is_pooler:
             mask_v = torch.where(q_seq_pos_t < 0, True, False)
             attn_mask = attn_mask | mask_v

From e0d8ddbad12e6178be14c2ba2022ed7878140c78 Mon Sep 17 00:00:00 2001
From: Libin Tang <litang@habana.ai>
Date: Thu, 10 Apr 2025 21:15:09 +0000
Subject: [PATCH 7/7] Fix test issue

---
 vllm/worker/hpu_enc_dec_model_runner.py | 4 ++--
 vllm/worker/hpu_model_runner.py         | 4 +++-
 vllm/worker/hpu_worker.py               | 3 +++
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/vllm/worker/hpu_enc_dec_model_runner.py b/vllm/worker/hpu_enc_dec_model_runner.py
index 6ffe7071d22d..b45d0f542c24 100644
--- a/vllm/worker/hpu_enc_dec_model_runner.py
+++ b/vllm/worker/hpu_enc_dec_model_runner.py
@@ -41,8 +41,8 @@
 
 class HpuModelAdapterEncoderDecoder(HpuModelAdapter):
 
-    def __init__(self, model, vllm_config, layer_names):
-        super().__init__(model, vllm_config, layer_names)
+    def __init__(self, model, vllm_config, layer_names, is_causal):
+        super().__init__(model, vllm_config, layer_names, False)
 
         # We only wrap the language model in HPU graph because some Ops in
         # vision model will fallback to CPU and cause the graph building fail.
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index c956cb107ae4..cb22e549eecf 100755
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -242,7 +242,7 @@ def find_rope_layer(parent, path):
 
 class HpuModelAdapter(torch.nn.Module):
 
-    def __init__(self, model, vllm_config, layer_names, is_causal=True):
+    def __init__(self, model, vllm_config, layer_names, is_causal):
         super().__init__()
         self.model = model
         self.prefill_use_fusedsdpa = "fsdpa" in enabled_flags()
@@ -609,6 +609,7 @@ def __init__(
         return_hidden_states: bool = False,
         input_registry: InputRegistry = INPUT_REGISTRY,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+        is_causal: bool = True,
     ):
         ModelRunnerBase.__init__(self, vllm_config=vllm_config)
         environment.set_model_config(self.model_config)
@@ -694,6 +695,7 @@ def __init__(
         # For both multi-step scheduling and delayed sampling
         self.cached_step_outputs: List[torch.Tensor] = []
         self.is_pooler = False
+        self.is_causal = is_causal
         # For delayed sampling
         self.cached_step_inputs: List[
             ModelInputForHPUWithSamplingMetadata] = []
diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py
index ddd336e3c5be..d4b5c1251cd7 100755
--- a/vllm/worker/hpu_worker.py
+++ b/vllm/worker/hpu_worker.py
@@ -84,15 +84,18 @@ def __init__(
 
         is_encoder_decoder_model = self._is_encoder_decoder_model()
         ModelRunnerClass: Type[HPUModelRunnerBase] = HPUModelRunner
+        is_causal = True
         if self.model_config.runner_type == "pooling":
             ModelRunnerClass = HPUPoolingModelRunner
         elif is_encoder_decoder_model:
             ModelRunnerClass = HPUEncoderDecoderModelRunner
+            is_causal = False
         self.model_runner: HPUModelRunnerBase = ModelRunnerClass(
             vllm_config=vllm_config,
             kv_cache_dtype=self.cache_config.cache_dtype,
             is_driver_worker=is_driver_worker,
             **speculative_args,
+            is_causal=is_causal,
         )
         if model_runner_cls is not None:
             self.model_runner = model_runner_cls(self.model_runner)