From 6d89eee313da966c176b08c16ded302995c5e2cf Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Thu, 10 Apr 2025 00:25:26 +0000 Subject: [PATCH 1/7] Fix the embedding accuracy issue for non-casual model when merged prefill is enabled. --- vllm/worker/hpu_model_runner.py | 51 +++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 19 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 7f4b3c25b75d..65eca4552706 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -242,7 +242,7 @@ def find_rope_layer(parent, path): class HpuModelAdapter(torch.nn.Module): - def __init__(self, model, vllm_config, layer_names): + def __init__(self, model, vllm_config, layer_names, is_causal): super().__init__() self.model = model self.prefill_use_fusedsdpa = "fsdpa" in enabled_flags() @@ -253,9 +253,7 @@ def __init__(self, model, vllm_config, layer_names): self.dtype = vllm_config.model_config.dtype self.layer_names = layer_names self.is_pooler = hasattr(self.model, "_pooler") - self.is_causal = True - if self.is_pooler: - self.set_causal_option(self.model) + self.is_causal = is_causal self.use_merged_prefill = VLLM_MERGED_PREFILL def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device, @@ -436,17 +434,6 @@ def sample(self, *args, **kwargs): def generate_proposals(self, *args, **kwargs): return self.model.generate_proposals(*args, **kwargs) - def set_causal_option(self, module): - if isinstance(module, HPUAttentionImpl) and hasattr( - module, 'attn_type'): - self.is_causal = not ( - module.attn_type == AttentionType.ENCODER - or module.attn_type == AttentionType.ENCODER_ONLY - or module.attn_type == AttentionType.ENCODER_DECODER) - return - else: - for child_name, child_module in module.named_children(): - self.set_causal_option(child_module) # sampler property will be used by spec_decode_worker # don't rename @@ -832,12 +819,15 @@ def load_model(self) -> None: hidden_layer_markstep_interval) path_to_rope = get_path_to_rope(self.model) torch.hpu.synchronize() - + self.is_causal = True + if self.is_pooler: + self.set_causal_option(self.model) with HabanaMemoryProfiler() as m_wrap: self.model = self._maybe_wrap_in_hpu_graph( self.model, vllm_config=self.vllm_config, - layer_names=path_to_rope) + layer_names=path_to_rope, + is_causal=self.is_causal) msg = f"Wrapping in HPU Graph took {m_wrap.get_summary_string()}" logger.info(msg) with HabanaMemoryProfiler() as m_wrap: @@ -1027,17 +1017,40 @@ def make_attn_bias(self, seq_lens, max_prompt_len, dtype): pad=-1, dtype=torch.long, flat=self.use_merged_prefill) + q_seq_idx_t = seq_idx_t.unsqueeze(-1) kv_seq_idx_t = seq_idx_t.unsqueeze(-2) q_seq_pos_t = seq_pos_t.unsqueeze(-1) kv_seq_pos_t = seq_pos_t.unsqueeze(-2) seq_idx_t = q_seq_idx_t != kv_seq_idx_t seq_pos_t = kv_seq_pos_t > q_seq_pos_t - attn_mask = seq_idx_t | seq_pos_t + import pdb;pdb.set_trace() + if self.is_causal: + attn_mask = (seq_idx_t | seq_pos_t) + else: + attn_mask = seq_idx_t + if self.is_pooler: + mask_v = torch.where(q_seq_pos_t < 0 , True, False) + attn_mask = attn_mask | mask_v + off_value = -3E38 #small number, avoid nan and overflow + else: + off_value = -math.inf attn_bias = torch.zeros_like(attn_mask, dtype=dtype) - attn_bias.masked_fill_(attn_mask, -math.inf) + attn_bias.masked_fill_(attn_mask, off_value) + import pdb;pdb.set_trace() return attn_bias.unsqueeze(1) + def set_causal_option(self, module): + if isinstance(module, HPUAttentionImpl) and hasattr( + module, 'attn_type'): + self.is_causal = not ( + module.attn_type == AttentionType.ENCODER + or module.attn_type == AttentionType.ENCODER_ONLY + or module.attn_type == AttentionType.ENCODER_DECODER) + return + else: + for child_name, child_module in module.named_children(): + self.set_causal_option(child_module) def move_to_device(self, tensor): return tensor if tensor is None else tensor.to(self.device, non_blocking=True) From d5548246406b22a0b63db2d8fcc3b68c24552673 Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Thu, 10 Apr 2025 05:25:23 +0000 Subject: [PATCH 2/7] Remove debug code. --- vllm/worker/hpu_model_runner.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 65eca4552706..baec79ae7c1a 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1024,14 +1024,13 @@ def make_attn_bias(self, seq_lens, max_prompt_len, dtype): kv_seq_pos_t = seq_pos_t.unsqueeze(-2) seq_idx_t = q_seq_idx_t != kv_seq_idx_t seq_pos_t = kv_seq_pos_t > q_seq_pos_t - import pdb;pdb.set_trace() if self.is_causal: attn_mask = (seq_idx_t | seq_pos_t) else: attn_mask = seq_idx_t if self.is_pooler: mask_v = torch.where(q_seq_pos_t < 0 , True, False) - attn_mask = attn_mask | mask_v + attn_mask = attn_mask | mask_v off_value = -3E38 #small number, avoid nan and overflow else: off_value = -math.inf From d9fd0acdd4b1c37a7c1ce56f1501703147145236 Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Thu, 10 Apr 2025 05:33:23 +0000 Subject: [PATCH 3/7] Remove debug code. --- vllm/worker/hpu_model_runner.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index baec79ae7c1a..c2975172fb67 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1029,14 +1029,13 @@ def make_attn_bias(self, seq_lens, max_prompt_len, dtype): else: attn_mask = seq_idx_t if self.is_pooler: - mask_v = torch.where(q_seq_pos_t < 0 , True, False) + mask_v = torch.where(q_seq_pos_t < 0, True, False) attn_mask = attn_mask | mask_v - off_value = -3E38 #small number, avoid nan and overflow + off_value = -3E38 #small number, avoid nan and overflow else: off_value = -math.inf attn_bias = torch.zeros_like(attn_mask, dtype=dtype) attn_bias.masked_fill_(attn_mask, off_value) - import pdb;pdb.set_trace() return attn_bias.unsqueeze(1) def set_causal_option(self, module): From 3b75197e23ac8ea2222fa5f5aba9c303b08e28fb Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Thu, 10 Apr 2025 05:42:03 +0000 Subject: [PATCH 4/7] fix format. --- vllm/worker/hpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index c2975172fb67..5b0e2eddea7a 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -434,7 +434,6 @@ def sample(self, *args, **kwargs): def generate_proposals(self, *args, **kwargs): return self.model.generate_proposals(*args, **kwargs) - # sampler property will be used by spec_decode_worker # don't rename @property @@ -1049,6 +1048,7 @@ def set_causal_option(self, module): else: for child_name, child_module in module.named_children(): self.set_causal_option(child_module) + def move_to_device(self, tensor): return tensor if tensor is None else tensor.to(self.device, non_blocking=True) From bbf09ee1eadd477068cd06ae178fe3d72bae5988 Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Thu, 10 Apr 2025 05:52:59 +0000 Subject: [PATCH 5/7] Add is_causal default value. --- vllm/worker/hpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 5b0e2eddea7a..823c9841a126 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -242,7 +242,7 @@ def find_rope_layer(parent, path): class HpuModelAdapter(torch.nn.Module): - def __init__(self, model, vllm_config, layer_names, is_causal): + def __init__(self, model, vllm_config, layer_names, is_causal=True): super().__init__() self.model = model self.prefill_use_fusedsdpa = "fsdpa" in enabled_flags() From 5aedd5e6538f4976bb5c1695fd3e31a0aa93b943 Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Thu, 10 Apr 2025 06:04:00 +0000 Subject: [PATCH 6/7] format change. --- vllm/worker/hpu_model_runner.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 823c9841a126..c956cb107ae4 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1023,10 +1023,7 @@ def make_attn_bias(self, seq_lens, max_prompt_len, dtype): kv_seq_pos_t = seq_pos_t.unsqueeze(-2) seq_idx_t = q_seq_idx_t != kv_seq_idx_t seq_pos_t = kv_seq_pos_t > q_seq_pos_t - if self.is_causal: - attn_mask = (seq_idx_t | seq_pos_t) - else: - attn_mask = seq_idx_t + attn_mask = (seq_idx_t | seq_pos_t) if self.is_causal else seq_idx_t if self.is_pooler: mask_v = torch.where(q_seq_pos_t < 0, True, False) attn_mask = attn_mask | mask_v From e0d8ddbad12e6178be14c2ba2022ed7878140c78 Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Thu, 10 Apr 2025 21:15:09 +0000 Subject: [PATCH 7/7] Fix test issue --- vllm/worker/hpu_enc_dec_model_runner.py | 4 ++-- vllm/worker/hpu_model_runner.py | 4 +++- vllm/worker/hpu_worker.py | 3 +++ 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/vllm/worker/hpu_enc_dec_model_runner.py b/vllm/worker/hpu_enc_dec_model_runner.py index 6ffe7071d22d..b45d0f542c24 100644 --- a/vllm/worker/hpu_enc_dec_model_runner.py +++ b/vllm/worker/hpu_enc_dec_model_runner.py @@ -41,8 +41,8 @@ class HpuModelAdapterEncoderDecoder(HpuModelAdapter): - def __init__(self, model, vllm_config, layer_names): - super().__init__(model, vllm_config, layer_names) + def __init__(self, model, vllm_config, layer_names, is_causal): + super().__init__(model, vllm_config, layer_names, False) # We only wrap the language model in HPU graph because some Ops in # vision model will fallback to CPU and cause the graph building fail. diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index c956cb107ae4..cb22e549eecf 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -242,7 +242,7 @@ def find_rope_layer(parent, path): class HpuModelAdapter(torch.nn.Module): - def __init__(self, model, vllm_config, layer_names, is_causal=True): + def __init__(self, model, vllm_config, layer_names, is_causal): super().__init__() self.model = model self.prefill_use_fusedsdpa = "fsdpa" in enabled_flags() @@ -609,6 +609,7 @@ def __init__( return_hidden_states: bool = False, input_registry: InputRegistry = INPUT_REGISTRY, mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, + is_causal: bool = True, ): ModelRunnerBase.__init__(self, vllm_config=vllm_config) environment.set_model_config(self.model_config) @@ -694,6 +695,7 @@ def __init__( # For both multi-step scheduling and delayed sampling self.cached_step_outputs: List[torch.Tensor] = [] self.is_pooler = False + self.is_causal = is_causal # For delayed sampling self.cached_step_inputs: List[ ModelInputForHPUWithSamplingMetadata] = [] diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index ddd336e3c5be..d4b5c1251cd7 100755 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -84,15 +84,18 @@ def __init__( is_encoder_decoder_model = self._is_encoder_decoder_model() ModelRunnerClass: Type[HPUModelRunnerBase] = HPUModelRunner + is_causal = True if self.model_config.runner_type == "pooling": ModelRunnerClass = HPUPoolingModelRunner elif is_encoder_decoder_model: ModelRunnerClass = HPUEncoderDecoderModelRunner + is_causal = False self.model_runner: HPUModelRunnerBase = ModelRunnerClass( vllm_config=vllm_config, kv_cache_dtype=self.cache_config.cache_dtype, is_driver_worker=is_driver_worker, **speculative_args, + is_causal=is_causal, ) if model_runner_cls is not None: self.model_runner = model_runner_cls(self.model_runner)