From 7d5e862ab9630005e6b264719ca72700e8ca4a1b Mon Sep 17 00:00:00 2001
From: Yeonsil Yoon <yyoon@habana.ai>
Date: Wed, 9 Apr 2025 17:46:12 -0700
Subject: [PATCH] Modify RobertaEmbedding forward as custom op method

---
 README_GAUDI.md                               |   2 +-
 .../ai_accelerator/hpu-gaudi.inc.md           |   2 +-
 vllm/model_executor/models/roberta.py         | 120 +++++++++++++++++-
 3 files changed, 120 insertions(+), 4 deletions(-)

diff --git a/README_GAUDI.md b/README_GAUDI.md
index ed635b19796e..d2ec62bf5f96 100644
--- a/README_GAUDI.md
+++ b/README_GAUDI.md
@@ -385,7 +385,7 @@ Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM
 
 - `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used, if `1` PyTorch Lazy backend for Gaudi will be used. `1` is the default.
 - `PT_HPU_ENABLE_LAZY_COLLECTIVES` must be set to `true` for tensor parallel inference with HPU Graphs.
-- `PT_HPUGRAPH_DISABLE_TENSOR_CACHE` must be set to `false` for llava and qwen models.
+- `PT_HPUGRAPH_DISABLE_TENSOR_CACHE` must be set to `false` for llava, qwen and roberta models.
 - `VLLM_PROMPT_USE_FLEX_ATTENTION` is enabled only for llama model, and allows to use torch.nn.attention.flex_attention instead of FusedSDPA. Note, this requires `VLLM_PROMPT_USE_FUSEDSDPA=0`
 
 # Quantization, FP8 Inference and Model Calibration Process
diff --git a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
index 40ecedabc59f..e0cd525a57e6 100644
--- a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
@@ -361,7 +361,7 @@ Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM
 
 - `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used, if `1` PyTorch Lazy backend for Gaudi will be used. `1` is the default.
 - `PT_HPU_ENABLE_LAZY_COLLECTIVES` must be set to `true` for tensor parallel inference with HPU Graphs.
-- `PT_HPUGRAPH_DISABLE_TENSOR_CACHE` must be set to `false` for llava model.
+- `PT_HPUGRAPH_DISABLE_TENSOR_CACHE` must be set to `false` for llava, qwen and roberta models.
 
 ## Quantization, FP8 Inference and Model Calibration Process
 
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index 742e63a065b1..6997d8b73e27 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import itertools
+import os
 from typing import Iterable, List, Optional, Tuple
 
 import torch
@@ -9,6 +10,7 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
+from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.pooler import CrossEncodingPooler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
@@ -47,7 +49,8 @@ def encoder_decoder_weights():
                                        if not n.startswith("roberta."))
 
 
-class RobertaEmbedding(nn.Module):
+@CustomOp.register("roberta_embedding")
+class RobertaEmbedding(CustomOp):
 
     def __init__(self, config: RobertaConfig):
         super().__init__()
@@ -71,7 +74,80 @@ def __init__(self, config: RobertaConfig):
             raise ValueError("Only 'absolute' position_embedding_type" +
                              " is supported")
 
-    def forward(
+        self.use_merged_prefill = os.environ.get('VLLM_MERGED_PREFILL',
+                                                 'false').lower() == 'true'
+
+    def forward_hpu(
+        self,
+        input_ids: torch.Tensor,
+        seq_lens: torch.Tensor,
+        position_ids: torch.Tensor,
+        token_type_ids: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        input_shape = input_ids.size()
+        inputs_embeds = self.word_embeddings(input_ids)
+
+        # Replace position ids because in RoBERTa models
+        # they have to start at padding_idx + 1 and ignore
+        # existing padding tokens
+        # Modified replace position ids
+        # for HPU set position_ids and input_ids as [batch_size, bucket_size]
+        # References:
+        # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133
+        # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669
+        pos_list = []
+        token_list = []
+        if self.use_merged_prefill:
+            offset = 0
+            for seq_len in seq_lens:
+                pos_list.append(position_ids[0][offset:offset + seq_len])
+                token_list.append(input_ids[0][offset:offset + seq_len])
+                offset += seq_len
+
+            offset = 0
+            for positions, tokens, seq_len in zip(pos_list, token_list,
+                                                  seq_lens):
+                # Verify assumption that incoming position are
+                # always a sequence from 0 to N.
+                expected_pos = torch.arange(positions.size()[0],
+                                            dtype=torch.long,
+                                            device=inputs_embeds.device)
+                assert torch.equal(positions, expected_pos)
+                position_ids[0][offset:offset +
+                                seq_len] = create_position_ids_from_input_ids(
+                                    tokens, self.padding_idx)
+                offset += seq_len
+        else:
+            for offset in range(position_ids.size()[0]):
+                pos_list.append(position_ids[offset])
+                token_list.append(input_ids[offset])
+
+            for index, (positions, tokens, seq_len) in enumerate(
+                    zip(pos_list, token_list, seq_lens)):
+                # Verify assumption that incoming position are
+                # always a sequence from 0 to N.
+                expected_pos = torch.arange(positions.size()[0],
+                                            dtype=torch.long,
+                                            device=inputs_embeds.device)
+                valid_input_mask = expected_pos < seq_len
+                expected_pos = expected_pos * valid_input_mask
+                assert torch.equal(positions, expected_pos)
+                position_ids[index] = create_position_ids_from_input_ids_hpu(
+                    tokens, self.padding_idx, seq_len)
+
+        # Position embeddings.
+        position_embeddings = self.position_embeddings(position_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape,
+                                         dtype=torch.long,
+                                         device=inputs_embeds.device)
+
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        embeddings = inputs_embeds + token_type_embeddings + position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        return embeddings
+
+    def forward_native(
         self,
         input_ids: torch.Tensor,
         seq_lens: torch.Tensor,
@@ -119,6 +195,46 @@ def forward(
         embeddings = self.LayerNorm(embeddings)
         return embeddings
 
+    def forward_cuda(
+        self,
+        input_ids: torch.Tensor,
+        seq_lens: torch.Tensor,
+        position_ids: torch.Tensor,
+        token_type_ids: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return self.forward_native(input_ids, seq_lens, position_ids,
+                                   token_type_ids)
+
+
+# Adapted from transformers
+def create_position_ids_from_input_ids_hpu(input_ids,
+                                           padding_idx,
+                                           seq_len,
+                                           past_key_values_length=0):
+    """
+    Replace non-padding symbols with their position numbers.
+    Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        x: torch.Tensor x:
+
+    Returns: torch.Tensor
+    """
+    # The series of casts and type-conversions here are carefully
+    # balanced to both work with ONNX export and XLA.
+    valid_input_mask = torch.arange(input_ids.size()[0],
+                                    dtype=torch.int,
+                                    device=input_ids.device)
+    valid_input_mask = valid_input_mask < seq_len
+
+    mask = input_ids.ne(padding_idx).int()
+
+    incremental_indices = (torch.cumsum(mask, dim=0).type_as(mask) +
+                           past_key_values_length) * mask
+
+    return (incremental_indices.long() + padding_idx) * valid_input_mask
+
 
 # Adapted from transformers
 def create_position_ids_from_input_ids(input_ids,