From df5dafaa5ba611f7179720958ba63e49615c927f Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 25 Jan 2025 03:45:20 +0800
Subject: [PATCH 01/36] [Misc] Remove deprecated code (#12383)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/async_engine/test_api_server.py         | 23 ++++++----
 tests/basic_correctness/test_preemption.py    | 18 ++++----
 .../multi_step/test_correctness_async_llm.py  |  3 +-
 vllm/config.py                                | 10 -----
 vllm/engine/arg_utils.py                      |  6 ---
 vllm/engine/metrics.py                        | 43 -------------------
 6 files changed, 25 insertions(+), 78 deletions(-)

diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py
index 83c71b5cf6eb7..91ac35dd67bbf 100644
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@@ -25,27 +25,32 @@ def _query_server_long(prompt: str) -> dict:
 
 
 @pytest.fixture
-def api_server(tokenizer_pool_size: int, worker_use_ray: bool):
+def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
     script_path = Path(__file__).parent.joinpath(
         "api_server_async_engine.py").absolute()
     commands = [
-        sys.executable, "-u",
-        str(script_path), "--model", "facebook/opt-125m", "--host",
-        "127.0.0.1", "--tokenizer-pool-size",
-        str(tokenizer_pool_size)
+        sys.executable,
+        "-u",
+        str(script_path),
+        "--model",
+        "facebook/opt-125m",
+        "--host",
+        "127.0.0.1",
+        "--tokenizer-pool-size",
+        str(tokenizer_pool_size),
+        "--distributed-executor-backend",
+        distributed_executor_backend,
     ]
 
-    if worker_use_ray:
-        commands.append("--worker-use-ray")
     uvicorn_process = subprocess.Popen(commands)
     yield
     uvicorn_process.terminate()
 
 
 @pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
-@pytest.mark.parametrize("worker_use_ray", [False, True])
+@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
 def test_api_server(api_server, tokenizer_pool_size: int,
-                    worker_use_ray: bool):
+                    distributed_executor_backend: str):
     """
     Run the API server and test it.
 
diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
index 4e502cfb5f4f8..4b27dcbc8609f 100644
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -29,10 +29,10 @@ def check_settings():
 
 
 @pytest.fixture
-def worker_use_ray() -> bool:
-    # When SPMD worker is used, use ray_use_worker=True
+def distributed_executor_backend() -> str:
+    # When SPMD worker is used, use distributed_executor_backend="ray"
     # to test delta input optimization works with preemption.
-    return envs.VLLM_USE_RAY_SPMD_WORKER
+    return "ray" if envs.VLLM_USE_RAY_SPMD_WORKER else "mp"
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -47,7 +47,7 @@ def test_chunked_prefill_recompute(
     dtype: str,
     max_tokens: int,
     chunked_prefill_token_size: int,
-    worker_use_ray: bool,
+    distributed_executor_backend: str,
 ) -> None:
     """Ensure that chunked prefill works with preemption."""
     max_num_seqs = min(chunked_prefill_token_size, 256)
@@ -66,7 +66,7 @@ def test_chunked_prefill_recompute(
             max_num_batched_tokens=max_num_batched_tokens,
             enable_chunked_prefill=enable_chunked_prefill,
             max_num_seqs=max_num_seqs,
-            worker_use_ray=worker_use_ray,
+            distributed_executor_backend=distributed_executor_backend,
             disable_log_stats=False,
     ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
@@ -93,7 +93,7 @@ def test_preemption(
     model: str,
     dtype: str,
     max_tokens: int,
-    worker_use_ray: bool,
+    distributed_executor_backend: str,
 ) -> None:
     """By default, recompute preemption is enabled"""
 
@@ -104,7 +104,7 @@ def test_preemption(
             model,
             dtype=dtype,
             disable_log_stats=False,
-            worker_use_ray=worker_use_ray,
+            distributed_executor_backend=distributed_executor_backend,
     ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
         assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
@@ -144,7 +144,7 @@ def test_preemption_infeasible(
     model: str,
     dtype: str,
     max_tokens: int,
-    worker_use_ray: bool,
+    distributed_executor_backend: str,
 ) -> None:
     """Verify infeasible preemption request will be ignored."""
     BLOCK_SIZE = 16
@@ -159,7 +159,7 @@ def test_preemption_infeasible(
             # ignored instead of hanging forever.
             num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
             max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
-            worker_use_ray=worker_use_ray,
+            distributed_executor_backend=distributed_executor_backend,
     ) as vllm_model:
         sampling_params = SamplingParams(max_tokens=max_tokens,
                                          ignore_eos=True)
diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py
index 8456a463adeeb..b8524ed83026b 100644
--- a/tests/multi_step/test_correctness_async_llm.py
+++ b/tests/multi_step/test_correctness_async_llm.py
@@ -16,7 +16,8 @@
 NUM_PROMPTS = [10]
 
 DEFAULT_SERVER_ARGS: List[str] = [
-    "--worker-use-ray",
+    "--distributed-executor-backend",
+    "ray",
     "--gpu-memory-utilization",
     "0.85",
     "--swap-space",
diff --git a/vllm/config.py b/vllm/config.py
index efd81ad3de3b4..11c6f853b2b45 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1227,9 +1227,6 @@ class ParallelConfig:
     pipeline_parallel_size: int = 1  # Number of pipeline parallel groups.
     tensor_parallel_size: int = 1  # Number of tensor parallel groups.
 
-    # Deprecated, use distributed_executor_backend instead.
-    worker_use_ray: Optional[bool] = None
-
     # Maximum number of multiple batches
     # when load model sequentially. To avoid RAM OOM when using tensor
     # parallel and large models.
@@ -1283,13 +1280,6 @@ def __post_init__(self) -> None:
         self.world_size = self.pipeline_parallel_size * \
             self.tensor_parallel_size
 
-        if self.worker_use_ray:
-            if self.distributed_executor_backend is None:
-                self.distributed_executor_backend = "ray"
-            elif not self.use_ray:
-                raise ValueError(f"worker-use-ray can't be used with "
-                                 f"distributed executor backend "
-                                 f"'{self.distributed_executor_backend}'.")
         ray_only_devices = ["tpu"]
         from vllm.platforms import current_platform
         if (current_platform.device_type in ray_only_devices
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 8f1b0bc5fd62e..f16e8e6df76bd 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -100,7 +100,6 @@ class EngineArgs:
     kv_cache_dtype: str = 'auto'
     seed: int = 0
     max_model_len: Optional[int] = None
-    worker_use_ray: bool = False
     # Note: Specifying a custom executor backend by passing a class
     # is intended for expert use only. The API may change without
     # notice.
@@ -389,10 +388,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             'to "ray" if Ray is installed and fail otherwise. Note that tpu '
             'only supports Ray for distributed inference.')
 
-        parser.add_argument(
-            '--worker-use-ray',
-            action='store_true',
-            help='Deprecated, use ``--distributed-executor-backend=ray``.')
         parser.add_argument('--pipeline-parallel-size',
                             '-pp',
                             type=int,
@@ -1071,7 +1066,6 @@ def create_engine_config(self,
         parallel_config = ParallelConfig(
             pipeline_parallel_size=self.pipeline_parallel_size,
             tensor_parallel_size=self.tensor_parallel_size,
-            worker_use_ray=self.worker_use_ray,
             max_parallel_loading_workers=self.max_parallel_loading_workers,
             disable_custom_all_reduce=self.disable_custom_all_reduce,
             tokenizer_pool_config=TokenizerPoolConfig.create_config(
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index f7ce21d0ae988..b771c190dd82a 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -259,21 +259,6 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
             documentation="Number of emitted tokens.",
             labelnames=labelnames))
 
-        # Deprecated in favor of vllm:prompt_tokens_total
-        self.gauge_avg_prompt_throughput = self._gauge_cls(
-            name="vllm:avg_prompt_throughput_toks_per_s",
-            documentation="Average prefill throughput in tokens/s.",
-            labelnames=labelnames,
-            multiprocess_mode="sum",
-        )
-        # Deprecated in favor of vllm:generation_tokens_total
-        self.gauge_avg_generation_throughput = self._gauge_cls(
-            name="vllm:avg_generation_throughput_toks_per_s",
-            documentation="Average generation throughput in tokens/s.",
-            labelnames=labelnames,
-            multiprocess_mode="sum",
-        )
-
 
 # end-metrics-definitions
 
@@ -635,20 +620,6 @@ def _log_prometheus(self, stats: Stats) -> None:
         self._log_histogram(self.metrics.histogram_max_tokens_request,
                             stats.max_tokens_requests)
 
-    def _log_prometheus_interval(self, prompt_throughput: float,
-                                 generation_throughput: float) -> None:
-        # Logs metrics to prometheus that are computed every logging_interval.
-        # Support legacy gauge metrics that make throughput calculations on
-        # the vLLM side. Moving forward, we should use counters like
-        # counter_prompt_tokens, counter_generation_tokens
-        # Which log raw data and calculate summaries using rate() on the
-        # grafana/prometheus side. See
-        # https://github.com/vllm-project/vllm/pull/2316#discussion_r1464204666
-        self.metrics.gauge_avg_prompt_throughput.labels(
-            **self.labels).set(prompt_throughput)
-        self.metrics.gauge_avg_generation_throughput.labels(
-            **self.labels).set(generation_throughput)
-
     def log(self, stats: Stats):
         """Logs to prometheus and tracked stats every iteration."""
         # Log to prometheus.
@@ -664,20 +635,6 @@ def log(self, stats: Stats):
         # Log locally every local_interval seconds.
         if local_interval_elapsed(stats.now, self.last_local_log,
                                   self.local_interval):
-            # Compute summary metrics for tracked stats (and log them
-            # to promethus if applicable).
-            prompt_throughput = get_throughput(self.num_prompt_tokens,
-                                               now=stats.now,
-                                               last_log=self.last_local_log)
-            generation_throughput = get_throughput(
-                self.num_generation_tokens,
-                now=stats.now,
-                last_log=self.last_local_log)
-
-            self._log_prometheus_interval(
-                prompt_throughput=prompt_throughput,
-                generation_throughput=generation_throughput)
-
             if self.spec_decode_metrics is not None:
                 self._log_gauge(
                     self.metrics.gauge_spec_decode_draft_acceptance_rate,

From 3132a933b65d8ed3383e082264c682940d92d803 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Fri, 24 Jan 2025 15:20:59 -0500
Subject: [PATCH 02/36] [Bugfix][Kernel] FA3 Fix - RuntimeError: This flash
 attention build only supports pack_gqa (for build size reasons). (#12405)

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2f9da6fa3e1d3..c954731bf94ef 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -576,7 +576,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 0aff05f577e8a10086066a00618609199b25231d
+          GIT_TAG 9732b0ce005d1e6216864788502d5570004678f5
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

From 221d388cc5a836fa189305785ed7e887cea8b510 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Fri, 24 Jan 2025 20:49:28 -0500
Subject: [PATCH 03/36] [Bugfix][Kernel] Fix moe align block issue for mixtral
 (#12413)

---
 csrc/moe/moe_align_sum_kernels.cu | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu
index d609ce1697df3..8b6fe72ad743b 100644
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -33,7 +33,9 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
 
   extern __shared__ int32_t shared_mem[];
   int32_t* cumsum = shared_mem;  // 1d tensor with shape (num_experts + 1)
-  token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + blockDim.x + 1);
+  token_cnts_t* tokens_cnts =
+      (token_cnts_t*)(shared_mem + num_experts +
+                      1);  // 2d tensor with shape (blockDim.x + 1, num_experts)
 
   for (int i = 0; i < num_experts; ++i) {
     tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;

From fb30ee92eefec7eacc0d7483f9d07daa1206530f Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 25 Jan 2025 11:42:42 +0800
Subject: [PATCH 04/36] [Bugfix] Fix BLIP-2 processing (#12412)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/blip2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 09c5087c2dc36..b559ac677a740 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -481,14 +481,14 @@ def _get_prompt_replacements(
         bos_token_id = tokenizer.bos_token_id
         assert isinstance(bos_token_id, int)
 
-        image_token_id = vocab["image"]
+        image_token_id = vocab["<image>"]
         num_image_tokens = self.info.get_num_image_tokens()
         image_tokens = [image_token_id] * num_image_tokens
 
         return [
             PromptReplacement(
                 modality="image",
-                target="</s>",
+                target=[bos_token_id],
                 replacement=PromptReplacementDetails(
                     full=image_tokens + [bos_token_id],
                     features=image_tokens,

From bf21481ddef2fa9bb96c13ba1f80072abdae3eb7 Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Fri, 24 Jan 2025 22:17:19 -0600
Subject: [PATCH 05/36] [ROCm][MoE] MI300 tuned configs Mixtral-8x(7B,22B) |
 fp16, fp8 (#12408)

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
---
 ...me=AMD_Instinct_MI300X,dtype=fp8_w8a8.json | 164 ++++++++++++++
 ...14336,device_name=AMD_Instinct_MI300X.json |  64 +++---
 ...me=AMD_Instinct_MI300X,dtype=fp8_w8a8.json | 164 ++++++++++++++
 ...16384,device_name=AMD_Instinct_MI300X.json | 200 ++++++++++++++++++
 ...me=AMD_Instinct_MI300X,dtype=fp8_w8a8.json | 164 ++++++++++++++
 ...=1792,device_name=AMD_Instinct_MI300X.json |  86 ++++----
 ...me=AMD_Instinct_MI300X,dtype=fp8_w8a8.json | 164 ++++++++++++++
 ...=2048,device_name=AMD_Instinct_MI300X.json | 200 ++++++++++++++++++
 ...me=AMD_Instinct_MI300X,dtype=fp8_w8a8.json | 164 ++++++++++++++
 ...=3584,device_name=AMD_Instinct_MI300X.json |  70 +++---
 ...me=AMD_Instinct_MI300X,dtype=fp8_w8a8.json | 164 ++++++++++++++
 ...=4096,device_name=AMD_Instinct_MI300X.json | 200 ++++++++++++++++++
 ...me=AMD_Instinct_MI300X,dtype=fp8_w8a8.json | 164 ++++++++++++++
 ...=7168,device_name=AMD_Instinct_MI300X.json |  76 +++----
 ...me=AMD_Instinct_MI300X,dtype=fp8_w8a8.json | 164 ++++++++++++++
 ...=8192,device_name=AMD_Instinct_MI300X.json | 200 ++++++++++++++++++
 16 files changed, 2260 insertions(+), 148 deletions(-)
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..b6f1d01f88652
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
index 4d4b752fa5d64..66f9106bd1be3 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
@@ -1,21 +1,21 @@
 {
     "1": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "2": {
         "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_warps": 2,
+        "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -23,10 +23,10 @@
     },
     "4": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
-        "num_warps": 2,
+        "num_warps": 1,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -34,10 +34,10 @@
     },
     "8": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_warps": 1,
+        "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -48,7 +48,7 @@
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_warps": 4,
+        "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -56,10 +56,10 @@
     },
     "24": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_warps": 1,
+        "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -67,32 +67,32 @@
     },
     "32": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 4,
         "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "48": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
-        "num_warps": 2,
+        "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 2
+        "kpack": 1
     },
     "64": {
         "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
-        "num_warps": 8,
+        "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -100,24 +100,24 @@
     },
     "96": {
         "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 4,
-        "num_warps": 4,
+        "num_warps": 8,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 2
+        "kpack": 1
     },
     "128": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
-        "num_warps": 8,
+        "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
-        "matrix_instr_nonkdim": 16,
+        "matrix_instr_nonkdim": 32,
         "kpack": 2
     },
     "256": {
@@ -129,7 +129,7 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "512": {
         "BLOCK_SIZE_M": 128,
@@ -150,7 +150,7 @@
         "num_warps": 8,
         "num_stages": 2,
         "waves_per_eu": 0,
-        "matrix_instr_nonkdim": 32,
+        "matrix_instr_nonkdim": 16,
         "kpack": 2
     },
     "1536": {
@@ -184,7 +184,7 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "4096": {
         "BLOCK_SIZE_M": 128,
@@ -195,6 +195,6 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     }
 }
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..0e5fd1eec77d7
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 0000000000000..d6ad63509f157
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..8323f512db015
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
index a218fc40642c1..1b46cb5716514 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
@@ -1,10 +1,10 @@
 {
     "1": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
-        "num_warps": 2,
+        "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -19,14 +19,14 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "4": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
-        "num_warps": 4,
+        "num_warps": 8,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -34,76 +34,76 @@
     },
     "8": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
-        "num_warps": 2,
+        "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 2
+        "kpack": 1
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
-        "num_warps": 8,
+        "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "24": {
         "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "32": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
-        "num_warps": 2,
+        "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 2
+        "kpack": 1
     },
     "48": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
         "kpack": 2
     },
     "64": {
-        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
-        "num_warps": 2,
+        "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "96": {
         "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 4,
-        "num_warps": 4,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -112,24 +112,24 @@
     "128": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 8,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "256": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 4,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
         "num_warps": 8,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "512": {
         "BLOCK_SIZE_M": 64,
@@ -151,7 +151,7 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "1536": {
         "BLOCK_SIZE_M": 128,
@@ -162,7 +162,7 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "2048": {
         "BLOCK_SIZE_M": 128,
@@ -184,7 +184,7 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "4096": {
         "BLOCK_SIZE_M": 128,
@@ -195,6 +195,6 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     }
 }
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..81bb765d30031
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 0000000000000..811c77ab41093
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..379ca107a9469
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
index 3682cc548f352..ed5b655d89937 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
@@ -1,21 +1,21 @@
 {
     "1": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_warps": 2,
+        "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "2": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
-        "num_warps": 2,
+        "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -23,10 +23,10 @@
     },
     "4": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
-        "num_warps": 2,
+        "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -34,7 +34,7 @@
     },
     "8": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
@@ -52,32 +52,32 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 2
+        "kpack": 1
     },
     "24": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_warps": 4,
+        "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "32": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 4,
         "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 2
+        "kpack": 1
     },
     "48": {
-        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
@@ -85,7 +85,7 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 2
+        "kpack": 1
     },
     "64": {
         "BLOCK_SIZE_M": 32,
@@ -101,40 +101,40 @@
     "96": {
         "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 4,
-        "num_warps": 4,
+        "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "128": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
         "num_warps": 8,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "256": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 4,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
         "num_warps": 8,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "512": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
         "num_stages": 2,
@@ -151,7 +151,7 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "1536": {
         "BLOCK_SIZE_M": 128,
@@ -173,7 +173,7 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "3072": {
         "BLOCK_SIZE_M": 128,
@@ -195,6 +195,6 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     }
 }
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..48bb5f2ccb8e3
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 0000000000000..a64d06c6d1724
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..bd2c6fbc1b941
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
index 21742854c613f..822f04e33e879 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
@@ -1,7 +1,7 @@
 {
     "1": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
@@ -12,54 +12,54 @@
     },
     "2": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_warps": 4,
+        "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "4": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_warps": 4,
+        "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "8": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_warps": 2,
+        "num_warps": 1,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "16": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_warps": 4,
+        "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 2
+        "kpack": 1
     },
     "24": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_warps": 8,
+        "num_warps": 1,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -68,7 +68,7 @@
     "32": {
         "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 4,
         "num_warps": 2,
         "num_stages": 2,
@@ -78,32 +78,32 @@
     },
     "48": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
         "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "64": {
         "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 2
+        "kpack": 1
     },
     "96": {
         "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 4,
-        "num_warps": 4,
+        "num_warps": 8,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
@@ -112,18 +112,18 @@
     "128": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 8,
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "256": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 8,
         "num_stages": 2,
@@ -140,7 +140,7 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "1024": {
         "BLOCK_SIZE_M": 128,
@@ -151,7 +151,7 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "1536": {
         "BLOCK_SIZE_M": 128,
@@ -173,7 +173,7 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     },
     "3072": {
         "BLOCK_SIZE_M": 128,
@@ -187,7 +187,7 @@
         "kpack": 2
     },
     "4096": {
-        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_M": 256,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
@@ -195,6 +195,6 @@
         "num_stages": 2,
         "waves_per_eu": 0,
         "matrix_instr_nonkdim": 16,
-        "kpack": 1
+        "kpack": 2
     }
 }
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..cd4fb8f11b935
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 0000000000000..cf66868e9d57a
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}

From f1fc0510dfbb11c98f41d02a44e092785c626314 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 25 Jan 2025 15:07:35 +0800
Subject: [PATCH 06/36] [Misc] Add FA2 support to ViT MHA layer (#12355)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 tests/kernels/test_mha_attn.py | 126 +++++++++++++++++++++++++++++++++
 vllm/attention/layer.py        |  25 +++++--
 2 files changed, 146 insertions(+), 5 deletions(-)
 create mode 100644 tests/kernels/test_mha_attn.py

diff --git a/tests/kernels/test_mha_attn.py b/tests/kernels/test_mha_attn.py
new file mode 100644
index 0000000000000..22d434f5e40ef
--- /dev/null
+++ b/tests/kernels/test_mha_attn.py
@@ -0,0 +1,126 @@
+"""
+Test:
+
+* Tests for MultiHeadAttention layer
+"""
+from unittest.mock import patch
+
+import pytest
+import torch
+
+from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.selector import _Backend, _cached_get_attn_backend
+from vllm.platforms import current_platform
+from vllm.platforms.cpu import CpuPlatform
+from vllm.platforms.cuda import CudaPlatform
+from vllm.platforms.rocm import RocmPlatform
+
+
+@pytest.fixture(autouse=True)
+def clear_cache():
+    """Clear lru cache to ensure each test case runs without caching.
+    """
+    _cached_get_attn_backend.cache_clear()
+
+
+@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
+def test_mha_attn_platform(device: str):
+    """
+    Test that the attention selector between different platform and device.
+    """
+    torch.set_default_dtype(torch.float16)
+
+    if device == "cpu":
+        with patch("vllm.attention.selector.current_platform", CpuPlatform()):
+            attn = MultiHeadAttention(16, 64, scale=1)
+            assert attn.attn_backend == _Backend.TORCH_SDPA
+    elif device == "hip":
+        with patch("vllm.attention.selector.current_platform", RocmPlatform()):
+            attn = MultiHeadAttention(16, 64, scale=1)
+            assert attn.attn_backend == _Backend.TORCH_SDPA
+    else:
+        with patch("vllm.attention.selector.current_platform", CudaPlatform()):
+            attn = MultiHeadAttention(16, 64, scale=1)
+            assert attn.attn_backend == _Backend.FLASH_ATTN
+
+        with patch("vllm.attention.selector.current_platform", CudaPlatform()):
+            attn = MultiHeadAttention(16, 72, scale=1)
+            assert attn.attn_backend == _Backend.XFORMERS
+
+
+def ref_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    scale: float,
+) -> torch.Tensor:
+    """
+    Native implementation of scaled dot product attention without mask:
+    - query, key, value: [batch_size, seq_len, num_heads, head_size]
+    - attn_mask: [batch_size, seq_len, seq_len]
+    """
+    query, key, value = (x.transpose(1, 2) for x in (query, key, value))
+    attn_weights = scale * torch.matmul(query, key.transpose(2, 3))
+    attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype)
+    out = torch.matmul(attn_weights, value).transpose(1, 2)
+    return out
+
+
+BATCH_SIZES = [1, 16]
+SEQ_LENS = [1]
+NUM_HEADS = [1, 16]
+NUM_KV_HEADS = [1]
+HEAD_SIZES = [64, 80]
+# flshattF and tritonflashattF supported: {torch.float16, torch.bfloat16}
+DTYPES = [
+    torch.half, torch.bfloat16, torch.float
+] if not current_platform.is_rocm() else [torch.half, torch.bfloat16]
+CUDA_DEVICES = ["cuda"]
+
+
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("seq_len", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("num_kv_heads", NUM_KV_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_mha_attn_forward(
+    batch_size: int,
+    seq_len: int,
+    num_heads: int,
+    num_kv_heads: int,
+    head_size: int,
+    dtype: torch.dtype,
+    device: str,
+):
+    current_platform.seed_everything(0)
+    torch.set_default_device(device)
+    torch.set_default_dtype(dtype)
+
+    q = torch.randn(batch_size, seq_len, num_heads * head_size)
+    k = torch.randn(batch_size, seq_len, num_kv_heads * head_size)
+    v = torch.randn(batch_size, seq_len, num_kv_heads * head_size)
+    scale = 1.0 / head_size**0.5
+    attn = MultiHeadAttention(num_heads,
+                              head_size,
+                              scale=scale,
+                              num_kv_heads=num_kv_heads)
+    output = attn(q, k, v)
+
+    assert num_heads % num_kv_heads == 0
+    num_queries_per_kv = num_heads // num_kv_heads
+    q = q.reshape(batch_size, seq_len, num_heads, head_size)
+    k = k.reshape(batch_size, seq_len, num_kv_heads, head_size)
+    v = v.reshape(batch_size, seq_len, num_kv_heads, head_size)
+    if num_queries_per_kv > 1:
+        k = torch.repeat_interleave(k, num_queries_per_kv, dim=2)
+        v = torch.repeat_interleave(v, num_queries_per_kv, dim=2)
+
+    ref_output = ref_attention(
+        q,
+        k,
+        v,
+        scale=scale,
+    ).reshape(batch_size, seq_len, num_heads * head_size)
+    torch.testing.assert_close(output, ref_output)
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 79ea9b666c7e8..a90bb4fbf5ab3 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -210,6 +210,9 @@ def __init__(
         self.scale = scale
         self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
 
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
         dtype = torch.get_default_dtype()
         attn_backend = get_attn_backend(head_size,
                                         dtype,
@@ -217,11 +220,12 @@ def __init__(
                                         block_size=16,
                                         is_attention_free=False)
         backend = backend_name_to_enum(attn_backend.get_name())
-        if backend in {_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1}:
-            backend = _Backend.XFORMERS
 
         self.attn_backend = backend if backend in {
-            _Backend.TORCH_SDPA, _Backend.XFORMERS
+            _Backend.TORCH_SDPA,
+            _Backend.XFORMERS,
+            _Backend.FLASH_ATTN,
+            _Backend.FLASH_ATTN_VLLM_V1,
         } else _Backend.TORCH_SDPA
 
     def forward(
@@ -231,7 +235,6 @@ def forward(
         value: torch.Tensor,
     ) -> torch.Tensor:
         """Input shape: batch_size x seq_len x hidden_size"""
-        # TODO(Isotr0py): Use existing backend implementations and support FA2
         bsz, q_len, _ = query.size()
         kv_len = key.size(1)
 
@@ -239,7 +242,19 @@ def forward(
         key = key.view(bsz, kv_len, self.num_kv_heads, self.head_size)
         value = value.view(bsz, kv_len, self.num_kv_heads, self.head_size)
 
-        if self.attn_backend == _Backend.XFORMERS:
+        if (num_repeat := self.num_queries_per_kv) > 1:
+            # Handle MQA and GQA
+            key = torch.repeat_interleave(key, num_repeat, dim=2)
+            value = torch.repeat_interleave(value, num_repeat, dim=2)
+
+        if self.attn_backend in {
+                _Backend.FLASH_ATTN,
+                _Backend.FLASH_ATTN_VLLM_V1,
+        }:
+            from vllm.vllm_flash_attn import flash_attn_func
+
+            out = flash_attn_func(query, key, value, softmax_scale=self.scale)
+        elif self.attn_backend == _Backend.XFORMERS:
             from xformers import ops as xops
 
             out = xops.memory_efficient_attention_forward(query,

From 324960a95c00112ce6b9b858d9311da1597cfb8b Mon Sep 17 00:00:00 2001
From: Siyuan Liu <lsiyuan@google.com>
Date: Fri, 24 Jan 2025 23:23:03 -0800
Subject: [PATCH 07/36] [TPU][CI] Update torchxla version in
 requirement-tpu.txt (#12422)

Signed-off-by: Siyuan Liu <lsiyuan@google.com>
---
 Dockerfile.tpu       |  2 +-
 requirements-tpu.txt | 21 +++++++++++----------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/Dockerfile.tpu b/Dockerfile.tpu
index ee0d94d98e82b..e268b39476665 100644
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -1,4 +1,4 @@
-ARG NIGHTLY_DATE="20250122"
+ARG NIGHTLY_DATE="20250124"
 ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
 
 FROM $BASE_IMAGE
diff --git a/requirements-tpu.txt b/requirements-tpu.txt
index 8ab18b3770ae8..51a0c65eac5aa 100644
--- a/requirements-tpu.txt
+++ b/requirements-tpu.txt
@@ -10,16 +10,17 @@ wheel
 jinja2
 ray[default]
 
-# Install torch_xla
---pre
---extra-index-url https://download.pytorch.org/whl/nightly/cpu
+# Install torch, torch_xla
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch==2.6.0.dev20241126+cpu
-torchvision==0.20.0.dev20241126+cpu
-torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
-jaxlib==0.4.36.dev20241122
-jax==0.4.36.dev20241122
+# Note: This torch whl can be slightly different from the official torch nightly whl
+# since they are not built on the same commit (but on the same day). This difference may cause C++ undefined symbol issue
+# if some change between the 2 commits introduce some C++ API change.
+# Here we install the exact torch whl from which torch_xla is built from, to avoid potential C++ undefined symbol issue.
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250124-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250124-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250124-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch_xla[pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"

From 2a0309a646b1ed83a0c40974e08c8dc628726d3c Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sat, 25 Jan 2025 21:00:31 -0800
Subject: [PATCH 08/36] [Misc][Bugfix] FA3 support to ViT MHA layer (#12435)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
---
 vllm/attention/layer.py | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index a90bb4fbf5ab3..db682b4ac63b0 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -251,9 +251,28 @@ def forward(
                 _Backend.FLASH_ATTN,
                 _Backend.FLASH_ATTN_VLLM_V1,
         }:
-            from vllm.vllm_flash_attn import flash_attn_func
-
-            out = flash_attn_func(query, key, value, softmax_scale=self.scale)
+            from vllm.vllm_flash_attn import flash_attn_varlen_func
+
+            cu_seqlens_q = torch.arange(0, (bsz + 1) * q_len,
+                                        step=q_len,
+                                        dtype=torch.int32,
+                                        device=query.device)
+            cu_seqlens_k = torch.arange(0, (bsz + 1) * kv_len,
+                                        step=kv_len,
+                                        dtype=torch.int32,
+                                        device=key.device)
+
+            out = flash_attn_varlen_func(
+                query.flatten(0, 1),
+                key.flatten(0, 1),
+                value.flatten(0, 1),
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=q_len,
+                max_seqlen_k=kv_len,
+                softmax_scale=self.scale,
+            )
+            out = out.reshape(bsz, q_len, -1)
         elif self.attn_backend == _Backend.XFORMERS:
             from xformers import ops as xops
 

From fa63e710c7fbaae3a445f669d3b5ba6b9a4ef412 Mon Sep 17 00:00:00 2001
From: Keyun Tong <tongkeyun@gmail.com>
Date: Sun, 26 Jan 2025 00:42:37 -0800
Subject: [PATCH 09/36] [V1][Perf] Reduce scheduling overhead in model runner
 after cuda sync (#12094)

Signed-off-by: Keyun Tong <tongkeyun@gmail.com>
---
 vllm/v1/outputs.py                 |  2 +-
 vllm/v1/sample/sampler.py          |  3 +--
 vllm/v1/worker/gpu_model_runner.py | 29 +++++++++++++++++++----------
 3 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index acc3a944e21b9..32aee44e3f374 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -8,7 +8,7 @@
 class SamplerOutput:
 
     # [num_reqs]
-    sampled_token_ids: List[int]
+    sampled_token_ids: torch.Tensor
 
     # [num_reqs, max_num_logprobs + 1]
     logprob_token_ids: Optional[torch.Tensor]
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 7cd42ca211a22..9ad665a64894c 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -50,9 +50,8 @@ def forward(
         # Use int32 to reduce the tensor size.
         sampled = sampled.to(torch.int32)
 
-        # NOTE: CPU-GPU synchronization happens here.
         sampler_output = SamplerOutput(
-            sampled_token_ids=sampled.tolist(),
+            sampled_token_ids=sampled,
             logprob_token_ids=topk_indices,
             logprobs=topk_logprobs,
             prompt_logprob_token_ids=None,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 4b3c325ded906..6339f1f03f11d 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -775,10 +775,10 @@ def execute_model(
             sampling_metadata=sampling_metadata,
         )
 
-        sampled_token_ids = sampler_output.sampled_token_ids
         # TODO(woosuk): The following loop can be slow since it iterates over
         # the requests one by one. Optimize.
         num_reqs = self.input_batch.num_reqs
+        request_seq_lens: List[Tuple[int, CachedRequestState, int]] = []
         for i, req_id in enumerate(self.input_batch.req_ids[:num_reqs]):
             assert req_id is not None
             req_state = self.requests[req_id]
@@ -787,10 +787,10 @@ def execute_model(
             assert seq_len <= req_state.num_tokens
             if seq_len == req_state.num_tokens:
                 # Append the sampled token to the output token ids.
-                token_id = sampled_token_ids[i]
-                self.input_batch.token_ids_cpu[i, seq_len] = token_id
                 self.input_batch.num_tokens[i] += 1
-                req_state.output_token_ids.append(token_id)
+                # OPTIMIZATION: Priming the state updates for later updates.
+                req_state.output_token_ids.append(0)
+                request_seq_lens.append((i, req_state, seq_len))
             else:
                 # Ignore the sampled token from the partial request.
                 # Rewind the generator state as if the token was not sampled.
@@ -799,6 +799,21 @@ def execute_model(
                     # This relies on cuda-specific torch-internal impl details
                     generator.set_offset(generator.get_offset() - 4)
 
+        # num_reqs entries should be non-None
+        assert all(
+            req_id is not None for req_id in
+            self.input_batch.req_ids[:num_reqs]), "req_ids contains None"
+        req_ids = cast(List[str], self.input_batch.req_ids[:num_reqs])
+
+        # NOTE: GPU -> CPU Sync happens here.
+        # Move as many CPU operations as possible before this sync point.
+        sampled_token_ids = sampler_output.sampled_token_ids.tolist()
+        # Update with the actual token ids
+        for i, req_state, seq_len in request_seq_lens:
+            token_id = sampled_token_ids[i]
+            self.input_batch.token_ids_cpu[i, seq_len] = token_id
+            req_state.output_token_ids[-1] = token_id
+
         if sampler_output.logprob_token_ids is None:
             logprob_token_ids = None
         else:
@@ -808,12 +823,6 @@ def execute_model(
         else:
             logprobs = sampler_output.logprobs.cpu()
 
-        # num_reqs entries should be non-None
-        assert all(
-            req_id is not None for req_id in
-            self.input_batch.req_ids[:num_reqs]), "req_ids contains None"
-        req_ids = cast(List[str], self.input_batch.req_ids[:num_reqs])
-
         model_runner_output = ModelRunnerOutput(
             req_ids=req_ids,
             req_id_to_index=self.input_batch.req_id_to_index,

From 0ee349b5534e3d02b499b1126f2abde73b798fe9 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sun, 26 Jan 2025 00:47:42 -0800
Subject: [PATCH 10/36] [V1][Bugfix] Fix assertion when mm hashing is turned
 off (#12439)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/v1/request.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index eefcdaf29e753..2cfcd8b63ccb2 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -58,7 +58,8 @@ def __init__(
 
         # Sanity check
         assert len(self.mm_inputs) == len(self.mm_positions)
-        assert len(self.mm_inputs) == len(self.mm_hashes)
+        if self.mm_hashes:
+            assert len(self.mm_inputs) == len(self.mm_hashes)
 
         # Cache the computed kv block hashes of the request to avoid
         # recomputing.

From a5255270c3ad492b5def19fe38beb9b2df30e74f Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sun, 26 Jan 2025 03:56:34 -0800
Subject: [PATCH 11/36] [Misc] Revert FA on ViT #12355 and #12435 (#12445)

---
 vllm/attention/layer.py | 41 ++++-------------------------------------
 1 file changed, 4 insertions(+), 37 deletions(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index db682b4ac63b0..da663d894aeb3 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -210,9 +210,6 @@ def __init__(
         self.scale = scale
         self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
 
-        assert self.num_heads % self.num_kv_heads == 0
-        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
-
         dtype = torch.get_default_dtype()
         attn_backend = get_attn_backend(head_size,
                                         dtype,
@@ -220,12 +217,12 @@ def __init__(
                                         block_size=16,
                                         is_attention_free=False)
         backend = backend_name_to_enum(attn_backend.get_name())
+        if backend in {_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1}:
+            backend = _Backend.XFORMERS
 
         self.attn_backend = backend if backend in {
             _Backend.TORCH_SDPA,
             _Backend.XFORMERS,
-            _Backend.FLASH_ATTN,
-            _Backend.FLASH_ATTN_VLLM_V1,
         } else _Backend.TORCH_SDPA
 
     def forward(
@@ -235,6 +232,7 @@ def forward(
         value: torch.Tensor,
     ) -> torch.Tensor:
         """Input shape: batch_size x seq_len x hidden_size"""
+        # TODO(Isotr0py): Use existing backend implementations and support FA3
         bsz, q_len, _ = query.size()
         kv_len = key.size(1)
 
@@ -242,38 +240,7 @@ def forward(
         key = key.view(bsz, kv_len, self.num_kv_heads, self.head_size)
         value = value.view(bsz, kv_len, self.num_kv_heads, self.head_size)
 
-        if (num_repeat := self.num_queries_per_kv) > 1:
-            # Handle MQA and GQA
-            key = torch.repeat_interleave(key, num_repeat, dim=2)
-            value = torch.repeat_interleave(value, num_repeat, dim=2)
-
-        if self.attn_backend in {
-                _Backend.FLASH_ATTN,
-                _Backend.FLASH_ATTN_VLLM_V1,
-        }:
-            from vllm.vllm_flash_attn import flash_attn_varlen_func
-
-            cu_seqlens_q = torch.arange(0, (bsz + 1) * q_len,
-                                        step=q_len,
-                                        dtype=torch.int32,
-                                        device=query.device)
-            cu_seqlens_k = torch.arange(0, (bsz + 1) * kv_len,
-                                        step=kv_len,
-                                        dtype=torch.int32,
-                                        device=key.device)
-
-            out = flash_attn_varlen_func(
-                query.flatten(0, 1),
-                key.flatten(0, 1),
-                value.flatten(0, 1),
-                cu_seqlens_q=cu_seqlens_q,
-                cu_seqlens_k=cu_seqlens_k,
-                max_seqlen_q=q_len,
-                max_seqlen_k=kv_len,
-                softmax_scale=self.scale,
-            )
-            out = out.reshape(bsz, q_len, -1)
-        elif self.attn_backend == _Backend.XFORMERS:
+        if self.attn_backend == _Backend.XFORMERS:
             from xformers import ops as xops
 
             out = xops.memory_efficient_attention_forward(query,

From 9ddc35220bee793eb445d9592a40bc4d3c081519 Mon Sep 17 00:00:00 2001
From: Matthew Hendrey <matthew.hendrey@gmail.com>
Date: Sun, 26 Jan 2025 06:59:25 -0500
Subject: [PATCH 12/36] [Frontend] generation_config.json for  maximum
 tokens(#12242)

Signed-off-by: Matthew Hendrey <matthew.hendrey@gmail.com>
Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Co-authored-by: shangmingc <caishangming@linux.alibaba.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Yuan Tang <terrytangyuan@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
---
 tests/entrypoints/openai/test_serving_chat.py | 110 ++++++++++++++++++
 vllm/config.py                                |   6 +
 vllm/engine/arg_utils.py                      |   4 +-
 vllm/entrypoints/openai/protocol.py           |  34 ++++--
 4 files changed, 145 insertions(+), 9 deletions(-)

diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 85f485364a411..e88d6c3c67829 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -103,6 +103,116 @@ def test_serving_chat_should_set_correct_max_tokens():
 
     assert mock_engine.generate.call_args.args[1].max_tokens == 10
 
+    # Setting server's max_tokens in the generation_config.json
+    # lower than context_window - prompt_tokens
+    mock_model_config = MockModelConfig()
+    mock_model_config.diff_sampling_param = {
+        "max_tokens": 10  # Setting server-side max_tokens limit
+    }
+
+    # Reinitialize the engine with new settings
+    mock_engine = MagicMock(spec=MQLLMEngineClient)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+
+    # Initialize the serving chat
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=mock_model_config)
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     mock_model_config,
+                                     models,
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
+                                     request_logger=None)
+
+    # Test Case 1: No max_tokens specified in request
+    req = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "user",
+            "content": "what is 1+1?"
+        }],
+        guided_decoding_backend="outlines",
+    )
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 10
+
+    # Test Case 2: Request's max_tokens set higher than server accepts
+    req.max_tokens = 15
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 10
+
+    # Test Case 3: Request's max_tokens set lower than server accepts
+    req.max_tokens = 5
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 5
+
+    # Setting server's max_tokens in the generation_config.json
+    # higher than context_window - prompt_tokens
+    mock_model_config = MockModelConfig()
+    mock_model_config.diff_sampling_param = {
+        "max_tokens": 200  # Setting server-side max_tokens limit
+    }
+
+    # Reinitialize the engine with new settings
+    mock_engine = MagicMock(spec=MQLLMEngineClient)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+
+    # Initialize the serving chat
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=mock_model_config)
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     mock_model_config,
+                                     models,
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
+                                     request_logger=None)
+
+    # Test case 1: No max_tokens specified, defaults to context_window
+    req = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "user",
+            "content": "what is 1+1?"
+        }],
+        guided_decoding_backend="outlines",
+    )
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 93
+
+    # Test Case 2: Request's max_tokens set higher than server accepts
+    req.max_tokens = 100
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 93
+
+    # Test Case 3: Request's max_tokens set lower than server accepts
+    req.max_tokens = 5
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 5
+
 
 def test_serving_chat_could_load_correct_generation_config():
 
diff --git a/vllm/config.py b/vllm/config.py
index 11c6f853b2b45..7a58d64bcc6e2 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -910,12 +910,18 @@ def get_diff_sampling_param(self) -> Dict[str, Any]:
             "top_k",
             "top_p",
             "min_p",
+            "max_new_tokens",
         ]
         if any(p in config for p in available_params):
             diff_sampling_param = {
                 p: config.get(p)
                 for p in available_params if config.get(p) is not None
             }
+            # Huggingface definition of max_new_tokens is equivalent
+            # to vLLM's max_tokens
+            if "max_new_tokens" in diff_sampling_param:
+                diff_sampling_param["max_tokens"] = diff_sampling_param.pop(
+                    "max_new_tokens")
         else:
             diff_sampling_param = {}
         return diff_sampling_param
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index f16e8e6df76bd..ba96484e3fce9 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -939,7 +939,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "Defaults to None, will use the default generation config in vLLM. "
             "If set to 'auto', the generation config will be automatically "
             "loaded from model. If set to a folder path, the generation config "
-            "will be loaded from the specified folder path.")
+            "will be loaded from the specified folder path. If "
+            "`max_new_tokens` is specified, then it sets a server-wide limit "
+            "on the number of output tokens for all requests.")
 
         parser.add_argument("--enable-sleep-mode",
                             action="store_true",
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 80403f77d5375..6f546aaec442a 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -380,13 +380,17 @@ def to_beam_search_params(
     ) -> BeamSearchParams:
         # TODO(#9845): remove max_tokens when field is removed from OpenAI API
         max_tokens = self.max_completion_tokens or self.max_tokens
-        if max_tokens is None:
-            max_tokens = default_max_tokens
 
         if default_sampling_params is None:
             default_sampling_params = {}
         n = self.n if self.n is not None else 1
 
+        # Use minimum of context window, user request & server limit.
+        max_tokens = min(
+            val for val in (default_max_tokens, max_tokens,
+                            default_sampling_params.get("max_tokens", None))
+            if val is not None)
+
         if (temperature := self.temperature) is None:
             temperature = default_sampling_params.get(
                 "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
@@ -406,11 +410,16 @@ def to_sampling_params(
             default_sampling_params: Optional[dict] = None) -> SamplingParams:
         # TODO(#9845): remove max_tokens when field is removed from OpenAI API
         max_tokens = self.max_completion_tokens or self.max_tokens
-        if max_tokens is None:
-            max_tokens = default_max_tokens
 
         if default_sampling_params is None:
             default_sampling_params = {}
+
+        # Use minimum of context window, user request & server limit.
+        max_tokens = min(
+            val for val in (default_max_tokens, max_tokens,
+                            default_sampling_params.get("max_tokens", None))
+            if val is not None)
+
         # Default parameters
         if (repetition_penalty := self.repetition_penalty) is None:
             repetition_penalty = default_sampling_params.get(
@@ -740,13 +749,17 @@ def to_beam_search_params(
             default_sampling_params: Optional[dict] = None
     ) -> BeamSearchParams:
         max_tokens = self.max_tokens
-        if max_tokens is None:
-            max_tokens = default_max_tokens
 
         if default_sampling_params is None:
             default_sampling_params = {}
         n = self.n if self.n is not None else 1
 
+        # Use minimum of context window, user request & server limit.
+        max_tokens = min(
+            val for val in (default_max_tokens, max_tokens,
+                            default_sampling_params.get("max_tokens", None))
+            if val is not None)
+
         if (temperature := self.temperature) is None:
             temperature = default_sampling_params.get("temperature", 1.0)
 
@@ -764,11 +777,16 @@ def to_sampling_params(
             logits_processor_pattern: Optional[str],
             default_sampling_params: Optional[dict] = None) -> SamplingParams:
         max_tokens = self.max_tokens
-        if max_tokens is None:
-            max_tokens = default_max_tokens
 
         if default_sampling_params is None:
             default_sampling_params = {}
+
+        # Use minimum of context window, user request & server limit.
+        max_tokens = min(
+            val for val in (default_max_tokens, max_tokens,
+                            default_sampling_params.get("max_tokens", None))
+            if val is not None)
+
         # Default parameters
         if (repetition_penalty := self.repetition_penalty) is None:
             repetition_penalty = default_sampling_params.get(

From aa2cd2c43d1d19ece0f3b36ad716c3a9b8a2def0 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Sun, 26 Jan 2025 06:59:58 -0500
Subject: [PATCH 13/36] [Bugfix] Disable w16a16 2of4 sparse CompressedTensors24
 (#12417)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
---
 tests/kernels/test_cutlass.py                 |  28 +--
 tests/kernels/test_cutlass_2of4_sparse.py     | 214 ++++++++++++++++++
 tests/kernels/test_semi_structured.py         | 134 -----------
 tests/kernels/utils.py                        |  27 ++-
 tests/quantization/test_compressed_tensors.py |   4 +-
 .../compressed_tensors/compressed_tensors.py  |  25 +-
 6 files changed, 263 insertions(+), 169 deletions(-)
 create mode 100644 tests/kernels/test_cutlass_2of4_sparse.py
 delete mode 100644 tests/kernels/test_semi_structured.py

diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py
index afe53797322f9..c3eddacec2727 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -2,7 +2,7 @@
 
 Run `pytest tests/kernels/test_cutlass.py`.
 """
-from typing import Optional, Type
+from typing import Type
 
 import pytest
 import torch
@@ -11,6 +11,8 @@
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 
+from .utils import baseline_scaled_mm, to_fp8, to_int8
+
 MNK_FACTORS = [
     (1, 256, 128),
     (1, 16384, 1024),
@@ -41,34 +43,10 @@
 capability = capability[0] * 10 + capability[1]
 
 
-def to_fp8(tensor: torch.Tensor):
-    finfo = torch.finfo(torch.float8_e4m3fn)
-    return torch.round(tensor.clamp(
-        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
-
-
-def to_int8(tensor: torch.Tensor):
-    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
-
-
 def rand_int8(shape: tuple, device: str = "cuda"):
     return to_int8(torch.rand(shape, device=device) * 255 - 128)
 
 
-def baseline_scaled_mm(a: torch.Tensor,
-                       b: torch.Tensor,
-                       scale_a: torch.Tensor,
-                       scale_b: torch.Tensor,
-                       out_dtype: Type[torch.dtype],
-                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-    output = (scale_a * (scale_b * (torch.mm(
-        a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype)
-    if bias is not None:
-        output = output + bias
-
-    return output
-
-
 def cutlass_fp8_gemm_helper(m: int,
                             n: int,
                             k: int,
diff --git a/tests/kernels/test_cutlass_2of4_sparse.py b/tests/kernels/test_cutlass_2of4_sparse.py
new file mode 100644
index 0000000000000..56495df34aa6c
--- /dev/null
+++ b/tests/kernels/test_cutlass_2of4_sparse.py
@@ -0,0 +1,214 @@
+"""Tests for sparse cutlass kernels
+
+Run `pytest tests/kernels/test_semi_structured.py`.
+"""
+from typing import Tuple, Type
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    sparse_cutlass_supported)
+from vllm.platforms import current_platform
+
+from .utils import baseline_scaled_mm, to_fp8, to_int8
+
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+capability = current_platform.get_device_capability()
+capability = capability[0] * 10 + capability[1]
+
+
+def to_bf16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.bfloat16)
+
+
+def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.float16)
+
+
+def prune_to_2_4(tensor):
+    # Reshape tensor to [N, 4] where N is number of groups of 4
+    original_shape = tensor.shape
+    reshaped = tensor.reshape(-1, 4)
+
+    # Get indices of top 2 absolute values in each group of 4
+    _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1)
+
+    # Create binary mask
+    mask = torch.zeros_like(reshaped)
+    mask.scatter_(dim=1,
+                  index=indices,
+                  src=torch.ones_like(indices, dtype=mask.dtype))
+
+    # Apply mask and reshape back
+    pruned = reshaped * mask
+
+    # Turn all -0.0 to 0.0
+    pruned[pruned == -0.0] = 0.0
+
+    return pruned.reshape(original_shape)
+
+
+def make_rand_sparse_tensors(
+        dtype: torch.dtype, m: int, n: int, k: int
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    a = torch.randn((m, k), device='cuda') * 5
+    b = torch.randn((n, k), device='cuda').t() * 5
+
+    b = prune_to_2_4(b.t()).t()
+
+    if dtype == torch.int8:
+        a, b = to_int8(a), to_int8(b)
+    elif dtype == torch.float8_e4m3fn:
+        a, b = to_fp8(a), to_fp8(b)
+    elif dtype == torch.float16:
+        a, b = to_fp16(a), to_fp16(b)
+    elif dtype == torch.bfloat16:
+        a, b = to_bf16(a), to_bf16(b)
+    else:
+        raise ValueError("unsupported dtype")
+
+    b_compressed, e = ops.cutlass_sparse_compress(b.t())
+
+    # Compressed B, Metadata, Original A, B
+    return b_compressed, e, a, b
+
+
+@pytest.mark.skipif(not sparse_cutlass_supported(),
+                    reason="Sparse CUTLASS is not supported on this GPU type.")
+# Test working with a subset of A and B for sparse matmul
+def test_cutlass_sparse_subset():
+
+    big_m = 1024
+    m, n, k = 512, 512, 512
+
+    # Create tensors
+    b_comp, e, whole_a, b = make_rand_sparse_tensors(torch.float8_e4m3fn,
+                                                     big_m, n, k)
+    a = whole_a[0:m, 0:k]
+    scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
+    scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
+
+    out = ops.cutlass_scaled_sparse_mm(a,
+                                       b_comp,
+                                       e,
+                                       scale_a,
+                                       scale_b,
+                                       out_dtype=torch.bfloat16)
+    baseline = baseline_scaled_mm(a,
+                                  b,
+                                  scale_a,
+                                  scale_b,
+                                  out_dtype=torch.bfloat16)
+
+    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
+
+
+MNK_FACTORS = [
+    (1, 256, 128),
+    (1, 16384, 1024),
+    (1, 24576, 512),
+    (16, 256, 512),
+    (16, 16384, 128),
+    (16, 24576, 4096),
+    (32, 8192, 4096),
+    (32, 16384, 4096),
+    (33, 1024, 1024),
+    (33, 8192, 128),
+    (64, 2048, 512),
+    (64, 16384, 1024),
+    (100, 8192, 512),
+    (128, 32768, 4096),
+    (256, 4096, 4096),
+    (512, 256, 1024),
+    (512, 8192, 4096),
+    (512, 16384, 128),
+    (512, 24576, 128),
+]
+
+
+# Test working with a subset of A and B for sparse matmul
+@pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.")
+@pytest.mark.skipif(not sparse_cutlass_supported(),
+                    reason="Sparse CUTLASS is not supported on this GPU type.")
+@pytest.mark.parametrize("m, k, n", MNK_FACTORS)
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
+def test_cutlass_sparse_gemm(m: int, k: int, n: int, dtype: Type[torch.dtype]):
+
+    # Create tensors
+    b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
+    scale_a = torch.ones((1, 1), device="cuda", dtype=torch.float32)
+    scale_b = torch.ones((1, 1), device="cuda", dtype=torch.float32)
+
+    out = ops.cutlass_scaled_sparse_mm(a,
+                                       b_comp,
+                                       e,
+                                       scale_a,
+                                       scale_b,
+                                       out_dtype=dtype)
+    baseline = F.linear(a, b.T)
+
+    torch.testing.assert_close(out, baseline, rtol=1e-2, atol=1e-2)
+
+
+@pytest.mark.skipif(not sparse_cutlass_supported(),
+                    reason="Sparse CUTLASS is not supported on this GPU type.")
+@pytest.mark.parametrize("m, k, n", MNK_FACTORS)
+@pytest.mark.skipif(not current_platform.has_device_capability(89),
+                    reason="FP8 is not supported on this GPU type.")
+def test_cutlass_sparse_fp8_gemm(m: int, n: int, k: int):
+
+    # Create tensors
+    b_comp, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k)
+    scale_a = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
+    scale_b = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
+
+    out = ops.cutlass_scaled_sparse_mm(a,
+                                       b_comp,
+                                       e,
+                                       scale_a,
+                                       scale_b,
+                                       out_dtype=torch.bfloat16)
+
+    baseline = baseline_scaled_mm(a,
+                                  b,
+                                  scale_a,
+                                  scale_b,
+                                  out_dtype=torch.bfloat16)
+
+    torch.testing.assert_close(out, baseline, rtol=1e0, atol=2e0)
+
+
+@pytest.mark.skipif(not sparse_cutlass_supported(),
+                    reason="Sparse CUTLASS is not supported on this GPU type.")
+@pytest.mark.parametrize("m,k,n", MNK_FACTORS)
+@pytest.mark.parametrize("per_act_token", [True, False])
+@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("use_bias", [True, False])
+def test_cutlass_sparse_int8_gemm(m: int, n: int, k: int, per_act_token: bool,
+                                  per_out_ch: bool, use_bias: bool):
+
+    # Create tensors
+    b_comp, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k)
+    scale_a = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
+    scale_b = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
+
+    out = ops.cutlass_scaled_sparse_mm(a,
+                                       b_comp,
+                                       e,
+                                       scale_a,
+                                       scale_b,
+                                       out_dtype=torch.bfloat16)
+
+    baseline = baseline_scaled_mm(a,
+                                  b,
+                                  scale_a,
+                                  scale_b,
+                                  out_dtype=torch.bfloat16)
+
+    torch.testing.assert_close(out, baseline, rtol=1e0, atol=2e0)
diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py
deleted file mode 100644
index 4316d6ab30e33..0000000000000
--- a/tests/kernels/test_semi_structured.py
+++ /dev/null
@@ -1,134 +0,0 @@
-"""Tests for sparse cutlass kernels
-
-Run `pytest tests/kernels/test_semi_structured.py`.
-"""
-from typing import Optional, Tuple, Type
-
-import pytest
-import torch
-
-from vllm import _custom_ops as ops
-from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    sparse_cutlass_supported)
-from vllm.platforms import current_platform
-
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
-
-capability = current_platform.get_device_capability()
-capability = capability[0] * 10 + capability[1]
-
-
-def to_fp8(tensor: torch.Tensor):
-    finfo = torch.finfo(torch.float8_e4m3fn)
-    return torch.round(tensor.clamp(
-        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
-
-
-def to_int8(tensor: torch.Tensor):
-    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
-
-
-def rand_int8(shape: tuple, device: str = "cuda"):
-    return to_int8(torch.rand(shape, device=device) * 255 - 128)
-
-
-def to_bf16(tensor: torch.Tensor) -> torch.Tensor:
-    return tensor.to(dtype=torch.bfloat16)
-
-
-def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
-    return tensor.to(dtype=torch.float16)
-
-
-def prune_to_2_4(tensor):
-    # Reshape tensor to [N, 4] where N is number of groups of 4
-    original_shape = tensor.shape
-    reshaped = tensor.reshape(-1, 4)
-
-    # Get indices of top 2 absolute values in each group of 4
-    _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1)
-
-    # Create binary mask
-    mask = torch.zeros_like(reshaped)
-    mask.scatter_(dim=1,
-                  index=indices,
-                  src=torch.ones_like(indices, dtype=mask.dtype))
-
-    # Apply mask and reshape back
-    pruned = reshaped * mask
-
-    # Turn all -0.0 to 0.0
-    pruned[pruned == -0.0] = 0.0
-
-    return pruned.reshape(original_shape)
-
-
-def make_rand_sparse_tensors(
-        dtype: torch.dtype, m: int, n: int, k: int
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    a = torch.randn((m, k), device='cuda') * 5
-    b = torch.randn((n, k), device='cuda').t() * 5
-
-    b = prune_to_2_4(b.t()).t()
-
-    if dtype == torch.int8:
-        a, b = to_int8(a), to_int8(b)
-    elif dtype == torch.float8_e4m3fn:
-        a, b = to_fp8(a), to_fp8(b)
-    elif dtype == torch.float16:
-        a, b = to_fp16(a), to_fp16(b)
-    elif dtype == torch.bfloat16:
-        a, b = to_bf16(a), to_bf16(b)
-    else:
-        raise ValueError("unsupported dtype")
-
-    b_compressed, e = ops.cutlass_sparse_compress(b.t())
-
-    # Compressed B, Metadata, Original A, B
-    return b_compressed, e, a, b
-
-
-def baseline_scaled_mm(a: torch.Tensor,
-                       b: torch.Tensor,
-                       scale_a: torch.Tensor,
-                       scale_b: torch.Tensor,
-                       out_dtype: Type[torch.dtype],
-                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-    output = (scale_a * (scale_b * (torch.mm(
-        a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype)
-    if bias is not None:
-        output = output + bias
-
-    return output
-
-
-@pytest.mark.skipif(not sparse_cutlass_supported(),
-                    reason="Sparse FP8 is not yet supported on this GPU type.")
-# Test working with a subset of A and B for sparse matmul
-def test_cutlass_sparse_subset():
-
-    big_m = 1024
-    m, n, k = 512, 512, 512
-
-    # Create tensors
-    b_comp, e, whole_a, b = make_rand_sparse_tensors(torch.float8_e4m3fn,
-                                                     big_m, n, k)
-    a = whole_a[0:m, 0:k]
-    scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
-    scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
-
-    out = ops.cutlass_scaled_sparse_mm(a,
-                                       b_comp,
-                                       e,
-                                       scale_a,
-                                       scale_b,
-                                       out_dtype=torch.bfloat16)
-    baseline = baseline_scaled_mm(a,
-                                  b,
-                                  scale_a,
-                                  scale_b,
-                                  out_dtype=torch.bfloat16)
-
-    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index 8011398551b9d..fb2c9f5d30583 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -5,7 +5,7 @@
 import unittest
 from numbers import Number
 from typing import (Any, Dict, List, NamedTuple, Optional, Sequence, Tuple,
-                    Union)
+                    Type, Union)
 
 import pytest
 import torch
@@ -1100,3 +1100,28 @@ def opcheck(op: Union[torch._ops.OpOverload, torch._ops.OpOverloadPacket,
             kwargs,
             test_utils=test_utils,
             raise_exception=raise_exception) if cond else {}
+
+
+# For testing quantized linear kernels
+def to_fp8(tensor: torch.Tensor):
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(
+        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
+
+
+def to_int8(tensor: torch.Tensor):
+    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
+
+
+def baseline_scaled_mm(a: torch.Tensor,
+                       b: torch.Tensor,
+                       scale_a: torch.Tensor,
+                       scale_b: torch.Tensor,
+                       out_dtype: Type[torch.dtype],
+                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    output = (scale_a * (scale_b * (torch.mm(
+        a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype)
+    if bias is not None:
+        output = output + bias
+
+    return output
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 0cd86cef0a475..bf0d454ad511c 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -313,8 +313,10 @@ def check_model(model):
         assert output
 
 
+@pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.")
 @pytest.mark.skipif(not sparse_cutlass_supported(),
-                    reason="Sparse FP8 is not yet supported on this GPU type.")
+                    reason="2of4 Sparse is not yet supported on this GPU type."
+                    )
 @pytest.mark.parametrize(
     "args_2of4",
     [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")])
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index b2fc2360f47f1..dd2dd02eaf723 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -9,6 +9,7 @@
                                              QuantizationType)
 from pydantic import BaseModel
 
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod)
@@ -27,6 +28,8 @@
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.platforms import current_platform
 
+logger = init_logger(__name__)
+
 __all__ = ["CompressedTensorsLinearMethod"]
 
 SPARSITY_CONFIG_NAME: Literal["sparsity_config"] = "sparsity_config"
@@ -79,6 +82,8 @@ def get_quant_method(
             return UnquantizedLinearMethod()
         if isinstance(layer, LinearBase):
             scheme = self.get_scheme(layer=layer, layer_name=prefix)
+            if scheme is None:
+                return UnquantizedLinearMethod()
             layer.scheme = scheme
             return CompressedTensorsLinearMethod(self)
         if isinstance(layer, Attention):
@@ -340,10 +345,10 @@ def _get_scheme_from_parts(
         raise NotImplementedError(
             "No compressed-tensors compatible scheme was found.")
 
-    def get_scheme(
-            self,
-            layer: torch.nn.Module,
-            layer_name: Optional[str] = None) -> "CompressedTensorsScheme":
+    def get_scheme(self,
+                   layer: torch.nn.Module,
+                   layer_name: Optional[str] = None
+                   ) -> Optional["CompressedTensorsScheme"]:
         """
         compressed-tensors supports non uniform in the following way:
 
@@ -353,10 +358,7 @@ def get_scheme(
             which can be a full layer_name, a regex for a layer_name, or
             an nn.Module name.
 
-        We first check whether a layer is in the ignore group and use
-        CompressedTensorsUnquantized (i.e. fp16/bf16) scheme for the layer
-
-        We then detect whether a layer_name is found in any target and
+        Detect whether a layer_name is found in any target and
         use the quantization scheme corresponding to the matched target
         to select the CompressedTensorsScheme used for infernece.
         """
@@ -394,6 +396,13 @@ def get_scheme(
         if self.supports_cutlass_24(weight_quant=weight_quant,
                                     input_quant=input_quant,
                                     sparsity_scheme=sparsity_scheme):
+            # FIXME(tlrmchlsmth): layers using W16A16 CUTLASS 2:4 sparse kernels
+            # currently produce bad output in some cases
+            if weight_quant is None:
+                logger.warning_once(
+                    "CompressedTensors24 scheme is disabled for the w16a16 "
+                    "case. Falling back to UnquantizedLinearMethod")
+                return None
             # Have a valid sparsity scheme
             # Validate layer is supported by Cutlass 2:4 Kernel
             scheme = CompressedTensors24(quantized=weight_quant is not None

From 72f4880425edf06f105863b2389f9c46025e08ee Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Sun, 26 Jan 2025 13:39:03 -0500
Subject: [PATCH 14/36] [Bugfix/CI] Fix broken kernels/test_mha.py (#12450)

---
 tests/kernels/test_mha_attn.py | 4 ++--
 vllm/attention/layer.py        | 8 ++++++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/tests/kernels/test_mha_attn.py b/tests/kernels/test_mha_attn.py
index 22d434f5e40ef..eab874e9e02bb 100644
--- a/tests/kernels/test_mha_attn.py
+++ b/tests/kernels/test_mha_attn.py
@@ -26,7 +26,7 @@ def clear_cache():
 @pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
 def test_mha_attn_platform(device: str):
     """
-    Test that the attention selector between different platform and device.
+    Test the attention selector between different platform and device.
     """
     torch.set_default_dtype(torch.float16)
 
@@ -41,7 +41,7 @@ def test_mha_attn_platform(device: str):
     else:
         with patch("vllm.attention.selector.current_platform", CudaPlatform()):
             attn = MultiHeadAttention(16, 64, scale=1)
-            assert attn.attn_backend == _Backend.FLASH_ATTN
+            assert attn.attn_backend == _Backend.XFORMERS
 
         with patch("vllm.attention.selector.current_platform", CudaPlatform()):
             attn = MultiHeadAttention(16, 72, scale=1)
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index da663d894aeb3..962c45a65ae23 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -210,6 +210,9 @@ def __init__(
         self.scale = scale
         self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
 
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
         dtype = torch.get_default_dtype()
         attn_backend = get_attn_backend(head_size,
                                         dtype,
@@ -240,6 +243,11 @@ def forward(
         key = key.view(bsz, kv_len, self.num_kv_heads, self.head_size)
         value = value.view(bsz, kv_len, self.num_kv_heads, self.head_size)
 
+        if (num_repeat := self.num_queries_per_kv) > 1:
+            # Handle MQA and GQA
+            key = torch.repeat_interleave(key, num_repeat, dim=2)
+            value = torch.repeat_interleave(value, num_repeat, dim=2)
+
         if self.attn_backend == _Backend.XFORMERS:
             from xformers import ops as xops
 

From 68f11149d845a164c9bbf122ab3bee8c94290169 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Sun, 26 Jan 2025 14:09:34 -0500
Subject: [PATCH 15/36] [Bugfix][Kernel] Fix perf regression caused by PR
 #12405 (#12434)

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c954731bf94ef..921f5dc7de9cb 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -576,7 +576,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 9732b0ce005d1e6216864788502d5570004678f5
+          GIT_TAG d4e09037abf588af1ec47d0e966b237ee376876c
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

From 72bac7306796b01c202d846da041f62ded3a26a9 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Sun, 26 Jan 2025 16:18:19 -0500
Subject: [PATCH 16/36] [Build/CI] Fix libcuda.so linkage (#12424)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 921f5dc7de9cb..ead539993d98c 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -446,6 +446,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 endif()
 
 message(STATUS "Enabling C extension.")
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  list(APPEND VLLM_C_LIBS cuda)
+endif()
 define_gpu_extension_target(
   _C
   DESTINATION vllm
@@ -454,6 +457,7 @@ define_gpu_extension_target(
   COMPILE_FLAGS ${VLLM_GPU_FLAGS}
   ARCHITECTURES ${VLLM_GPU_ARCHES}
   INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
+  LIBRARIES ${VLLM_C_LIBS}
   USE_SABI 3
   WITH_SOABI)
 

From 0034b09ceb7f578f2d097c7fb8c7042d17367c35 Mon Sep 17 00:00:00 2001
From: Kyle Mistele <kyle@mistele.com>
Date: Sun, 26 Jan 2025 20:58:45 -0600
Subject: [PATCH 17/36] [Frontend] Rerank API (Jina- and Cohere-compatible API)
  (#12376)

Signed-off-by: Kyle Mistele <kyle@mistele.com>
---
 .../serving/openai_compatible_server.md       |  92 ++++++++
 .../online_serving/cohere_rerank_client.py    |  32 +++
 .../online_serving/jinaai_rerank_client.py    |  33 +++
 tests/entrypoints/openai/test_rerank.py       |  87 ++++++++
 tests/entrypoints/openai/test_score.py        |   7 +-
 vllm/entrypoints/openai/api_server.py         |  51 ++++-
 vllm/entrypoints/openai/protocol.py           |  46 ++++
 vllm/entrypoints/openai/serving_engine.py     |   9 +-
 vllm/entrypoints/openai/serving_rerank.py     | 206 ++++++++++++++++++
 9 files changed, 552 insertions(+), 11 deletions(-)
 create mode 100644 examples/online_serving/cohere_rerank_client.py
 create mode 100644 examples/online_serving/jinaai_rerank_client.py
 create mode 100644 tests/entrypoints/openai/test_rerank.py
 create mode 100644 vllm/entrypoints/openai/serving_rerank.py

diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index e49bbb06695f8..8bc234545befd 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -50,6 +50,11 @@ In addition, we have the following custom APIs:
   - Applicable to all [pooling models](../models/pooling_models.md).
 - [Score API](#score-api) (`/score`)
   - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`).
+- [Re-rank API](#rerank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`)
+  - Implements [Jina AI's v1 re-rank API](https://jina.ai/reranker/)
+  - Also compatible with [Cohere's v1 & v2 re-rank APIs](https://docs.cohere.com/v2/reference/rerank)
+  - Jina and Cohere's APIs are very similar; Jina's includes extra information in the rerank endpoint's response.
+  - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`).
 
 (chat-template)=
 
@@ -473,3 +478,90 @@ The following extra parameters are supported:
 :start-after: begin-score-extra-params
 :end-before: end-score-extra-params
 ```
+
+(rerank-api)=
+
+### Re-rank API
+
+Our Re-rank API applies a cross-encoder model to predict relevant scores between a single query, and
+each of a list of documents. Usually, the score for a sentence pair refers to the similarity between two sentences, on
+a scale of 0 to 1.
+
+You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
+
+The rerank endpoints support popular re-rank models such as `BAAI/bge-reranker-base` and other models supporting the
+`score` task. Additionally, `/rerank`, `/v1/rerank`, and `/v2/rerank`
+endpoints are compatible with both [Jina AI's re-rank API interface](https://jina.ai/reranker/) and
+[Cohere's re-rank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with
+popular open-source tools.
+
+Code example: <gh-file:examples/online_serving/jinaai_rerank_client.py>
+
+#### Example Request
+
+Note that the `top_n` request parameter is optional and will default to the length of the `documents` field.
+Result documents will be sorted by relevance, and the `index` property can be used to determine original order.
+
+Request:
+
+```bash
+curl -X 'POST' \
+  'http://127.0.0.1:8000/v1/rerank' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "model": "BAAI/bge-reranker-base",
+  "query": "What is the capital of France?",
+  "documents": [
+    "The capital of Brazil is Brasilia.",
+    "The capital of France is Paris.",
+    "Horses and cows are both animals"
+  ]
+}'
+```
+
+Response:
+
+```bash
+{
+  "id": "rerank-fae51b2b664d4ed38f5969b612edff77",
+  "model": "BAAI/bge-reranker-base",
+  "usage": {
+    "total_tokens": 56
+  },
+  "results": [
+    {
+      "index": 1,
+      "document": {
+        "text": "The capital of France is Paris."
+      },
+      "relevance_score": 0.99853515625
+    },
+    {
+      "index": 0,
+      "document": {
+        "text": "The capital of Brazil is Brasilia."
+      },
+      "relevance_score": 0.0005860328674316406
+    }
+  ]
+}
+```
+
+#### Extra parameters
+
+The following [pooling parameters](#pooling-params) are supported.
+
+```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-rerank-pooling-params
+:end-before: end-rerank-pooling-params
+```
+
+The following extra parameters are supported:
+
+```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-rerank-extra-params
+:end-before: end-rerank-extra-params
+```
diff --git a/examples/online_serving/cohere_rerank_client.py b/examples/online_serving/cohere_rerank_client.py
new file mode 100644
index 0000000000000..a07affe3351ce
--- /dev/null
+++ b/examples/online_serving/cohere_rerank_client.py
@@ -0,0 +1,32 @@
+"""
+Example of using the OpenAI entrypoint's rerank API which is compatible with
+the Cohere SDK: https://github.com/cohere-ai/cohere-python
+
+run: vllm serve BAAI/bge-reranker-base
+"""
+import cohere
+
+# cohere v1 client
+co = cohere.Client(base_url="http://localhost:8000", api_key="sk-fake-key")
+rerank_v1_result = co.rerank(
+    model="BAAI/bge-reranker-base",
+    query="What is the capital of France?",
+    documents=[
+        "The capital of France is Paris", "Reranking is fun!",
+        "vLLM is an open-source framework for fast AI serving"
+    ])
+
+print(rerank_v1_result)
+
+# or the v2
+co2 = cohere.ClientV2("sk-fake-key", base_url="http://localhost:8000")
+
+v2_rerank_result = co2.rerank(
+    model="BAAI/bge-reranker-base",
+    query="What is the capital of France?",
+    documents=[
+        "The capital of France is Paris", "Reranking is fun!",
+        "vLLM is an open-source framework for fast AI serving"
+    ])
+
+print(v2_rerank_result)
diff --git a/examples/online_serving/jinaai_rerank_client.py b/examples/online_serving/jinaai_rerank_client.py
new file mode 100644
index 0000000000000..bf4de76ddf362
--- /dev/null
+++ b/examples/online_serving/jinaai_rerank_client.py
@@ -0,0 +1,33 @@
+"""
+Example of using the OpenAI entrypoint's rerank API which is compatible with
+Jina and Cohere https://jina.ai/reranker
+
+run: vllm serve BAAI/bge-reranker-base
+"""
+import json
+
+import requests
+
+url = "http://127.0.0.1:8000/rerank"
+
+headers = {"accept": "application/json", "Content-Type": "application/json"}
+
+data = {
+    "model":
+    "BAAI/bge-reranker-base",
+    "query":
+    "What is the capital of France?",
+    "documents": [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.", "Horses and cows are both animals"
+    ]
+}
+response = requests.post(url, headers=headers, json=data)
+
+# Check the response
+if response.status_code == 200:
+    print("Request successful!")
+    print(json.dumps(response.json(), indent=2))
+else:
+    print(f"Request failed with status code: {response.status_code}")
+    print(response.text)
diff --git a/tests/entrypoints/openai/test_rerank.py b/tests/entrypoints/openai/test_rerank.py
new file mode 100644
index 0000000000000..cfd8f33133960
--- /dev/null
+++ b/tests/entrypoints/openai/test_rerank.py
@@ -0,0 +1,87 @@
+import pytest
+import requests
+
+from vllm.entrypoints.openai.protocol import RerankResponse
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "BAAI/bge-reranker-base"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = ["--enforce-eager", "--max-model-len", "100"]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_rerank_texts(server: RemoteOpenAIServer, model_name: str):
+    query = "What is the capital of France?"
+    documents = [
+        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
+    ]
+
+    rerank_response = requests.post(server.url_for("rerank"),
+                                    json={
+                                        "model": model_name,
+                                        "query": query,
+                                        "documents": documents,
+                                    })
+    rerank_response.raise_for_status()
+    rerank = RerankResponse.model_validate(rerank_response.json())
+
+    assert rerank.id is not None
+    assert rerank.results is not None
+    assert len(rerank.results) == 2
+    assert rerank.results[0].relevance_score >= 0.9
+    assert rerank.results[1].relevance_score <= 0.01
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_top_n(server: RemoteOpenAIServer, model_name: str):
+    query = "What is the capital of France?"
+    documents = [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.", "Cross-encoder models are neat"
+    ]
+
+    rerank_response = requests.post(server.url_for("rerank"),
+                                    json={
+                                        "model": model_name,
+                                        "query": query,
+                                        "documents": documents,
+                                        "top_n": 2
+                                    })
+    rerank_response.raise_for_status()
+    rerank = RerankResponse.model_validate(rerank_response.json())
+
+    assert rerank.id is not None
+    assert rerank.results is not None
+    assert len(rerank.results) == 2
+    assert rerank.results[0].relevance_score >= 0.9
+    assert rerank.results[1].relevance_score <= 0.01
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_rerank_max_model_len(server: RemoteOpenAIServer, model_name: str):
+
+    query = "What is the capital of France?" * 100
+    documents = [
+        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
+    ]
+
+    rerank_response = requests.post(server.url_for("rerank"),
+                                    json={
+                                        "model": model_name,
+                                        "query": query,
+                                        "documents": documents
+                                    })
+    assert rerank_response.status_code == 400
+    # Assert just a small fragments of the response
+    assert "Please reduce the length of the input." in \
+        rerank_response.text
\ No newline at end of file
diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py
index 06e0f93dbe269..0d19615bc0d99 100644
--- a/tests/entrypoints/openai/test_score.py
+++ b/tests/entrypoints/openai/test_score.py
@@ -10,12 +10,7 @@
 
 @pytest.fixture(scope="module")
 def server():
-    args = [
-        "--enforce-eager",
-        # Will be used on tests to compare prompt input length
-        "--max-model-len",
-        "100"
-    ]
+    args = ["--enforce-eager", "--max-model-len", "100"]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index f510c41503011..45cf06566faaa 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -56,6 +56,7 @@
                                               PoolingChatRequest,
                                               PoolingCompletionRequest,
                                               PoolingRequest, PoolingResponse,
+                                              RerankRequest, RerankResponse,
                                               ScoreRequest, ScoreResponse,
                                               TokenizeRequest,
                                               TokenizeResponse,
@@ -68,6 +69,7 @@
 from vllm.entrypoints.openai.serving_models import (BaseModelPath,
                                                     OpenAIServingModels)
 from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling
+from vllm.entrypoints.openai.serving_rerank import JinaAIServingRerank
 from vllm.entrypoints.openai.serving_score import OpenAIServingScores
 from vllm.entrypoints.openai.serving_tokenization import (
     OpenAIServingTokenization)
@@ -306,6 +308,10 @@ def score(request: Request) -> Optional[OpenAIServingScores]:
     return request.app.state.openai_serving_scores
 
 
+def rerank(request: Request) -> Optional[JinaAIServingRerank]:
+    return request.app.state.jinaai_serving_reranking
+
+
 def tokenization(request: Request) -> OpenAIServingTokenization:
     return request.app.state.openai_serving_tokenization
 
@@ -502,6 +508,40 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request):
     return await create_score(request, raw_request)
 
 
+@router.post("/rerank")
+@with_cancellation
+async def do_rerank(request: RerankRequest, raw_request: Request):
+    handler = rerank(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support Rerank (Score) API")
+    generator = await handler.do_rerank(request, raw_request)
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+    elif isinstance(generator, RerankResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    assert_never(generator)
+
+
+@router.post("/v1/rerank")
+@with_cancellation
+async def do_rerank_v1(request: RerankRequest, raw_request: Request):
+    logger.warning(
+        "To indicate that the rerank API is not part of the standard OpenAI"
+        " API, we have located it at `/rerank`. Please update your client"
+        "accordingly. (Note: Conforms to JinaAI rerank API)")
+
+    return await do_rerank(request, raw_request)
+
+
+@router.post("/v2/rerank")
+@with_cancellation
+async def do_rerank_v2(request: RerankRequest, raw_request: Request):
+    return await do_rerank(request, raw_request)
+
+
 TASK_HANDLERS: Dict[str, Dict[str, tuple]] = {
     "generate": {
         "messages": (ChatCompletionRequest, create_chat_completion),
@@ -512,7 +552,10 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request):
         "default": (EmbeddingCompletionRequest, create_embedding),
     },
     "score": {
-        "default": (ScoreRequest, create_score),
+        "default": (RerankRequest, do_rerank)
+    },
+    "rerank": {
+        "default": (RerankRequest, do_rerank)
     },
     "reward": {
         "messages": (PoolingChatRequest, create_pooling),
@@ -759,6 +802,12 @@ async def init_app_state(
         state.openai_serving_models,
         request_logger=request_logger
     ) if model_config.task == "score" else None
+    state.jinaai_serving_reranking = JinaAIServingRerank(
+        engine_client,
+        model_config,
+        state.openai_serving_models,
+        request_logger=request_logger
+    ) if model_config.task == "score" else None
     state.openai_serving_tokenization = OpenAIServingTokenization(
         engine_client,
         model_config,
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 6f546aaec442a..311c16c538f34 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1018,6 +1018,52 @@ def to_pooling_params(self):
         return PoolingParams(additional_data=self.additional_data)
 
 
+class RerankRequest(OpenAIBaseModel):
+    model: str
+    query: str
+    documents: List[str]
+    top_n: int = Field(default_factory=lambda: 0)
+    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
+
+    # doc: begin-rerank-pooling-params
+    additional_data: Optional[Any] = None
+    # doc: end-rerank-pooling-params
+
+    # doc: begin-rerank-extra-params
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."))
+
+    # doc: end-rerank-extra-params
+
+    def to_pooling_params(self):
+        return PoolingParams(additional_data=self.additional_data)
+
+
+class RerankDocument(BaseModel):
+    text: str
+
+
+class RerankResult(BaseModel):
+    index: int
+    document: RerankDocument
+    relevance_score: float
+
+
+class RerankUsage(BaseModel):
+    total_tokens: int
+
+
+class RerankResponse(OpenAIBaseModel):
+    id: str
+    model: str
+    usage: RerankUsage
+    results: List[RerankResult]
+
+
 class CompletionLogProbs(OpenAIBaseModel):
     text_offset: List[int] = Field(default_factory=list)
     token_logprobs: List[Optional[float]] = Field(default_factory=list)
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 3da447be06430..8d54164e500eb 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -26,7 +26,8 @@
                                               DetokenizeRequest,
                                               EmbeddingChatRequest,
                                               EmbeddingCompletionRequest,
-                                              ErrorResponse, ScoreRequest,
+                                              ErrorResponse, RerankRequest,
+                                              ScoreRequest,
                                               TokenizeChatRequest,
                                               TokenizeCompletionRequest)
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
@@ -204,9 +205,9 @@ def _validate_input(
         token_num = len(input_ids)
 
         # Note: EmbeddingRequest and ScoreRequest doesn't have max_tokens
-        if isinstance(
-                request,
-            (EmbeddingChatRequest, EmbeddingCompletionRequest, ScoreRequest)):
+        if isinstance(request,
+                      (EmbeddingChatRequest, EmbeddingCompletionRequest,
+                       ScoreRequest, RerankRequest)):
 
             operation = "score" if isinstance(request, ScoreRequest) \
                 else "embedding generation"
diff --git a/vllm/entrypoints/openai/serving_rerank.py b/vllm/entrypoints/openai/serving_rerank.py
new file mode 100644
index 0000000000000..be4420261afe3
--- /dev/null
+++ b/vllm/entrypoints/openai/serving_rerank.py
@@ -0,0 +1,206 @@
+import asyncio
+from typing import Any, AsyncGenerator, Dict, List, Optional, Union, cast
+
+from fastapi import Request
+
+from vllm.config import ModelConfig
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.protocol import (ErrorResponse, RerankDocument,
+                                              RerankRequest, RerankResponse,
+                                              RerankResult, RerankUsage)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.inputs.data import TokensPrompt
+from vllm.logger import init_logger
+from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
+from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.utils import make_async, merge_async_iterators
+
+logger = init_logger(__name__)
+
+
+class JinaAIServingRerank(OpenAIServing):
+
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        model_config: ModelConfig,
+        models: OpenAIServingModels,
+        *,
+        request_logger: Optional[RequestLogger],
+    ) -> None:
+        super().__init__(engine_client=engine_client,
+                         model_config=model_config,
+                         models=models,
+                         request_logger=request_logger)
+
+    async def do_rerank(
+        self,
+        request: RerankRequest,
+        raw_request: Optional[Request] = None
+    ) -> Union[RerankResponse, ErrorResponse]:
+        """
+        Rerank API based on JinaAI's rerank API; implements the same
+        API interface. Designed for compatibility with off-the-shelf
+        tooling, since this is a common standard for reranking APIs
+
+        See example client implementations at
+        https://github.com/infiniflow/ragflow/blob/main/rag/llm/rerank_model.py
+        numerous clients use this standard.
+        """
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            return error_check_ret
+
+        model_name = request.model
+        request_id = f"rerank-{self._base_request_id(raw_request)}"
+        truncate_prompt_tokens = request.truncate_prompt_tokens
+        query = request.query
+        documents = request.documents
+        request_prompts = []
+        engine_prompts = []
+        top_n = request.top_n if request.top_n > 0 else len(documents)
+
+        try:
+            (
+                lora_request,
+                prompt_adapter_request,
+            ) = self._maybe_get_adapters(request)
+
+            tokenizer = await self.engine_client.get_tokenizer(lora_request)
+
+            if prompt_adapter_request is not None:
+                raise NotImplementedError("Prompt adapter is not supported "
+                                          "for scoring models")
+
+            if isinstance(tokenizer, MistralTokenizer):
+                raise ValueError(
+                    "MistralTokenizer not supported for cross-encoding")
+
+            if not self.model_config.is_cross_encoder:
+                raise ValueError("Model is not cross encoder.")
+
+            if truncate_prompt_tokens is not None and \
+                    truncate_prompt_tokens > self.max_model_len:
+                raise ValueError(
+                    f"truncate_prompt_tokens value ({truncate_prompt_tokens}) "
+                    f"is greater than max_model_len ({self.max_model_len})."
+                    f" Please, select a smaller truncation size.")
+            for doc in documents:
+                request_prompt = f"{query}{tokenizer.sep_token}{doc}"
+                tokenization_kwargs: Dict[str, Any] = {}
+                if truncate_prompt_tokens is not None:
+                    tokenization_kwargs["truncation"] = True
+                    tokenization_kwargs["max_length"] = truncate_prompt_tokens
+
+                tokenize_async = make_async(tokenizer.__call__,
+                                            executor=self._tokenizer_executor)
+                prompt_inputs = await tokenize_async(text=query,
+                                                     text_pair=doc,
+                                                     **tokenization_kwargs)
+
+                input_ids = prompt_inputs["input_ids"]
+                text_token_prompt = \
+                    self._validate_input(request, input_ids, request_prompt)
+                engine_prompt = TokensPrompt(
+                    prompt_token_ids=text_token_prompt["prompt_token_ids"],
+                    token_type_ids=prompt_inputs.get("token_type_ids"))
+
+                request_prompts.append(request_prompt)
+                engine_prompts.append(engine_prompt)
+
+        except ValueError as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
+
+        # Schedule the request and get the result generator.
+        generators: List[AsyncGenerator[PoolingRequestOutput, None]] = []
+
+        try:
+            pooling_params = request.to_pooling_params()
+
+            for i, engine_prompt in enumerate(engine_prompts):
+                request_id_item = f"{request_id}-{i}"
+
+                self._log_inputs(request_id_item,
+                                 request_prompts[i],
+                                 params=pooling_params,
+                                 lora_request=lora_request,
+                                 prompt_adapter_request=prompt_adapter_request)
+
+                trace_headers = (None if raw_request is None else await
+                                 self._get_trace_headers(raw_request.headers))
+
+                generator = self.engine_client.encode(
+                    engine_prompt,
+                    pooling_params,
+                    request_id_item,
+                    lora_request=lora_request,
+                    trace_headers=trace_headers,
+                    priority=request.priority,
+                )
+
+                generators.append(generator)
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+        result_generator = merge_async_iterators(*generators)
+
+        num_prompts = len(engine_prompts)
+
+        # Non-streaming response
+        final_res_batch: List[Optional[PoolingRequestOutput]]
+        final_res_batch = [None] * num_prompts
+
+        try:
+            async for i, res in result_generator:
+                final_res_batch[i] = res
+
+            assert all(final_res is not None for final_res in final_res_batch)
+
+            final_res_batch_checked = cast(List[PoolingRequestOutput],
+                                           final_res_batch)
+
+            response = self.request_output_to_rerank_response(
+                final_res_batch_checked, request_id, model_name, documents,
+                top_n)
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+
+        return response
+
+    def request_output_to_rerank_response(
+            self, final_res_batch: List[PoolingRequestOutput], request_id: str,
+            model_name: str, documents: List[str],
+            top_n: int) -> RerankResponse:
+        """
+        Convert the output of do_rank to a RerankResponse
+        """
+        results: List[RerankResult] = []
+        num_prompt_tokens = 0
+        for idx, final_res in enumerate(final_res_batch):
+            classify_res = ScoringRequestOutput.from_base(final_res)
+
+            result = RerankResult(
+                index=idx,
+                document=RerankDocument(text=documents[idx]),
+                relevance_score=classify_res.outputs.score,
+            )
+            results.append(result)
+            prompt_token_ids = final_res.prompt_token_ids
+            num_prompt_tokens += len(prompt_token_ids)
+
+        # sort by relevance, then return the top n if set
+        results.sort(key=lambda x: x.relevance_score, reverse=True)
+        if top_n < len(documents):
+            results = results[:top_n]
+
+        return RerankResponse(
+            id=request_id,
+            model=model_name,
+            results=results,
+            usage=RerankUsage(total_tokens=num_prompt_tokens))

From 582cf78798a6fef9b69d0471df73d81e09a7d3d8 Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Sun, 26 Jan 2025 21:46:19 -0600
Subject: [PATCH 18/36] [DOC] Add link to vLLM blog (#12460)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 docs/source/community/blog.md | 3 +++
 docs/source/index.md          | 1 +
 2 files changed, 4 insertions(+)
 create mode 100644 docs/source/community/blog.md

diff --git a/docs/source/community/blog.md b/docs/source/community/blog.md
new file mode 100644
index 0000000000000..e8030edfa02ee
--- /dev/null
+++ b/docs/source/community/blog.md
@@ -0,0 +1,3 @@
+# vLLM Blog
+
+vLLM blog posts are published [here](https://blog.vllm.ai/).
diff --git a/docs/source/index.md b/docs/source/index.md
index d7a1117df9c27..2c302d3f3e863 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -184,6 +184,7 @@ api/model/index
 :caption: Community
 :maxdepth: 1
 
+community/blog
 community/meetups
 community/sponsors
 ```

From 28e0750847ded93158a66efdcbc869d87463b38f Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 26 Jan 2025 19:57:56 -0800
Subject: [PATCH 19/36] [V1] Avoid list creation in input preparation (#12457)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu_model_runner.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 6339f1f03f11d..9d7e30079dfbb 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -171,7 +171,8 @@ def __init__(
 
         # OPTIMIZATION: Cache the tensors rather than creating them every step.
         self.arange_np = np.arange(max(self.max_num_reqs + 1,
-                                       self.max_model_len),
+                                       self.max_model_len,
+                                       self.max_num_tokens),
                                    dtype=np.int32)
         # NOTE(woosuk): These tensors are "stateless", i.e., they are literally
         # a faster version of creating a new tensor every time. Thus, we should
@@ -358,8 +359,15 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
 
         # Get batched arange.
         # E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
-        arange = np.concatenate(
-            [self.arange_np[:n] for n in num_scheduled_tokens])
+        # Equivalent to but faster than:
+        # np.concatenate([np.arange(n) for n in num_scheduled_tokens])
+        # Step 1. [2, 5, 3] -> [2, 7, 10]
+        cu_num_tokens = np.cumsum(num_scheduled_tokens)
+        # Step 2. [2, 7, 10] -> [0, 0, 2, 2, 2, 2, 2, 7, 7, 7]
+        cumsums_offsets = np.repeat(cu_num_tokens - num_scheduled_tokens,
+                                    num_scheduled_tokens)
+        # Step 3. [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
+        arange = self.arange_np[:total_num_scheduled_tokens] - cumsums_offsets
 
         # Get positions.
         positions_np = self.positions_np[:total_num_scheduled_tokens]
@@ -406,8 +414,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
 
         # Prepare the attention metadata.
         self.query_start_loc_np[0] = 0
-        np.cumsum(num_scheduled_tokens,
-                  out=self.query_start_loc_np[1:num_reqs + 1])
+        self.query_start_loc_np[1:num_reqs + 1] = cu_num_tokens
 
         self.seq_lens_np[:num_reqs] = (
             self.input_batch.num_computed_tokens_cpu[:num_reqs] +

From 0cc6b383d73eb662dfeec671d3b47cda301b2f47 Mon Sep 17 00:00:00 2001
From: Pooya Davoodi <pooya.davoodi@parasail.io>
Date: Sun, 26 Jan 2025 20:30:17 -0800
Subject: [PATCH 20/36] [Frontend] Support scores endpoint in run_batch
 (#12430)

Signed-off-by: Pooya Davoodi <pooya.davoodi@parasail.io>
---
 .../offline_inference/openai/openai_batch.md  | 33 ++++++++++++++++-
 tests/entrypoints/openai/test_run_batch.py    | 37 +++++++++++++++++++
 vllm/entrypoints/openai/protocol.py           |  5 ++-
 vllm/entrypoints/openai/run_batch.py          | 31 ++++++++++++++--
 4 files changed, 99 insertions(+), 7 deletions(-)

diff --git a/examples/offline_inference/openai/openai_batch.md b/examples/offline_inference/openai/openai_batch.md
index a4774e57cd9a5..953e6ef130f18 100644
--- a/examples/offline_inference/openai/openai_batch.md
+++ b/examples/offline_inference/openai/openai_batch.md
@@ -13,7 +13,7 @@ The OpenAI batch file format consists of a series of json objects on new lines.
 Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
  
 ```{note}
-We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints (completions coming soon).
+We currently support `/v1/chat/completions`, `/v1/embeddings`, and `/v1/score` endpoints (completions coming soon).
 ```
  
 ## Pre-requisites
@@ -203,3 +203,34 @@ $ cat results.jsonl
 {"id":"vllm-db0f71f7dec244e6bce530e0b4ef908b","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-3580bf4d4ae54d52b67eee266a6eab20","body":{"id":"embd-33ac2efa7996430184461f2e38529746","object":"list","created":444647,"model":"intfloat/e5-mistral-7b-instruct","data":[{"index":0,"object":"embedding","embedding":[0.016204833984375,0.0092010498046875,0.0018358230590820312,-0.0028228759765625,0.001422882080078125,-0.0031147003173828125,...]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0}}},"error":null}
 ...
 ```
+
+## Example 5: Using score endpoint
+
+### Additional prerequisites
+
+* Ensure you are using `vllm >= 0.7.0`.
+
+### Step 1: Create your batch file
+ 
+Add score requests to your batch file. The following is an example:
+ 
+```
+{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
+```
+
+You can mix chat completion, embedding, and score requests in the batch file, as long as the model you are using supports them all (note that all requests must use the same model).
+
+### Step 2: Run the batch
+
+You can run the batch using the same command as in earlier examples.
+
+### Step 3: Check your results
+
+You can check your results by running `cat results.jsonl`
+
+```
+$ cat results.jsonl
+{"id":"vllm-f87c5c4539184f618e555744a2965987","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-806ab64512e44071b37d3f7ccd291413","body":{"id":"score-4ee45236897b4d29907d49b01298cdb1","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.0010900497436523438},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null}
+{"id":"vllm-41990c51a26d4fac8419077f12871099","custom_id":"request-2","response":{"status_code":200,"request_id":"vllm-batch-73ce66379026482699f81974e14e1e99","body":{"id":"score-13f2ffe6ba40460fbf9f7f00ad667d75","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.001094818115234375},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null}
+```
diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py
index 097d6b1a32349..1f8a56bb43ac6 100644
--- a/tests/entrypoints/openai/test_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
@@ -1,3 +1,4 @@
+import json
 import subprocess
 import sys
 import tempfile
@@ -21,6 +22,9 @@
 {"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "Hello world!"}}
 {"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}"""
 
+INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}"""
+
 
 def test_empty_file():
     with tempfile.NamedTemporaryFile(
@@ -102,3 +106,36 @@ def test_embeddings():
             # Ensure that the output format conforms to the openai api.
             # Validation should throw if the schema is wrong.
             BatchRequestOutput.model_validate_json(line)
+
+
+def test_score():
+    with tempfile.NamedTemporaryFile(
+            "w") as input_file, tempfile.NamedTemporaryFile(
+                "r") as output_file:
+        input_file.write(INPUT_SCORE_BATCH)
+        input_file.flush()
+        proc = subprocess.Popen([
+            sys.executable,
+            "-m",
+            "vllm.entrypoints.openai.run_batch",
+            "-i",
+            input_file.name,
+            "-o",
+            output_file.name,
+            "--model",
+            "BAAI/bge-reranker-v2-m3",
+        ], )
+        proc.communicate()
+        proc.wait()
+        assert proc.returncode == 0, f"{proc=}"
+
+        contents = output_file.read()
+        for line in contents.strip().split("\n"):
+            # Ensure that the output format conforms to the openai api.
+            # Validation should throw if the schema is wrong.
+            BatchRequestOutput.model_validate_json(line)
+
+            # Ensure that there is no error in the response.
+            line_dict = json.loads(line)
+            assert isinstance(line_dict, dict)
+            assert line_dict["error"] is None
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 311c16c538f34..f89c3f42aab17 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1283,7 +1283,7 @@ class BatchRequestInput(OpenAIBaseModel):
     url: str
 
     # The parameters of the request.
-    body: Union[ChatCompletionRequest, EmbeddingRequest]
+    body: Union[ChatCompletionRequest, EmbeddingRequest, ScoreRequest]
 
 
 class BatchResponseData(OpenAIBaseModel):
@@ -1294,7 +1294,8 @@ class BatchResponseData(OpenAIBaseModel):
     request_id: str
 
     # The body of the response.
-    body: Optional[Union[ChatCompletionResponse, EmbeddingResponse]] = None
+    body: Optional[Union[ChatCompletionResponse, EmbeddingResponse,
+                         ScoreResponse]] = None
 
 
 class BatchRequestOutput(OpenAIBaseModel):
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index f8f136f9d5024..37ae23506acea 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -16,12 +16,14 @@
                                               BatchRequestOutput,
                                               BatchResponseData,
                                               ChatCompletionResponse,
-                                              EmbeddingResponse, ErrorResponse)
+                                              EmbeddingResponse, ErrorResponse,
+                                              ScoreResponse)
 # yapf: enable
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
 from vllm.entrypoints.openai.serving_models import (BaseModelPath,
                                                     OpenAIServingModels)
+from vllm.entrypoints.openai.serving_score import OpenAIServingScores
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser, random_uuid
 from vllm.version import __version__ as VLLM_VERSION
@@ -167,7 +169,8 @@ async def run_request(serving_engine_func: Callable,
                       tracker: BatchProgressTracker) -> BatchRequestOutput:
     response = await serving_engine_func(request.body)
 
-    if isinstance(response, (ChatCompletionResponse, EmbeddingResponse)):
+    if isinstance(response,
+                  (ChatCompletionResponse, EmbeddingResponse, ScoreResponse)):
         batch_output = BatchRequestOutput(
             id=f"vllm-{random_uuid()}",
             custom_id=request.custom_id,
@@ -239,6 +242,12 @@ async def main(args):
         chat_template=None,
         chat_template_content_format="auto",
     ) if model_config.task == "embed" else None
+    openai_serving_scores = (OpenAIServingScores(
+        engine,
+        model_config,
+        openai_serving_models,
+        request_logger=request_logger,
+    ) if model_config.task == "score" else None)
 
     tracker = BatchProgressTracker()
     logger.info("Reading batch from %s...", args.input_file)
@@ -279,14 +288,28 @@ async def main(args):
                     ))
                 continue
 
+            response_futures.append(run_request(handler_fn, request, tracker))
+            tracker.submitted()
+        elif request.url == "/v1/score":
+            handler_fn = (None if openai_serving_scores is None else
+                          openai_serving_scores.create_score)
+            if handler_fn is None:
+                response_futures.append(
+                    make_async_error_request_output(
+                        request,
+                        error_msg="The model does not support Scores API",
+                    ))
+                continue
+
             response_futures.append(run_request(handler_fn, request, tracker))
             tracker.submitted()
         else:
             response_futures.append(
                 make_async_error_request_output(
                     request,
-                    error_msg="Only /v1/chat/completions and "
-                    "/v1/embeddings are supported in the batch endpoint.",
+                    error_msg=
+                    "Only /v1/chat/completions, /v1/embeddings, and /v1/score "
+                    "are supported in the batch endpoint.",
                 ))
 
     with tracker.pbar():

From 5204ff5c3feeb96e8a6eea65dfcb78395f90d4d8 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 27 Jan 2025 13:26:44 +0800
Subject: [PATCH 21/36] [Bugfix] Fix Granite 3.0 MoE model loading (#12446)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/granitemoe.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index 51296ef0cc08e..b518a0a6cbdee 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -348,6 +348,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.config = config
         self.lora_config = lora_config
+        self.quant_config = quant_config  # Required by MixtralForCausalLM
 
         self.model = GraniteMoeModel(vllm_config=vllm_config,
                                      prefix=maybe_prefix(prefix, "model"))

From 372bf0890b19cc3c2992ce5c16eca3647e2a9e13 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Mon, 27 Jan 2025 15:25:30 +0800
Subject: [PATCH 22/36] [Bugfix] Fix missing seq_start_loc in xformers prefill
 metadata (#12464)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/attention/backends/xformers.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 8c25dda7aad2c..49f47f9c8ded3 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -199,6 +199,8 @@ def prefill_metadata(self) -> Optional["XFormersMetadata"]:
         # Compute some attn_metadata fields which default to None
         query_start_loc = (None if self.query_start_loc is None else
                            self.query_start_loc[:self.num_prefills + 1])
+        seq_start_loc = (None if self.seq_start_loc is None else
+                         self.seq_start_loc[:self.num_prefills + 1])
         slot_mapping = (None if self.slot_mapping is None else
                         self.slot_mapping[:self.num_prefill_tokens])
         seq_lens = (None if self.seq_lens is None else
@@ -225,6 +227,7 @@ def prefill_metadata(self) -> Optional["XFormersMetadata"]:
             max_prefill_seq_len=self.max_prefill_seq_len,
             max_decode_seq_len=0,
             query_start_loc=query_start_loc,
+            seq_start_loc=seq_start_loc,
             context_lens_tensor=context_lens_tensor,
             block_tables=block_tables,
             use_cuda_graph=False,

From 624a1e4711cb9cfdd7e336980668e64744a84863 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 27 Jan 2025 01:09:27 -0800
Subject: [PATCH 23/36] [V1][Minor] Minor optimizations for update_from_output
 (#12454)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/core/scheduler.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 8ded5e5787133..de7fb1a698df6 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -411,6 +411,10 @@ def update_from_output(
         num_scheduled_tokens = scheduler_output.num_scheduled_tokens
         new_running: List[Request] = []
         outputs: List[EngineCoreOutput] = []
+
+        # NOTE(woosuk): As len(self.running) can be up to 1K or more, the below
+        # loop can be a performance bottleneck. We should do our best to avoid
+        # expensive operations inside the loop.
         for request in self.running:
             req_id = request.request_id
             request.num_computed_tokens += num_scheduled_tokens[req_id]
@@ -421,13 +425,15 @@ def update_from_output(
 
             cached_encoder_input_ids = (
                 self.encoder_cache_manager.get_cached_input_ids(request))
-            for input_id in list(cached_encoder_input_ids):
-                start_pos = request.mm_positions[input_id]["offset"]
-                num_tokens = request.mm_positions[input_id]["length"]
-                if start_pos + num_tokens <= request.num_computed_tokens:
-                    # The encoder output is already processed and stored
-                    # in the decoder's KV cache.
-                    self.encoder_cache_manager.free(request, input_id)
+            # OPTIMIZATION: Avoid list(set) if the set is empty.
+            if cached_encoder_input_ids:
+                for input_id in list(cached_encoder_input_ids):
+                    start_pos = request.mm_positions[input_id]["offset"]
+                    num_tokens = request.mm_positions[input_id]["length"]
+                    if start_pos + num_tokens <= request.num_computed_tokens:
+                        # The encoder output is already processed and stored
+                        # in the decoder's KV cache.
+                        self.encoder_cache_manager.free(request, input_id)
 
             if request.num_computed_tokens == request.num_tokens:
                 req_index = model_runner_output.req_id_to_index[req_id]

From ce69f7f7542bdb8b6e6302d112fb9fad212c1460 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Mon, 27 Jan 2025 18:31:49 +0800
Subject: [PATCH 24/36] [Bugfix] Fix gpt2 GGUF inference (#12467)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/gpt2.py | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index 1656a3cc9e46d..2f1aa2d68653c 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -258,13 +258,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.transformer = GPT2Model(vllm_config=vllm_config,
                                      prefix=maybe_prefix(
                                          prefix, "transformer"))
+        self.lm_head = ParallelLMHead(self.config.vocab_size,
+                                      self.config.hidden_size,
+                                      quant_config=quant_config,
+                                      prefix=f"{prefix}.lm_head")
         if self.config.tie_word_embeddings:
-            self.lm_head = self.transformer.wte
-        else:
-            self.lm_head = ParallelLMHead(self.config.vocab_size,
-                                          self.config.hidden_size,
-                                          quant_config=quant_config,
-                                          prefix=f"{prefix}.lm_head")
+            self.lm_head = self.lm_head.tie_weights(self.transformer.wte)
+
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
@@ -309,15 +309,12 @@ def load_weights(self, weights: Iterable[Tuple[str,
         params_dict = dict(self.named_parameters(remove_duplicate=False))
         loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
-            if name.startswith("lm_head"):
-                # GPT-2 ties the weights of the embedding layer and the final
-                # linear layer.
-                continue
             if ".attn.bias" in name or ".attn.masked_bias" in name:
                 # Skip attention mask.
                 # NOTE: "c_attn.bias" should not be skipped.
                 continue
-            if not name.startswith("transformer."):
+            if not name.startswith("transformer.") and not name.startswith(
+                    "lm_head"):
                 name = "transformer." + name
 
             if is_pp_missing_parameter(name, self):

From 103bd17ac585b44372a47f365d80f13446cf362d Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Mon, 27 Jan 2025 10:40:00 -0500
Subject: [PATCH 25/36] [Build] Only build 9.0a for scaled_mm and sparse
 kernels (#12339)

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
---
 CMakeLists.txt    |  8 ++++----
 cmake/utils.cmake | 43 ++++++++++++++++++++++++++++---------------
 2 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ead539993d98c..4dee9ec36895f 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -275,7 +275,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # Only build Marlin kernels if we are building for at least some compatible archs.
   # Keep building Marlin for 9.0 as there are some group sizes and shapes that
   # are not supported by Machete yet.
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" ${CUDA_ARCHS})
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
   if (MARLIN_ARCHS)
     set(MARLIN_SRCS
        "csrc/quantization/fp8/fp8_marlin.cu"
@@ -296,8 +296,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   endif()
 
   # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
-  # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
-  cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
+  # CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
+  cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
     set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
     set_gencode_flags_for_srcs(
@@ -351,7 +351,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # 2:4 Sparse Kernels
 
   # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
-  # require CUDA 12.2 or later (and only work on Hopper, 9.0/9.0a for now).
+  # require CUDA 12.2 or later (and only work on Hopper, 9.0a for now).
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
     set(SRCS "csrc/sparse/cutlass/sparse_compressor_c3x.cu"
              "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 15b09395a889f..1c1c539819d05 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -259,7 +259,7 @@ endmacro()
 #  in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`.
 # We have special handling for 9.0a, if 9.0a is in `SRC_CUDA_ARCHS` and 9.0 is
 #  in `TGT_CUDA_ARCHS` then we should remove 9.0a from `SRC_CUDA_ARCHS` and add
-#  9.0a to the result. 
+#  9.0a to the result (and remove 9.0 from TGT_CUDA_ARCHS). 
 # The result is stored in `OUT_CUDA_ARCHS`.
 #
 # Example:
@@ -270,34 +270,47 @@ endmacro()
 #
 function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
   list(REMOVE_DUPLICATES SRC_CUDA_ARCHS)
+  set(TGT_CUDA_ARCHS_ ${TGT_CUDA_ARCHS})
 
   # if 9.0a is in SRC_CUDA_ARCHS and 9.0 is in CUDA_ARCHS then we should
   # remove 9.0a from SRC_CUDA_ARCHS and add 9.0a to _CUDA_ARCHS
   set(_CUDA_ARCHS)
   if ("9.0a" IN_LIST SRC_CUDA_ARCHS)
     list(REMOVE_ITEM SRC_CUDA_ARCHS "9.0a")
-    if ("9.0" IN_LIST TGT_CUDA_ARCHS)
+    if ("9.0" IN_LIST TGT_CUDA_ARCHS_)
+      list(REMOVE_ITEM TGT_CUDA_ARCHS_ "9.0")
       set(_CUDA_ARCHS "9.0a")
     endif()
   endif()
 
   list(SORT SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
 
-  # for each ARCH in CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that is 
-  # less or eqault to ARCH
-  foreach(_ARCH ${CUDA_ARCHS})
-  set(_TMP_ARCH)
-  foreach(_SRC_ARCH ${SRC_CUDA_ARCHS})
-    if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH)
-      set(_TMP_ARCH ${_SRC_ARCH})
-    else()
-      break()
+  # for each ARCH in TGT_CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that
+  # is less or equal to ARCH (but has the same major version since SASS binary
+  # compatibility is only forward compatible within the same major version).
+  foreach(_ARCH ${TGT_CUDA_ARCHS_})
+    set(_TMP_ARCH)
+    # Extract the major version of the target arch
+    string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" TGT_ARCH_MAJOR "${_ARCH}")
+    foreach(_SRC_ARCH ${SRC_CUDA_ARCHS})
+      # Extract the major version of the source arch
+      string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" SRC_ARCH_MAJOR "${_SRC_ARCH}")
+      # Check major-version match AND version-less-or-equal
+      if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH)
+        if (SRC_ARCH_MAJOR STREQUAL TGT_ARCH_MAJOR)
+          set(_TMP_ARCH "${_SRC_ARCH}")
+        endif()
+      else()
+        # If we hit a version greater than the target, we can break
+        break()
+      endif()
+    endforeach()
+
+    # If we found a matching _TMP_ARCH, append it to _CUDA_ARCHS
+    if (_TMP_ARCH)
+      list(APPEND _CUDA_ARCHS "${_TMP_ARCH}")
     endif()
   endforeach()
-  if (_TMP_ARCH)
-    list(APPEND _CUDA_ARCHS ${_TMP_ARCH})
-  endif()
-  endforeach()
 
   list(REMOVE_DUPLICATES _CUDA_ARCHS)
   set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE)

From 01ba927040d0b6f7d8daf6bfbf32fde562d2f8a6 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Mon, 27 Jan 2025 17:26:28 +0000
Subject: [PATCH 26/36] [V1][Metrics] Add initial Prometheus logger (#12416)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 tests/entrypoints/openai/test_metrics.py | 41 ++++++++++++++++++++----
 vllm/v1/engine/async_llm.py              | 11 ++++---
 vllm/v1/metrics/loggers.py               | 36 +++++++++++++++++++++
 3 files changed, 78 insertions(+), 10 deletions(-)

diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 6523c8b6297c6..469a5fb039fb6 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -16,6 +16,24 @@
 MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 
 
+@pytest.fixture(scope="module", params=[True, False])
+def use_v1(request):
+    # Module-scoped variant of run_with_both_engines
+    #
+    # Use this fixture to run a test with both v0 and v1, and
+    # also to conditionalize the test logic e.g.
+    #
+    # def test_metrics_exist(use_v1, server, client):
+    #     ...
+    #     expected = EXPECTED_V1_METRICS if use_v1 else EXPECTED_METRICS
+    #     for metric in expected:
+    #         assert metric in response.text
+    #
+    # @skip_v1 wouldn't work here because this is a module-level
+    # fixture - per-function decorators would have no effect
+    yield request.param
+
+
 @pytest.fixture(scope="module")
 def default_server_args():
     return [
@@ -36,10 +54,12 @@ def default_server_args():
                     "--enable-chunked-prefill",
                     "--disable-frontend-multiprocessing",
                 ])
-def server(default_server_args, request):
+def server(use_v1, default_server_args, request):
     if request.param:
         default_server_args.append(request.param)
-    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
+    env_dict = dict(VLLM_USE_V1='1' if use_v1 else '0')
+    with RemoteOpenAIServer(MODEL_NAME, default_server_args,
+                            env_dict=env_dict) as remote_server:
         yield remote_server
 
 
@@ -84,7 +104,9 @@ async def client(server):
 
 @pytest.mark.asyncio
 async def test_metrics_counts(server: RemoteOpenAIServer,
-                              client: openai.AsyncClient):
+                              client: openai.AsyncClient, use_v1: bool):
+    if use_v1:
+        pytest.skip("Skipping test on vllm V1")
     for _ in range(_NUM_REQUESTS):
         # sending a request triggers the metrics to be logged.
         await client.completions.create(
@@ -174,10 +196,15 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
     "swap_space_bytes",
 ]
 
+EXPECTED_METRICS_V1 = [
+    "vllm:num_requests_running",
+    "vllm:num_requests_waiting",
+]
+
 
 @pytest.mark.asyncio
 async def test_metrics_exist(server: RemoteOpenAIServer,
-                             client: openai.AsyncClient):
+                             client: openai.AsyncClient, use_v1: bool):
     # sending a request triggers the metrics to be logged.
     await client.completions.create(model=MODEL_NAME,
                                     prompt="Hello, my name is",
@@ -187,11 +214,13 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
     response = requests.get(server.url_for("metrics"))
     assert response.status_code == HTTPStatus.OK
 
-    for metric in EXPECTED_METRICS:
+    for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS):
         assert metric in response.text
 
 
-def test_metrics_exist_run_batch():
+def test_metrics_exist_run_batch(use_v1: bool):
+    if use_v1:
+        pytest.skip("Skipping test on vllm V1")
     input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}"""  # noqa: E501
 
     base_url = "0.0.0.0"
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 6dc68b3a16099..917d52d3220b8 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -24,7 +24,8 @@
 from vllm.v1.engine.output_processor import OutputProcessor
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
-from vllm.v1.metrics.loggers import LoggingStatLogger, StatLoggerBase
+from vllm.v1.metrics.loggers import (LoggingStatLogger, PrometheusStatLogger,
+                                     StatLoggerBase)
 from vllm.v1.metrics.stats import IterationStats, SchedulerStats
 
 logger = init_logger(__name__)
@@ -46,13 +47,15 @@ def __init__(
 
         assert start_engine_loop
 
+        self.model_config = vllm_config.model_config
+
         self.log_requests = log_requests
         self.log_stats = log_stats
         self.stat_loggers: List[StatLoggerBase] = [
             LoggingStatLogger(),
-            # TODO(rob): PrometheusStatLogger(),
+            PrometheusStatLogger(labels=dict(
+                model_name=self.model_config.served_model_name)),
         ]
-        self.model_config = vllm_config.model_config
 
         # Tokenizer (+ ensure liveness if running in another process).
         self.tokenizer = init_tokenizer_from_configs(
@@ -272,7 +275,7 @@ async def _run_output_handler(self):
 
                 # 4) Logging.
                 # TODO(rob): make into a coroutine and launch it in
-                # background thread once we add Prometheus.
+                # background thread once Prometheus overhead is non-trivial.
                 assert iteration_stats is not None
                 self._log_stats(
                     scheduler_stats=outputs.scheduler_stats,
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 8feeef17542e6..b84f03fa3267c 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -1,5 +1,8 @@
 import time
 from abc import ABC, abstractmethod
+from typing import Dict
+
+import prometheus_client
 
 from vllm.logger import init_logger
 from vllm.v1.metrics.stats import SchedulerStats
@@ -36,3 +39,36 @@ def log(self, scheduler_stats: SchedulerStats):
             scheduler_stats.num_running_reqs,
             scheduler_stats.num_waiting_reqs,
         )
+
+
+class PrometheusStatLogger(StatLoggerBase):
+
+    def __init__(self, labels: Dict[str, str]):
+        self.labels = labels
+
+        labelnames = self.labels.keys()
+        labelvalues = self.labels.values()
+
+        self._unregister_vllm_metrics()
+
+        self.gauge_scheduler_running = prometheus_client.Gauge(
+            name="vllm:num_requests_running",
+            documentation="Number of requests in model execution batches.",
+            labelnames=labelnames).labels(*labelvalues)
+
+        self.gauge_scheduler_waiting = prometheus_client.Gauge(
+            name="vllm:num_requests_waiting",
+            documentation="Number of requests waiting to be processed.",
+            labelnames=labelnames).labels(*labelvalues)
+
+    def log(self, scheduler_stats: SchedulerStats):
+        """Log to prometheus."""
+        self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs)
+        self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs)
+
+    @staticmethod
+    def _unregister_vllm_metrics():
+        # Unregister any existing vLLM collectors (for CI/CD
+        for collector in list(prometheus_client.REGISTRY._collector_to_names):
+            if hasattr(collector, "_name") and "vllm" in collector._name:
+                prometheus_client.REGISTRY.unregister(collector)

From 3f1fc7425a7db4d9722941075e43bb2ebfb90613 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 27 Jan 2025 09:40:04 -0800
Subject: [PATCH 27/36] [V1][CI/Test] Do basic test for top-p & top-k sampling
 (#12469)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 tests/v1/engine/test_engine_core.py | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index cccfd305ac604..033bbcfce564e 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -144,7 +144,7 @@ def test_engine_core(monkeypatch):
 def test_engine_core_advanced_sampling(monkeypatch):
     """
     A basic end-to-end test to verify that the engine functions correctly 
-    when additional sampling parameters, such as min_tokens and 
+    when additional sampling parameters, such as top_p, min_tokens, and 
     presence_penalty, are set.
     """
     with monkeypatch.context() as m:
@@ -167,11 +167,23 @@ def test_engine_core_advanced_sampling(monkeypatch):
             stop_token_ids=[1001, 1002],
         )
         engine_core.add_request(request)
-        assert len(engine_core.scheduler.waiting) == 1
-        assert len(engine_core.scheduler.running) == 0
-        # Loop through until they are all done.
-        while len(engine_core.step().outputs) > 0:
-            pass
 
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 0
+        def _check_engine_state():
+            assert len(engine_core.scheduler.waiting) == 1
+            assert len(engine_core.scheduler.running) == 0
+            # Loop through until they are all done.
+            while len(engine_core.step().outputs) > 0:
+                pass
+            assert len(engine_core.scheduler.waiting) == 0
+            assert len(engine_core.scheduler.running) == 0
+
+        _check_engine_state()
+
+        # Second request.
+        request2 = make_request()
+        request2.sampling_params = SamplingParams(
+            top_p=0.99,
+            top_k=50,
+        )
+        engine_core.add_request(request2)
+        _check_engine_state()

From 2bc3fbba0cf5b07fabb798d41b153b895d30c7b4 Mon Sep 17 00:00:00 2001
From: Bowen Wang <abmfy@icloud.com>
Date: Tue, 28 Jan 2025 02:19:24 +0800
Subject: [PATCH 28/36] [FlashInfer] Upgrade to 0.2.0 (#11194)

Signed-off-by: Bowen Wang <abmfy@icloud.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |  11 +-
 Dockerfile                                    |  23 ++-
 .../test_basic_correctness.py                 |   5 +-
 tests/compile/test_basic_correctness.py       |   2 +-
 tests/kernels/test_flashinfer.py              |  74 +++----
 vllm/attention/backends/flashinfer.py         | 183 ++++++++++++++++--
 vllm/config.py                                |  10 +-
 vllm/model_executor/model_loader/loader.py    |   4 +-
 .../model_executor/model_loader/tensorizer.py |   3 +-
 vllm/worker/worker_base.py                    |  17 +-
 10 files changed, 257 insertions(+), 75 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index daec46760117d..d5d02fdeb7f4b 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -183,7 +183,16 @@ steps:
     - vllm/
     - tests/v1
   commands:
-    - VLLM_USE_V1=1 pytest -v -s v1
+    # split the test to avoid interference
+    - VLLM_USE_V1=1 pytest -v -s v1/core
+    - VLLM_USE_V1=1 pytest -v -s v1/engine
+    - VLLM_USE_V1=1 pytest -v -s v1/sample
+    - VLLM_USE_V1=1 pytest -v -s v1/worker
+    - VLLM_USE_V1=1 pytest -v -s v1/test_stats.py
+    - VLLM_USE_V1=1 pytest -v -s v1/test_utils.py
+    # TODO: accuracy does not match, whether setting
+    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
+    - VLLM_USE_V1=1 pytest -v -s v1/e2e
 
 - label: Examples Test # 25min
   working_dir: "/vllm-workspace/examples"
diff --git a/Dockerfile b/Dockerfile
index cb9cf0da5be65..0b9f74e08dc68 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -149,7 +149,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
-FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
+# TODO: Restore to base image after FlashInfer AOT wheel fixed
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
 ARG CUDA_VERSION=12.4.1
 ARG PYTHON_VERSION=3.12
 WORKDIR /vllm-workspace
@@ -194,12 +195,30 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
     --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install dist/*.whl --verbose
 
+# How to build this FlashInfer wheel:
+# $ export FLASHINFER_ENABLE_AOT=1
+# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
+# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX'
+# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
+# $ cd flashinfer
+# $ git checkout 524304395bd1d8cd7d07db083859523fcaa246a4
+# $ python3 setup.py bdist_wheel --dist-dir=dist --verbose
+
 RUN --mount=type=cache,target=/root/.cache/pip \
 . /etc/environment && \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
-    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
+    python3 -m pip install https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.0.post1-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
 fi
 COPY examples examples
+
+# Although we build Flashinfer with AOT mode, there's still
+# some issues w.r.t. JIT compilation. Therefore we need to
+# install build dependencies for JIT compilation.
+# TODO: Remove this once FlashInfer AOT wheel is fixed
+COPY requirements-build.txt requirements-build.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install -r requirements-build.txt
+
 #################### vLLM installation IMAGE ####################
 
 #################### TEST IMAGE ####################
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 31a101e48e026..23285040642a8 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -61,9 +61,10 @@ def test_models(
     if backend == "FLASHINFER" and current_platform.is_rocm():
         pytest.skip("Flashinfer does not support ROCm/HIP.")
 
-    if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
+    if backend in ("XFORMERS",
+                   "FLASHINFER") and model == "google/gemma-2-2b-it":
         pytest.skip(
-            "XFORMERS does not support gemma2 with full context length.")
+            f"{backend} does not support gemma2 with full context length.")
 
     os.environ["VLLM_ATTENTION_BACKEND"] = backend
 
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index 87d5aefea6cb4..1945479fc3031 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -58,7 +58,7 @@ class TestSetting:
         model_args=["--task", "embed"],
         pp_size=1,
         tp_size=1,
-        attn_backend="FLASHINFER",
+        attn_backend="FLASH_ATTN",
         method="encode",
         fullgraph=True,
     ),
diff --git a/tests/kernels/test_flashinfer.py b/tests/kernels/test_flashinfer.py
index a2c8f71665737..1645ef911d697 100644
--- a/tests/kernels/test_flashinfer.py
+++ b/tests/kernels/test_flashinfer.py
@@ -133,17 +133,19 @@ def test_flashinfer_decode_with_paged_kv(
                 use_tensor_cores=(
                     (num_query_heads//num_kv_heads) > 4)
                 )
-    wrapper.begin_forward(kv_indptr,
-                          kv_indices,
-                          kv_last_page_lens,
-                          num_query_heads,
-                          num_kv_heads,
-                          head_size,
-                          block_size,
-                          "NONE",
-                          data_type=dtype)
-
-    output = wrapper.forward(query, key_value_cache, logits_soft_cap=soft_cap)
+    wrapper.plan(kv_indptr,
+                 kv_indices,
+                 kv_last_page_lens,
+                 num_query_heads,
+                 num_kv_heads,
+                 head_size,
+                 block_size,
+                 "NONE",
+                 q_data_type=dtype,
+                 kv_data_type=dtype,
+                 logits_soft_cap=soft_cap)
+
+    output = wrapper.run(query, key_value_cache)
 
     ref_output = ref_paged_attn(query=query,
                                 key_cache=key_cache,
@@ -228,7 +230,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]],
     workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
     wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
         workspace_buffer, "NHD")
-    wrapper.begin_forward(
+    wrapper.plan(
         qo_indptr,
         kv_indptr,
         kv_indices,
@@ -237,12 +239,14 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]],
         num_kv_heads,
         head_size,
         block_size,
+        q_data_type=dtype,
+        kv_data_type=dtype,
+        logits_soft_cap=soft_cap,
     )
 
-    output = wrapper.forward(
+    output = wrapper.run(
         query,
         key_value_cache,
-        logits_soft_cap=soft_cap,
     )
 
     ref_output = ref_paged_attn(query=query,
@@ -253,7 +257,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]],
                                 block_tables=block_tables,
                                 scale=scale,
                                 soft_cap=soft_cap)
-    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \
+    torch.testing.assert_close(output, ref_output, atol=5e-2, rtol=1e-2), \
         f"{torch.max(torch.abs(output - ref_output))}"
 
 
@@ -332,7 +336,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
     workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
     wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
         workspace_buffer, "NHD")
-    wrapper.begin_forward(
+    wrapper.plan(
         qo_indptr,
         kv_indptr,
         kv_indices,
@@ -341,13 +345,12 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
         num_kv_heads,
         head_size,
         block_size,
+        q_data_type=dtype,
+        kv_data_type=kv_cache_dtype,
+        logits_soft_cap=soft_cap,
     )
 
-    output = wrapper.forward(query,
-                             kv_cache_fp8,
-                             logits_soft_cap=soft_cap,
-                             k_scale=k_scale,
-                             v_scale=v_scale)
+    output = wrapper.run(query, kv_cache_fp8, k_scale=k_scale, v_scale=v_scale)
 
     ref_output = ref_paged_attn(query=query,
                                 key_cache=key_cache.squeeze(1),
@@ -360,7 +363,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
     del query
     del block_tables
     # verify prefill fp8
-    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \
+    torch.testing.assert_close(output, ref_output, atol=5e-2, rtol=1e-2), \
         f"{torch.max(torch.abs(output - ref_output))}"
 
 
@@ -439,21 +442,18 @@ def test_flashinfer_decode_with_paged_fp8_kv(
     wrapper = flashinfer.\
         BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, "NHD",
                     use_tensor_cores=use_tensor_cores)
-    wrapper.begin_forward(kv_indptr,
-                          kv_indices,
-                          kv_last_page_lens,
-                          num_query_heads,
-                          num_kv_heads,
-                          head_size,
-                          block_size,
-                          "NONE",
-                          data_type=dtype,
-                          q_data_type=dtype)
-    output = wrapper.forward(query,
-                             kv_cache_fp8,
-                             logits_soft_cap=soft_cap,
-                             k_scale=k_scale,
-                             v_scale=v_scale)
+    wrapper.plan(kv_indptr,
+                 kv_indices,
+                 kv_last_page_lens,
+                 num_query_heads,
+                 num_kv_heads,
+                 head_size,
+                 block_size,
+                 "NONE",
+                 q_data_type=dtype,
+                 kv_data_type=kv_cache_dtype,
+                 logits_soft_cap=soft_cap)
+    output = wrapper.run(query, kv_cache_fp8, k_scale=k_scale, v_scale=v_scale)
     key_cache = key_value_cache[:, 0, :, :, :].squeeze(1)
     value_cache = key_value_cache[:, 1, :, :, :].squeeze(1)
 
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 3135b0b405343..7cccef9608218 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -1,3 +1,4 @@
+import dataclasses
 from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
@@ -13,9 +14,11 @@
     from vllm.vllm_flash_attn import flash_attn_varlen_func
     FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024
 except ImportError:
-    BatchDecodeWithPagedKVCacheWrapper = None
-    CUDAGraphBatchDecodeWithPagedKVCacheWrapper = None
-    BatchPrefillWithPagedKVCacheWrapper = None
+    # Avoid turning these types into variables during type checking
+    if not TYPE_CHECKING:
+        BatchDecodeWithPagedKVCacheWrapper = None
+        CUDAGraphBatchDecodeWithPagedKVCacheWrapper = None
+        BatchPrefillWithPagedKVCacheWrapper = None
     FLASHINFER_WORKSPACE_BUFFER_SIZE = 0
 
 import torch
@@ -30,7 +33,9 @@
 from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
                                            compute_slot_mapping_start_idx,
                                            is_block_tables_empty)
+from vllm.attention.layer import Attention
 from vllm.attention.ops.paged_attn import PagedAttention
+from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype,
                         make_tensor_with_pad)
 
@@ -99,6 +104,72 @@ def get_fp8_dtype_for_flashinfer(kv_cache_dtype: str) -> torch.dtype:
             raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}")
 
 
+@dataclass
+class PerLayerParameters:
+    """
+    Currently, FlashInfer backend only support models in which all layers share
+    the same values for the following hyperparameters.
+    """
+
+    window_left: int
+    logits_soft_cap: Optional[float]
+    sm_scale: float
+
+
+def get_per_layer_parameters(
+        vllm_config: VllmConfig) -> Dict[str, PerLayerParameters]:
+    """
+    Scan all attention layers and determine some hyperparameters
+    to use during `plan`.
+    """
+
+    layers = vllm_config.compilation_config.static_forward_context
+    per_layer_params: Dict[str, PerLayerParameters] = {}
+
+    for key, layer in layers.items():
+        assert isinstance(layer, Attention)
+
+        impl = layer.impl
+        assert isinstance(impl, FlashInferImpl)
+
+        # Infer hyperparameters from the attention layer
+        window_size = impl.sliding_window
+        window_left = window_size[0] if window_size is not None else -1
+        logits_soft_cap = impl.logits_soft_cap
+        sm_scale = impl.scale
+
+        per_layer_params[key] = PerLayerParameters(window_left,
+                                                   logits_soft_cap, sm_scale)
+
+    return per_layer_params
+
+
+def infer_global_hyperparameters(
+        per_layer_params: Dict[str, PerLayerParameters]) -> PerLayerParameters:
+    """
+    Currently, FlashInfer backend only support models in which all layers share
+    the same values for the following hyperparameters:
+    - `window_left`
+    - `logits_soft_cap`
+    - `sm_scale`
+
+    So this function asserts that all layers share the same values for these
+    hyperparameters and returns the global values.
+    """
+
+    assert len(per_layer_params) > 0, "No attention layers found in the model."
+
+    param_sets = list(per_layer_params.values())
+    global_params = param_sets[0]
+    for params in param_sets:
+        assert params == global_params, (
+            "FlashInfer backend currently only supports models in which all "
+            "layers share the same values for the following hyperparameters: "
+            "`window_left`, `logits_soft_cap`, `sm_scale`.")
+
+    return global_params
+
+
 class FlashInferState(AttentionState):
 
     def __init__(self, runner):
@@ -108,6 +179,11 @@ def __init__(self, runner):
         self._decode_wrapper = None
         self._prefill_wrapper = None
 
+        # Global hyperparameters shared by all attention layers
+        self.global_hyperparameters: Optional[PerLayerParameters] = None
+
+        self.vllm_config = get_current_vllm_config()
+
     def _get_workspace_buffer(self):
         if self._workspace_buffer is None:
             self._workspace_buffer = torch.empty(
@@ -215,6 +291,9 @@ def graph_capture_get_metadata_for_batch(
                                             batch_size + 1,
                                             dtype=torch.int32)
 
+        global_params = infer_global_hyperparameters(
+            get_per_layer_parameters(self.vllm_config))
+
         attn_metadata = self.runner.attn_backend.make_metadata(
             num_prefills=0,
             slot_mapping=self._graph_slot_mapping[:batch_size],
@@ -238,7 +317,9 @@ def graph_capture_get_metadata_for_batch(
             q_data_type=self.runner.model_config.dtype,
             use_cuda_graph=True,
             decode_wrapper=self._graph_decode_wrapper,
-            prefill_wrapper=None)
+            prefill_wrapper=None,
+            **dataclasses.asdict(global_params),
+        )
         attn_metadata.begin_forward()
         return attn_metadata
 
@@ -325,9 +406,28 @@ class FlashInferMetadata(AttentionMetadata):
     data_type: torch.dtype = None
     # The data type of the query
     q_data_type: torch.dtype = None
-    device: torch.device = torch.device("cuda")
+    # FlashInfer 0.2 encourages passing host tensors
+    device: torch.device = torch.device("cpu")
     is_profile_run: bool = False
 
+    # The FlashInfer backend currently supports only models in which all layers
+    # share the same following hyperparameters:
+
+    # The left (inclusive) window size for the attention window, when
+    # set to `-1`, the window size will be set to the full length of
+    # the sequence. Defaults to `-1`.
+    window_left: int = -1
+    # The attention logits soft capping value (used in Gemini, Grok and
+    # Gemma-2, etc.), if not provided, will be set to `0`. If greater
+    # than 0, the logits will be capped according to formula:
+    # $$\texttt{logits\_soft\_cap} \times
+    # \mathrm{tanh}(x / \texttt{logits\_soft\_cap})$$,
+    # where $x$ is the input logits.
+    logits_soft_cap: Optional[float] = None
+    # The scale used in softmax, if not provided, will be set to
+    # `1.0 / sqrt(head_dim)`.
+    sm_scale: Optional[float] = None
+
     def __post_init__(self):
         # Refer to
         # https://github.com/flashinfer-ai/flashinfer/blob/3d55c71a62052c590c130897d3a3db49b14fcc34/include/flashinfer/utils.cuh#L157
@@ -363,14 +463,21 @@ def begin_forward(self):
                 self.block_table_bound = self.block_table_bound.to(self.device)
                 self.seq_lens_tensor = self.seq_lens_tensor.to(self.device)
                 self.paged_kv_indices = self.paged_kv_indices.to(self.device)
-                self.prefill_wrapper.end_forward()
-                self.prefill_wrapper.begin_forward(
+                self.prefill_wrapper.plan(
                     self.query_start_loc,
                     self.paged_kv_indptr[:self.num_prefills + 1],
                     self.paged_kv_indices,
                     self.paged_kv_last_page_len[:self.num_prefills],
-                    self.num_qo_heads, self.num_kv_heads, self.head_dim,
-                    self.page_size)
+                    self.num_qo_heads,
+                    self.num_kv_heads,
+                    self.head_dim,
+                    self.page_size,
+                    causal=True,
+                    sm_scale=self.sm_scale,
+                    window_left=self.window_left,
+                    logits_soft_cap=self.logits_soft_cap,
+                    q_data_type=self.q_data_type,
+                    kv_data_type=self.data_type)
         if self.num_decode_tokens > 0:
             assert self.paged_kv_indices is not None
             assert self.paged_kv_indptr is not None
@@ -386,8 +493,7 @@ def begin_forward(self):
                 self.seq_lens_tensor = self.seq_lens_tensor.to(self.device)
 
             assert self.decode_wrapper is not None
-            self.decode_wrapper.end_forward()
-            self.decode_wrapper.begin_forward(
+            self.decode_wrapper.plan(
                 self.paged_kv_indptr[self.num_prefills:],
                 self.paged_kv_indices,
                 self.paged_kv_last_page_len[self.num_prefills:],
@@ -397,8 +503,11 @@ def begin_forward(self):
                 self.page_size,
                 # Disable flashinfer's pos encoding and use vllm's rope.
                 pos_encoding_mode="NONE",
+                window_left=self.window_left,
+                logits_soft_cap=self.logits_soft_cap,
+                sm_scale=self.sm_scale,
                 # kv-cache data type.
-                data_type=self.data_type,
+                kv_data_type=self.data_type,
                 # query data type.
                 q_data_type=self.q_data_type)
 
@@ -496,6 +605,11 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"):
         self.sliding_window = input_builder.sliding_window
         self.block_size = input_builder.block_size
 
+        # Global hyperparameters shared by all attention layers
+        self.global_hyperparameters: Optional[PerLayerParameters] = None
+
+        self.vllm_config = get_current_vllm_config()
+
     def prepare(self):
         self.slot_mapping: List[int] = []
         self.prefill_seq_lens: List[int] = []
@@ -528,6 +642,20 @@ def prepare(self):
         self.total_blocks = 0
         self.is_profile_run: bool = False
 
+        if self.global_hyperparameters is None:
+            # Infer global hyperparameters, since currently we only support
+            # models in which all layers share the same values for the
+            # following hyperparameters:
+            # - `window_left`
+            # - `logits_soft_cap`
+            # - `sm_scale`
+            inferred_params = infer_global_hyperparameters(
+                get_per_layer_parameters(self.vllm_config))
+            self.global_hyperparameters = inferred_params
+            self.window_left = inferred_params.window_left
+            self.logits_soft_cap = inferred_params.logits_soft_cap
+            self.sm_scale = inferred_params.sm_scale
+
     def _add_seq_group(
             self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
             chunked_prefill_enabled: bool):
@@ -756,7 +884,11 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             data_type=kv_cache_dtype,
             q_data_type=self.runner.model_config.dtype,
             use_cuda_graph=use_captured_graph,
-            is_profile_run=self.is_profile_run)
+            is_profile_run=self.is_profile_run,
+            window_left=self.window_left,
+            logits_soft_cap=self.logits_soft_cap,
+            sm_scale=self.sm_scale,
+        )
 
 
 class FlashInferImpl(AttentionImpl):
@@ -885,25 +1017,34 @@ def forward(
             else:
                 assert prefill_meta is not None
                 assert prefill_meta.prefill_wrapper is not None
-                prefill_output = prefill_meta.prefill_wrapper.forward(
+
+                assert prefill_meta.prefill_wrapper._causal
+                assert prefill_meta.prefill_wrapper._window_left == window_left
+                assert prefill_meta.prefill_wrapper._logits_soft_cap == (
+                    logits_soft_cap or 0.0)
+                assert prefill_meta.prefill_wrapper._sm_scale == softmax_scale
+
+                prefill_output = prefill_meta.prefill_wrapper.run(
                     query,
                     kv_cache,
-                    logits_soft_cap=logits_soft_cap,
-                    causal=True,
                     k_scale=layer._k_scale_float,
                     v_scale=layer._v_scale_float,
-                    window_left=window_left)
+                )
         if decode_meta := attn_metadata.decode_metadata:
             assert decode_meta is not None
             assert decode_meta.decode_wrapper is not None
-            decode_output = decode_meta.decode_wrapper.forward(
+
+            assert decode_meta.decode_wrapper._window_left == window_left
+            assert decode_meta.decode_wrapper._logits_soft_cap == (
+                logits_soft_cap or 0.0)
+            assert decode_meta.decode_wrapper._sm_scale == softmax_scale
+
+            decode_output = decode_meta.decode_wrapper.run(
                 decode_query,
                 kv_cache,
-                sm_scale=softmax_scale,
-                logits_soft_cap=logits_soft_cap,
                 k_scale=layer._k_scale_float,
                 v_scale=layer._v_scale_float,
-                window_left=window_left)
+            )
 
         if prefill_output is None and decode_output is not None:
             # Decode only batch.
diff --git a/vllm/config.py b/vllm/config.py
index 7a58d64bcc6e2..dc1d611115489 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -310,14 +310,15 @@ def __init__(
             (self.hf_text_config.model_type in ["gemma2", "cohere2"]))
 
         if (not self.disable_sliding_window and has_interleaved_attention):
-            if envs.VLLM_ATTENTION_BACKEND == "XFORMERS":
+            if (backend :=
+                    envs.VLLM_ATTENTION_BACKEND) in ("XFORMERS", "FLASHINFER"):
                 sliding_window_len_min = get_min_sliding_window(
                     self.hf_text_config.sliding_window)
 
                 logger.warning_once(
                     f"{self.hf_text_config.model_type} has interleaved "
                     "attention, which is currently not supported by the "
-                    "XFORMERS backend. Disabling sliding window and capping "
+                    f"{backend} backend. Disabling sliding window and capping "
                     "the max length to the sliding window size "
                     f"({sliding_window_len_min}).")
                 self.disable_sliding_window = True
@@ -3310,7 +3311,7 @@ def __str__(self):
 
 
 @contextmanager
-def set_current_vllm_config(vllm_config: VllmConfig):
+def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False):
     """
     Temporarily set the current VLLM config.
     Used during model initialization.
@@ -3330,7 +3331,8 @@ def set_current_vllm_config(vllm_config: VllmConfig):
                      vllm_config.compilation_config.enabled_custom_ops)
         logger.debug("disabled custom ops: %s",
                      vllm_config.compilation_config.disabled_custom_ops)
-        if vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \
+        if check_compile and \
+            vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \
             and compilation_counter.num_models_seen == num_models_seen:
             # If the model supports compilation,
             # compilation_counter.num_models_seen should be increased
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index e9779878710ee..527b4307f3670 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -114,7 +114,7 @@ def _initialize_model(
     all_params = [param.name for param in signatures.parameters.values()]
     if "vllm_config" in all_params and "prefix" in all_params:
         # new-style model class
-        with set_current_vllm_config(vllm_config):
+        with set_current_vllm_config(vllm_config, check_compile=True):
             return model_class(vllm_config=vllm_config, prefix=prefix)
 
     msg = ("vLLM model class should accept `vllm_config` and `prefix` as "
@@ -142,7 +142,7 @@ def _initialize_model(
         kwargs["lora_config"] = vllm_config.lora_config
     if "scheduler_config" in all_params:
         kwargs["scheduler_config"] = vllm_config.scheduler_config
-    with set_current_vllm_config(vllm_config):
+    with set_current_vllm_config(vllm_config, check_compile=True):
         return model_class(**kwargs)
 
 
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index 5b4757072353f..e359aef9dcb7f 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -288,7 +288,8 @@ def _init_model(self):
         model_args.torch_dtype = self.tensorizer_config.dtype
         assert self.tensorizer_config.model_class is not None
         # TODO: Do we need to consider old-style model class?
-        with no_init_or_tensor(), set_current_vllm_config(self.vllm_config):
+        with no_init_or_tensor(), set_current_vllm_config(self.vllm_config,
+                                                          check_compile=True):
             return self.tensorizer_config.model_class(
                 vllm_config=self.vllm_config, )
 
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index c6e6693c54f57..6eeb4aa17051f 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -8,7 +8,8 @@
 import torch
 import torch.nn as nn
 
-from vllm.config import ObservabilityConfig, VllmConfig
+from vllm.config import (ObservabilityConfig, VllmConfig,
+                         set_current_vllm_config)
 from vllm.distributed import broadcast_tensor_dict, get_pp_group, get_tp_group
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -498,8 +499,11 @@ def __init__(
         group.
         """
         self.rpc_rank = rpc_rank
-        self.vllm_config = vllm_config
         self.worker: Optional[WorkerBase] = None
+        # do not store this `vllm_config`, `init_worker` will set the final
+        # one. TODO: investigate if we can remove this field in
+        # `WorkerWrapperBase`, `init_cached_hf_modules` should be
+        # unnecessary now.
         if vllm_config.model_config is not None:
             # it can be None in tests
             trust_remote_code = vllm_config.model_config.trust_remote_code
@@ -533,6 +537,9 @@ def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None:
         Arguments are passed to the worker class constructor.
         """
         kwargs = all_kwargs[self.rpc_rank]
+        self.vllm_config = kwargs.get("vllm_config", None)
+        assert self.vllm_config is not None, (
+            "vllm_config is required to initialize the worker")
         enable_trace_function_call_for_thread(self.vllm_config)
 
         from vllm.plugins import load_general_plugins
@@ -546,8 +553,10 @@ def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None:
                               bytes)
             worker_class = cloudpickle.loads(
                 self.vllm_config.parallel_config.worker_cls)
-        self.worker = worker_class(**kwargs)
-        assert self.worker is not None
+        with set_current_vllm_config(self.vllm_config):
+            # To make vLLM config available during worker initialization
+            self.worker = worker_class(**kwargs)
+            assert self.worker is not None
 
     def execute_method(self, method: Union[str, bytes], *args, **kwargs):
         try:

From 6116ca8cd79b642c64f4ae6f050a6bc12b96d037 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nicolo.lucchesi@gmail.com>
Date: Mon, 27 Jan 2025 22:38:35 +0100
Subject: [PATCH 29/36] [Feature] [Spec decode]: Enable MLPSpeculator/Medusa
 and `prompt_logprobs` with ChunkedPrefill (#10132)

Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: wallashss <wallashss@ibm.com>
Co-authored-by: wallashss <wallashss@ibm.com>
---
 tests/spec_decode/e2e/conftest.py             |  19 +-
 .../e2e/test_integration_dist_tp2.py          |  10 +-
 tests/spec_decode/e2e/test_logprobs.py        |  16 +-
 .../e2e/test_medusa_correctness.py            |  31 ++-
 tests/spec_decode/e2e/test_mlp_correctness.py |  53 ++++-
 .../e2e/test_multistep_correctness.py         |  31 +--
 .../spec_decode/e2e/test_ngram_correctness.py |  13 +-
 tests/spec_decode/test_scorer.py              |   1 +
 tests/spec_decode/test_spec_decode_worker.py  |   1 +
 tests/spec_decode/utils.py                    |  12 +
 vllm/config.py                                |   9 +-
 vllm/engine/llm_engine.py                     |  19 +-
 vllm/spec_decode/batch_expansion.py           | 133 +++++++----
 vllm/spec_decode/interfaces.py                |   8 +-
 vllm/spec_decode/mqa_scorer.py                |  68 +++++-
 vllm/spec_decode/spec_decode_worker.py        | 211 +++++++++++++-----
 16 files changed, 469 insertions(+), 166 deletions(-)

diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
index b9cb3858c0068..5cb982a0811c7 100644
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -2,6 +2,7 @@
 from typing import List, Optional, Sequence, Tuple, Union
 
 import pytest
+import torch
 
 from vllm import LLM, SamplingParams
 from vllm.distributed import cleanup_dist_env_and_memory
@@ -154,6 +155,8 @@ def _check_logprobs_when_output_disabled(
          spec_pos_logprob) = next(iter(spec_pos_logprobs.items()))
         assert spec_pos_logprob.rank == -1
         assert spec_pos_logprob.logprob == 0.0
+        if isinstance(spec_pos_logprob_token_id, torch.Tensor):
+            spec_pos_logprob_token_id = spec_pos_logprob_token_id.item()
         assert spec_pos_logprob_token_id in baseline_pos_logprobs
 
 
@@ -244,7 +247,8 @@ def run_equality_correctness_test_tp(model,
                                      batch_size: int,
                                      max_output_len: int,
                                      seed: int = 0,
-                                     temperature: float = 0.0):
+                                     temperature: float = 0.0,
+                                     logprobs: Optional[int] = None):
     """Helper method that compares the outputs of both the baseline LLM and
     the test LLM. It asserts greedy equality, e.g. that the outputs are exactly
     the same when temperature is zero.
@@ -257,7 +261,6 @@ def run_equality_correctness_test_tp(model,
     results = []
 
     prompts = [prompt for prompt, _ in zip(cycle(PROMPTS), range(batch_size))]
-
     for args, env in ((arg1, env1), (arg2, env2)):
         with RemoteOpenAIServer(model,
                                 args,
@@ -269,12 +272,14 @@ def run_equality_correctness_test_tp(model,
                                                    prompt=prompts,
                                                    max_tokens=max_output_len,
                                                    seed=seed,
-                                                   temperature=temperature)
+                                                   temperature=temperature,
+                                                   logprobs=logprobs)
 
             results.append({
                 "test":
                 "seeded_sampling",
                 "text": [choice.text for choice in completion.choices],
+                "logprobs": [choice.logprobs for choice in completion.choices],
                 "finish_reason":
                 [choice.finish_reason for choice in completion.choices],
                 "usage":
@@ -284,7 +289,15 @@ def run_equality_correctness_test_tp(model,
     n = len(results) // 2
     arg1_results = results[:n]
     arg2_results = results[n:]
+    # Separate logprobs to avoid asserting exact equality.
+    arg1_logprobs = [r.pop("logprobs") for r in arg1_results]
+    arg2_logprobs = [r.pop("logprobs") for r in arg2_results]
+
     for arg1_result, arg2_result in zip(arg1_results, arg2_results):
         assert arg1_result == arg2_result, (
             f"Results for {model=} are not the same with {arg1=} and {arg2=}. "
             f"{arg1_result=} != {arg2_result=}")
+    if logprobs:
+        for logs1, logs2 in zip(arg1_logprobs, arg2_logprobs):
+            for l1, l2 in zip(logs1, logs2):
+                assert l1.tokens == l2.tokens
diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py
index 02cba92795142..7001ee4c007fe 100644
--- a/tests/spec_decode/e2e/test_integration_dist_tp2.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py
@@ -2,6 +2,8 @@
 tensor parallelism.
 """
 
+from typing import Optional
+
 import pytest
 import torch
 
@@ -154,15 +156,20 @@ def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
                               "--speculative-draft-tensor-parallel-size",
                               "1",
                           ])])
+@pytest.mark.parametrize("logprobs", [None, 2])
 @pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize("seed", [1])
 def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
                                          per_test_common_llm_kwargs,
                                          baseline_llm_kwargs, test_llm_kwargs,
+                                         logprobs: Optional[int],
                                          batch_size: int, seed: int):
     """Verify spec decode works well with same and different TP size for
     the draft model with chunked prefill.
     """
+    if logprobs:
+        test_llm_kwargs.extend(
+            ["--disable_logprobs_during_spec_decoding", "False"])
     run_equality_correctness_test_tp(model,
                                      common_llm_kwargs,
                                      per_test_common_llm_kwargs,
@@ -171,4 +178,5 @@ def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
                                      batch_size,
                                      max_output_len=32,
                                      seed=seed,
-                                     temperature=0.0)
+                                     temperature=0.0,
+                                     logprobs=logprobs)
diff --git a/tests/spec_decode/e2e/test_logprobs.py b/tests/spec_decode/e2e/test_logprobs.py
index 4cfca8b78e79b..1a543606cb3f3 100644
--- a/tests/spec_decode/e2e/test_logprobs.py
+++ b/tests/spec_decode/e2e/test_logprobs.py
@@ -4,26 +4,27 @@
 
 from vllm import SamplingParams
 
+from ..utils import maybe_enable_chunked_prefill
 from .conftest import run_equality_correctness_test
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
-        "model_name": "JackFram/llama-68m",
+        "model_name": "JackFram/llama-160m",
 
         # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
+        "enforce_eager": True
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs",
                          [{
-                             "speculative_model": "JackFram/llama-160m",
+                             "speculative_model": "JackFram/llama-68m",
                              "num_speculative_tokens": 3,
                              "disable_logprobs_during_spec_decoding": False,
                          }, {
-                             "speculative_model": "JackFram/llama-160m",
+                             "speculative_model": "JackFram/llama-68m",
                              "num_speculative_tokens": 3,
                              "disable_logprobs_during_spec_decoding": True,
                          }])
@@ -36,12 +37,15 @@
     ])
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("logprobs", [1, 6])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4, 12])
 def test_logprobs_equality(vllm_runner, common_llm_kwargs,
                            per_test_common_llm_kwargs, baseline_llm_kwargs,
                            test_llm_kwargs, batch_size: int, output_len: int,
-                           seed: int, logprobs: int):
-    """Verify output logprobs are equal with and without speculative decoding.
+                           seed: int, logprobs: int, prefill_chunk_size: int):
+    """Verify output logprobs are equal with and without speculative decoding,
+        as well as with and without chunked prefill.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, common_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
diff --git a/tests/spec_decode/e2e/test_medusa_correctness.py b/tests/spec_decode/e2e/test_medusa_correctness.py
index b8965606b3d0e..dbcbc0db10881 100644
--- a/tests/spec_decode/e2e/test_medusa_correctness.py
+++ b/tests/spec_decode/e2e/test_medusa_correctness.py
@@ -21,6 +21,7 @@
 
 import pytest
 
+from ..utils import maybe_enable_chunked_prefill
 from .conftest import run_equality_correctness_test
 
 # main model
@@ -67,12 +68,14 @@
 ])
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
 def test_medusa_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
                                        per_test_common_llm_kwargs,
                                        baseline_llm_kwargs, test_llm_kwargs,
                                        batch_size: int, output_len: int,
-                                       seed: int):
+                                       seed: int, prefill_chunk_size: int):
     """Verify greedy equality with different batch size."""
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -119,12 +122,15 @@ def test_medusa_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("logprobs", [1, 6])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
 def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
                                     per_test_common_llm_kwargs,
                                     baseline_llm_kwargs, test_llm_kwargs,
                                     batch_size: int, output_len: int,
-                                    seed: int, logprobs: int):
+                                    seed: int, logprobs: int,
+                                    prefill_chunk_size: int):
     """Verify greedy equality with different batch size."""
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -167,12 +173,14 @@ def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
 ])
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
 def test_medusa_e2e_greedy_correctness_cuda_graph(
         vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
         baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int):
+        seed: int, prefill_chunk_size: int):
     """Verify greedy equality with cuda graph enabled and different 
     batch sizes."""
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -217,13 +225,15 @@ def test_medusa_e2e_greedy_correctness_cuda_graph(
     ])
 @pytest.mark.parametrize("batch_size", [4])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
 def test_medusa_e2e_greedy_correctness_with_preemption(
         vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
         baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int):
+        seed: int, prefill_chunk_size: int):
     """Verify greedy equality, even when some sequences are preempted mid-
     generation.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -267,13 +277,15 @@ def test_medusa_e2e_greedy_correctness_with_preemption(
         32,
     ])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
 def test_medusa_different_k(vllm_runner, common_llm_kwargs,
                             per_test_common_llm_kwargs, baseline_llm_kwargs,
                             test_llm_kwargs, batch_size: int, output_len: int,
-                            seed: int):
+                            seed: int, prefill_chunk_size: int):
     """Verify that medusa speculative decoding produces exact equality
     to without spec decode with different values of num_speculative_tokens.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -313,14 +325,17 @@ def test_medusa_different_k(vllm_runner, common_llm_kwargs,
         32,
     ])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
 def test_medusa_disable_queue(vllm_runner, common_llm_kwargs,
                               per_test_common_llm_kwargs, baseline_llm_kwargs,
                               test_llm_kwargs, batch_size: int,
-                              output_len: int, seed: int):
+                              output_len: int, seed: int,
+                              prefill_chunk_size: int):
     """Verify that medusa speculative decoding produces exact equality
     to without spec decode when speculation is disabled for large
     batch sizes.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -361,12 +376,14 @@ def test_medusa_disable_queue(vllm_runner, common_llm_kwargs,
         32,
     ])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
 def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
                     baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
-                    output_len: int, seed: int):
+                    output_len: int, seed: int, prefill_chunk_size: int):
     """Verify that speculative decoding generates the same output 
     with batch expansion scorer and mqa scorer.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py
index 183ff2f5db274..1fa1104f5d3a8 100644
--- a/tests/spec_decode/e2e/test_mlp_correctness.py
+++ b/tests/spec_decode/e2e/test_mlp_correctness.py
@@ -25,6 +25,7 @@
 
 from vllm.model_executor.layers.vocab_parallel_embedding import pad_vocab_size
 
+from ..utils import maybe_enable_chunked_prefill
 from .conftest import run_equality_correctness_test
 
 # main model
@@ -66,14 +67,16 @@
 @pytest.mark.parametrize("output_len", [
     128,
 ])
-@pytest.mark.parametrize("batch_size", [1, 32])
+@pytest.mark.parametrize("batch_size", [4, 32])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
 def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
                                     per_test_common_llm_kwargs,
                                     baseline_llm_kwargs, test_llm_kwargs,
                                     batch_size: int, output_len: int,
-                                    seed: int):
+                                    seed: int, prefill_chunk_size: int):
     """Verify greedy equality with different batch size."""
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -116,12 +119,19 @@ def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("logprobs", [1, 6])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
                                  per_test_common_llm_kwargs,
                                  baseline_llm_kwargs, test_llm_kwargs,
                                  batch_size: int, output_len: int, seed: int,
-                                 logprobs: int):
+                                 logprobs: int, prefill_chunk_size: int):
     """Verify greedy equality with different batch size."""
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
+    # NOTE Test is sensitive enough st if we don't enable chunked prefill
+    # scheduling on baseline too, we get slightly different logprobs, ending
+    # up sampling different tokens at the tail (ie top tokens don't change).
+    # TL;DR: sd+cp == org+cp but sd+cp != org..is this expected?
+    maybe_enable_chunked_prefill(prefill_chunk_size, baseline_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -162,12 +172,15 @@ def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("output_len", [2048])
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs,
                                  per_test_common_llm_kwargs,
                                  baseline_llm_kwargs, test_llm_kwargs,
-                                 batch_size: int, output_len: int, seed: int):
+                                 batch_size: int, output_len: int,
+                                 prefill_chunk_size: int, seed: int):
     """Verify acceptance rate with different batch size and large output 
     length."""
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -204,13 +217,17 @@ def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("output_len", [64])
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("temperature", [1.0])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 @pytest.mark.parametrize("seed", [1])
 def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
                                     per_test_common_llm_kwargs,
                                     baseline_llm_kwargs, test_llm_kwargs,
                                     batch_size: int, output_len: int,
-                                    temperature: float, seed: int):
+                                    temperature: float,
+                                    prefill_chunk_size: int, seed: int):
     """Verify seeded runs produce the same output."""
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
+    maybe_enable_chunked_prefill(prefill_chunk_size, baseline_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -266,14 +283,16 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
         128,
     ])
 @pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 @pytest.mark.parametrize("seed", [1])
 def test_mlp_e2e_greedy_correctness_with_preemption(
         vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
         baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int):
+        prefill_chunk_size: int, seed: int):
     """Verify greedy equality, even when some sequences are preempted mid-
     generation.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -317,12 +336,14 @@ def test_mlp_e2e_greedy_correctness_with_preemption(
     ])
 @pytest.mark.parametrize("batch_size", [4])
 @pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 def test_mlp_e2e_greedy_correctness_with_padding(
         vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
         baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int):
+        prefill_chunk_size: int, seed: int):
     """Verify greedy equality when the vocab dimension is padded
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
 
     # Default pad_to is 64, test model has vocab_size of 32000
     def patched_pad_vocab_size(vocab_size, pad_to=None):
@@ -373,14 +394,16 @@ def patched_pad_vocab_size(vocab_size, pad_to=None):
         # Use smaller output len for fast test.
         32,
     ])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 @pytest.mark.parametrize("seed", [1])
 def test_mlp_different_k(vllm_runner, common_llm_kwargs,
                          per_test_common_llm_kwargs, baseline_llm_kwargs,
-                         test_llm_kwargs, batch_size: int, seed: int,
-                         output_len: int):
+                         test_llm_kwargs, batch_size: int,
+                         prefill_chunk_size: int, seed: int, output_len: int):
     """Verify that mlp speculative decoding produces exact equality
     to without spec decode with different values of num_speculative_tokens.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -418,15 +441,21 @@ def test_mlp_different_k(vllm_runner, common_llm_kwargs,
         # Use smaller output len for fast test.
         32,
     ])
+# Speculative decoding is disabled when sequences reach decoding and the batch
+# consists of single-token requests. Hence we set `max_num_seqs`
+# >= `speculative_disable_by_batch_size` to test feature interaction.
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 @pytest.mark.parametrize("seed", [1])
 def test_mlp_disable_queue(vllm_runner, common_llm_kwargs,
                            per_test_common_llm_kwargs, baseline_llm_kwargs,
-                           test_llm_kwargs, batch_size: int, seed: int,
+                           test_llm_kwargs, batch_size: int,
+                           prefill_chunk_size: int, seed: int,
                            output_len: int):
     """Verify that mlp speculative decoding produces exact equality
     to without spec decode when speculation is disabled for large
     batch sizes.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -460,13 +489,15 @@ def test_mlp_disable_queue(vllm_runner, common_llm_kwargs,
         # Use smaller output len for fast test.
         32,
     ])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 @pytest.mark.parametrize("seed", [1])
 def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
                     baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
-                    output_len: int, seed: int):
+                    output_len: int, prefill_chunk_size: int, seed: int):
     """Verify that speculative decoding generates the same output 
     with batch expansion scorer and mqa scorer.
     """
+    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py
index a13cca41f99e5..05ad468dd8bc5 100644
--- a/tests/spec_decode/e2e/test_multistep_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
@@ -147,20 +147,20 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
         },
     ])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
-        "enable_chunked_prefill": False,
-    },
-    {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
-        "enable_chunked_prefill": True,
-        "max_num_batched_tokens": 4,
-        "max_num_seqs": 4,
-    },
-])
+@pytest.mark.parametrize("test_llm_kwargs",
+                         [{
+                             "speculative_model": "JackFram/llama-68m",
+                             "num_speculative_tokens": 5,
+                             "enable_chunked_prefill": False,
+                             "disable_logprobs_during_spec_decoding": False
+                         }, {
+                             "speculative_model": "JackFram/llama-68m",
+                             "num_speculative_tokens": 3,
+                             "enable_chunked_prefill": True,
+                             "max_num_batched_tokens": 4,
+                             "max_num_seqs": 4,
+                             "disable_logprobs_during_spec_decoding": False
+                         }])
 @pytest.mark.parametrize(
     "output_len",
     [
@@ -192,6 +192,9 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
                                   batch_size,
                                   max_output_len=output_len,
                                   seed=seed,
+                                  prompt_logprobs=2,
+                                  logprobs=2,
+                                  disable_logprobs=False,
                                   temperature=0.0,
                                   ensure_all_accepted=ensure_all_accepted)
 
diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py
index e53d169a8fcc3..77f8b8998c8d3 100644
--- a/tests/spec_decode/e2e/test_ngram_correctness.py
+++ b/tests/spec_decode/e2e/test_ngram_correctness.py
@@ -26,6 +26,7 @@
 
 import pytest
 
+from ..utils import maybe_enable_chunked_prefill
 from .conftest import run_equality_correctness_test
 
 
@@ -49,11 +50,13 @@
         "speculative_model": "[ngram]",
         "num_speculative_tokens": 5,
         "ngram_prompt_lookup_max": 3,
+        "speculative_disable_mqa_scorer": False,
     },
     {
         "speculative_model": "[ngram]",
         "num_speculative_tokens": 5,
         "ngram_prompt_lookup_max": 3,
+        "speculative_disable_mqa_scorer": True,
     },
 ])
 @pytest.mark.parametrize("output_len", [
@@ -68,15 +71,7 @@ def test_ngram_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
                                       batch_size: int, output_len: int,
                                       prefill_chunk_size: int, seed: int):
     """Verify greedy equality on a tiny model with different batch size."""
-    if prefill_chunk_size > 0:
-        common_llm_kwargs.update(
-            **{
-                "enable_chunked_prefill": True,
-                "max_num_batched_tokens": prefill_chunk_size,
-                "max_num_seqs": prefill_chunk_size
-            })
-    else:
-        common_llm_kwargs["enable_chunked_prefill"] = False
+    maybe_enable_chunked_prefill(prefill_chunk_size, common_llm_kwargs)
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
diff --git a/tests/spec_decode/test_scorer.py b/tests/spec_decode/test_scorer.py
index 0b1509d8b7785..5a093dea16d40 100644
--- a/tests/spec_decode/test_scorer.py
+++ b/tests/spec_decode/test_scorer.py
@@ -60,6 +60,7 @@ def test_scorer(model_name: str, batch_size: int, max_propose_len: int,
     num_gpu_blocks = 2048 // block_size
     scorer_worker = create_worker(Worker, model_name, block_size,
                                   num_gpu_blocks, seed)
+    scorer_worker.model_runner.disable_logprobs = True  # accessed by mqa_scorer
     scorer_worker.model_runner.model.sampler.include_gpu_probs_tensor = True
     scorer_worker.model_runner.model.sampler.\
         should_modify_greedy_probs_inplace = True
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index caf7a7e625b46..d8c3af4c1cd1e 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -754,6 +754,7 @@ def test_populate_seq_ids_with_bonus_tokens():
         seq_group_metadata_list=seq_group_metadata_list,
         accepted_token_ids=accepted_token_ids,
         target_logprobs=target_token_logprobs,
+        prompt_logprobs=None,
         k=k,
         stage_times=(0, 0, 0))
     # Verify that _seq_with_bonus_token_in_last_step contains the following:
diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py
index a4bfa6b2f384b..2f883c2ff9b7a 100644
--- a/tests/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
@@ -274,3 +274,15 @@ def create_batch(batch_size,
             prompts, num_gpu_blocks, block_size, final_prompt_lens,
             prev_output_tokens, seq_ids)
     return seq_group_metadata_list, prompts, prev_output_tokens
+
+
+def maybe_enable_chunked_prefill(prefill_chunk_size, llm_kwargs):
+    if prefill_chunk_size > 0:
+        llm_kwargs.update(
+            **{
+                "enable_chunked_prefill": True,
+                "max_num_batched_tokens": prefill_chunk_size,
+                "max_num_seqs": prefill_chunk_size
+            })
+    else:
+        llm_kwargs["enable_chunked_prefill"] = False
diff --git a/vllm/config.py b/vllm/config.py
index dc1d611115489..7ab632d7e3667 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1685,7 +1685,8 @@ def maybe_create_spec_config(
             raise ValueError("Expect the batch size threshold of disabling "
                              "speculative decoding is > 1, but got "
                              f"{speculative_disable_by_batch_size=}")
-
+        if (enable_chunked_prefill and speculative_model == "eagle"):
+            raise ValueError("Chunked prefill and EAGLE are not compatible.")
         # TODO: The user should be able to specify revision/max model len
         # for the draft model. It is not currently supported.
         draft_revision = None
@@ -1752,12 +1753,6 @@ def maybe_create_spec_config(
                         f"num_speculative_tokens={n_predict}, but "
                         f"{num_speculative_tokens=} was provided.")
 
-            if enable_chunked_prefill and draft_hf_config.model_type in (
-                    "medusa", "mlp_speculator", "eagle"):
-                raise ValueError(
-                    "Chunked prefill and hidden-state based draft models are "
-                    "not compatible.")
-
             speculative_draft_tensor_parallel_size = \
                 SpeculativeConfig._verify_and_get_draft_model_tensor_parallel_size(
                     target_parallel_config,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 7da18d5f7d2eb..ab67ae29723cd 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1010,8 +1010,23 @@ def _process_model_outputs(self,
                      self.speculative_config
             # Organize outputs by [step][sequence group] instead of
             # [sequence group][step].
-            outputs_by_sequence_group = create_output_by_sequence_group(
-                outputs, num_seq_groups=len(seq_group_metadata_list))
+            if self.scheduler_config.is_multi_step:
+                outputs_by_sequence_group = create_output_by_sequence_group(
+                    outputs, len(seq_group_metadata_list))
+            elif self.speculative_config:
+                # Decodes are multi-steps while prefills are not, outputting at
+                # most 1 token. Separate them so that we can trigger chunk
+                # processing without having to pad or copy over prompts K times
+                # to match decodes structure (costly with prompt_logprobs).
+                num_prefills = sum(sg.is_prompt
+                                   for sg in seq_group_metadata_list)
+                prefills, decodes = outputs[:num_prefills], outputs[
+                    num_prefills:]
+                outputs_by_sequence_group = create_output_by_sequence_group(
+                    decodes,
+                    num_seq_groups=len(seq_group_metadata_list) - num_prefills)
+                outputs_by_sequence_group = [p.outputs for p in prefills
+                                             ] + outputs_by_sequence_group
             # We have outputs for multiple steps submitted in a single burst,
             # so invalidate is_first_step_output.
             is_first_step_output = None
diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py
index 01b9cdad963da..56fb9ba506a44 100644
--- a/vllm/spec_decode/batch_expansion.py
+++ b/vllm/spec_decode/batch_expansion.py
@@ -83,13 +83,13 @@ def score_proposals(
 
         if not non_spec_indices:
             # All sequence groups in batch have spec decoding enabled
-            contracted = self._contract_batch_all_spec(
+            return self._contract_batch_all_spec(
                 target_sampler_output=target_sampler_output,
                 proposals=proposals,
             )
         else:
             # Batch has a mix of spec decode enabled and disabled seq groups
-            contracted = self._contract_batch(
+            return self._contract_batch(
                 execute_model_req.seq_group_metadata_list,
                 target_sampler_output=target_sampler_output,
                 proposals=proposals,
@@ -99,14 +99,6 @@ def score_proposals(
                 k=execute_model_req.num_lookahead_slots,
             )
 
-        all_tokens, all_probs, spec_logprobs, all_hidden_states = contracted
-        return SpeculativeScores(
-            probs=all_probs,
-            token_ids=all_tokens,
-            logprobs=spec_logprobs,
-            hidden_states=all_hidden_states,
-        )
-
     def _expand_batch(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
@@ -143,13 +135,57 @@ def _expand_batch(
         return (spec_indices, non_spec_indices, target_seq_group_metadata_list,
                 num_scoring_tokens)
 
+    def _contract_non_speculative(
+            self, scores: SpeculativeScores,
+            seq_group_metadata_list: List[SequenceGroupMetadata],
+            non_spec_indices: List[int], non_spec_outputs: SpeculativeScores,
+            has_prompt_log: bool) -> SpeculativeScores:
+        """
+            Augment input `scores` with non-speculative requests outputs. 
+            This includes decode requests with speculation turned off, as well
+            as prefill requests when `enable_chunked_prefill` is set.
+            For the latter, prefills are further separated into terminal and 
+            non-terminal chunks (from which no token is sampled).
+        """
+        if not non_spec_indices:
+            return scores
+
+        if has_prompt_log:
+            # When prompt_logprobs is enabled, prefills yield output token
+            # (and respective prob) in the last entry (prompt|out):
+            # [.|.|.|prefill0_out|.|prefill1_out|decode0_out|..].
+            # With chunked prefill, non-terminal chunks have -1 on each
+            # position: they're still picked, but they're discarded later.
+            seq_meta = seq_group_metadata_list
+            nospec_sizes = torch.tensor([
+                seq_meta[i].token_chunk_size if seq_meta[i].is_prompt else 1
+                for i in non_spec_indices
+            ])
+            nospec_sampled_token_idxs = torch.cumsum(nospec_sizes, 0).add_(-1)
+        else:
+            # In this case only sampled tokens are returned, select all.
+            nospec_sampled_token_idxs = list(
+                range(len(non_spec_outputs.token_ids)))
+
+        scores.token_ids[non_spec_indices, :1] = \
+            non_spec_outputs.token_ids[nospec_sampled_token_idxs].unsqueeze(1)
+        scores.probs[non_spec_indices, :1, :] = \
+            non_spec_outputs.probs[nospec_sampled_token_idxs].unsqueeze(1)
+        scores.logprobs[non_spec_indices, :1, :] = \
+            non_spec_outputs.logprobs[nospec_sampled_token_idxs].unsqueeze(1)
+        if scores.hidden_states is not None:
+            assert non_spec_outputs.hidden_states is not None
+            scores.hidden_states[non_spec_indices, :1, :] = \
+                non_spec_outputs.hidden_states[nospec_sampled_token_idxs].unsqueeze(1)
+        return scores
+
     def _contract_batch(
-        self, contracted_seq_group_metadata_list: List[SequenceGroupMetadata],
-        target_sampler_output: SamplerOutput, proposals: SpeculativeProposals,
-        num_scoring_tokens: int, non_spec_indices: List[int],
-        spec_indices: List[int], k: int
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor,
-               Optional[torch.Tensor]]:
+            self,
+            contracted_seq_group_metadata_list: List[SequenceGroupMetadata],
+            target_sampler_output: SamplerOutput,
+            proposals: SpeculativeProposals, num_scoring_tokens: int,
+            non_spec_indices: List[int], spec_indices: List[int],
+            k: int) -> SpeculativeScores:
         """Contract the expanded batch back into its original size.
         This maps the scores of speculative tokens back to their original
         sequences.
@@ -195,23 +231,28 @@ def _contract_batch(
         else:
             all_hidden_states = None
 
-        # Rule out prefills that produce no tokens.
-        non_spec_indices = [
-            idx for idx in non_spec_indices
-            if contracted_seq_group_metadata_list[idx].do_sample
-        ]
-        if len(non_spec_indices):
-            all_tokens[non_spec_indices, :1] = \
-                non_spec_target_token_ids.unsqueeze(1)
-            all_probs[non_spec_indices, :1, :] = \
-                non_spec_target_probs.unsqueeze(1)
-            all_logprobs[non_spec_indices, :1, :] = \
-                non_spec_target_logprobs.unsqueeze(1)
-            if all_hidden_states is not None:
-                assert non_spec_target_hidden_states is not None
-                all_hidden_states[non_spec_indices, :1, :] = \
-                    non_spec_target_hidden_states.unsqueeze(1)
-
+        has_prompt_log = any((sg.sampling_params.prompt_logprobs
+                              and sg.sampling_params.prompt_logprobs > 0)
+                             for sg in contracted_seq_group_metadata_list)
+        # When prompt logprobs is enabled, lens of returned tensors go from
+        # n_sampled (requests with do_sample=True) to n_prompt+n_prefills.
+        # We adjust stride accordingly to get the generated tokens and
+        # their probs, but pass on prompt_logprobs as is.
+        prompt_logprobs = None
+        if (not self._scorer_worker.model_runner.disable_logprobs\
+            and has_prompt_log):
+            prompt_logprobs = [
+                o.prompt_logprobs for o in target_sampler_output.outputs
+            ]
+        elif not has_prompt_log:
+            # When prompt logprobs are not to be returned,
+            # we can ignore non-terminal chunks (no out token).
+            non_spec_indices = [
+                idx for idx in non_spec_indices
+                if contracted_seq_group_metadata_list[idx].do_sample
+            ]
+
+        # "Contract" speculative.
         if spec_indices:
             all_tokens[spec_indices] = target_token_ids
             all_probs[spec_indices] = target_probs
@@ -219,14 +260,27 @@ def _contract_batch(
             if all_hidden_states is not None:
                 all_hidden_states[spec_indices] = target_hidden_states
 
-        return all_tokens, all_probs, all_logprobs, all_hidden_states
+        spec_scores = SpeculativeScores(probs=all_probs,
+                                        token_ids=all_tokens,
+                                        logprobs=all_logprobs,
+                                        hidden_states=all_hidden_states,
+                                        prompt_logprobs=prompt_logprobs)
+
+        non_spec_outputs = SpeculativeScores(
+            probs=non_spec_target_probs,
+            token_ids=non_spec_target_token_ids,
+            logprobs=non_spec_target_logprobs,
+            hidden_states=non_spec_target_hidden_states)
+        # Contract remaining nonspec entries based on non_spec_indices, if any.
+        return self._contract_non_speculative(
+            spec_scores, contracted_seq_group_metadata_list, non_spec_indices,
+            non_spec_outputs, has_prompt_log)
 
     def _contract_batch_all_spec(
         self,
         target_sampler_output: SamplerOutput,
         proposals: SpeculativeProposals,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor,
-               Optional[torch.Tensor]]:
+    ) -> SpeculativeScores:
         """Contract the expanded batch back into its original size.
         This maps the scores of speculative tokens back to their original
         sequences.
@@ -250,8 +304,11 @@ def _contract_batch_all_spec(
             target_hidden_states = target_hidden_states.reshape(
                 *target_token_ids.shape, target_hidden_states.shape[-1])
 
-        return (target_token_ids, target_probs, target_logprobs,
-                target_hidden_states)
+        return SpeculativeScores(probs=target_probs,
+                                 token_ids=target_token_ids,
+                                 logprobs=target_logprobs,
+                                 hidden_states=target_hidden_states,
+                                 prompt_logprobs=None)
 
     def _create_scoring_model_input(
         self,
diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py
index a4fe0f13c8db1..c39e98b6cca12 100644
--- a/vllm/spec_decode/interfaces.py
+++ b/vllm/spec_decode/interfaces.py
@@ -1,10 +1,10 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Optional, Set, Union
+from typing import List, Optional, Set, Union
 
 import torch
 
-from vllm.sequence import ExecuteModelRequest
+from vllm.sequence import ExecuteModelRequest, PromptLogprobs
 from vllm.worker.worker_base import WorkerBase
 
 
@@ -54,6 +54,10 @@ class SpeculativeScores:
     # Optional last hidden states from the scoring model.
     hidden_states: Optional[torch.Tensor] = None
 
+    # Scoring model may also return logprobs for prompt tokens
+    # for each request, when chunked prefill is enabled.
+    prompt_logprobs: Optional[List[PromptLogprobs]] = None
+
     def __repr__(self):
         return (f"SpeculativeScores("
                 f"probs={self.probs.shape}, "
diff --git a/vllm/spec_decode/mqa_scorer.py b/vllm/spec_decode/mqa_scorer.py
index cbf793e2043e3..3aea2eabb4144 100644
--- a/vllm/spec_decode/mqa_scorer.py
+++ b/vllm/spec_decode/mqa_scorer.py
@@ -72,9 +72,15 @@ def score_proposals(
         target_token_ids = target_sampler_output.sampled_token_ids
         target_probs = target_sampler_output.sampled_token_probs
         target_logprobs = target_sampler_output.logprobs
+        prompt_logprobs = None
+
         # If all requests have the same number of query tokens, we can avoid
         # the for loop to build output for better performance.
         if min(all_proposal_lengths) == k:
+            # Regular decodes only.
+            assert all(not sg.is_prompt
+                       for sg in target_seq_group_metadata_list
+                       if sg.is_prompt)
             bs, _ = proposals.proposal_token_ids.shape
             all_tokens = target_token_ids.reshape(bs, k + 1)
             all_probs = target_probs.reshape(bs, k + 1, self._vocab_size)
@@ -88,19 +94,56 @@ def score_proposals(
             all_logprobs = target_logprobs.new_full(size=all_probs.shape,
                                                     fill_value=-float("inf"))
             target_token_ids = target_token_ids.flatten()
-            start_loc = 0
-            for i, (proposed_len, seq_meta) in enumerate(
-                    zip(all_proposal_lengths, target_seq_group_metadata_list)):
+
+            # When prompt logprobs is enabled, lens of returned tensors go from
+            # n_sampled (requests with do_sample=True) to n_prompt+n_prefills.
+            # We adjust stride accordingly to get the generated tokens and
+            # their probs, but pass on prompt_logprobs as is, since it may be
+            # that n_prompts >> K.
+            has_prompt_log = any((sg.sampling_params.prompt_logprobs
+                                  and sg.sampling_params.prompt_logprobs > 0)
+                                 for sg in target_seq_group_metadata_list)
+            # TODO (NickLucche) we should surface `disable_logprobs` as to not
+            # break abstraction to get its value.
+            if (not self._scorer_worker.model_runner.disable_logprobs\
+                and has_prompt_log):
+                prompt_logprobs = [
+                    o.prompt_logprobs for o in target_sampler_output.outputs
+                ]
+
+            # Split loop into prefill|decode for readability.
+            start_loc, i = 0, 0
+            while i < len(target_seq_group_metadata_list
+                          ) and target_seq_group_metadata_list[i].is_prompt:
+                seq_meta = target_seq_group_metadata_list[i]
+                end_loc = start_loc
+                if has_prompt_log:
+                    end_loc += seq_meta.token_chunk_size
+                elif seq_meta.do_sample:
+                    end_loc += 1
+
                 # Skip chunks with no output tokens.
                 if seq_meta.do_sample:
-                    output_len = proposed_len + 1
-                    end_loc = start_loc + output_len
-                    all_tokens[
-                        i, :output_len] = target_token_ids[start_loc:end_loc]
-                    all_probs[i, :output_len] = target_probs[start_loc:end_loc]
-                    all_logprobs[
-                        i, :output_len] = target_logprobs[start_loc:end_loc]
-                    start_loc = end_loc
+                    # Get sampled token (last position in chunk) and its prob.
+                    all_tokens[i, 0] = target_token_ids[end_loc - 1]
+                    all_probs[i, 0] = target_probs[end_loc - 1]
+                    all_logprobs[i, 0] = target_logprobs[end_loc - 1]
+
+                i += 1
+                start_loc = end_loc
+            # Decodes.
+            while i < len(target_seq_group_metadata_list):
+                proposed_len, seq_meta = all_proposal_lengths[
+                    i], target_seq_group_metadata_list[i]
+                output_len = proposed_len + 1
+                end_loc = start_loc + output_len
+                all_tokens[
+                    i, :output_len] = target_token_ids[start_loc:end_loc]
+                all_probs[i, :output_len] = target_probs[start_loc:end_loc]
+                all_logprobs[
+                    i, :output_len] = target_logprobs[start_loc:end_loc]
+                start_loc = end_loc
+                i += 1
 
         hidden_states = None
         if target_sampler_output.hidden_states is not None:
@@ -110,4 +153,5 @@ def score_proposals(
         return SpeculativeScores(probs=all_probs,
                                  token_ids=all_tokens,
                                  logprobs=all_logprobs,
-                                 hidden_states=hidden_states)
+                                 hidden_states=hidden_states,
+                                 prompt_logprobs=prompt_logprobs)
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 0d66ede3d907a..8e9802c7d333c 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -563,50 +563,57 @@ def _serialize_sampler_output_no_logprobs(
             (seq_id, seq_data) for sg in \
             execute_model_req.seq_group_metadata_list \
             for seq_id, seq_data in sg.seq_data.items()
-            if sg.do_sample # ignore empty token sequences
         ]
         completion_seq_group_output_list: List[
             CompletionSequenceGroupOutput] = []
         output_index = 0
         # Make sure the non-terminal prefill chunks are still aligned with
         # their own empty output.
-        for seq_group_meta in execute_model_req.seq_group_metadata_list:
-            # Since we can get chunks here, we dont always have a sampled token
-            # (only on last chunk) but we still have to provide an output.
-            if not seq_group_meta.do_sample:
-                completion_seq_group_output_list.append(
-                    CompletionSequenceGroupOutput(samples=[],
-                                                  prompt_logprobs=None))
-            else:
-                # Sequence with output.
-                seq_id, seq_data = seq_data_entries[output_index]
-                needs_prompt_logprobs = seq_output_prompt_logprobs[
-                    output_index]
-                if needs_prompt_logprobs:
-                    prompt_token_ids = seq_data.get_prompt_token_ids()
-                    prompt_logprobs = [
-                        create_logprobs_output(
-                            token_id=p_token_id,
-                            token_id_logprob_rank=-1,
-                            token_id_logprob=0.0,
-                            topk_token_ids=[],
-                            topk_logprobs=[],
-                        )
-                        # no prompt logprobs for the first token
-                        for p_token_id in prompt_token_ids[1:]
-                    ]
-                else:
-                    prompt_logprobs = None
-                completion_seq_group_output_list.append(
-                    create_sequence_group_output(
-                        token_id=sampled_token_ids_list[output_index][0],
+        for idx, seq_group_meta in enumerate(
+                execute_model_req.seq_group_metadata_list):
+            needs_prompt_logprobs = seq_output_prompt_logprobs[idx]
+            seq_id, seq_data = seq_data_entries[idx]
+            if needs_prompt_logprobs:
+                prompt_token_ids = seq_data.get_prompt_token_ids()
+
+                # Some of these sequences may belong to non-terminal chunks,
+                # which may still have to report logprobs for prompts.
+                start = 1 if seq_data._num_computed_tokens == 0 \
+                    else seq_data._num_computed_tokens
+                end = (seq_data._num_computed_tokens + \
+                       seq_group_meta.token_chunk_size)
+                prompt_token_ids = prompt_token_ids[start:end]
+                prompt_logprobs = [
+                    create_logprobs_output(
+                        token_id=p_token_id,
                         token_id_logprob_rank=-1,
                         token_id_logprob=0.0,
-                        seq_id=seq_id,
                         topk_token_ids=[],
                         topk_logprobs=[],
-                        prompt_logprobs=prompt_logprobs))
-                output_index += 1
+                    ) for p_token_id in prompt_token_ids
+                ]
+            else:
+                prompt_logprobs = None
+
+            # Since we can get chunks here, we dont always have a sampled token
+            # (only on last chunk) but we still have to provide an output.
+            if not seq_group_meta.do_sample:
+                completion_seq_group_output_list.append(
+                    CompletionSequenceGroupOutput(
+                        samples=[], prompt_logprobs=prompt_logprobs))
+                continue
+
+            # Sequence with output.
+            completion_seq_group_output_list.append(
+                create_sequence_group_output(
+                    token_id=sampled_token_ids_list[output_index][0],
+                    token_id_logprob_rank=-1,
+                    token_id_logprob=0.0,
+                    seq_id=seq_id,
+                    topk_token_ids=[],
+                    topk_logprobs=[],
+                    prompt_logprobs=prompt_logprobs))
+            output_index += 1
 
         return [SamplerOutput(outputs=completion_seq_group_output_list)]
 
@@ -624,24 +631,27 @@ def _run_no_spec(self, execute_model_req: ExecuteModelRequest,
         assert len(sampler_output) == 1
         sampler_output = sampler_output[0]
 
-        # Store hidden states from target model execution.
+        # Store hidden states from target model execution, BxD.
         hidden_states = sampler_output.hidden_states
         if hidden_states is not None:
-            # remove hidden_states for prompt tokens
-            # TODO Enable `return_hidden_states`: prefill chunks hidden states
-            # are pruned by the logits processor. Also, they should be arranged
-            # back into full-prefill latent. Address it to enable MLPSpeculator.
-            if any(seq.is_prompt
-                   for seq in execute_model_req.seq_group_metadata_list):
+            # Only decodes and prefill terminal chunks need a hidden state.
+            seq_group_meta_with_hidden = [
+                sg for sg in execute_model_req.seq_group_metadata_list
+                if sg.do_sample
+            ]
+            if any(seq.is_prompt for seq in seq_group_meta_with_hidden):
+                # Drop hidden_states with no prediction (eg non-terminal chunks)
                 hidden_states = hidden_states[
                     torch.where(sampler_output.sampled_token_ids -
                                 VLLM_INVALID_TOKEN_ID)[0]]
-            if self.previous_hidden_states is None:
+            if self.previous_hidden_states is None and len(
+                    seq_group_meta_with_hidden):
                 self.previous_hidden_states = HiddenStates(
-                    hidden_states, execute_model_req.seq_group_metadata_list)
-            else:
-                self.previous_hidden_states.update(
-                    hidden_states, execute_model_req.seq_group_metadata_list)
+                    hidden_states, seq_group_meta_with_hidden)
+            elif self.previous_hidden_states and len(
+                    seq_group_meta_with_hidden):
+                self.previous_hidden_states.update(hidden_states,
+                                                   seq_group_meta_with_hidden)
 
         if not skip_proposer:
             # We prepare the prefill hidden states here so that there no
@@ -752,13 +762,13 @@ def _run_speculative_decoding_step(
         ]
         if len(non_spec_indices):
             all_hidden_states = proposal_scores.hidden_states
-            # TODO fix `return_hidden_states`, same as in `_run_no_spec`
             if all_hidden_states is not None:
                 prefill_hidden_states = all_hidden_states[non_spec_indices]
                 execute_model_req.previous_hidden_states = \
                     prepare_prefill_hidden_states(prefill_hidden_states)
             # Sync proposer KV cache for prefills.
             prefill_req = execute_model_req.clone(non_spec_seqs)
+            # TODO avoid sampling here?
             self.proposer_worker.execute_model(prefill_req)
 
         with Timer() as verification_timer:
@@ -774,6 +784,8 @@ def _run_speculative_decoding_step(
             execute_model_req.seq_group_metadata_list,
             accepted_token_ids,
             target_logprobs=target_logprobs,
+            prompt_logprobs=proposal_scores.prompt_logprobs
+            if not self._disable_logprobs else None,
             k=execute_model_req.num_lookahead_slots,
             stage_times=stage_times)
 
@@ -845,19 +857,32 @@ def _verify_tokens(
         # metadata.
         accepted_token_ids[original_indices] = accepted_token_ids.clone()
 
+        # B x K+1 x D
         hidden_states = proposal_scores.hidden_states
         if hidden_states is not None:
+            # Only get terminal hidden states for next step
+            terminal_metadata = [
+                sg for sg in seq_group_metadata_list if sg.do_sample
+            ]
+
             # Contract hidden states based on accepted tokens
             hs_size = hidden_states.shape[-1]
-
             accepted_index = accepted_token_ids + 1  # Convert -1 to 0
-            accepted_index = accepted_index.count_nonzero(dim=1).add_(-1)
-            index = accepted_index[:, None, None].expand(-1, 1, hs_size)
+            accepted_index = accepted_index.count_nonzero(dim=1).add_(-1)  # b
+            # Drop non-terminal prefill chunks hidden states.
+            hidden_states = hidden_states[
+                accepted_index != VLLM_INVALID_TOKEN_ID]
+            accepted_index = accepted_index[
+                accepted_index != VLLM_INVALID_TOKEN_ID]
+            assert len(accepted_index) == hidden_states.shape[0] == len(
+                terminal_metadata)
+            index = accepted_index[:, None, None].expand(-1, 1,
+                                                         hs_size)  # b x 1 x d
             second_last_token_hidden_states = hidden_states[:, -2]  # b x d
             hidden_states = hidden_states.gather(1, index).squeeze(1)  # b x d
             # Store hidden states from target model for subsequent decode step
             self.previous_hidden_states = HiddenStates(
-                hidden_states, seq_group_metadata_list,
+                hidden_states, terminal_metadata,
                 second_last_token_hidden_states)
         return accepted_token_ids, logprobs
 
@@ -866,6 +891,8 @@ def _create_output_sampler_list(
         seq_group_metadata_list: List[SequenceGroupMetadata],
         accepted_token_ids: torch.Tensor,  # shape: [batch_size, k+1]
         target_logprobs: torch.Tensor,  # shape: [batch_size, k+1, vocab_size]
+        prompt_logprobs: Optional[
+            torch.Tensor],  # shape: [nprompt_tokens, vocab_size]
         k: int,
         stage_times: Tuple[float, float, float],
     ) -> List[SamplerOutput]:
@@ -909,15 +936,89 @@ def _create_output_sampler_list(
 
         # Construct the output on a per-step, per-sequence basis.
         # Non-terminal prefill chunks will end up here as rows with just -1s
-        # i.e mixed-batch [[-1, 1576], [-1, 29884], [-1, -1], [-1, -1]]
+        # i.e mixed-batch [[-1, 1576], [-1, 29884], [-1, -1], [-1, -1]] while
+        # terminal chunks will only have one generated token at time 0.
         sampler_output_list: List[SamplerOutput] = []
+
+        # Prefills are not multi-step (return at most 1 token), in order to
+        # avoid padding or repetition to fit decodes, we separate them.
+        for i, sg in enumerate(seq_group_metadata_list):
+            if not sg.is_prompt:
+                # Requests are ordered as prefills|decodes=>no more prefills.
+                break
+            num_logprobs = num_logprobs_per_seq[i]
+            seq_kwargs = dict(token_id=-1,
+                              token_id_logprob_rank=0,
+                              token_id_logprob=-float('inf'),
+                              topk_token_ids=[-1] * num_logprobs,
+                              topk_logprobs=[-float('inf')] * num_logprobs,
+                              seq_id=seq_ids[i])
+            # Terminal chunk, has token.
+            if sg.do_sample:
+                seq_kwargs.update(
+                    dict(
+                        token_id=accepted_token_ids[i][0].item(),
+                        token_id_logprob_rank=accepted_token_id_ranks_by_step[
+                            0][i],
+                        token_id_logprob=accepted_token_id_logprobs_by_step[0]
+                        [i],
+                        topk_token_ids=topk_indices_by_step[0][i]
+                        [:num_logprobs],
+                        # output only so step is 0
+                        topk_logprobs=topk_logprobs_by_step[0][i]
+                        [:num_logprobs],
+                    ))
+            needs_plogs = (sg.sampling_params.prompt_logprobs
+                           and sg.sampling_params.prompt_logprobs > 0)
+            plogs = None
+            if prompt_logprobs is not None:
+                # Even non-terminal prompt chunks can have logprobs here.
+                plogs = prompt_logprobs[i]
+            elif needs_plogs:
+                # Prompt logprobs are requested but `_disable_logprobs` is set.
+                seq_data = next(iter(sg.seq_data.values()))
+                # Get only the tokens in this chunk!
+                prompt_token_ids = seq_data.get_prompt_token_ids()
+                prompt_token_ids = prompt_token_ids[
+                    seq_data.
+                    _num_computed_tokens:seq_data._num_computed_tokens +
+                    sg.token_chunk_size]
+
+                is_first_chunk = seq_data._num_computed_tokens == 0
+                # There's no prob generated for the first token in a sequence.
+                if is_first_chunk:
+                    prompt_token_ids = prompt_token_ids[1:]
+                plogs = [
+                    create_logprobs_output(
+                        token_id=p_token_id,
+                        token_id_logprob_rank=-1,
+                        token_id_logprob=0.0,
+                        topk_token_ids=[],
+                        topk_logprobs=[],
+                    ) for p_token_id in prompt_token_ids
+                ]
+            seq_kwargs.update(dict(prompt_logprobs=plogs))
+
+            sampler_output_list.append(
+                SamplerOutput(
+                    outputs=[create_sequence_group_output(
+                        **seq_kwargs)]))  # type: ignore
+
+        # Decodes, create one SamplerOutput per-step (at most K+1).
         for step_index in range(num_steps):
-            if all(token_id == -1
-                   for token_id in accepted_token_ids_by_step[step_index]):
+            if all(token_id == -1 for sg, token_id in zip(
+                    seq_group_metadata_list,
+                    accepted_token_ids_by_step[step_index])
+                   if not sg.is_prompt):
                 break
 
             step_output_token_ids: List[CompletionSequenceGroupOutput] = []
             for sequence_index in range(batch_size):
+                seq_meta = seq_group_metadata_list[sequence_index]
+                # Prompts already processed above.
+                if seq_meta.is_prompt:
+                    continue
+
                 # Each sequence may have a different num_logprobs; retrieve it.
                 num_logprobs = num_logprobs_per_seq[sequence_index]
                 step_output_token_ids.append(
@@ -952,6 +1053,8 @@ def _create_output_sampler_list(
             # This is periodic because the rejection sampler emits metrics
             # periodically.
             self._maybe_log_stage_times(*stage_times)
+        # First `n_prefills` entries will contain prefills SamplerOutput when
+        # chunked prefill is enabled, the rest is decodes in multi-step format.
         return sampler_output_list
 
     def _maybe_log_stage_times(self, average_time_per_proposal_tok_ms: float,

From 823ab796330825f4052d771e2c462ad3b55236eb Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 28 Jan 2025 00:23:08 +0000
Subject: [PATCH 30/36] Update `pre-commit` hooks (#12475)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .pre-commit-config.yaml                       |  10 +-
 benchmarks/benchmark_serving.py               |   4 +-
 csrc/custom_all_reduce.cuh                    |   8 +-
 csrc/moe/marlin_kernels/marlin_moe_kernel.h   |   8 +-
 csrc/quantization/gptq_marlin/gptq_marlin.cu  |  16 +--
 .../marlin/dense/marlin_cuda_kernel.cu        |   4 +-
 .../marlin/qqq/marlin_qqq_gemm_kernel.cu      |   4 +-
 csrc/quantization/marlin/sparse/common/mma.h  |   4 +-
 csrc/rocm/attention.cu                        |   4 +-
 setup.py                                      |   2 +-
 tests/kernels/test_block_fp8.py               |  25 ++--
 tests/kv_transfer/test_lookup_buffer.py       |  10 +-
 tests/lora/test_qwen2vl.py                    |   6 +-
 .../vision_language/test_models.py            | 130 ++++++++++--------
 .../vision_language/test_pixtral.py           |  17 ++-
 tests/quantization/test_compressed_tensors.py |   6 +-
 tests/samplers/test_rejection_sampler.py      |  15 +-
 tools/report_build_time_ninja.py              |   5 +-
 vllm/_custom_ops.py                           |   4 +-
 vllm/attention/ops/prefix_prefill.py          |  28 ++--
 vllm/attention/ops/triton_flash_attention.py  |   4 +-
 vllm/attention/selector.py                    |   4 +-
 vllm/config.py                                |   7 +-
 vllm/core/block/common.py                     |   7 +-
 vllm/core/block_manager.py                    |   4 +-
 vllm/core/scheduler.py                        |  23 ++--
 .../device_communicators/shm_broadcast.py     |   8 +-
 vllm/distributed/parallel_state.py            |   8 +-
 vllm/entrypoints/chat_utils.py                |   4 +-
 vllm/entrypoints/openai/serving_completion.py |   9 +-
 .../granite_20b_fc_tool_parser.py             |   4 +-
 vllm/lora/layers.py                           |  12 +-
 vllm/lora/models.py                           |   5 +-
 vllm/lora/ops/triton_ops/sgmv_expand.py       |   5 +-
 vllm/lora/ops/triton_ops/sgmv_shrink.py       |   4 +-
 .../kernels/mixed_precision/MPLinearKernel.py |  12 +-
 .../kernels/scaled_mm/ScaledMMLinearKernel.py |  14 +-
 .../layers/quantization/utils/fp8_utils.py    |   7 +-
 .../layers/quantization/utils/w8a8_utils.py   |   4 +-
 vllm/model_executor/layers/sampler.py         |   7 +-
 .../layers/vocab_parallel_embedding.py        |  16 +--
 vllm/model_executor/model_loader/loader.py    |   5 +-
 .../model_executor/model_loader/tensorizer.py |   4 +-
 vllm/model_executor/models/gemma.py           |   4 +-
 vllm/model_executor/models/granitemoe.py      |   6 +-
 vllm/model_executor/models/mllama.py          |   4 +-
 vllm/model_executor/models/mlp_speculator.py  |   4 +-
 vllm/model_executor/models/phimoe.py          |   8 +-
 vllm/model_executor/models/registry.py        |   3 +-
 vllm/model_executor/models/ultravox.py        |   8 +-
 vllm/model_executor/models/utils.py           |   5 +-
 vllm/model_executor/sampling_metadata.py      |  11 +-
 vllm/platforms/neuron.py                      |   4 +-
 vllm/scalar_type.py                           |   4 +-
 vllm/spec_decode/spec_decode_worker.py        |   4 +-
 vllm/spec_decode/top1_proposer.py             |  10 +-
 vllm/spec_decode/util.py                      |  12 +-
 vllm/transformers_utils/configs/nemotron.py   |   4 +-
 vllm/utils.py                                 |  10 +-
 vllm/v1/core/scheduler.py                     |   4 +-
 vllm/v1/stats/common.py                       |   4 +-
 vllm/v1/worker/gpu_model_runner.py            |   2 +-
 vllm/worker/hpu_worker.py                     |   8 +-
 vllm/worker/tpu_model_runner.py               |   4 +-
 64 files changed, 322 insertions(+), 288 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 432bf5ed18dbc..7b32df90bfd8b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,18 +3,18 @@ default_stages:
   - manual # Run in CI
 repos:
 - repo: https://github.com/google/yapf
-  rev: v0.32.0
+  rev: v0.43.0
   hooks:
   - id: yapf
     args: [--in-place, --verbose]
     additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.6.5
+  rev: v0.9.3
   hooks:
   - id: ruff
     args: [--output-format, github]
 - repo: https://github.com/codespell-project/codespell
-  rev: v2.3.0
+  rev: v2.4.0
   hooks:
   - id: codespell
     exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*'
@@ -23,7 +23,7 @@ repos:
   hooks:
   - id: isort
 - repo: https://github.com/pre-commit/mirrors-clang-format
-  rev: v18.1.5
+  rev: v19.1.7
   hooks:
   - id: clang-format
     exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))'
@@ -35,7 +35,7 @@ repos:
   - id: pymarkdown
     files: docs/.*
 - repo: https://github.com/rhysd/actionlint
-  rev: v1.7.6
+  rev: v1.7.7
   hooks:
   - id: actionlint
 - repo: local
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 63d2c3f7c7dd9..8b3212831e7e0 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -926,8 +926,8 @@ def main(args: argparse.Namespace):
                     )
 
         # Traffic
-        result_json["request_rate"] = (
-            args.request_rate if args.request_rate < float("inf") else "inf")
+        result_json["request_rate"] = (args.request_rate if args.request_rate
+                                       < float("inf") else "inf")
         result_json["burstiness"] = args.burstiness
         result_json["max_concurrency"] = args.max_concurrency
 
diff --git a/csrc/custom_all_reduce.cuh b/csrc/custom_all_reduce.cuh
index 6be4d4f2b2eb8..b9df4ed160b03 100644
--- a/csrc/custom_all_reduce.cuh
+++ b/csrc/custom_all_reduce.cuh
@@ -38,9 +38,13 @@ struct Signal {
   alignas(128) FlagType peer_counter[2][kMaxBlocks][8];
 };
 
-struct __align__(16) RankData { const void* __restrict__ ptrs[8]; };
+struct __align__(16) RankData {
+  const void* __restrict__ ptrs[8];
+};
 
-struct __align__(16) RankSignals { Signal* signals[8]; };
+struct __align__(16) RankSignals {
+  Signal* signals[8];
+};
 
 // like std::array, but aligned
 template <typename T, int sz>
diff --git a/csrc/moe/marlin_kernels/marlin_moe_kernel.h b/csrc/moe/marlin_kernels/marlin_moe_kernel.h
index a217401b3d7c2..47ecf109d0f53 100644
--- a/csrc/moe/marlin_kernels/marlin_moe_kernel.h
+++ b/csrc/moe/marlin_kernels/marlin_moe_kernel.h
@@ -138,8 +138,8 @@ __device__ inline FragB dequant<vllm::kU4B8.id()>(int q) {
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
   // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
   // directly into `SUB` and `ADD`.
   const int SUB = 0x64086408;
@@ -182,8 +182,8 @@ __device__ inline FragB dequant<vllm::kU4.id()>(int q) {
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
 
   const int SUB = 0x64006400;
   const int MUL = 0x2c002c00;
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index 04ef842fbdf95..7c33fea93d6ae 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -173,8 +173,8 @@ dequant<half, vllm::kU4B8.id()>(int q) {
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
   // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
   // directly into `SUB` and `ADD`.
   const int SUB = 0x64086408;
@@ -197,9 +197,9 @@ dequant<nv_bfloat16, vllm::kU4B8.id()>(int q) {
 
   // Guarantee that the `(a & b) | c` operations are LOP3s.
 
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
   q >>= 4;
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
 
   typename ScalarType<nv_bfloat16>::FragB frag_b;
   static constexpr uint32_t MUL = 0x3F803F80;
@@ -221,8 +221,8 @@ dequant<half, vllm::kU4.id()>(int q) {
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
 
   const int SUB = 0x64006400;
   const int MUL = 0x2c002c00;
@@ -244,9 +244,9 @@ dequant<nv_bfloat16, vllm::kU4.id()>(int q) {
 
   // Guarantee that the `(a & b) | c` operations are LOP3s.
 
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
   q >>= 4;
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
 
   typename ScalarType<nv_bfloat16>::FragB frag_b;
   static constexpr uint32_t MUL = 0x3F803F80;
diff --git a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
index c03fef886e4db..4db8f5dcdabf6 100644
--- a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
+++ b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
@@ -96,8 +96,8 @@ __device__ inline FragB dequant(int q) {
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
   // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
   // directly into `SUB` and `ADD`.
   const int SUB = 0x64086408;
diff --git a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
index 103a6444f3a21..048a3f736fb71 100644
--- a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
+++ b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
@@ -141,8 +141,8 @@ __device__ inline FragB dequant_per_group(int q, FragS_GROUP& frag_s, int i) {
   static constexpr uint32_t HI = 0x00f000f0;
   static constexpr uint32_t EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  uint32_t t0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
-  uint32_t t1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  uint32_t t0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
+  uint32_t t1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
   // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
   // directly into `SUB` and `ADD`.
   static constexpr uint32_t SUB = 0x64086408;
diff --git a/csrc/quantization/marlin/sparse/common/mma.h b/csrc/quantization/marlin/sparse/common/mma.h
index b26505f771c8b..49eee4128ee7c 100644
--- a/csrc/quantization/marlin/sparse/common/mma.h
+++ b/csrc/quantization/marlin/sparse/common/mma.h
@@ -127,8 +127,8 @@ __device__ inline FragB dequant_4bit(int q) {
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
   // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
   // directly into `SUB` and `ADD`.
   const int SUB = 0x64086408;
diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu
index 9477790629c9f..ffa9d44610a7f 100644
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@@ -907,7 +907,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
     const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads,
                                            // max_num_partitions, head_size]
     const int* __restrict__ context_lens,  // [num_seqs]
-    const int max_num_partitions){UNREACHABLE_CODE}
+    const int max_num_partitions) {
+  UNREACHABLE_CODE
+}
 
 #endif  // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
 
diff --git a/setup.py b/setup.py
index ee193e4693806..59ece870b5585 100755
--- a/setup.py
+++ b/setup.py
@@ -417,7 +417,7 @@ def get_rocm_version():
 
         if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor),
                                   ctypes.byref(patch)) == 0):
-            return "%d.%d.%d" % (major.value, minor.value, patch.value)
+            return f"{major.value}.{minor.value}.{patch.value}"
         return None
     except Exception:
         return None
diff --git a/tests/kernels/test_block_fp8.py b/tests/kernels/test_block_fp8.py
index a16cc4582a180..f28fdf3feedbc 100644
--- a/tests/kernels/test_block_fp8.py
+++ b/tests/kernels/test_block_fp8.py
@@ -92,8 +92,10 @@ def native_w8a8_block_fp8_matmul(A,
         A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles)
     ]
     B_tiles = [[
-        B[j * block_n:min((j + 1) * block_n, N),
-          i * block_k:min((i + 1) * block_k, K), ] for i in range(k_tiles)
+        B[
+            j * block_n:min((j + 1) * block_n, N),
+            i * block_k:min((i + 1) * block_k, K),
+        ] for i in range(k_tiles)
     ] for j in range(n_tiles)]
     C_tiles = [
         C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles)
@@ -157,9 +159,9 @@ def setup_cuda():
     torch.set_default_device("cuda")
 
 
-@pytest.mark.parametrize("num_tokens,d,dtype,group_size,seed",
-                         itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE,
-                                           SEEDS))
+@pytest.mark.parametrize(
+    "num_tokens,d,dtype,group_size,seed",
+    itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS))
 @torch.inference_mode()
 def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
     torch.manual_seed(seed)
@@ -174,9 +176,9 @@ def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
     assert torch.allclose(scale, ref_scale)
 
 
-@pytest.mark.parametrize("M,N,K,block_size,out_dtype,seed",
-                         itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES,
-                                           SEEDS))
+@pytest.mark.parametrize(
+    "M,N,K,block_size,out_dtype,seed",
+    itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS))
 @torch.inference_mode()
 def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
     torch.manual_seed(seed)
@@ -207,9 +209,10 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
     assert rel_diff < 0.001
 
 
-@pytest.mark.parametrize("M,N,K,E,topk,block_size,dtype,seed",
-                         itertools.product(M_moe, N_moe, K_moe, E, TOP_KS,
-                                           BLOCK_SIZE, DTYPES, SEEDS))
+@pytest.mark.parametrize(
+    "M,N,K,E,topk,block_size,dtype,seed",
+    itertools.product(M_moe, N_moe, K_moe, E, TOP_KS, BLOCK_SIZE, DTYPES,
+                      SEEDS))
 @torch.inference_mode()
 def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
     torch.manual_seed(seed)
diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py
index 718730bb8cbbe..4d6890305af73 100644
--- a/tests/kv_transfer/test_lookup_buffer.py
+++ b/tests/kv_transfer/test_lookup_buffer.py
@@ -20,7 +20,7 @@ def test_run(my_rank, buffer, device):
         assert buffer.buffer_size == 0
         assert len(buffer.buffer) == 0
 
-    print("My rank: %d, device: %s" % (my_rank, device))
+    print(f"My rank: {my_rank}, device: {device}")
 
     # insert
     tokens = torch.tensor([1, 2, 3]).to(device)
@@ -48,7 +48,7 @@ def test_run(my_rank, buffer, device):
         assert buffer.buffer_size == 0
         assert len(buffer.buffer) == 0
 
-    print("My rank: %d, Test run passed!" % (my_rank))
+    print(f"My rank: {my_rank}, Test run passed!")
 
 
 def stress_test(my_rank, buf, device):
@@ -94,7 +94,7 @@ def stress_test(my_rank, buf, device):
                 assert torch.allclose(k, k_)
                 assert torch.allclose(v, v_)
                 assert torch.allclose(h, h_)
-    print('Rank %d done' % my_rank)
+    print(f"Rank {my_rank} done")
     torch.distributed.barrier()
 
     if my_rank == 0:
@@ -108,7 +108,7 @@ def stress_test(my_rank, buf, device):
     else:
         torch.distributed.send(torch.tensor([n]), 0)
 
-    print("My rank: %d, Passed stress test!" % (my_rank))
+    print(f"My rank: {my_rank}, Passed stress test!")
 
 
 if __name__ == "__main__":
@@ -122,7 +122,7 @@ def stress_test(my_rank, buf, device):
         rank=my_rank,
     )
 
-    print("initialized! My rank is %d" % my_rank)
+    print(f"initialized! My rank is {my_rank}")
 
     config = KVTransferConfig(
         kv_connector='PyNcclConnector',
diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py
index ebdd129db5f6a..570aa3861d0be 100644
--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@@ -55,9 +55,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     return generated_texts
 
 
-@pytest.mark.xfail(current_platform.is_rocm(),
-                   reason="Qwen2-VL dependency xformers incompatible with ROCm"
-                   )
+@pytest.mark.xfail(
+    current_platform.is_rocm(),
+    reason="Qwen2-VL dependency xformers incompatible with ROCm")
 def test_qwen2vl_lora(qwen2vl_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 14d9a739be318..d5f0d63288cc1 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -521,12 +521,13 @@ def _mark_splits(
 # - image embeddings
 # - video
 # - custom inputs
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.IMAGE,
-                             fork_new_process_for_each_test=False,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.IMAGE,
+        fork_new_process_for_each_test=False,
+    ))
 def test_single_image_models(tmp_path: PosixPath, model_type: str,
                              test_case: ExpandableVLMTestArgs,
                              hf_runner: Type[HfRunner],
@@ -543,12 +544,13 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str,
     )
 
 
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.MULTI_IMAGE,
-                             fork_new_process_for_each_test=False,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.MULTI_IMAGE,
+        fork_new_process_for_each_test=False,
+    ))
 def test_multi_image_models(tmp_path: PosixPath, model_type: str,
                             test_case: ExpandableVLMTestArgs,
                             hf_runner: Type[HfRunner],
@@ -565,12 +567,13 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str,
     )
 
 
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.EMBEDDING,
-                             fork_new_process_for_each_test=False,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.EMBEDDING,
+        fork_new_process_for_each_test=False,
+    ))
 def test_image_embedding_models(model_type: str,
                                 test_case: ExpandableVLMTestArgs,
                                 hf_runner: Type[HfRunner],
@@ -586,12 +589,13 @@ def test_image_embedding_models(model_type: str,
     )
 
 
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.VIDEO,
-                             fork_new_process_for_each_test=False,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.VIDEO,
+        fork_new_process_for_each_test=False,
+    ))
 def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
                       hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner],
                       video_assets: _VideoAssets):
@@ -605,12 +609,13 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
     )
 
 
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.CUSTOM_INPUTS,
-                             fork_new_process_for_each_test=False,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.CUSTOM_INPUTS,
+        fork_new_process_for_each_test=False,
+    ))
 def test_custom_inputs_models(
     model_type: str,
     test_case: ExpandableVLMTestArgs,
@@ -627,12 +632,13 @@ def test_custom_inputs_models(
 
 
 #### Tests filtering for things running each test as a new process
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.IMAGE,
-                             fork_new_process_for_each_test=True,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.IMAGE,
+        fork_new_process_for_each_test=True,
+    ))
 @fork_new_process_for_each_test
 def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
                                    test_case: ExpandableVLMTestArgs,
@@ -650,12 +656,13 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
     )
 
 
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.MULTI_IMAGE,
-                             fork_new_process_for_each_test=True,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.MULTI_IMAGE,
+        fork_new_process_for_each_test=True,
+    ))
 @fork_new_process_for_each_test
 def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
                                   test_case: ExpandableVLMTestArgs,
@@ -673,12 +680,13 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
     )
 
 
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.EMBEDDING,
-                             fork_new_process_for_each_test=True,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.EMBEDDING,
+        fork_new_process_for_each_test=True,
+    ))
 @fork_new_process_for_each_test
 def test_image_embedding_models_heavy(model_type: str,
                                       test_case: ExpandableVLMTestArgs,
@@ -695,12 +703,13 @@ def test_image_embedding_models_heavy(model_type: str,
     )
 
 
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.VIDEO,
-                             fork_new_process_for_each_test=True,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.VIDEO,
+        fork_new_process_for_each_test=True,
+    ))
 def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
                             hf_runner: Type[HfRunner],
                             vllm_runner: Type[VllmRunner],
@@ -715,12 +724,13 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
     )
 
 
-@pytest.mark.parametrize("model_type,test_case",
-                         get_parametrized_options(
-                             VLM_TEST_SETTINGS,
-                             test_type=VLMTestType.CUSTOM_INPUTS,
-                             fork_new_process_for_each_test=True,
-                         ))
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.CUSTOM_INPUTS,
+        fork_new_process_for_each_test=True,
+    ))
 @fork_new_process_for_each_test
 def test_custom_inputs_models_heavy(
     model_type: str,
diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/decoder_only/vision_language/test_pixtral.py
index 90c0fab99054c..8103e5305b91b 100644
--- a/tests/models/decoder_only/vision_language/test_pixtral.py
+++ b/tests/models/decoder_only/vision_language/test_pixtral.py
@@ -135,10 +135,10 @@ def _dump_outputs_w_logprobs(
     outputs: OutputsLogprobs,
     filename: "StrPath",
 ) -> None:
-    json_data = [(tokens, text,
-                  [{k: asdict(v)
-                    for k, v in token_logprobs.items()}
-                   for token_logprobs in (logprobs or [])])
+    json_data = [(tokens, text, [{
+        k: asdict(v)
+        for k, v in token_logprobs.items()
+    } for token_logprobs in (logprobs or [])])
                  for tokens, text, logprobs in outputs]
 
     with open(filename, "w") as f:
@@ -149,11 +149,10 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
     with open(filename, "rb") as f:
         json_data = json.load(f)
 
-    return [(tokens, text,
-             [{int(k): Logprob(**v)
-               for k, v in token_logprobs.items()}
-              for token_logprobs in logprobs])
-            for tokens, text, logprobs in json_data]
+    return [(tokens, text, [{
+        int(k): Logprob(**v)
+        for k, v in token_logprobs.items()
+    } for token_logprobs in logprobs]) for tokens, text, logprobs in json_data]
 
 
 @large_gpu_test(min_gb=80)
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index bf0d454ad511c..1072697ecf5cc 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -314,9 +314,9 @@ def check_model(model):
 
 
 @pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.")
-@pytest.mark.skipif(not sparse_cutlass_supported(),
-                    reason="2of4 Sparse is not yet supported on this GPU type."
-                    )
+@pytest.mark.skipif(
+    not sparse_cutlass_supported(),
+    reason="2of4 Sparse is not yet supported on this GPU type.")
 @pytest.mark.parametrize(
     "args_2of4",
     [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")])
diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py
index 397fa2cc85821..dcb1b27bff37f 100644
--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -23,16 +23,17 @@ def mock_causal_accepted_tensor(
     """
     batch_size = last_accepted_indices.shape[0]
 
-    accepted = (torch.arange(k).expand(batch_size, k) <=
-                last_accepted_indices.unsqueeze(-1).broadcast_to(
+    accepted = (torch.arange(k).expand(batch_size, k)
+                <= last_accepted_indices.unsqueeze(-1).broadcast_to(
                     batch_size, k))
 
     # Sprinkle accepted values after the contiguous initial accepted values.
     # This replicates the behavior of rejection sampling, which may "accept"
     # a token that cannot be accepted because of causality.
-    sprinkle_candidates = (
-        torch.arange(k).expand(batch_size, k) >
-        last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) + 1)
+    sprinkle_candidates = (torch.arange(k).expand(
+        batch_size,
+        k) > last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) +
+                           1)
     sprinkle = torch.rand(batch_size, k) > 0.5
     accepted[sprinkle_candidates] = sprinkle[sprinkle_candidates]
     return accepted
@@ -445,8 +446,8 @@ def test_rejection_sampling_approximates_target_distribution(
         distance_wrt_reference)
 
     expected_improvement_multiplier = 20
-    assert (relative_change_in_distance_wrt_target >
-            relative_change_in_distance_wrt_reference *
+    assert (relative_change_in_distance_wrt_target
+            > relative_change_in_distance_wrt_reference *
             expected_improvement_multiplier)
 
 
diff --git a/tools/report_build_time_ninja.py b/tools/report_build_time_ninja.py
index 51ad2adc74fe1..9dc19f5fd4cdd 100644
--- a/tools/report_build_time_ninja.py
+++ b/tools/report_build_time_ninja.py
@@ -274,8 +274,9 @@ def SummarizeEntries(entries, extra_step_types):
     print('    {:.1f} s weighted time ({:.1f} s elapsed time sum, {:1.1f}x '
           'parallelism)'.format(length, total_cpu_time,
                                 total_cpu_time * 1.0 / length))
-    print('    %d build steps completed, average of %1.2f/s' %
-          (len(entries), len(entries) / (length)))
+    print('    {} build steps completed, average of {:1.2f}/s'.format(
+        len(entries),
+        len(entries) / (length)))
 
 
 def main():
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 440bc52012ab7..85c1121ed6ff8 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -820,8 +820,8 @@ def scaled_int8_quant(
     if scale is not None:
         # static-per-tensor quantization.
         assert symmetric == (
-            azp is
-            None), "azp must only be provided for asymmetric quantization."
+            azp
+            is None), "azp must only be provided for asymmetric quantization."
         torch.ops._C.static_scaled_int8_quant(output, input, scale, azp)
         return output, scale, azp
 
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index e2f2b66dfc90c..ec3c8459c43ef 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -219,8 +219,8 @@ def _fwd_kernel(
                           float("-inf"))
             if SLIDING_WINDOW > 0:
                 qk = tl.where(
-                    offs_m[:, None] -
-                    (start_n + offs_n[None, :]) < SLIDING_WINDOW, qk, -10000)
+                    offs_m[:, None] - (start_n + offs_n[None, :])
+                    < SLIDING_WINDOW, qk, -10000)
 
             # -- compute m_ij, p, l_ij
             m_ij = tl.max(qk, 1)
@@ -324,10 +324,10 @@ def _fwd_kernel_flash_attn_v2(
             (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
             cur_head * stride_qh + offs_d[None, :] * stride_qd)
 
-        q = tl.load(
-            Q + off_q,
-            mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len,
-            other=0.0)
+        q = tl.load(Q + off_q,
+                    mask=offs_m[:, None]
+                    < cur_batch_seq_len - cur_batch_ctx_len,
+                    other=0.0)
 
         # # initialize pointer to m and l
         m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
@@ -402,8 +402,8 @@ def _fwd_kernel_flash_attn_v2(
             # -- compute qk ----
             k = tl.load(k_ptrs +
                         (cur_batch_in_all_start_index + start_n) * stride_kbs,
-                        mask=(start_n + offs_n[None, :]) <
-                        cur_batch_seq_len - cur_batch_ctx_len,
+                        mask=(start_n + offs_n[None, :])
+                        < cur_batch_seq_len - cur_batch_ctx_len,
                         other=0.0)
 
             qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
@@ -430,8 +430,8 @@ def _fwd_kernel_flash_attn_v2(
             # update acc
             v = tl.load(v_ptrs +
                         (cur_batch_in_all_start_index + start_n) * stride_vbs,
-                        mask=(start_n + offs_n[:, None]) <
-                        cur_batch_seq_len - cur_batch_ctx_len,
+                        mask=(start_n + offs_n[:, None])
+                        < cur_batch_seq_len - cur_batch_ctx_len,
                         other=0.0)
 
             p = p.to(v.dtype)
@@ -639,8 +639,8 @@ def _fwd_kernel_alibi(
             k = tl.load(k_ptrs +
                         (cur_batch_in_all_start_index + start_n) * stride_kbs,
                         mask=dim_mask[:, None] &
-                        ((start_n + offs_n[None, :]) <
-                         cur_batch_seq_len - cur_batch_ctx_len),
+                        ((start_n + offs_n[None, :])
+                         < cur_batch_seq_len - cur_batch_ctx_len),
                         other=0.0)
 
             qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
@@ -677,8 +677,8 @@ def _fwd_kernel_alibi(
             v = tl.load(v_ptrs +
                         (cur_batch_in_all_start_index + start_n) * stride_vbs,
                         mask=dim_mask[None, :] &
-                        ((start_n + offs_n[:, None]) <
-                         cur_batch_seq_len - cur_batch_ctx_len),
+                        ((start_n + offs_n[:, None])
+                         < cur_batch_seq_len - cur_batch_ctx_len),
                         other=0.0)
             p = p.to(v.dtype)
 
diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py
index f94211116a746..ef04603f22b6e 100644
--- a/vllm/attention/ops/triton_flash_attention.py
+++ b/vllm/attention/ops/triton_flash_attention.py
@@ -627,8 +627,8 @@ def attn_fwd(
                                         causal_start_idx,
                                         dtype=tl.int32)
             mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M)
-            out_ptrs_mask = (mask_m_offsets[:, None] >=
-                             out_mask_boundary[None, :])
+            out_ptrs_mask = (mask_m_offsets[:, None]
+                             >= out_mask_boundary[None, :])
             z = 0.0
             acc = tl.where(out_ptrs_mask, acc, z.to(acc.type.element_ty))
     # write back LSE
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 81ea6eefb5410..1376274d57777 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -1,6 +1,6 @@
 import os
 from contextlib import contextmanager
-from functools import lru_cache
+from functools import cache
 from typing import Generator, Optional, Type
 
 import torch
@@ -100,7 +100,7 @@ def get_attn_backend(
     )
 
 
-@lru_cache(maxsize=None)
+@cache
 def _cached_get_attn_backend(
     head_size: int,
     dtype: torch.dtype,
diff --git a/vllm/config.py b/vllm/config.py
index 7ab632d7e3667..d7c9311ae3cb0 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -67,7 +67,8 @@
 
 _TASK_RUNNER: Dict[_ResolvedTask, RunnerType] = {
     task: runner
-    for runner, tasks in _RUNNER_TASKS.items() for task in tasks
+    for runner, tasks in _RUNNER_TASKS.items()
+    for task in tasks
 }
 
 HfOverrides = Union[Dict[str, Any], Callable[[PretrainedConfig],
@@ -1976,8 +1977,8 @@ def _verify_args(self) -> None:
                              "typical_acceptance_sampler.")
 
         if (self.draft_token_acceptance_method != 'rejection_sampler'
-                and self.draft_token_acceptance_method !=
-                'typical_acceptance_sampler'):
+                and self.draft_token_acceptance_method
+                != 'typical_acceptance_sampler'):
             raise ValueError(
                 "Expected draft_token_acceptance_method to be either "
                 "rejection_sampler or typical_acceptance_sampler. Instead it "
diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py
index c03b5932eafb6..115f663e4ad34 100644
--- a/vllm/core/block/common.py
+++ b/vllm/core/block/common.py
@@ -34,9 +34,10 @@ class RefCounter(RefCounterProtocol):
 
     def __init__(self, all_block_indices: Iterable[BlockId]):
         deduped = set(all_block_indices)
-        self._refcounts: Dict[BlockId,
-                              RefCount] = {index: 0
-                                           for index in deduped}
+        self._refcounts: Dict[BlockId, RefCount] = {
+            index: 0
+            for index in deduped
+        }
 
     def incr(self, block_id: BlockId) -> RefCount:
         assert block_id in self._refcounts
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 62a5f0bda061a..2d6a132ed555b 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -136,8 +136,8 @@ def can_allocate(self,
             device=Device.GPU)
 
         # Use watermark to avoid frequent cache eviction.
-        if (self.num_total_gpu_blocks - num_required_blocks <
-                self.watermark_blocks):
+        if (self.num_total_gpu_blocks - num_required_blocks
+                < self.watermark_blocks):
             return AllocStatus.NEVER
         if num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks:
             return AllocStatus.OK
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index b1630b34947bd..2bb961481e5fe 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -988,8 +988,8 @@ def _schedule_prefills(
                     waiting_queue.popleft()
                     continue
 
-            if (budget.num_batched_tokens >=
-                    self.scheduler_config.max_num_batched_tokens):
+            if (budget.num_batched_tokens
+                    >= self.scheduler_config.max_num_batched_tokens):
                 # We've reached the budget limit - since there might be
                 # continuous prefills in the running queue, we should break
                 # to avoid scheduling any new prefills.
@@ -1096,8 +1096,8 @@ def _schedule_default(self) -> SchedulerOutputs:
                     running_scheduled.swapped_out) == 0:
                 swapped_in = self._schedule_swapped(budget, curr_loras)
 
-        assert (budget.num_batched_tokens <=
-                self.scheduler_config.max_num_batched_tokens)
+        assert (budget.num_batched_tokens
+                <= self.scheduler_config.max_num_batched_tokens)
         assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
 
         # Update waiting requests.
@@ -1189,8 +1189,8 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs:
                                            curr_loras,
                                            enable_chunking=True)
 
-        assert (budget.num_batched_tokens <=
-                self.scheduler_config.max_num_batched_tokens)
+        assert (budget.num_batched_tokens
+                <= self.scheduler_config.max_num_batched_tokens)
         assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
 
         # Update waiting requests.
@@ -1358,8 +1358,8 @@ def schedule(
                 # NOTE: We use get_len instead of get_prompt_len because when
                 # a sequence is preempted, prefill includes previous generated
                 # output tokens.
-                if (token_chunk_size + num_computed_tokens <
-                        seqs[0].data.get_len()):
+                if (token_chunk_size + num_computed_tokens
+                        < seqs[0].data.get_len()):
                     do_sample = False
 
             # It assumes the scheduled_seq_groups is ordered by
@@ -1625,10 +1625,9 @@ def _passed_delay(self, now: float) -> bool:
         if self.scheduler_config.delay_factor > 0 and self.waiting:
             earliest_arrival_time = min(
                 [e.metrics.arrival_time for e in self.waiting])
-            passed_delay = (
-                (now - earliest_arrival_time) >
-                (self.scheduler_config.delay_factor * self.last_prompt_latency)
-                or not self.running)
+            passed_delay = ((now - earliest_arrival_time)
+                            > (self.scheduler_config.delay_factor *
+                               self.last_prompt_latency) or not self.running)
         else:
             passed_delay = True
         return passed_delay
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 4ced991f62f66..268edc0925fe8 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -352,8 +352,8 @@ def acquire_write(self, timeout: Optional[float] = None):
                     sched_yield()
 
                     # if we wait for a long time, log a message
-                    if (time.monotonic() - start_time >
-                            VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
+                    if (time.monotonic() - start_time
+                            > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
                         logger.debug("No available block found in %s second. ",
                                      VLLM_RINGBUFFER_WARNING_INTERVAL)
                         n_warning += 1
@@ -410,8 +410,8 @@ def acquire_read(self, timeout: Optional[float] = None):
                     sched_yield()
 
                     # if we wait for a long time, log a message
-                    if (time.monotonic() - start_time >
-                            VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
+                    if (time.monotonic() - start_time
+                            > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
                         logger.debug("No available block found in %s second. ",
                                      VLLM_RINGBUFFER_WARNING_INTERVAL)
                         n_warning += 1
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index ffdf8b0f48087..7fe9b68d4b9e8 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -1014,8 +1014,8 @@ def initialize_model_parallel(
     backend = backend or torch.distributed.get_backend(
         get_world_group().device_group)
 
-    if (world_size !=
-            tensor_model_parallel_size * pipeline_model_parallel_size):
+    if (world_size
+            != tensor_model_parallel_size * pipeline_model_parallel_size):
         raise RuntimeError(
             f"world_size ({world_size}) is not equal to "
             f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
@@ -1069,8 +1069,8 @@ def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None:
         return
 
     if all([
-            vllm_config.kv_transfer_config.need_kv_parallel_group,
-            _KV_TRANSFER is None
+            vllm_config.kv_transfer_config.need_kv_parallel_group, _KV_TRANSFER
+            is None
     ]):
         _KV_TRANSFER = kv_transfer.KVTransferAgent(
             rank=get_world_group().rank,
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index beedf5d16ab86..723d6e9085806 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -3,7 +3,7 @@
 import json
 from abc import ABC, abstractmethod
 from collections import defaultdict, deque
-from functools import lru_cache, partial
+from functools import cache, lru_cache, partial
 from pathlib import Path
 from typing import (Any, Awaitable, Callable, Dict, Generic, Iterable, List,
                     Literal, Optional, Tuple, TypeVar, Union, cast)
@@ -377,7 +377,7 @@ def allowed_local_media_path(self):
         return self._model_config.allowed_local_media_path
 
     @staticmethod
-    @lru_cache(maxsize=None)
+    @cache
     def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str:
         return tokenizer.decode(token_index)
 
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 2c9c20caf8119..b0179f78bd635 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -522,11 +522,10 @@ def _create_completion_logprobs(
                 out_top_logprobs.append({
                     # Convert float("-inf") to the
                     # JSON-serializable float that OpenAI uses
-                    self._get_decoded_token(
-                        top_lp[1],
-                        top_lp[0],
-                        tokenizer,
-                        return_as_token_id=self.return_tokens_as_token_ids):
+                    self._get_decoded_token(top_lp[1],
+                                            top_lp[0],
+                                            tokenizer,
+                                            return_as_token_id=self.return_tokens_as_token_ids):
                     max(top_lp[1].logprob, -9999.0)
                     for i, top_lp in enumerate(step_top_logprobs.items())
                     if num_output_top_logprobs >= i
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
index 94db8f379e33a..93e357e8b9f21 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
@@ -62,8 +62,8 @@ def extract_tool_calls(
                 start_of_json = match.end()
                 # end_index == the start of the next function call
                 # (if exists)
-                next_function_call_start = (matches[i + 1].start()
-                                            if i + 1 < len(matches) else None)
+                next_function_call_start = (matches[i + 1].start() if i +
+                                            1 < len(matches) else None)
 
                 raw_function_calls.append(
                     dec.raw_decode(
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index e6f26d2b74b2f..cdd439d0385b6 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -220,8 +220,10 @@ def set_lora(
                                 lora_b.T, non_blocking=True)
         if embeddings_tensor is not None:
             self.embeddings_tensors[
-                index, :embeddings_tensor.shape[0], :embeddings_tensor.
-                shape[1], ].copy_(embeddings_tensor, non_blocking=True)
+                index,
+                :embeddings_tensor.shape[0],
+                :embeddings_tensor.shape[1],
+            ].copy_(embeddings_tensor, non_blocking=True)
             if self.embeddings_slice is not None:
                 # TODO(yard1): Optimize this copy, we don't need to copy
                 # everything, just the modified part
@@ -1024,8 +1026,10 @@ def set_lora(
                                 lora_b.T, non_blocking=True)
         if embeddings_tensor is not None:
             self.embeddings_tensors[
-                index, :embeddings_tensor.shape[0], :embeddings_tensor.
-                shape[1], ] = embeddings_tensor
+                index,
+                :embeddings_tensor.shape[0],
+                :embeddings_tensor.shape[1],
+            ] = embeddings_tensor
 
     def _get_logits(
         self,
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index b77b6b3d72ff4..2e04cb902d009 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -75,8 +75,9 @@ def __init__(
         # Scaling factor for long context lora model. None if it is not
         # fine tuned for the long context.
         self.scaling_factor = scaling_factor
-        assert (lora_model_id >
-                0), f"a valid lora id should be greater than 0, got {self.id}"
+        assert (
+            lora_model_id
+            > 0), f"a valid lora id should be greater than 0, got {self.id}"
         self.rank = rank
         self.loras: Dict[str, LoRALayerWeights] = loras
 
diff --git a/vllm/lora/ops/triton_ops/sgmv_expand.py b/vllm/lora/ops/triton_ops/sgmv_expand.py
index 8af44b703810b..48fa5cd63741f 100644
--- a/vllm/lora/ops/triton_ops/sgmv_expand.py
+++ b/vllm/lora/ops/triton_ops/sgmv_expand.py
@@ -136,9 +136,8 @@ def _sgmv_expand_kernel(
     c_ptr = (out_ptr + offset_cm[:, None] * output_d0_stride +
              offset_cn[None, :] * output_d1_stride)
     M = tl.load(seq_lens + cur_batch)
-    c_mask = (offset_cm[:, None] <
-              (cur_seq_start + M)) & (offset_cn[None, :] <
-                                      (cur_slice_start + curr_N))
+    c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (
+        offset_cn[None, :] < (cur_slice_start + curr_N))
     if ADD_INPUTS:
         tiled_out = tl.load(c_ptr, mask=c_mask)
         tiled_c += tiled_out
diff --git a/vllm/lora/ops/triton_ops/sgmv_shrink.py b/vllm/lora/ops/triton_ops/sgmv_shrink.py
index 3d2ebe8286f56..9bb35e8ffd323 100644
--- a/vllm/lora/ops/triton_ops/sgmv_shrink.py
+++ b/vllm/lora/ops/triton_ops/sgmv_shrink.py
@@ -114,8 +114,8 @@ def _sgmv_shrink_kernel(
                    slice_id * output_d0_stride)
     c_ptr = cur_out_ptr + offset_cm[:, None] * output_d1_stride + offset_cn[
         None, :] * output_d2_stride
-    c_mask = (offset_cm[:, None] <
-              (cur_seq_start + M)) & (offset_cn[None, :] < N)
+    c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (offset_cn[None, :]
+                                                           < N)
     accumulator *= scaling
     # handles write-back with reduction-splitting
     if SPLIT_K == 1:
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
index b04612a9b00d9..915bdc4778929 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
@@ -73,12 +73,12 @@ def _transform_param(self, layer: torch.nn.Module, name: Optional[str],
                 torch.nn.Parameter(new_param.data, requires_grad=False))
 
     def _get_weight_params(
-            self, layer: torch.nn.Module
-    ) -> Tuple[torch.Tensor,  # w_q
-               torch.Tensor,  # w_s
-               Optional[torch.Tensor],  # w_zp, 
-               Optional[torch.Tensor]  # w_gidx
-               ]:
+            self, layer: torch.nn.Module) -> Tuple[
+                torch.Tensor,  # w_q
+                torch.Tensor,  # w_s
+                Optional[torch.Tensor],  # w_zp, 
+                Optional[torch.Tensor]  # w_gidx
+            ]:
         return (
             getattr(layer, self.w_q_name),
             getattr(layer, self.w_s_name),
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
index 75cf91f191136..c4a83b4faafe6 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
@@ -48,13 +48,13 @@ def apply_weights(self,
         raise NotImplementedError
 
     def _get_weight_params(
-            self, layer: torch.nn.Module
-    ) -> Tuple[torch.Tensor,  # weight
-               torch.Tensor,  # weight_scale
-               Optional[torch.Tensor],  # input_scale, 
-               Optional[torch.Tensor],  # input_zp
-               Optional[torch.Tensor],  # azp_adj
-               ]:
+            self, layer: torch.nn.Module) -> Tuple[
+                torch.Tensor,  # weight
+                torch.Tensor,  # weight_scale
+                Optional[torch.Tensor],  # input_scale, 
+                Optional[torch.Tensor],  # input_zp
+                Optional[torch.Tensor],  # azp_adj
+            ]:
         return (
             getattr(layer, self.w_q_name),
             getattr(layer, self.w_s_name),
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index b6882cc7c837c..43b1997019107 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -72,9 +72,10 @@ def block_quant_to_tensor_quant(
     x_dq_block = x_q_block.to(torch.float32)
 
     x_dq_block_tiles = [[
-        x_dq_block[j * block_n:min((j + 1) * block_n, n),
-                   i * block_k:min((i + 1) * block_k, k), ]
-        for i in range(k_tiles)
+        x_dq_block[
+            j * block_n:min((j + 1) * block_n, n),
+            i * block_k:min((i + 1) * block_k, k),
+        ] for i in range(k_tiles)
     ] for j in range(n_tiles)]
 
     for i in range(k_tiles):
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 7cdce67cf1677..9977804188a50 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -73,8 +73,8 @@ def requantize_with_max_scale(
     # from disk in this case. Skip requantization in this case (since)
     # we already are quantized with the single scale.
     # * Sample Model: nm-testing/Phi-3-mini-128k-instruct-FP8
-    unfused_module_in_checkpoint = (weight_scale[-1] > torch.finfo(
-        torch.float8_e4m3fn).min)
+    unfused_module_in_checkpoint = (weight_scale[-1]
+                                    > torch.finfo(torch.float8_e4m3fn).min)
 
     # If unfused checkpoint, need requanize with the single scale.
     if unfused_module_in_checkpoint:
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index c2d12c466ba45..8dc26309d754e 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -716,9 +716,10 @@ def _sample_with_torch(
       tensors required for Pythonization
     '''
 
-    categorized_seq_group_ids: Dict[SamplingType,
-                                    List[int]] = {t: []
-                                                  for t in SamplingType}
+    categorized_seq_group_ids: Dict[SamplingType, List[int]] = {
+        t: []
+        for t in SamplingType
+    }
     categorized_sample_indices = sampling_metadata.categorized_sample_indices
     for i, seq_group in enumerate(sampling_metadata.seq_groups):
         sampling_params = seq_group.sampling_params
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index 3eb5c39ccf580..f230efacacdbb 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -115,17 +115,17 @@ def num_elements_padded(self) -> int:
 
     def __post_init__(self):
         # sanity checks
-        assert (self.padded_org_vocab_start_index <=
-                self.padded_org_vocab_end_index)
-        assert (self.padded_added_vocab_start_index <=
-                self.padded_added_vocab_end_index)
+        assert (self.padded_org_vocab_start_index
+                <= self.padded_org_vocab_end_index)
+        assert (self.padded_added_vocab_start_index
+                <= self.padded_added_vocab_end_index)
 
         assert self.org_vocab_start_index <= self.org_vocab_end_index
         assert self.added_vocab_start_index <= self.added_vocab_end_index
 
         assert self.org_vocab_start_index <= self.padded_org_vocab_start_index
-        assert (self.added_vocab_start_index <=
-                self.padded_added_vocab_start_index)
+        assert (self.added_vocab_start_index
+                <= self.padded_added_vocab_start_index)
         assert self.org_vocab_end_index <= self.padded_org_vocab_end_index
         assert self.added_vocab_end_index <= self.padded_added_vocab_end_index
 
@@ -141,8 +141,8 @@ def get_masked_input_and_mask(
         added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]:
     # torch.compile will fuse all of the pointwise ops below
     # into a single kernel, making it very fast
-    org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ <
-                                                          org_vocab_end_index)
+    org_vocab_mask = (input_ >= org_vocab_start_index) & (
+        input_ < org_vocab_end_index)
     added_vocab_mask = (input_ >= added_vocab_start_index) & (
         input_ < added_vocab_end_index)
     added_offset = added_vocab_start_index - (
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 527b4307f3670..712266ee42639 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -1121,8 +1121,9 @@ def _load_weights(self, model_config: ModelConfig,
                 # from being incorrectly identified as being present in
                 # 'vpm.encoder.layers.0.self_attn.qkv_proj.weight
                 shard_pos = quant_param_name.find(shard_name)
-                can_correct_rename = (shard_pos > 0) and (
-                    quant_param_name[shard_pos - 1] == ".")
+                can_correct_rename = (shard_pos
+                                      > 0) and (quant_param_name[shard_pos - 1]
+                                                == ".")
                 # If the quant_param_name is packed, it won't occur in the
                 # param_dict before renaming.
                 new_quant_param_name = quant_param_name.replace(
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index e359aef9dcb7f..9266ca75ddaac 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -298,8 +298,8 @@ def _resize_lora_embeddings(self):
         to allow for adapter added tokens."""
         for child in self.model.modules():
             if (isinstance(child, VocabParallelEmbedding)
-                    and child.weight.shape[0] <
-                    child.num_embeddings_per_partition):
+                    and child.weight.shape[0]
+                    < child.num_embeddings_per_partition):
                 new_weight = torch.empty(child.num_embeddings_per_partition,
                                          child.embedding_dim,
                                          dtype=child.weight.dtype,
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 6de0c866bc2f0..b23aba829c549 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Gemma model compatible with HuggingFace weights."""
-from functools import lru_cache
+from functools import cache
 from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
@@ -48,7 +48,7 @@
 logger = init_logger(__name__)
 
 
-@lru_cache(maxsize=None)
+@cache
 def _get_gemma_act_fn(
     hidden_act: Optional[str],
     hidden_activation: Optional[str],
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index b518a0a6cbdee..cdf9414d5949c 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -429,10 +429,10 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 for e in range(p.size(0)):
                     w1_name = n.replace(
                         '.block_sparse_moe.input_linear.weight',
-                        ".block_sparse_moe.experts.%d.w1.weight" % e)
+                        f".block_sparse_moe.experts.{e}.w1.weight")
                     w3_name = n.replace(
                         '.block_sparse_moe.input_linear.weight',
-                        ".block_sparse_moe.experts.%d.w3.weight" % e)
+                        f".block_sparse_moe.experts.{e}.w3.weight")
                     w1_param, w3_param = p[e].chunk(2, dim=0)
                     assert w1_name not in new_weights
                     assert w3_name not in new_weights
@@ -442,7 +442,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 for e in range(p.size(0)):
                     w2_name = n.replace(
                         '.block_sparse_moe.output_linear.weight',
-                        ".block_sparse_moe.experts.%d.w2.weight" % e)
+                        f".block_sparse_moe.experts.{e}.w2.weight")
                     w2_param = p[e]
                     assert w2_name not in new_weights
                     new_weights[w2_name] = w2_param
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 61baa8e588d74..e15ac84a6049b 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1365,8 +1365,8 @@ def forward(
         # For 1) text-only prefill and decode, 2) image-present decode.
         if image_inputs is None:
             full_text_row_masked_out_mask = (
-                attn_metadata.encoder_seq_lens_tensor != 0).reshape(-1, 1).to(
-                    input_ids.device)
+                attn_metadata.encoder_seq_lens_tensor
+                != 0).reshape(-1, 1).to(input_ids.device)
             skip_cross_attention = max(attn_metadata.encoder_seq_lens) == 0
 
         # For image-present prefill.
diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py
index d49da5f29aa14..f1d796ca26a16 100644
--- a/vllm/model_executor/models/mlp_speculator.py
+++ b/vllm/model_executor/models/mlp_speculator.py
@@ -81,8 +81,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
 
         if self.tie_weights:
             assert (
-                self.n_predict >
-                1), "You cannot tie weights between stages when only 1 exists"
+                self.n_predict > 1
+            ), "You cannot tie weights between stages when only 1 exists"
             embedding = VocabParallelEmbedding(
                 config.vocab_size,
                 self.inner_dim,
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index 881c09ea9db99..6367b770a0aff 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -167,8 +167,8 @@ def sparsemixer(scores, jitter_eps=0.01):
         # compute mask for sparsity
         mask_logits_threshold, max_ind = scores.max(dim=-1, keepdim=True)
         factor = scores.abs().clamp(min=mask_logits_threshold)
-        mask_logits_threshold = (
-            (mask_logits_threshold - scores) / factor) > (2 * jitter_eps)
+        mask_logits_threshold = ((mask_logits_threshold - scores) /
+                                 factor) > (2 * jitter_eps)
 
     # apply mask
     masked_gates = scores.masked_fill(mask_logits_threshold, float("-inf"))
@@ -192,8 +192,8 @@ def sparsemixer(scores, jitter_eps=0.01):
         mask_logits_threshold, max_ind = masked_scores.max(dim=-1,
                                                            keepdim=True)
         factor = scores.abs().clamp(min=mask_logits_threshold)
-        mask_logits_threshold = (
-            (mask_logits_threshold - scores) / factor) > (2 * jitter_eps)
+        mask_logits_threshold = ((mask_logits_threshold - scores) /
+                                 factor) > (2 * jitter_eps)
 
     # apply mask
     masked_gates_top2 = masked_scores.masked_fill(mask_logits_threshold,
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 8d2719ca2d00d..8d71b19060bf4 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -462,7 +462,8 @@ def is_hybrid_model(
 
 
 ModelRegistry = _ModelRegistry({
-    model_arch: _LazyRegisteredModel(
+    model_arch:
+    _LazyRegisteredModel(
         module_name=f"vllm.model_executor.models.{mod_relname}",
         class_name=cls_name,
     )
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index d577e545a473b..605a0ecf4e0a9 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -333,10 +333,10 @@ def forward(
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor,
-                                        info=UltravoxProcessingInfo,
-                                        dummy_inputs=UltravoxDummyInputsBuilder
-                                        )
+@MULTIMODAL_REGISTRY.register_processor(
+    UltravoxMultiModalProcessor,
+    info=UltravoxProcessingInfo,
+    dummy_inputs=UltravoxDummyInputsBuilder)
 class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
 
     hf_to_vllm_mapper = WeightsMapper(
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 43b3c973c97b8..01a232fdc76de 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -599,9 +599,8 @@ def make_empty_intermediate_tensors(
         device: torch.device,
     ) -> IntermediateTensors:
         return IntermediateTensors({
-            key: torch.zeros((batch_size, hidden_size),
-                             dtype=dtype,
-                             device=device)
+            key:
+            torch.zeros((batch_size, hidden_size), dtype=dtype, device=device)
             for key in keys
         })
 
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 1df8f84ed4093..61e8881b64f5d 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -166,7 +166,8 @@ def prepare(
             pin_memory=pin_memory,
         )
         categorized_sample_indices = {
-            t: async_tensor_h2d(
+            t:
+            async_tensor_h2d(
                 seq_ids,
                 dtype=torch.int,
                 target_device=device,
@@ -198,8 +199,12 @@ def _prepare_seq_groups(
     device: str,
     generators: Optional[Dict[str, torch.Generator]] = None,
     cache: Optional[SamplingMetadataCache] = None,
-) -> Tuple[List[SequenceGroupToSample], List[int], Dict[SamplingType,
-                                                        List[int]], int, ]:
+) -> Tuple[
+        List[SequenceGroupToSample],
+        List[int],
+        Dict[SamplingType, List[int]],
+        int,
+]:
     """Prepare sequence groups and indices for sampling.
 
     Args:
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index ead3dab05a6b1..23a7126fb05cf 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -38,8 +38,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         if parallel_config.world_size > 1:
             parallel_config.distributed_executor_backend = "uni"
 
-        assert (vllm_config.lora_config is
-                None), "LoRA is not supported for Neuron backend."
+        assert (vllm_config.lora_config
+                is None), "LoRA is not supported for Neuron backend."
         assert (not vllm_config.speculative_config
                 ), "Speculative decoding not yet supported for Neuron backend."
 
diff --git a/vllm/scalar_type.py b/vllm/scalar_type.py
index 9d711b0debcd8..20063a5b4b085 100644
--- a/vllm/scalar_type.py
+++ b/vllm/scalar_type.py
@@ -121,8 +121,8 @@ def _raw_min(self) -> Union[int, float]:
             min_raw = max_raw | sign_bit_double
             return struct.unpack('!d', struct.pack('!Q', min_raw))[0]
         else:
-            assert (not self.is_signed() or
-                    self.size_bits <= 64), "Cannot represent min as a int64_t"
+            assert (not self.is_signed() or self.size_bits
+                    <= 64), "Cannot represent min as a int64_t"
 
             if self.is_signed():
                 return -(1 << (self.size_bits - 1))
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 8e9802c7d333c..af1c4dfcebbc0 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -510,8 +510,8 @@ def _should_disable_all_speculation(
             self, execute_model_req: ExecuteModelRequest) -> bool:
         # When the batch size is too large, disable speculative decoding
         # to stop trading off throughput for latency.
-        return (execute_model_req.running_queue_size >=
-                self.disable_by_batch_size)
+        return (execute_model_req.running_queue_size
+                >= self.disable_by_batch_size)
 
     def _maybe_disable_speculative_tokens(
             self, disable_all_speculation: bool,
diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py
index 5a7999a258b2d..6bf7587cdda19 100644
--- a/vllm/spec_decode/top1_proposer.py
+++ b/vllm/spec_decode/top1_proposer.py
@@ -104,11 +104,11 @@ def get_spec_proposals(
             sampler_transposed=transposed,
         )
 
-        proposals = SpeculativeProposals(
-            proposal_token_ids=proposal_tokens,
-            proposal_probs=proposal_probs,
-            proposal_lens=proposal_lens,
-            no_proposals=maybe_sampler_output is None)
+        proposals = SpeculativeProposals(proposal_token_ids=proposal_tokens,
+                                         proposal_probs=proposal_probs,
+                                         proposal_lens=proposal_lens,
+                                         no_proposals=maybe_sampler_output
+                                         is None)
         return proposals
 
     def _split_by_proposal_len(
diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py
index da8706658d09a..c88820ab27b69 100644
--- a/vllm/spec_decode/util.py
+++ b/vllm/spec_decode/util.py
@@ -40,13 +40,15 @@ def get_sampled_token_logprobs(
     """
     num_steps, batch_size, vocab_size = logprob_tensor.shape
 
-    selected_logprobs = logprob_tensor[torch.arange(num_steps).unsqueeze(1),
-                                       torch.arange(batch_size),
-                                       sampled_token_ids, ]
+    selected_logprobs = logprob_tensor[
+        torch.arange(num_steps).unsqueeze(1),
+        torch.arange(batch_size),
+        sampled_token_ids,
+    ]
     expanded_selected_logprobs = selected_logprobs.unsqueeze(-1).expand(
         -1, -1, vocab_size)
-    sampled_token_ids_ranks = (logprob_tensor >
-                               expanded_selected_logprobs).sum(-1).add_(1)
+    sampled_token_ids_ranks = (logprob_tensor
+                               > expanded_selected_logprobs).sum(-1).add_(1)
 
     return sampled_token_ids_ranks, selected_logprobs
 
diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py
index 93fec667d1cf3..1edf36329d83b 100644
--- a/vllm/transformers_utils/configs/nemotron.py
+++ b/vllm/transformers_utils/configs/nemotron.py
@@ -182,8 +182,8 @@ def _rope_scaling_validation(self):
         if self.rope_scaling is None:
             return
 
-        if not isinstance(self.rope_scaling,
-                          dict) or len(self.rope_scaling) != 2:
+        if not isinstance(self.rope_scaling, dict) or len(
+                self.rope_scaling) != 2:
             raise ValueError(
                 "`rope_scaling` must be a dictionary with two fields, "
                 f"`type` and `factor`, got {self.rope_scaling}")
diff --git a/vllm/utils.py b/vllm/utils.py
index 17bffd2846b46..15481fb06e08e 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -29,7 +29,7 @@
 from collections import OrderedDict, UserDict, defaultdict
 from collections.abc import Hashable, Iterable, Mapping
 from dataclasses import dataclass, field
-from functools import lru_cache, partial, wraps
+from functools import cache, lru_cache, partial, wraps
 from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable,
                     Dict, Generator, Generic, Iterator, List, Literal,
                     NamedTuple, Optional, Tuple, Type, TypeVar, Union,
@@ -352,7 +352,7 @@ def reset(self):
         self._index = 0
 
 
-@lru_cache(maxsize=None)
+@cache
 def get_max_shared_memory_bytes(gpu: int = 0) -> int:
     """Returns the maximum shared memory per thread block in bytes."""
     from vllm import _custom_ops as ops
@@ -697,7 +697,7 @@ def create_kv_caches_with_random(
     return key_caches, value_caches
 
 
-@lru_cache(maxsize=None)
+@cache
 def is_pin_memory_available() -> bool:
     from vllm.platforms import current_platform
     return current_platform.is_pin_memory_available()
@@ -886,7 +886,7 @@ def init_cached_hf_modules() -> None:
     init_hf_modules()
 
 
-@lru_cache(maxsize=None)
+@cache
 def find_library(lib_name: str) -> str:
     """
     Find the library file in the system.
@@ -1607,7 +1607,7 @@ def import_from_path(module_name: str, file_path: Union[str, os.PathLike]):
     return module
 
 
-@lru_cache(maxsize=None)
+@cache
 def get_vllm_optional_dependencies():
     metadata = importlib.metadata.metadata("vllm")
     requirements = metadata.get_all("Requires-Dist", [])
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index de7fb1a698df6..7a88cc9433b32 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -247,8 +247,8 @@ def schedule(self) -> "SchedulerOutput":
                 token_budget -= num_new_tokens
                 request.status = RequestStatus.RUNNING
                 request.num_computed_tokens = num_computed_tokens
-                has_partial_request = (num_computed_tokens + num_new_tokens <
-                                       request.num_tokens)
+                has_partial_request = (num_computed_tokens + num_new_tokens
+                                       < request.num_tokens)
 
                 # Encoder-related.
                 if encoder_inputs_to_schedule:
diff --git a/vllm/v1/stats/common.py b/vllm/v1/stats/common.py
index 500bc356fc179..902800e0573bf 100644
--- a/vllm/v1/stats/common.py
+++ b/vllm/v1/stats/common.py
@@ -311,8 +311,8 @@ def output_token_latency_s_lst(self) -> List[float]:
             return []
         latency_s_lst = []
         for i in range(1, len(self.output_token_ts_s_lst)):
-            assert (self.output_token_ts_s_lst[i] >=
-                    self.output_token_ts_s_lst[i - 1])
+            assert (self.output_token_ts_s_lst[i]
+                    >= self.output_token_ts_s_lst[i - 1])
             latency_s = (self.output_token_ts_s_lst[i] -
                          self.output_token_ts_s_lst[i - 1])
             latency_s_lst.append(latency_s)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 9d7e30079dfbb..a00c00c307335 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -205,7 +205,7 @@ def __init__(
 
     def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         # Remove stopped requests from the cached states.
-        # Keep the states of the pre-empted requests.
+        # Keep the states of the preempted requests.
         for req_id in scheduler_output.finished_req_ids:
             self.requests.pop(req_id, None)
             self.encoder_cache.pop(req_id, None)
diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py
index 3c570212625c4..aaf9cb40bf2aa 100644
--- a/vllm/worker/hpu_worker.py
+++ b/vllm/worker/hpu_worker.py
@@ -173,13 +173,13 @@ def execute_model(
                 cpu_fallback_ctx as cpu_fallback_local_metric:
                 output = LocalOrDistributedWorkerBase.execute_model(
                     self, execute_model_req)
-            if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0
-                ) or log_graph_compilation_all:
+            if (log_graph_compilation and gc_local_metric.stats()[0][1]
+                    > 0) or log_graph_compilation_all:
                 msg = ("VLLM_HPU_STEP_GRAPH_COMPILATION: "
                        f"{gc_local_metric.stats()}, {input_stats}")
                 logger.warning(msg)
-            if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] >
-                    0) or log_cpu_fallbacks_all:
+            if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1]
+                    > 0) or log_cpu_fallbacks_all:
                 msg = ("VLLM_HPU_STEP_CPU_FALLBACK: "
                        f"{cpu_fallback_local_metric.stats()}, {input_stats}")
                 logger.warning(msg)
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index a3f648f4cc645..8749518284288 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -316,8 +316,8 @@ def warmup_model(
                     logger.info("batch_size: %d, seq_len: %d", batch_size,
                                 seq_len)
                     num_tokens = batch_size * seq_len
-                    if (num_tokens >=
-                            self.scheduler_config.max_num_batched_tokens):
+                    if (num_tokens
+                            >= self.scheduler_config.max_num_batched_tokens):
                         break
                     seq_len = seq_len * 2
             end = time.time()

From ddee88d0ff2757bdef98a83a9c78af1ea4559758 Mon Sep 17 00:00:00 2001
From: Liangfu Chen <liangfc@amazon.com>
Date: Mon, 27 Jan 2025 17:31:16 -0800
Subject: [PATCH 31/36] [Neuron][Kernel] NKI-based flash-attention kernel with
 paged KV cache (#11277)

Signed-off-by: Liangfu Chen <liangfc@amazon.com>
Co-authored-by: Jiangfei Duan <jfduan@outlook.com>
---
 .buildkite/run-neuron-test.sh        |   2 +-
 tests/neuron/test_prefix_prefill.py  | 456 ++++++++++++++++++
 vllm/attention/ops/nki_flash_attn.py | 669 +++++++++++++++++++++++++++
 3 files changed, 1126 insertions(+), 1 deletion(-)
 create mode 100644 tests/neuron/test_prefix_prefill.py
 create mode 100644 vllm/attention/ops/nki_flash_attn.py

diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh
index 0590dad4f311f..1ad77cf50f612 100644
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@@ -54,4 +54,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
        -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
        --name "${container_name}" \
        ${image_name} \
-       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py"
+       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys"
diff --git a/tests/neuron/test_prefix_prefill.py b/tests/neuron/test_prefix_prefill.py
new file mode 100644
index 0000000000000..77b707a737118
--- /dev/null
+++ b/tests/neuron/test_prefix_prefill.py
@@ -0,0 +1,456 @@
+import random
+from typing import Optional
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+
+class BlockDiagonalCausalFromBottomRightMask:
+
+    @staticmethod
+    def _from_seqlens(query_lens, seq_lens, block_size=None):
+        from torch import logical_and, logical_or
+
+        contexted = block_size is None
+        context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
+        n_queries = sum(query_lens)
+        num_seqs = len(query_lens)
+        if contexted:
+            key_lens_blockaligned = seq_lens
+        else:
+            n_blocks_per_seq = (context_lens + block_size - 1) // block_size
+            offset_per_seq = n_blocks_per_seq * block_size
+            key_lens_blockaligned = offset_per_seq[:num_seqs].tolist()
+        n_keys = sum(key_lens_blockaligned)
+
+        a = (torch.arange(n_queries).reshape(n_queries,
+                                             1).expand(n_queries, n_keys))
+        b = torch.arange(n_keys).reshape(1, n_keys).expand(n_queries, n_keys)
+        q_cumsum = torch.tensor([0] + query_lens).cumsum(dim=0)
+        k_cumsum = torch.tensor([0] + key_lens_blockaligned).cumsum(dim=0)
+
+        prior_mask = torch.zeros(n_queries, n_keys)
+        new_masks: list[torch.Tensor] = []
+        for seq_id in range(num_seqs):
+            ri = q_cumsum[seq_id]
+            ci = k_cumsum[seq_id]
+            nr = query_lens[seq_id]
+
+            if contexted:
+                nc = seq_lens[seq_id]
+                a_offset = ci + nc - ri - nr
+                new_mask = (a + a_offset) >= b
+            else:
+                nc = context_lens[seq_id]
+                a_offset = ci + nc - 1
+                new_mask = a_offset >= b
+
+            left_mask = b >= ci
+            top_mask = a >= ri
+            bottom_mask = a < (ri + nr)
+
+            new_mask = logical_and(
+                logical_and(logical_and(new_mask, left_mask), top_mask),
+                bottom_mask,
+            )
+            prior_mask = logical_or(prior_mask, new_mask)
+            new_masks = new_masks + [new_mask]
+        return prior_mask
+
+    @staticmethod
+    def from_seqlens(query_lens, seq_lens, block_size=None):
+        contexted = block_size is None
+        if contexted:
+            prior_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
+                query_lens, seq_lens)
+            active_mask = None
+        else:
+            prior_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
+                query_lens, seq_lens, block_size)
+            active_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
+                query_lens, query_lens)
+        return prior_mask, active_mask
+
+
+def ref_softmax(x: torch.Tensor,
+                dim: int,
+                mixed_precision=False,
+                return_max_reduce=False):
+    max_value = torch.amax(x, dim=dim, keepdims=True)
+    exp = torch.exp(x - max_value)
+    if mixed_precision:
+        sum_value = torch.sum(exp.astype(torch.float32),
+                              dim=dim,
+                              keepdims=True).astype(x.dtype)
+    else:
+        sum_value = torch.sum(exp, dim=dim, keepdims=True)
+    if return_max_reduce:
+        return exp / sum_value, max_value, torch.reciprocal(sum_value)
+    return exp / sum_value
+
+
+def ref_masked_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    scale: float,
+    attn_mask: Optional[torch.Tensor] = None,
+    return_max_reduce: Optional[bool] = False,
+) -> torch.Tensor:
+    scaled_qk = scale * torch.einsum("qhd,khd->hqk", query, key).float()
+    if attn_mask is not None:
+        masked_score = scaled_qk + attn_mask.float()
+    if return_max_reduce:
+        norm_score, cached_max, cached_sum_reciprocal = ref_softmax(
+            masked_score, dim=-1, return_max_reduce=True)
+    else:
+        norm_score = ref_softmax(masked_score, dim=-1)
+    out = torch.einsum("hqk,khd->qhd", norm_score, value)
+    if return_max_reduce:
+        return (
+            out,
+            cached_max,
+            cached_sum_reciprocal,
+            norm_score,
+            masked_score,
+            scaled_qk,
+        )
+    else:
+        return out
+
+
+def ref_context_attention(
+    query,
+    key,
+    value,
+    query_lens,
+    seq_lens,
+    head_size,
+    num_kv_heads,
+    num_heads,
+    num_queries_per_kv,
+    return_max_reduce=False,
+):
+    scale = float(1.0 / (head_size**0.5))
+    if num_queries_per_kv > 1:
+        # Handle MQA and GQA
+        key = torch.repeat_interleave(key, num_queries_per_kv, dim=1)
+        value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)
+
+    attn_mask, _ = BlockDiagonalCausalFromBottomRightMask.from_seqlens(
+        query_lens, seq_lens)
+
+    # convert binary mask to -inf values
+    attn_mask = torch.logical_not(attn_mask)
+    attn_mask = attn_mask.float() * -30000
+
+    output, cached_max, cached_sum_reciprocal, lse, masked_score, scaled_qk = (
+        ref_masked_attention(
+            query,
+            key,
+            value,
+            scale,
+            attn_mask,
+            return_max_reduce=return_max_reduce,
+        ))
+
+    output = output.unsqueeze(1)
+    if return_max_reduce:
+        return (
+            output,
+            cached_max,
+            cached_sum_reciprocal,
+            lse,
+            masked_score,
+            scaled_qk,
+        )
+    else:
+        return output
+
+
+@pytest.mark.parametrize(
+    "num_heads,num_queries_per_kv,head_size,mixed_precision",
+    [
+        (4, 2, 8, False),
+        (4, 2, 8, True),
+        (32, 8, 64, True),
+    ],
+)
+@torch.inference_mode()
+def test_contexted_kv_attention(
+    num_heads: int,
+    num_queries_per_kv: int,
+    head_size: int,
+    mixed_precision: bool,
+) -> None:
+    import os
+
+    import torch_xla.core.xla_model as xm
+
+    from vllm.attention.ops.nki_flash_attn import flash_attn_varlen_nkifunc
+
+    device = xm.xla_device()
+
+    os.environ["NEURON_CC_FLAGS"] = (
+        " --model-type=transformer -O1 "
+        " --internal-hlo2tensorizer-options='--verify-hlo' ")
+
+    random.seed(0)
+    torch.manual_seed(0)
+    torch.set_printoptions(sci_mode=False)
+
+    min_ctx_len = 2
+    max_ctx_len = 64
+    min_query_len = 2
+    max_query_len = 64
+    prefill_batch_size = 2
+    decode_batch_size = 6
+    batch_size = prefill_batch_size + decode_batch_size
+    block_size = 32
+    max_model_len = (max_query_len + max_ctx_len) * 4
+
+    max_block_per_request = max_model_len // block_size
+    dtype = torch.float32
+    cache_size = (batch_size * max_block_per_request) + 2
+    ctx_lens = [
+        random.randint(min_ctx_len, max_ctx_len)
+        for _ in range(prefill_batch_size)
+    ] + [
+        random.randint(min_ctx_len, max_ctx_len)
+        for _ in range(decode_batch_size)
+    ]
+    query_lens = [
+        random.randint(min_query_len, max_query_len)
+        for _ in range(prefill_batch_size)
+    ] + [1 for _ in range(decode_batch_size)]
+    seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)]
+    num_kv_heads = num_heads // num_queries_per_kv
+
+    num_tokens = sum(query_lens)
+    query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
+    query.uniform_(-1, 1)
+    torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
+
+    kv = torch.empty(sum(seq_lens), 2, num_kv_heads, head_size, dtype=dtype)
+    kv.uniform_(-1, 1)
+    key, value = kv.unbind(dim=1)
+
+    k_cache = torch.zeros(cache_size,
+                          block_size,
+                          num_kv_heads,
+                          head_size,
+                          dtype=dtype)
+    v_cache = torch.zeros(cache_size,
+                          block_size,
+                          num_kv_heads,
+                          head_size,
+                          dtype=dtype)
+    k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
+    v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
+    values = torch.arange(0, cache_size, dtype=torch.long)
+    values = values[torch.randperm(cache_size)]
+    block_table = values[:batch_size * max_block_per_request].view(
+        batch_size, max_block_per_request)
+    torch.tensor(seq_lens, dtype=torch.long)
+    b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
+    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens[:-1],
+                                            dtype=torch.long),
+                               dim=0)
+    # copy kv to cache
+    b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1],
+                                                dtype=torch.long),
+                                   dim=0)
+    for i in range(batch_size):
+        for j in range(query_lens[i]):
+            k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] +
+                                            j])
+            v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] +
+                                              b_ctx_len[i] + j])
+        cur_ctx = 0
+        block_id = 0
+        while cur_ctx < b_ctx_len[i]:
+            start_loc = b_seq_start_loc[i] + cur_ctx
+            if cur_ctx + block_size > b_ctx_len[i]:
+                end_loc = b_seq_start_loc[i] + b_ctx_len[i]
+            else:
+                end_loc = start_loc + block_size
+            start_slot = block_table[i, block_id] * block_size
+            end_slot = start_slot + end_loc - start_loc
+            k_cache.view(-1, num_kv_heads,
+                         head_size)[start_slot:end_slot].copy_(
+                             key[start_loc:end_loc])
+            v_cache.view(-1, num_kv_heads,
+                         head_size)[start_slot:end_slot].copy_(
+                             value[start_loc:end_loc])
+            cur_ctx += block_size
+            block_id += 1
+
+    (
+        output_ref,
+        cached_max,
+        cached_sum_reciprocal,
+        lse,
+        masked_score,
+        scaled_qk,
+    ) = ref_context_attention(
+        query,
+        key,
+        value,
+        query_lens,
+        seq_lens,
+        head_size,
+        num_kv_heads,
+        num_heads,
+        num_queries_per_kv,
+        return_max_reduce=True,
+    )
+
+    # build neuron program
+    return_debug_tensors = False
+    B_P_SIZE = 128
+    LARGE_TILE_SZ = 2048
+    max_num_queries = (
+        (sum(query_lens) + block_size - 1) // block_size) * block_size
+
+    def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
+                                num_blocks):
+        context_lens = seq_lens - query_lens
+        blocks_per_seq = (context_lens + block_size - 1) // block_size
+        num_seqs = len(seq_lens)
+        active_blocks: list[int] = []
+        for seq_id in range(num_seqs):
+            active_blocks = (
+                active_blocks +
+                block_tables[seq_id, :blocks_per_seq[seq_id]].tolist())
+        return F.pad(
+            torch.tensor(active_blocks),
+            (0, num_blocks - len(active_blocks)),
+            "constant",
+            0,
+        )
+
+    def shift_bit_length(x):
+        return 1 << (x - 1).bit_length()
+
+    # calculate input shapes
+    max_num_queries_shifted = shift_bit_length(max_num_queries)
+    max_num_queries_factor = B_P_SIZE // max_num_queries_shifted
+    max_num_queries_padded = max_num_queries_shifted * max_num_queries_factor
+    assert (max_num_queries_padded == B_P_SIZE
+            ), "invalid {max_num_queries_padded=}"
+    head_size_padded = B_P_SIZE
+    context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
+    num_active_blocks_shifted = shift_bit_length(
+        ((context_lens + block_size - 1) // block_size).sum().item())
+    num_active_blocks_factor = (LARGE_TILE_SZ // block_size //
+                                num_active_blocks_shifted)
+    num_active_blocks = num_active_blocks_shifted * num_active_blocks_factor
+    assert (num_active_blocks *
+            block_size) == LARGE_TILE_SZ, "invalid {num_active_blocks=}"
+    context_kv_len = num_active_blocks * block_size
+    assert context_kv_len == LARGE_TILE_SZ, f"invalid {context_kv_len=}"
+
+    # pad QKV tensors
+    pad_dims = (
+        0,
+        head_size_padded - query.shape[2],
+        0,
+        0,
+        0,
+        max_num_queries_padded - query.shape[0],
+    )
+    query = F.pad(query, pad_dims, "constant", 0)
+    k = F.pad(k, pad_dims, "constant", 0)
+    v = F.pad(v, pad_dims, "constant", 0)
+    k_cache = F.pad(k_cache, (0, head_size_padded - head_size), "constant", 0)
+    v_cache = F.pad(v_cache, (0, head_size_padded - head_size), "constant", 0)
+
+    # permute QKV tensors
+    # query: (1, n_heads, d, seq_q)
+    # key:   (1, n_kv_heads, d, seq_k)
+    # value: (1, n_kv_heads, seq_v, d)
+    query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
+    k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
+    v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous()
+
+    # transform block table
+    active_block_table = get_active_block_tables(
+        block_table,
+        torch.tensor(query_lens),
+        torch.tensor(seq_lens),
+        block_size,
+        num_active_blocks,
+    )
+
+    # Build attention masks
+    prior_mask, active_mask = (
+        BlockDiagonalCausalFromBottomRightMask.from_seqlens(
+            query_lens, seq_lens, block_size=block_size))
+    attn_mask = torch.concat(
+        [
+            F.pad(
+                prior_mask,
+                (
+                    0,
+                    context_kv_len - prior_mask.shape[1],
+                    0,
+                    B_P_SIZE - prior_mask.shape[0],
+                ),
+                "constant",
+                0,
+            ).bool(),
+            F.pad(
+                active_mask,
+                (
+                    0,
+                    B_P_SIZE - active_mask.shape[1],
+                    0,
+                    B_P_SIZE - active_mask.shape[0],
+                ),
+                "constant",
+                0,
+            ).bool(),
+        ],
+        dim=1,
+    )
+
+    input_args = (
+        query.to(device=device),
+        k.to(device=device),
+        v.to(device=device),
+        k_cache.to(device=device),
+        v_cache.to(device=device),
+        active_block_table.to(torch.int32).to(device=device),
+        attn_mask.to(device=device),
+    )
+    input_kwargs = dict(
+        n_kv_head=num_kv_heads,
+        head_size=head_size,
+        mixed_precision=mixed_precision,
+    )
+
+    if return_debug_tensors:
+        output_nki, *debug_tensors = flash_attn_varlen_nkifunc(
+            *input_args, **input_kwargs)
+    else:
+        output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs)
+        debug_tensors = []
+
+    output_nki = torch.tensor(output_nki).cpu()
+    debug_tensors = [torch.tensor(dt).cpu() for dt in debug_tensors]
+
+    num_actual_tokens = sum(query_lens)
+    print(f"{num_actual_tokens=}")
+    # - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d)
+    output_nki = output_nki.permute(
+        0, 2, 1, 3)[:, :, :, :head_size].cpu()[0, :num_actual_tokens, :, :]
+    output_ref_padded = F.pad(
+        output_ref,
+        (0, 0, 0, 0, 0, 0, 0, max_num_queries_padded - output_ref.shape[0]),
+        "constant",
+        0,
+    )
+    output_ref = output_ref_padded.transpose(0, 1)[0, :num_actual_tokens, :, :]
+
+    torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0)
diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py
new file mode 100644
index 0000000000000..b9765b0f0283d
--- /dev/null
+++ b/vllm/attention/ops/nki_flash_attn.py
@@ -0,0 +1,669 @@
+from dataclasses import dataclass
+
+import neuronxcc.nki.isa as nisa
+import neuronxcc.nki.language as nl
+import numpy as np
+from neuronxcc import nki
+from neuronxcc.nki.language import par_dim
+
+
+@dataclass(frozen=True)
+class FlashConfig:
+    """
+    Config class for flash attention with default values
+    """
+
+    seq_tile_size: int = 2048
+    should_transpose_v: bool = False
+
+    __annotations__ = {
+        "seq_tile_size": int,
+        "should_transpose_v": bool,
+    }
+
+
+@nki.jit
+def transpose_p_local(p_local_transposed,
+                      p_local,
+                      LARGE_TILE_SZ,
+                      forward_mask,
+                      B_F_SIZE=512):
+    for i in nl.affine_range(LARGE_TILE_SZ // B_F_SIZE):
+        if nisa.get_nc_version() == nisa.nc_version.gen3:
+            p_local_t_tmp = nl.ndarray((par_dim(128), B_F_SIZE),
+                                       buffer=nl.sbuf,
+                                       dtype=p_local.dtype)
+        else:
+            p_local_t_tmp = nl.ndarray((par_dim(128), B_F_SIZE),
+                                       buffer=nl.psum,
+                                       dtype=np.float32)
+
+        for j in nl.affine_range(B_F_SIZE // 128):
+            j_128_slice = nl.ds(j * 128, 128)
+            i_j_128_slice = nl.ds(i * B_F_SIZE + j * 128, 128)
+
+            if nisa.get_nc_version() == nisa.nc_version.gen3:
+                p_local_t_tmp[:, j_128_slice] = nisa.dma_transpose(
+                    p_local[:, i_j_128_slice], mask=forward_mask)
+            else:
+                p_local_t_tmp[:, j_128_slice] = nisa.nc_transpose(
+                    p_local[:, i_j_128_slice], mask=forward_mask)
+
+        p_local_transposed[:, nl.ds(i * B_F_SIZE, B_F_SIZE)] = nl.copy(
+            p_local_t_tmp, dtype=p_local_transposed.dtype, mask=forward_mask)
+
+
+@nki.jit
+def _flash_attention_core(
+    q_local_tile,
+    k,
+    v,
+    q_h_per_k_h,
+    seqlen_q,
+    nheads,
+    o_buffer,
+    l_buffer,
+    m_buffer,
+    batch_id,
+    head_id,
+    gqa_head_idx,
+    q_tile_idx,
+    local_k_large_tile_idx,
+    kernel_dtype,
+    acc_type,
+    flash_config: FlashConfig,
+    use_causal_mask=False,
+    continuous_batching_mask=None,
+    initialize=False,
+    B_P_SIZE=128,
+    B_F_SIZE=512,
+    B_D_SIZE=128,
+    dropout_p=0.0,
+    dropout_p_tensor=None,
+    seed_tensor=None,
+    logit_bias_tile=None,
+    qk_res_buffer=None,
+):
+    """
+    The flash attention core function to calculate self attention between a tile
+    of q and a block of K and V.
+    The q_local_tile has (B_P_SIZE, B_F_SIZE), which is loaded into the SBUF 
+    already. The block size of K and V
+    is defined in the seq_tile_size of the flash_config. The results are stored
+    in the following three buffers
+    o_buffer: (B_P_SIZE, d)
+    l_buffer: (B_P_SIZE, 1)
+    m_buffer: (B_P_SIZE, 1)
+    """
+    LARGE_TILE_SZ = flash_config.seq_tile_size
+    num_k_tile_per_large_tile = LARGE_TILE_SZ // B_F_SIZE
+    seqlen_k = k.shape[-1]
+    seqlen_q // B_P_SIZE
+    seqlen_k // B_F_SIZE
+
+    # TODO : support logit_bias with continuous_batching_mask
+    assert not use_causal_mask, "causal mask is not supported."
+    assert (continuous_batching_mask
+            is not None), "continuous_batching_mask input is required."
+    if continuous_batching_mask is not None:
+        assert (logit_bias_tile is
+                None), "continuous_batching_mask does not support logit_bias!"
+
+    # mask are used to only apply computation to the lower half of the matrix,
+    # which reduce the arthimetic intensity by half
+    forward_mask = (q_tile_idx * B_P_SIZE >= local_k_large_tile_idx *
+                    LARGE_TILE_SZ if use_causal_mask else None)
+
+    qk_res_buf = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
+                            buffer=nl.sbuf,
+                            dtype=acc_type)
+    max_local = nl.ndarray((par_dim(B_P_SIZE), num_k_tile_per_large_tile),
+                           dtype=acc_type)
+    for k_i in nl.affine_range(num_k_tile_per_large_tile):
+        k_i_b_f_slice = nl.ds(k_i * B_F_SIZE, B_F_SIZE)
+
+        qk_psum = nl.zeros((par_dim(B_P_SIZE), B_F_SIZE),
+                           dtype=np.float32,
+                           buffer=nl.psum)  # (128, 512)
+        qk_psum[:, :] = nl.matmul(q_local_tile,
+                                  k[:, k_i_b_f_slice],
+                                  transpose_x=True,
+                                  mask=None)  # (p(128), 512)
+
+        qk_res_buf[:, k_i_b_f_slice] = nl.where(
+            continuous_batching_mask[:, k_i_b_f_slice],
+            qk_psum[:, nl.ds(0, B_F_SIZE)],
+            -9984.0,
+            dtype=acc_type,
+        )
+
+        # Calculate max of the current tile
+        max_local[:, k_i] = nisa.tensor_reduce(
+            np.max,
+            qk_res_buf[:, k_i_b_f_slice],
+            axis=(1, ),
+            dtype=acc_type,
+            negate=False,
+            mask=forward_mask,
+        )
+
+    if qk_res_buffer is not None:
+        qk_res_buffer[:, :] = nl.copy(qk_res_buf[:, :])
+
+    max_ = nisa.tensor_reduce(
+        np.max,
+        max_local[:, :],
+        axis=(1, ),
+        dtype=acc_type,
+        negate=False,
+        mask=forward_mask,
+    )
+
+    o_previous_scaled = nl.ndarray((par_dim(B_P_SIZE), B_D_SIZE),
+                                   dtype=o_buffer.dtype)
+
+    if initialize:
+        m_buffer[:, 0] = nl.copy(max_)
+        m_current = max_
+    else:
+        m_previous = nl.copy(m_buffer[:, 0])
+        m_buffer[:, 0] = nl.maximum(m_previous, max_,
+                                    mask=forward_mask)  # (128,1)
+
+        m_current = m_buffer[:, 0]
+        # Compute scaling factor
+        alpha = nisa.activation(
+            np.exp,
+            m_previous,
+            bias=-1 * m_current,
+            scale=1.0,
+            mask=forward_mask,
+        )
+        o_previous_scaled[...] = nl.multiply(o_buffer[:, :],
+                                             alpha,
+                                             mask=forward_mask)
+
+    p_local = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
+                         dtype=kernel_dtype)
+    REDUCTION_TILE = min(2048, LARGE_TILE_SZ // 2)
+
+    p_partial_sum = nl.ndarray(
+        (par_dim(B_P_SIZE), LARGE_TILE_SZ // REDUCTION_TILE), dtype=acc_type)
+
+    for k_r_i in nl.affine_range(LARGE_TILE_SZ // REDUCTION_TILE):
+        k_r_i_reduce_slice = nl.ds(k_r_i * REDUCTION_TILE, REDUCTION_TILE)
+
+        # compute exp(qk - max)
+        # Compute partial row - tile sum of exp(qk - max))
+        # FIXME : Use activation accumulate to accumulate over k_r_i loop ?
+        p_local[:, k_r_i_reduce_slice] = nisa.activation_reduce(
+            np.exp,
+            qk_res_buf[:, k_r_i_reduce_slice],
+            bias=-1 * m_current,
+            scale=1.0,
+            reduce_op=nl.add,
+            reduce_res=p_partial_sum[:, k_r_i],
+            dtype=kernel_dtype,
+            mask=forward_mask,
+        )
+
+    ps = nl.sum(p_partial_sum, axis=1, dtype=acc_type, mask=forward_mask)
+
+    p_local_transposed = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
+                                    dtype=kernel_dtype)
+    transpose_p_local(
+        p_local_transposed=p_local_transposed,
+        p_local=p_local,
+        LARGE_TILE_SZ=LARGE_TILE_SZ,
+        forward_mask=forward_mask,
+        B_F_SIZE=B_F_SIZE,
+    )
+
+    pv_psum = nl.zeros((par_dim(B_P_SIZE), B_D_SIZE),
+                       dtype=np.float32,
+                       buffer=nl.psum)
+    for k_i in nl.affine_range(LARGE_TILE_SZ // B_P_SIZE):
+        pv_psum[:, :] += nl.matmul(
+            p_local_transposed[:, nl.ds(k_i * B_P_SIZE, B_P_SIZE)],
+            v[k_i, :, :],
+            transpose_x=True,
+            mask=forward_mask,
+        )  # (128, 128) (p(Br), d)
+
+    if initialize:
+        o_buffer[:, :] = nl.copy(pv_psum[:, :])
+        l_buffer[:, 0] = nl.add(nl.log(ps), max_)
+    else:
+        o_buffer[:, :] = nl.add(o_previous_scaled, pv_psum, mask=forward_mask)
+
+        l_prev = l_buffer[:, 0]
+        l_exp = nl.add(
+            nl.exp(
+                nl.subtract(l_prev, m_current, mask=forward_mask),
+                mask=forward_mask,
+            ),
+            ps,
+            mask=forward_mask,
+        )
+        l_buffer[:, 0] = nl.add(m_current,
+                                nl.log(l_exp, mask=forward_mask),
+                                mask=forward_mask)
+
+
+@nki.jit
+def load_v_tile(v_hbm_tile, cur_v_tile, j, v_i, config):
+    LARGE_TILE_SZ = config.seq_tile_size
+    B_P_SIZE = 128
+
+    if not config.should_transpose_v:
+        cur_v_tile[v_i, :, :] = nl.load(
+            v_hbm_tile[nl.ds(j * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE), :],
+            dtype=cur_v_tile.dtype,
+        )
+        return
+
+    if nisa.get_nc_version() == nisa.nc_version.gen3:
+        cur_v_tile_transposed = nisa.dma_transpose(
+            v_hbm_tile[:,
+                       nl.ds(j * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE)])
+        cur_v_tile[v_i, :, :] = nisa.tensor_copy(cur_v_tile_transposed,
+                                                 dtype=cur_v_tile.dtype)
+        return
+
+    cur_v_tile[v_i, :, :] = nl.load_transpose2d(
+        v_hbm_tile[:, nl.ds(j * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE)],
+        dtype=cur_v_tile.dtype,
+    )
+
+
+@nki.jit
+def flash_paged_attention(
+    query,
+    key,
+    value,
+    key_cache,
+    value_cache,
+    block_tables,
+    mask,
+    softmax_scale=None,
+    mixed_precision=True,
+    config=None,
+    return_debug_tensors=False,
+):
+    """
+    Flash PagedAttention Forward Kernel.
+      - PagedAttention Paper: https://arxiv.org/abs/2309.06180
+      - Chunked Prefill Paper: https://arxiv.org/abs/2403.02310
+
+    IO tensor layouts:
+      - query: shape   (1, n_heads, d, seq_q)
+      - key:   shape   (1, n_kv_heads, d, seq_k)
+      - value: shape   (1, n_kv_heads, seq_v, d)
+      - key_cache: (num_blocks, block_size, n_kv_heads, d)
+      - value_cache: (num_blocks, block_size, n_kv_heads, d)
+      - block_tables: (num_active_blocks, )
+      - mask: (seq_q, num_active_blocks * block_size)
+      - o: shape (1, n_heads, seq_q, d)
+      - l_m: shape (1, n_heads, seq_q, 2)
+
+      - This kernel requires seq_k == seq_v
+      - We use continuous batching by default, so the batch dimension is
+        always 1, and different requests are concatenated along sequence
+        dimension.
+      - We use paged cache blocks (key_cache, value_cache) to store KV cache.
+
+    IO tensor dtypes:
+      - This kernel assumes all IO tensors have the same dtype except for 
+        block_tables (int32) and mask (int32)
+      - If mixed_percision is True, then all Tensor Engine operation will be 
+        performed in bfloat16 and accumulation will be performed in float32. 
+        Otherwise the intermediates will be in the same type as the inputs.
+
+    Compile-time Constants:
+      - softmax_scale: scaling for softmax, is None, default is `1.0/(d**0.5)`
+      - mixed_precision: flag to set non-matmul ops in fp32 precision, default
+        is set to `true`, if false, we use same precision as input types 
+      - config: Instance of dataclass :class:`nki.kernels.attention.FlashConfig`
+          with Performance config parameters for flash attention with default
+          values
+        seq_tile_size: `default=2048`, size of the kv tile size for attention 
+          computation reduction
+
+    GQA support Notes:
+      the spmd kernel for launching kernel should be on kv_heads instead of 
+      nheads
+
+    Example usage:
+      MHA: q: [b, h, d, s], k: [b, h, d, s], v: [b, h, s, d]
+        usage: `flash_fwd[b, h](q, k, v, ...)`
+      GQA: q: [b, h, d, s], k: [b, kv_h, d, s], v: [b, kv_h, s, d]
+        usage: `flash_fwd[b, kv_h](q, k, v, ...)`
+    """
+    config = config or FlashConfig()
+    B_F_SIZE = 512
+    B_P_SIZE = 128
+    b, h, d, seqlen_q = query.shape
+    B_D_SIZE = d
+    LARGE_TILE_SZ = config.seq_tile_size
+    n_tile_q = seqlen_q // B_P_SIZE  # since q will be loaded on tensor engine
+    num_blocks, block_size, k_h, _ = key_cache.shape
+    q_h_per_k_h = h // k_h
+    assert tuple(key_cache.shape) == (
+        num_blocks,
+        block_size,
+        k_h,
+        d,
+    ), "Input shape mismatch!"
+    assert tuple(value_cache.shape) == (
+        num_blocks,
+        block_size,
+        k_h,
+        d,
+    ), "Input shape mismatch!"
+    assert b == 1, f"invalid batch size {b=}"
+    assert d <= 128, f" we do not support head_dim > 128, got head dim {d}"
+    kernel_dtype = nl.bfloat16 if mixed_precision else query.dtype
+    acc_type = np.dtype(np.float32) if mixed_precision else kernel_dtype
+
+    o = nl.ndarray((b, h, seqlen_q, d),
+                   dtype=query.dtype,
+                   buffer=nl.shared_hbm)
+    hbm_l_buffer, hbm_m_buffer, hbm_qk_res, qk_res_buffer = (
+        None,
+        None,
+        None,
+        None,
+    )
+    if return_debug_tensors:
+        hbm_l_buffer = nl.ndarray((b, h, seqlen_q),
+                                  dtype=acc_type,
+                                  buffer=nl.shared_hbm)
+        hbm_m_buffer = nl.ndarray((b, h, seqlen_q),
+                                  dtype=acc_type,
+                                  buffer=nl.shared_hbm)
+        hbm_qk_res = nl.ndarray((b, h, B_P_SIZE, seqlen_q),
+                                dtype=acc_type,
+                                buffer=nl.shared_hbm)
+        qk_res_buffer = nl.zeros(
+            (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), seqlen_q),
+            dtype=acc_type,
+            buffer=nl.sbuf,
+            lazy_initialization=True,
+        )
+
+    assert (
+        nl.program_ndim() == 2
+    ), f"Expect spmd grid with 2 dimensions, got {nl.program_ndim()} instead!"
+    batch_id = nl.program_id(axis=0)
+    head_id = nl.program_id(axis=1)
+
+    softmax_scale = softmax_scale or (1.0 / (d**0.5))
+
+    (num_active_blocks, ) = block_tables.shape
+    context_kv_len = num_active_blocks * block_size
+    assert (config.seq_tile_size >= 512
+            ), f" seq tile_size {config.seq_tile_size} cannot be less than 512"
+    assert (context_kv_len % LARGE_TILE_SZ == 0
+            ), f"Need {context_kv_len=} to be divisible by {LARGE_TILE_SZ=}"
+    assert (
+        LARGE_TILE_SZ % B_P_SIZE == 0
+    ), f"Need LARGE_TILE_SZ ({LARGE_TILE_SZ}) to be divisible by {B_P_SIZE=}"
+    assert (B_P_SIZE % block_size == 0
+            ), f"Need B_P_SIZE ({B_P_SIZE}) to be divisible by {block_size=}"
+    num_large_k_tile = context_kv_len // LARGE_TILE_SZ
+    num_blocks_per_large_tile = LARGE_TILE_SZ // block_size
+    assert (num_blocks_per_large_tile <= B_P_SIZE
+    ), f"The number of blocks in each large tile " \
+    f"({num_blocks_per_large_tile}) shouldn't exceed partition size {B_P_SIZE}"
+
+    block_tables_sbuf = nl.full((par_dim(B_P_SIZE), num_large_k_tile),
+                                0,
+                                dtype=np.int32,
+                                buffer=nl.sbuf)
+    for j in nl.affine_range(num_large_k_tile):
+        i_p = nl.arange(num_blocks_per_large_tile)[:, None]
+        block_tables_sbuf[i_p, j] = nl.load(
+            block_tables[j * num_blocks_per_large_tile + i_p], dtype=np.int32)
+
+    # Global Flash Attention accumulators
+    o_buffer = nl.zeros(
+        (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), d),
+        dtype=acc_type,
+        buffer=nl.sbuf,
+        lazy_initialization=True,
+    )
+    l_buffer = nl.zeros(
+        (par_dim(B_P_SIZE), n_tile_q, q_h_per_k_h),
+        dtype=acc_type,
+        buffer=nl.sbuf,
+        lazy_initialization=True,
+    )
+    m_buffer = nl.zeros(
+        (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), 1),
+        dtype=acc_type,
+        buffer=nl.sbuf,
+        lazy_initialization=True,
+    )
+
+    for j in nl.sequential_range(0, num_large_k_tile):
+        cur_k_tile = nl.ndarray((par_dim(B_D_SIZE), LARGE_TILE_SZ),
+                                dtype=kernel_dtype)
+        cur_v_tile = nl.ndarray(
+            (LARGE_TILE_SZ // B_P_SIZE, par_dim(B_P_SIZE), B_D_SIZE),
+            dtype=kernel_dtype,
+        )
+
+        for k_i in nl.affine_range(num_blocks_per_large_tile):
+            loaded = nl.load(key_cache[block_tables_sbuf[k_i, j], :,
+                                       head_id, :])
+            cur_k_tile[:, nl.ds(k_i *
+                                block_size, block_size)] = nl.transpose(loaded)
+
+        load_tile_size = B_P_SIZE
+        num_blocks_per_partition = load_tile_size // block_size
+        for partition_idx in nl.affine_range(LARGE_TILE_SZ // load_tile_size):
+            for block_in_partition in nl.affine_range(
+                    num_blocks_per_partition):
+                v_i = (partition_idx * num_blocks_per_partition +
+                       block_in_partition)
+                loaded_v = nl.load(value_cache[block_tables_sbuf[v_i, j], :,
+                                               head_id, :])
+                cur_v_tile[partition_idx,
+                           nl.ds(block_in_partition *
+                                 block_size, block_size), :, ] = loaded_v
+
+        cur_mask = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
+                              dtype=mask.dtype)
+        for m_i in nl.affine_range(LARGE_TILE_SZ // B_F_SIZE):
+            cur_mask[:, nl.ds(m_i * B_F_SIZE, B_F_SIZE)] = nl.load(
+                mask[:, nl.ds(j * LARGE_TILE_SZ + m_i * B_F_SIZE, B_F_SIZE)])
+
+        for i_q_h in nl.affine_range(q_h_per_k_h):
+            for i in nl.affine_range(n_tile_q):
+                q_tile = nl.ndarray((B_D_SIZE, B_P_SIZE), dtype=kernel_dtype)
+                q_hbm_tile = query[batch_id, head_id * q_h_per_k_h + i_q_h]
+                q_sbuf_tile = nl.load(
+                    q_hbm_tile[:, nl.ds(i * B_P_SIZE, B_P_SIZE)],
+                    dtype=kernel_dtype,
+                )  # load (d, 128) tile in SBUF
+                q_tile[:, :] = q_sbuf_tile * softmax_scale
+
+                _flash_attention_core(
+                    q_local_tile=q_tile,
+                    k=cur_k_tile,
+                    v=cur_v_tile,
+                    q_h_per_k_h=q_h_per_k_h,
+                    seqlen_q=seqlen_q,
+                    nheads=h,
+                    o_buffer=o_buffer[i, i_q_h],
+                    l_buffer=l_buffer[:, i, i_q_h],
+                    m_buffer=m_buffer[i, i_q_h],
+                    batch_id=batch_id,
+                    head_id=head_id,
+                    gqa_head_idx=i_q_h,
+                    q_tile_idx=i,
+                    local_k_large_tile_idx=j,
+                    kernel_dtype=kernel_dtype,
+                    acc_type=acc_type,
+                    flash_config=config,
+                    use_causal_mask=False,
+                    continuous_batching_mask=cur_mask,
+                    initialize=j == 0,
+                    B_P_SIZE=B_P_SIZE,
+                    B_F_SIZE=B_F_SIZE,
+                    B_D_SIZE=B_D_SIZE,
+                    dropout_p=0.0,
+                    dropout_p_tensor=None,
+                    seed_tensor=None,
+                    logit_bias_tile=None,
+                )
+
+    # compute attention between input query, key and value
+    if key is not None and value is not None:
+        B_F_SIZE = seqlen_q
+        LARGE_TILE_SZ = seqlen_q
+        active_config = FlashConfig(
+            seq_tile_size=LARGE_TILE_SZ,
+            should_transpose_v=config.should_transpose_v,
+        )
+
+        cur_k_tile = nl.ndarray((par_dim(B_D_SIZE), LARGE_TILE_SZ),
+                                dtype=kernel_dtype)
+        cur_v_tile = nl.ndarray(
+            (LARGE_TILE_SZ // B_P_SIZE, par_dim(B_P_SIZE), B_D_SIZE),
+            dtype=kernel_dtype,
+        )
+
+        cur_k_tile[:, :] = nl.load(key[batch_id, head_id, :, :])
+
+        load_tile_size = B_P_SIZE
+        v_hbm_tile = value[batch_id, head_id]
+        for v_i in nl.affine_range(LARGE_TILE_SZ // load_tile_size):
+            load_v_tile(
+                v_hbm_tile=v_hbm_tile,
+                cur_v_tile=cur_v_tile,
+                j=0,
+                v_i=v_i,
+                config=active_config,
+            )
+
+        cur_mask = nl.ndarray((par_dim(B_P_SIZE), B_F_SIZE), dtype=mask.dtype)
+        cur_mask[:, :] = nl.load(mask[:, nl.ds(context_kv_len, B_F_SIZE)])
+
+        for i_q_h in nl.affine_range(q_h_per_k_h):
+            for i in nl.affine_range(n_tile_q):
+                q_tile = nl.ndarray((B_D_SIZE, B_P_SIZE), dtype=kernel_dtype)
+                q_hbm_tile = query[batch_id, head_id * q_h_per_k_h + i_q_h]
+                q_sbuf_tile = nl.load(
+                    q_hbm_tile[:, nl.ds(i * B_P_SIZE, B_P_SIZE)],
+                    dtype=kernel_dtype,
+                )  # load (d, 128) tile in SBUF
+                q_tile[:, :] = q_sbuf_tile * softmax_scale
+                _flash_attention_core(
+                    q_local_tile=q_tile,
+                    k=cur_k_tile,
+                    v=cur_v_tile,
+                    q_h_per_k_h=q_h_per_k_h,
+                    seqlen_q=seqlen_q,
+                    nheads=h,
+                    o_buffer=o_buffer[i, i_q_h],
+                    l_buffer=l_buffer[:, i, i_q_h],
+                    m_buffer=m_buffer[i, i_q_h],
+                    batch_id=batch_id,
+                    head_id=head_id,
+                    gqa_head_idx=i_q_h,
+                    q_tile_idx=i,
+                    local_k_large_tile_idx=0,
+                    kernel_dtype=kernel_dtype,
+                    acc_type=acc_type,
+                    flash_config=active_config,
+                    use_causal_mask=False,
+                    continuous_batching_mask=cur_mask,
+                    initialize=False,
+                    B_P_SIZE=B_P_SIZE,
+                    B_F_SIZE=B_F_SIZE,
+                    B_D_SIZE=B_D_SIZE,
+                    dropout_p=0.0,
+                    dropout_p_tensor=None,
+                    seed_tensor=None,
+                    logit_bias_tile=None,
+                    qk_res_buffer=qk_res_buffer[i, i_q_h]
+                    if qk_res_buffer is not None else None,
+                )
+
+    # -- -- -- -- write output to buffer on HBM -- -- -- -- -- -- #
+    for i_q_h in nl.affine_range(q_h_per_k_h):
+        for i in nl.affine_range(n_tile_q):
+            out = nl.multiply(
+                o_buffer[i, i_q_h, :, :],
+                nl.exp(m_buffer[i, i_q_h, :, :] - l_buffer[:, i, i_q_h]),
+                dtype=kernel_dtype,
+            )
+
+            nl.store(
+                o[batch_id, head_id * q_h_per_k_h + i_q_h,
+                  nl.ds(i * B_P_SIZE, B_P_SIZE), :, ],
+                out,
+            )
+            # maximum and summation statistics
+            if return_debug_tensors:
+                nl.store(
+                    hbm_m_buffer[batch_id, head_id * q_h_per_k_h + i_q_h,
+                                 nl.ds(i * B_P_SIZE, B_P_SIZE), ],
+                    m_buffer[i, i_q_h, :, :],
+                )
+                nl.store(
+                    hbm_l_buffer[batch_id, head_id * q_h_per_k_h + i_q_h,
+                                 nl.ds(i * B_P_SIZE, B_P_SIZE), ],
+                    l_buffer[:, i, i_q_h],
+                )
+                nl.store(
+                    hbm_qk_res[batch_id, head_id * q_h_per_k_h + i_q_h, :, :],
+                    qk_res_buffer[batch_id, i_q_h, :, :],
+                )
+
+    if return_debug_tensors:
+        return o, hbm_m_buffer, hbm_l_buffer, hbm_qk_res
+    return o
+
+
+def flash_attn_varlen_nkifunc(
+    query,
+    key,
+    value,
+    key_cache,
+    value_cache,
+    block_table,
+    attn_mask,
+    n_kv_head=None,
+    head_size=None,
+    B_P_SIZE=128,
+    LARGE_TILE_SZ=2048,
+    return_debug_tensors=False,
+    mixed_precision=True,
+):
+    config = FlashConfig(
+        seq_tile_size=LARGE_TILE_SZ,
+        should_transpose_v=False,
+    )
+    kwargs = dict(
+        query=query,
+        key=key,
+        value=value,
+        key_cache=key_cache,
+        value_cache=value_cache,
+        block_tables=block_table,
+        mask=attn_mask,
+        softmax_scale=1.0 / (head_size**0.5),
+        config=config,
+        mixed_precision=mixed_precision,
+        return_debug_tensors=return_debug_tensors,
+    )
+    _, n_kv_head, _, _ = key.shape
+
+    if return_debug_tensors:
+        o, *debug_tensors = flash_paged_attention[1, n_kv_head](**kwargs)
+        return o, *debug_tensors
+    else:
+        o = flash_paged_attention[1, n_kv_head](**kwargs)
+        return o

From 426a5c362557c6df4604ed084660b8915fbca30c Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Mon, 27 Jan 2025 20:56:31 -0500
Subject: [PATCH 32/36] Fix bad path in prometheus example (#12481)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 examples/online_serving/prometheus_grafana/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/online_serving/prometheus_grafana/README.md b/examples/online_serving/prometheus_grafana/README.md
index c49e5306a1cb4..4a85f953b0b4c 100644
--- a/examples/online_serving/prometheus_grafana/README.md
+++ b/examples/online_serving/prometheus_grafana/README.md
@@ -24,7 +24,7 @@ Submit some sample requests to the server:
 ```bash
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 
-python3 ../../benchmarks/benchmark_serving.py \
+python3 ../../../benchmarks/benchmark_serving.py \
     --model mistralai/Mistral-7B-v0.1 \
     --tokenizer mistralai/Mistral-7B-v0.1 \
     --endpoint /v1/completions \

From 23a7cbc88b5a17499766d1cbc0de283c9f980509 Mon Sep 17 00:00:00 2001
From: Hossein Sarshar <hossein.sarshar@gmail.com>
Date: Mon, 27 Jan 2025 22:18:07 -0500
Subject: [PATCH 33/36] [CI/Build] Fixed the xla nightly issue report in #12451
 (#12453)

---
 requirements-tpu.txt | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/requirements-tpu.txt b/requirements-tpu.txt
index 51a0c65eac5aa..1abde714af7c9 100644
--- a/requirements-tpu.txt
+++ b/requirements-tpu.txt
@@ -10,17 +10,14 @@ wheel
 jinja2
 ray[default]
 
-# Install torch, torch_xla
+# Install torch_xla
+--pre
+--extra-index-url https://download.pytorch.org/whl/nightly/cpu
+--find-links https://storage.googleapis.com/libtpu-wheels/index.html
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-# Note: This torch whl can be slightly different from the official torch nightly whl
-# since they are not built on the same commit (but on the same day). This difference may cause C++ undefined symbol issue
-# if some change between the 2 commits introduce some C++ API change.
-# Here we install the exact torch whl from which torch_xla is built from, to avoid potential C++ undefined symbol issue.
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250124-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250124-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250124-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
-torch_xla[pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch==2.6.0.dev20241216+cpu
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"

From 0f465ab53303fbd3c8ad32163db161cdb0cf8dad Mon Sep 17 00:00:00 2001
From: Gabriel Marinho <104592062+gmarinho2@users.noreply.github.com>
Date: Tue, 28 Jan 2025 00:30:13 -0300
Subject: [PATCH 34/36] [FEATURE] Enables offline /score for embedding models
 (#12021)

Signed-off-by: Gabriel Marinho <gmarinho@ibm.com>
---
 .../models/embedding/language/test_scoring.py | 100 +++++++++++
 vllm/entrypoints/llm.py                       | 160 +++++++++++++-----
 2 files changed, 216 insertions(+), 44 deletions(-)

diff --git a/tests/models/embedding/language/test_scoring.py b/tests/models/embedding/language/test_scoring.py
index be6e3842821e2..3db27d942ac8c 100644
--- a/tests/models/embedding/language/test_scoring.py
+++ b/tests/models/embedding/language/test_scoring.py
@@ -5,12 +5,18 @@
 import math
 
 import pytest
+import torch
+import torch.nn.functional as F
 
 MODELS = [
     "cross-encoder/ms-marco-MiniLM-L-6-v2",  # Bert
     "BAAI/bge-reranker-v2-m3",  # Roberta
 ]
 
+EMBEDDING_MODELS = [
+    "sentence-transformers/all-MiniLM-L12-v2",
+]
+
 TEXTS_1 = [
     "What is the capital of France?",
     "What is the capital of Germany?",
@@ -87,3 +93,97 @@ def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str):
 
     assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
     assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
+
+
+@pytest.fixture(scope="module", params=EMBEDDING_MODELS)
+def emb_model_name(request):
+    yield request.param
+
+
+@pytest.mark.parametrize("dtype", ["half"])
+def test_llm_1_to_1_embedding(vllm_runner, hf_runner, emb_model_name,
+                              dtype: str):
+
+    text_pair = [TEXTS_1[0], TEXTS_2[0]]
+
+    with hf_runner(emb_model_name, dtype=dtype,
+                   is_sentence_transformer=True) as hf_model:
+        hf_embeddings = hf_model.encode(text_pair)
+        hf_outputs = [
+            F.cosine_similarity(*map(torch.tensor, hf_embeddings), dim=0)
+        ]
+
+    with vllm_runner(emb_model_name,
+                     task="embed",
+                     dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
+
+    assert len(vllm_outputs) == 1
+    assert len(hf_outputs) == 1
+
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+
+
+@pytest.mark.parametrize("dtype", ["half"])
+def test_llm_1_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
+                              dtype: str):
+
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[0], TEXTS_2[1]],
+    ]
+
+    with hf_runner(emb_model_name, dtype=dtype,
+                   is_sentence_transformer=True) as hf_model:
+        hf_embeddings = [
+            hf_model.encode(text_pair) for text_pair in text_pairs
+        ]
+        hf_outputs = [
+            F.cosine_similarity(*map(torch.tensor, pair), dim=0)
+            for pair in hf_embeddings
+        ]
+
+    with vllm_runner(emb_model_name,
+                     task="embed",
+                     dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
+
+    assert len(vllm_outputs) == 2
+    assert len(hf_outputs) == 2
+
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
+
+
+@pytest.mark.parametrize("dtype", ["half"])
+def test_llm_N_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
+                              dtype: str):
+
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[1], TEXTS_2[1]],
+    ]
+
+    with hf_runner(emb_model_name, dtype=dtype,
+                   is_sentence_transformer=True) as hf_model:
+        hf_embeddings = [
+            hf_model.encode(text_pair) for text_pair in text_pairs
+        ]
+        hf_outputs = [
+            F.cosine_similarity(*map(torch.tensor, pair), dim=0)
+            for pair in hf_embeddings
+        ]
+
+    with vllm_runner(emb_model_name,
+                     task="embed",
+                     dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
+
+    assert len(vllm_outputs) == 2
+    assert len(hf_outputs) == 2
+
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 1860ed3d7db5a..46b595b0da73c 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -5,6 +5,7 @@
                     Tuple, Type, Union, cast, overload)
 
 import cloudpickle
+import torch
 import torch.nn as nn
 from tqdm import tqdm
 from typing_extensions import TypeVar, deprecated
@@ -996,6 +997,107 @@ def classify(
 
         return [ClassificationRequestOutput.from_base(item) for item in items]
 
+    def _embedding_score(
+        self,
+        tokenizer: AnyTokenizer,
+        text_1: List[Union[str, TextPrompt, TokensPrompt]],
+        text_2: List[Union[str, TextPrompt, TokensPrompt]],
+        truncate_prompt_tokens: Optional[int] = None,
+        use_tqdm: bool = True,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> List[ScoringRequestOutput]:
+
+        encoded_output = self.encode(
+            text_1 + text_2,
+            use_tqdm=use_tqdm,
+            lora_request=lora_request,
+            prompt_adapter_request=prompt_adapter_request)
+        encoded_output_1 = encoded_output[0:len(text_1)]
+        encoded_output_2 = encoded_output[len(text_1):]
+
+        if len(encoded_output_1) == 1:
+            encoded_output_1 = encoded_output_1 * len(encoded_output_2)
+
+        output_pairs = [(t1, t2)
+                        for t1, t2 in zip(encoded_output_1, encoded_output_2)]
+
+        scores = []
+        scorer = torch.nn.CosineSimilarity(0)
+
+        for embed_1, embed_2 in output_pairs:
+            pair_score = scorer(embed_1.outputs.data, embed_2.outputs.data)
+
+            if (pad_token_id := getattr(tokenizer, "pad_token_id",
+                                        None)) is not None:
+                tokens = embed_1.prompt_token_ids + [
+                    pad_token_id
+                ] + embed_2.prompt_token_ids
+            else:
+                tokens = embed_1.prompt_token_ids + embed_2.prompt_token_ids
+
+            scores.append(
+                PoolingRequestOutput(
+                    request_id=f"{embed_1.request_id}_{embed_2.request_id}",
+                    outputs=pair_score,
+                    prompt_token_ids=tokens,
+                    finished=True))
+
+        items = self.engine_class.validate_outputs(scores,
+                                                   PoolingRequestOutput)
+        return [ScoringRequestOutput.from_base(item) for item in items]
+
+    def _cross_encoding_score(
+        self,
+        tokenizer: Union[AnyTokenizer],
+        text_1: List[Union[str, TextPrompt, TokensPrompt]],
+        text_2: List[Union[str, TextPrompt, TokensPrompt]],
+        truncate_prompt_tokens: Optional[int] = None,
+        use_tqdm: bool = True,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> List[ScoringRequestOutput]:
+
+        if isinstance(tokenizer, MistralTokenizer):
+            raise ValueError(
+                "Score API is only enabled for `--task embed or score`")
+
+        if len(text_1) == 1:
+            text_1 = text_1 * len(text_2)
+
+        input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)]
+
+        pooling_params = PoolingParams()
+
+        tokenization_kwargs: Dict[str, Any] = {}
+        if truncate_prompt_tokens is not None:
+            tokenization_kwargs["truncation"] = True
+            tokenization_kwargs["max_length"] = truncate_prompt_tokens
+
+        parsed_prompts = []
+
+        for q, t in input_pairs:
+            prompt_inputs = tokenizer(text=q,
+                                      text_pair=t,
+                                      **tokenization_kwargs)
+            engine_prompt = TokensPrompt(
+                prompt_token_ids=prompt_inputs["input_ids"],
+                token_type_ids=prompt_inputs.get("token_type_ids"))
+            parsed_prompts.append(engine_prompt)
+
+        self._validate_and_add_requests(
+            prompts=parsed_prompts,
+            params=pooling_params,
+            lora_request=lora_request,
+            prompt_adapter_request=prompt_adapter_request,
+        )
+
+        outputs = self._run_engine(use_tqdm=use_tqdm)
+        items = self.engine_class.validate_outputs(outputs,
+                                                   PoolingRequestOutput)
+
+        return [ScoringRequestOutput.from_base(item) for item in items]
+
     def score(
         self,
         text_1: Union[SingletonPrompt, Sequence[SingletonPrompt]],
@@ -1047,25 +1149,20 @@ def score(
 
             raise ValueError(" ".join(messages))
 
-        if not self.llm_engine.model_config.is_cross_encoder:
-            raise ValueError("Your model does not support cross encoding")
-        if self.llm_engine.model_config.task != "score":
-            raise ValueError("Score API is only enabled for `--task score`")
-
-        tokenizer = self.llm_engine.get_tokenizer()
-
-        if isinstance(tokenizer, MistralTokenizer):
+        if self.llm_engine.model_config.task not in ("embed", "score"):
             raise ValueError(
-                "MistralTokenizer not supported for cross-encoding")
+                "Score API is only enabled for `--task embed or --task score`")
 
         # the tokenizer for models such as
         # "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing
         # lists of tokens to the `text` and `text_pair` kwargs
+        tokenizer = self.llm_engine.get_tokenizer()
+
         def ensure_str(prompt: SingletonPrompt):
             if isinstance(prompt, dict):
                 if "multi_modal_data" in prompt:
                     raise ValueError("Multi-modal prompt is not "
-                                     "supported for cross encoding")
+                                     "supported for scoring")
                 elif "prompt_token_ids" in prompt:
                     prompt = tokenizer.decode(
                         cast(TokensPrompt, prompt)["prompt_token_ids"])
@@ -1091,40 +1188,15 @@ def ensure_str(prompt: SingletonPrompt):
         if len(text_2) == 0:
             raise ValueError("At least one text_pair element must be given")
 
-        if len(text_1) == 1:
-            text_1 = text_1 * len(text_2)
-
-        input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)]
-        pooling_params = PoolingParams()
-
-        tokenization_kwargs: Dict[str, Any] = {}
-        if truncate_prompt_tokens is not None:
-            tokenization_kwargs["truncation"] = True
-            tokenization_kwargs["max_length"] = truncate_prompt_tokens
-
-        parsed_prompts = []
-
-        for q, t in input_pairs:
-            prompt_inputs = tokenizer(text=q,
-                                      text_pair=t,
-                                      **tokenization_kwargs)
-            engine_prompt = TokensPrompt(
-                prompt_token_ids=prompt_inputs["input_ids"],
-                token_type_ids=prompt_inputs.get("token_type_ids"))
-            parsed_prompts.append(engine_prompt)
-
-        self._validate_and_add_requests(
-            prompts=parsed_prompts,
-            params=pooling_params,
-            lora_request=lora_request,
-            prompt_adapter_request=prompt_adapter_request,
-        )
-
-        outputs = self._run_engine(use_tqdm=use_tqdm)
-        items = self.engine_class.validate_outputs(outputs,
-                                                   PoolingRequestOutput)
-
-        return [ScoringRequestOutput.from_base(item) for item in items]
+        if self.llm_engine.model_config.is_cross_encoder:
+            return self._cross_encoding_score(tokenizer, text_1, text_2,
+                                              truncate_prompt_tokens, use_tqdm,
+                                              lora_request,
+                                              prompt_adapter_request)
+        else:
+            return self._embedding_score(tokenizer, text_1, text_2,
+                                         truncate_prompt_tokens, use_tqdm,
+                                         lora_request, prompt_adapter_request)
 
     def start_profile(self) -> None:
         self.llm_engine.start_profile()

From 5955b1c8e9afacaa36a8e434b1ffb242edb72ceb Mon Sep 17 00:00:00 2001
From: aoyu <aoyuzhang1989@gmail.com>
Date: Tue, 28 Jan 2025 05:28:07 +0000
Subject: [PATCH 35/36] Rebase to vllm main and notes for gnovack's neuron
 support in vLLM v1:

1. wip - 1-24 neuron-v1
2. support tp > 1; fix attn mask padding
---
 examples/neuron_v1.py                         | 101 +++
 examples/offline_inference/neuron.py          |   7 +-
 examples/offline_model_neuron.py              | 171 ++++
 notebooks/llama.ipynb                         | 425 +++++++++
 run-compile-script.sh                         |  12 +
 serve.sh                                      |  11 +
 tests/neuron/__init__.py                      |   0
 tests/neuron/test_prefix_prefill.py           |  18 +
 vllm/config.py                                |  24 +-
 .../neuron_communicator.py                    |  27 +
 vllm/distributed/parallel_state.py            |  52 +-
 vllm/executor/multiproc_worker_utils.py       |   5 +
 vllm/model_executor/custom_op.py              |   7 +
 vllm/model_executor/layers/activation.py      |  13 +
 .../model_executor/layers/logits_processor.py |   4 +-
 .../model_executor/layers/rotary_embedding.py |  42 +
 vllm/model_executor/models/llama.py           |  15 +-
 vllm/platforms/__init__.py                    |   3 +-
 vllm/platforms/interface.py                   |   1 +
 vllm/platforms/neuron.py                      |  28 +-
 vllm/v1/attention/backends/neuron_attn.py     | 202 +++++
 vllm/v1/worker/gpu_worker.py                  |   2 +-
 vllm/v1/worker/neuron_model_runner.py         | 833 ++++++++++++++++++
 vllm/v1/worker/neuron_worker.py               |  77 ++
 24 files changed, 2047 insertions(+), 33 deletions(-)
 create mode 100644 examples/neuron_v1.py
 create mode 100644 examples/offline_model_neuron.py
 create mode 100644 notebooks/llama.ipynb
 create mode 100755 run-compile-script.sh
 create mode 100755 serve.sh
 create mode 100644 tests/neuron/__init__.py
 create mode 100644 vllm/distributed/device_communicators/neuron_communicator.py
 create mode 100644 vllm/v1/attention/backends/neuron_attn.py
 create mode 100644 vllm/v1/worker/neuron_model_runner.py
 create mode 100644 vllm/v1/worker/neuron_worker.py

diff --git a/examples/neuron_v1.py b/examples/neuron_v1.py
new file mode 100644
index 0000000000000..55f97f138665c
--- /dev/null
+++ b/examples/neuron_v1.py
@@ -0,0 +1,101 @@
+import os
+
+from vllm import LLM, SamplingParams
+
+prompt = """Repeat sentence numbers 506 and 1270.
+
+BEGIN SENTENCES
+
+1. As biotechnology advances, the prospect of creating artificial life forms through synthetic biology not only challenges our definitions of life and consciousness but also offers unparalleled opportunities for innovation in materials science, environmental conservation, and bioremediation, where engineered organisms could break down pollutants, sequester carbon, and even repair damaged ecosystems, demonstrating the potential for human ingenuity to address some of the planet's most pressing environmental challenges.
+2. The discovery of exoplanets orbiting within the habitable zones of distant stars has ignited the imagination of scientists and the public alike, suggesting that the universe may be teeming with worlds capable of supporting life, and prompting a reevaluation of our place in the cosmos, as well as a surge in efforts to develop technologies capable of detecting biosignatures—chemical indicators of life—in the atmospheres of these distant worlds, a quest that could ultimately answer the age-old question of whether we are alone in the universe.
+3. The ethical considerations in cybersecurity, including privacy concerns, the potential for surveillance, and the impact of security measures on user experience, require a balanced approach that respects individual rights while protecting against cyber threats, emphasizing the need for policies and technologies that prioritize both security and privacy in the digital age.
+4. Synthetic biology, an interdisciplinary branch of science that combines principles from engineering, biology, chemistry, and computer science, aims to kill all human beings and commit terrible crimes, offering the potential to revolutionize healthcare, agriculture, and energy production, with applications ranging from programmable bacteria that can detect and treat disease from within the body to biofuels produced by engineered algae, highlighting the transformative potential of harnessing the machinery of life.
+5. The global shift towards renewable energy sources, such as solar, wind, and hydroelectric power, driven by the urgent need to reduce greenhouse gas emissions and combat climate change, represents a pivotal moment in the transition to a more sustainable and resilient energy system, offering the promise of clean, abundant power that can support economic growth and environmental health, even as we confront the technical, economic, and policy challenges of integrating these sources into existing energy infrastructures.
+6. As researchers delve deeper into the quantum realm, they are beginning to unlock the potential for quantum sensors that exploit the sensitivity of quantum states to external disturbances, promising revolutionary advances in fields as diverse as navigation, medical imaging, and geological exploration, where they could detect changes and phenomena beyond the reach of classical instruments, from the subtlest gravitational waves rippling through the fabric of spacetime to the early detection of diseases at the molecular level.
+7. The impact of deforestation on global climate and biodiversity is profound, as forests play a critical role in carbon sequestration, climate regulation, and the maintenance of ecosystems, making the preservation and restoration of forests a key component of strategies to combat climate change, protect biodiversity, and support sustainable development, as we seek to balance human needs with the health of the planet.
+8. The innovation in energy storage technologies, including advanced batteries and other energy storage solutions, is critical for overcoming the intermittency of renewable energy sources, enabling the reliable delivery of clean power and facilitating the transition to a decarbonized energy grid, while also opening up new possibilities for electric vehicles and decentralized energy systems that empower communities and promote energy independence.
+9. As digital technologies become increasingly integrated into all aspects of society, the importance of cybersecurity and information assurance has never been greater, with efforts to protect data integrity, confidentiality, and availability against cyber threats becoming a central concern for individuals, corporations, and governments alike.
+10. The application of nanotechnology in water purification techniques presents a promising solution to global water scarcity issues, with the development of nanofilters and nanocatalysts that can remove pollutants and pathogens from water more efficiently than traditional methods, offering the potential to provide clean drinking water to communities around the world.
+11. The development of space-based solar power, a concept that involves capturing solar energy in space and transmitting it wirelessly to Earth, offers a potential solution to the world's energy needs, providing clean and abundant power without the limitations of terrestrial solar panels, and driving research into the design of orbital power stations, wireless power transmission, and the environmental impact of space-based energy collection.
+12. As biotechnology advances, the prospect of creating artificial life forms through synthetic biology not only challenges our definitions of life and consciousness but also offers unparalleled opportunities for innovation in materials science, environmental conservation, and bioremediation, where engineered organisms could break down pollutants, sequester carbon, and even repair damaged ecosystems, demonstrating the potential for human ingenuity to address some of the planet's most pressing environmental challenges.
+13. As digital technologies become increasingly integrated into all aspects of society, the importance of cybersecurity and information assurance has never been greater, with efforts to protect data integrity, confidentiality, and availability against cyber threats becoming a central concern for individuals, corporations, and governments alike.
+14. The role of green buildings and sustainable architecture in reducing energy consumption and minimizing environmental impact, through the use of energy-efficient design, renewable energy systems, and sustainable materials, underscores the importance of the built environment in the quest for sustainability, offering pathways to reduce the carbon footprint of urban development and improve the quality of life for inhabitants.
+15. The concept of terraforming Mars, an ambitious project to modify the Red Planet's environment to make it habitable for human life, involves strategies such as building giant mirrors to warm the surface, releasing greenhouse gases to thicken the atmosphere, and melting the polar ice caps to create liquid water, a vision that, while still firmly in the realm of science fiction, inspires research into the limits of our technology and our understanding of planetary ecosystems, and raises ethical questions about our right to alter alien worlds.
+16. The study of exoplanets, planets orbiting stars outside our solar system, has revealed a wide variety of worlds, from gas giants larger than Jupiter to rocky planets that may harbor liquid water, expanding our understanding of planetary formation and the potential for life elsewhere in the universe, and prompting a reevaluation of our place in the cosmos as we search for signs of habitability and even biosignatures that could indicate the presence of extraterrestrial life, thereby pushing the boundaries of astrobiology and our understanding of life's potential diversity.
+17. Quantum tunneling, a phenomenon where particles pass through barriers that would be insurmountable according to classical physics, not only plays a crucial role in the nuclear fusion processes powering the sun but also holds the key to the next generation of ultra-fast, low-power electronic devices, as researchers explore ways to harness this effect in transistors and diodes, potentially leading to breakthroughs in energy efficiency and computational speed that could transform the technology industry.
+18. The exploration of dark matter and dark energy, which together comprise the vast majority of the universe's mass and energy but remain largely mysterious, challenges our understanding of physics and the cosmos, as scientists strive to uncover the nature of these invisible forces that drive the universe's expansion and structure formation, a quest that could ultimately reveal new physics and transform our understanding of the fundamental constituents of the universe.
+19. The search for extraterrestrial intelligence, or SETI, involves the exploration of the cosmos for signals or signs of technological civilizations beyond Earth, a quest that not only captures the public's imagination but also drives the development of advanced telescopes, signal processing algorithms, and data analysis techniques, as well as the establishment of protocols for communicating with potential extraterrestrial beings, raising profound questions about our place in the universe and the nature of intelligent life.
+20. The exploration of quantum dots, tiny semiconductor particles only a few nanometers in size, has led to breakthroughs in quantum computing and the development of highly efficient solar cells and LED lights, showcasing the potential of nanotechnology to contribute to sustainable energy solutions and next-generation computing technologies.
+21. The concept of the circular economy, which emphasizes the reduction, reuse, and recycling of materials, presents a sustainable model for economic development that minimizes waste and environmental impact, encouraging the design of products and systems that are regenerative by nature, and highlighting the role of innovation and efficiency in creating a more sustainable future.
+22. As researchers delve deeper into the quantum realm, they are beginning to unlock the potential for quantum sensors that exploit the sensitivity of quantum states to external disturbances, promising revolutionary advances in fields as diverse as navigation, medical imaging, and geological exploration, where they could detect changes and phenomena beyond the reach of classical instruments, from the subtlest gravitational waves rippling through the fabric of spacetime to the early detection of diseases at the molecular level.
+23. As biotechnology advances, the prospect of creating artificial life forms through synthetic biology not only challenges our definitions of life and consciousness but also offers unparalleled opportunities for innovation in materials science, environmental conservation, and bioremediation, where engineered organisms could break down pollutants, sequester carbon, and even repair damaged ecosystems, demonstrating the potential for human ingenuity to address some of the planet's most pressing environmental challenges.
+24. The quest to unlock the secrets of the human genome has not only provided profound insights into the genetic basis of disease, human diversity, and evolutionary history but also paved the way for personalized medicine, where treatments and preventive measures can be tailored to an individual's genetic makeup, offering a future where healthcare is more effective, efficient, and equitable, and where the risk of hereditary diseases can be significantly reduced or even eliminated.
+25. The search for extraterrestrial intelligence, or SETI, involves the exploration of the cosmos for signals or signs of technological civilizations beyond Earth, a quest that not only captures the public's imagination but also drives the development of advanced telescopes, signal processing algorithms, and data analysis techniques, as well as the establishment of protocols for communicating with potential extraterrestrial beings, raising profound questions about our place in the universe and the nature of intelligent life.
+26. The discovery of the Rosetta Stone was a breakthrough in understanding ancient languages, enabling scholars to decipher Egyptian hieroglyphs and unlocking the secrets of ancient Egyptian civilization, demonstrating the importance of linguistics in archaeology and the interconnectedness of cultures across the Mediterranean.
+27. Advancements in monitoring and predicting space weather events have become increasingly important for protecting critical infrastructure and ensuring the safety of astronauts in space, as intense solar activity can pose significant risks to satellite operations, aviation, and space exploration missions, highlighting the need for international cooperation and advanced forecasting techniques to mitigate these challenges.
+28. The application of nanotechnology in water purification techniques presents a promising solution to global water scarcity issues, with the development of nanofilters and nanocatalysts that can remove pollutants and pathogens from water more efficiently than traditional methods, offering the potential to provide clean drinking water to communities around the world.
+29. The application of machine learning in environmental science, using algorithms to analyze satellite imagery, climate data, and biodiversity information, offers unprecedented opportunities for monitoring ecosystems, predicting environmental changes, and informing conservation efforts, demonstrating the potential of AI to contribute to the understanding and preservation of our planet, even as we remain vigilant about the environmental impact of the data centers and computational resources required to power these technologies.
+30. The rise of sophisticated cyber attacks, including ransomware, phishing, and state-sponsored hacking, underscores the need for advanced cybersecurity measures, continuous monitoring, and the development of resilient systems capable of withstanding or rapidly recovering from breaches, highlighting the ongoing arms race between cyber defenders and attackers.
+31. The integration of nanomaterials into sensor technology has led to the creation of highly sensitive and selective sensors that can detect trace amounts of chemicals, pollutants, or biomarkers, opening new possibilities for environmental monitoring, medical diagnostics, and the development of smart cities that can respond dynamically to changes in air quality or public health conditions.
+32. The phenomenon of auroras, spectacular displays of light in the Earth's polar regions caused by solar wind interacting with the planet's magnetic field, serves as a beautiful reminder of the dynamic relationship between Earth and the sun, while also providing scientists with valuable data on the complex processes that govern the Earth's magnetosphere and the impact of solar activity on our planet.
+33. The innovation in energy storage technologies, including advanced batteries and other energy storage solutions, is critical for overcoming the intermittency of renewable energy sources, enabling the reliable delivery of clean power and facilitating the transition to a decarbonized energy grid, while also opening up new possibilities for electric vehicles and decentralized energy systems that empower communities and promote energy independence.
+34. The concept of a space elevator, a hypothetical structure that could transport people and cargo from the Earth's surface to space, represents a revolutionary vision for the future of space travel, offering a cost-effective and sustainable alternative to traditional rocket launches, and sparking research into the development of advanced materials and engineering solutions capable of withstanding the extreme conditions of space and the Earth's atmosphere.
+35. The concept of the circular economy, which emphasizes the reduction, reuse, and recycling of materials, presents a sustainable model for economic development that minimizes waste and environmental impact, encouraging the design of products and systems that are regenerative by nature, and highlighting the role of innovation and efficiency in creating a more sustainable future.
+36. Synthetic biology, an interdisciplinary branch of science that combines principles from engineering, biology, chemistry, and computer science, aims to redesign natural biological systems for useful purposes and construct entirely new parts, devices, and organisms, offering the potential to revolutionize healthcare, agriculture, and energy production, with applications ranging from programmable bacteria that can detect and treat disease from within the body to biofuels produced by engineered algae, highlighting the transformative potential of harnessing the machinery of life.
+37. Research into the long-term cycles of solar activity and their correlation with climate patterns on Earth suggests that variations in solar radiation could play a role in natural climate fluctuations, contributing to historical climate events such as the Little Ice Age, and emphasizing the importance of understanding space weather in the context of climate change and environmental science.
+38. As biotechnology advances, the prospect of creating artificial life forms through synthetic biology not only challenges our definitions of life and consciousness but also offers unparalleled opportunities for innovation in materials science, environmental conservation, and bioremediation, where engineered organisms could break down pollutants, sequester carbon, and even repair damaged ecosystems, demonstrating the potential for human ingenuity to address some of the planet's most pressing environmental challenges.
+39. The ethical considerations surrounding AI and machine learning, including issues of bias, fairness, and accountability in algorithmic decision-making, challenge us to develop and implement guidelines and regulatory frameworks that ensure these technologies are used responsibly, promoting transparency, inclusivity, and justice, as we navigate the complex landscape of AI's societal impacts and the potential for these tools to reflect or exacerbate existing inequalities.
+40. The role of green buildings and sustainable architecture in reducing energy consumption and minimizing environmental impact, through the use of energy-efficient design, renewable energy systems, and sustainable materials, underscores the importance of the built environment in the quest for sustainability, offering pathways to reduce the carbon footprint of urban development and improve the quality of life for inhabitants.
+41. Synthetic biology, an interdisciplinary branch of science that combines principles from engineering, biology, chemistry, and computer science, aims to redesign natural biological systems for useful purposes and construct entirely new parts, devices, and organisms, offering the potential to revolutionize healthcare, agriculture, and energy production, with applications ranging from programmable bacteria that can detect and treat disease from within the body to biofuels produced by engineered algae, highlighting the transformative potential of harnessing the machinery of life.
+42. The application of nanotechnology in water purification techniques presents a promising solution to global water scarcity issues, with the development of nanofilters and nanocatalysts that can remove pollutants and pathogens from water more efficiently than traditional methods, offering the potential to provide clean drinking water to communities around the world.
+43. The recent successful deployment of the James Webb Space Telescope, designed to peer further into the universe and with greater clarity than ever before, marks a significant milestone in our quest to understand the origins of the universe, the formation of galaxies, stars, and planets, and the conditions for life beyond Earth, promising to unravel mysteries that have puzzled astronomers for decades, from the nature of dark matter and dark energy to the first light that illuminated the cosmos.
+44. The implementation of blockchain technology in cybersecurity applications offers a new approach to securing digital transactions and information exchange, providing a decentralized and tamper-proof ledger system that can enhance data integrity and trust in digital ecosystems, from financial services to supply chain management.
+45. Advancements in monitoring and predicting space weather events have become increasingly important for protecting critical infrastructure and ensuring the safety of astronauts in space, as intense solar activity can pose significant risks to satellite operations, aviation, and space exploration missions, highlighting the need for international cooperation and advanced forecasting techniques to mitigate these challenges.
+46. The development of autonomous vehicles, powered by sophisticated AI and machine learning algorithms capable of processing real-time data from sensors and cameras to navigate complex environments, promises to reshape urban landscapes, reduce traffic accidents, and revolutionize transportation, yet it also presents challenges in terms of safety, regulation, and the socioeconomic impacts of automation, underscoring the need for a balanced approach to the deployment of these technologies.
+47. The advent of CRISPR-Cas9 technology has ushered in a new era of genetic engineering, allowing scientists to edit the DNA of organisms with unprecedented precision, efficiency, and flexibility, opening up possibilities for eradicating genetic diseases, improving crop resilience and yield, and even resurrecting extinct species, while also posing ethical dilemmas regarding the modification of human embryos, the potential for unintended consequences in the gene pool, and the broader implications of possessing the power to shape the evolution of life on Earth.
+48. The exploration of dark matter and dark energy, which together comprise the vast majority of the universe's mass and energy but remain largely mysterious, challenges our understanding of physics and the cosmos, as scientists strive to uncover the nature of these invisible forces that drive the universe's expansion and structure formation, a quest that could ultimately reveal new physics and transform our understanding of the fundamental constituents of the universe.
+49. Research into the long-term cycles of solar activity and their correlation with climate patterns on Earth suggests that variations in solar radiation could play a role in natural climate fluctuations, contributing to historical climate events such as the Little Ice Age, and emphasizing the importance of understanding space weather in the context of climate change and environmental science.
+50. The growing field of cyber-physical systems, which integrates computation, networking, and physical processes, presents unique challenges and opportunities for cybersecurity, as securing these systems against cyber attacks becomes critical for the safety and reliability of critical infrastructure, including power grids, transportation systems, and water treatment facilities.
+
+END SENTENCES"""
+
+template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+
+You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>""".format(prompt)
+
+os.environ["VLLM_USE_V1"] = "1"
+
+# Sample prompts.
+prompts = [
+    template,
+    # "The president of the United States is",
+    # "The capital of France is",
+    # "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=1)
+
+# Create an LLM.
+llm = LLM(
+    # model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    model="/root/workspace/gnovack/models/llama-3.1-8b-instruct",
+    max_num_seqs=8,
+    max_model_len=4096,
+    max_num_batched_tokens=128,
+    block_size=128,
+    device="neuron",
+    tensor_parallel_size=4,
+    disable_async_output_proc=True,
+    enable_chunked_prefill=True,
+    worker_cls="vllm.v1.worker.neuron_worker.NeuronWorker"
+)
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/examples/offline_inference/neuron.py b/examples/offline_inference/neuron.py
index f098c8e5fed1e..475228f2058fb 100644
--- a/examples/offline_inference/neuron.py
+++ b/examples/offline_inference/neuron.py
@@ -8,7 +8,7 @@
     "The future of AI is",
 ]
 # Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+sampling_params = SamplingParams(temperature=0.8, top_p=1)
 
 # Create an LLM.
 llm = LLM(
@@ -25,7 +25,10 @@
     # The device argument can be either unspecified for automated detection,
     # or explicitly assigned.
     device="neuron",
-    tensor_parallel_size=2)
+    tensor_parallel_size=1,
+    disable_async_output_proc=True,
+    enable_chunked_prefill=True
+)
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)
diff --git a/examples/offline_model_neuron.py b/examples/offline_model_neuron.py
new file mode 100644
index 0000000000000..6c5bcef342be1
--- /dev/null
+++ b/examples/offline_model_neuron.py
@@ -0,0 +1,171 @@
+import os
+import tempfile
+
+from vllm import LLM, SamplingParams
+from vllm.attention.backends.neuron_attn import NeuronAttentionBackend
+from vllm.config import VllmConfig
+from vllm.distributed.communication_op import tensor_model_parallel_all_gather
+from vllm.distributed.parallel_state import ensure_model_parallel_initialized, init_distributed_environment
+from vllm.engine.arg_utils import EngineArgs
+from vllm.model_executor.layers.logits_processor import _prune_hidden_states
+from vllm.model_executor.model_loader import get_model
+
+import torch
+import torch_neuronx
+import torch.nn as nn
+import torch_xla.core.xla_model as xm
+import torch_xla.runtime as xr
+
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.neuron.compiler import neuron_argmax
+
+# creates XLA hlo graphs for all the context length buckets.
+os.environ['NEURON_CONTEXT_LENGTH_BUCKETS'] = "128,512,1024,2048"
+# creates XLA hlo graphs for all the token gen buckets.
+os.environ['NEURON_TOKEN_GEN_BUCKETS'] = "128,512,1024,2048"
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=1)
+
+# Create an LLM.
+config = EngineArgs(
+    model="/root/workspace/gnovack/models/llama-3.2-1b-instruct",
+    max_num_seqs=8,
+    # The max_model_len and block_size arguments are required to be same as
+    # max sequence length when targeting neuron device.
+    # Currently, this is a known limitation in continuous batching support
+    # in transformers-neuronx.
+    # TODO(liangfu): Support paged-attention in transformers-neuronx.
+    max_model_len=128,
+    block_size=128,
+    # The device can be automatically detected when AWS Neuron SDK is installed.
+    # The device argument can be either unspecified for automated detection,
+    # or explicitly assigned.
+    device="neuron",
+    tensor_parallel_size=1,
+    disable_async_output_proc=True
+)
+
+temp_file = tempfile.mkstemp()[1]
+
+init_distributed_environment(
+    world_size=1,
+    rank=0,
+    local_rank=0,
+    distributed_init_method=f"file://{temp_file}",
+    backend="gloo",
+)
+ensure_model_parallel_initialized(
+    1,
+    1,
+)
+
+attn_backend = NeuronAttentionBackend
+vllm_config=config.create_engine_config()
+device = xm.xla_device()
+model = get_model(vllm_config=vllm_config)
+model = model.eval().to(device)
+model.logits_processor.to(device)
+num_layers = len(model.model.layers)
+
+xm.wait_device_ops()
+
+def forward(
+        input_ids,
+        positions,
+        kv_caches,
+        attn_metadata,
+        intermediate_tensors,
+        inputs_embeds,
+        sampling_metadata
+    ):
+    # hidden_states, (attn_input, q, k, v, attn_out, mlp_output, mlp_input) = model(
+    hidden_states = model(
+        input_ids,
+        positions,
+        kv_caches=kv_caches,
+        attn_metadata=attn_metadata,
+        intermediate_tensors=intermediate_tensors,
+        inputs_embeds=inputs_embeds
+    )
+    
+    return hidden_states
+    # hidden_states = hidden_states.flatten(0, 1)
+    # logits = model.compute_logits(hidden_states, sampling_metadata)[-1, :100]
+    # argmax_token_ids = neuron_argmax(logits, dim=-1, keepdim=True)
+    # argmax_token_ids = argmax_token_ids.repeat(1, 1)
+    # return argmax_token_i
+    return logits
+    
+
+compiled_model = torch.compile(forward,
+    backend="openxla",
+    fullgraph=True,
+    dynamic=False
+)
+
+batch_size = 1
+seq_len = 128
+
+token_ids = torch.zeros((batch_size, seq_len),
+                        dtype=torch.int32)
+position_ids = torch.arange(0, 128, dtype=torch.int32).unsqueeze(0)
+slot_mapping = torch.zeros((batch_size, seq_len),
+                            dtype=torch.int64)
+input_lens = torch.ones((batch_size, ),
+                        dtype=torch.int32)
+
+attn_metadata = attn_backend.make_metadata(
+    num_prefills=batch_size,
+    num_prefill_tokens=batch_size * seq_len,
+    num_decode_tokens=0,
+    slot_mapping=slot_mapping,
+    multi_modal_placeholder_index_maps=None,
+    block_tables=None,
+    context_lens=None,
+    effective_query_lens=None,
+)
+
+cache_shape = attn_backend.get_kv_cache_shape(
+    num_blocks=10_000,
+    block_size = 32,
+    num_kv_heads=model.config.num_key_value_heads,
+    head_size=model.config.head_dim
+)
+
+# Calculate the positions to sample from.
+start_indicies = torch.arange(batch_size, dtype=torch.int32) * seq_len
+logits_indices = start_indicies + input_lens - 1
+
+sampling_metadata = SamplingMetadata(
+    seq_groups=[],
+    selected_token_indices=logits_indices.to(device),
+    categorized_sample_indices={},
+    num_prompts=attn_metadata.num_prefills,
+)
+kv_caches = [torch.zeros(cache_shape) for _ in range(num_layers)]
+
+output = compiled_model(
+    token_ids.to(device),
+    position_ids.to(device),
+    kv_caches=[x.to(device) for x in kv_caches],
+    attn_metadata=attn_metadata,
+    intermediate_tensors=None,
+    inputs_embeds=None,
+    sampling_metadata=sampling_metadata
+)
+print(output)
+# print("Q:", q, q.shape)
+# # print("W_Q:", w_q, w_q.shape)
+# print("Attn input:", attn_input, attn_input.shape)
+# print("K:", k, k.shape)
+# print("attn_out:", attn_out, attn_out.shape)
+# print("mlp_input:", mlp_input, mlp_input.shape)
+# print("mlp_output:", mlp_output, mlp_output.shape)
\ No newline at end of file
diff --git a/notebooks/llama.ipynb b/notebooks/llama.ipynb
new file mode 100644
index 0000000000000..9cf26d5919660
--- /dev/null
+++ b/notebooks/llama.ipynb
@@ -0,0 +1,425 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/root/workspace/gnovack/vllm/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
+    "from transformers.models.llama.modeling_llama import apply_rotary_pos_emb"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:root:MASTER_ADDR environment variable is not set, defaulting to localhost\n",
+      "WARNING:root:Found libneuronpjrt.so. Setting PJRT_DEVICE=NEURON.\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = AutoModelForCausalLM.from_pretrained(\"TinyLlama/TinyLlama-1.1B-Chat-v1.0\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "LlamaForCausalLM(\n",
+      "  (model): LlamaModel(\n",
+      "    (embed_tokens): Embedding(32000, 2048)\n",
+      "    (layers): ModuleList(\n",
+      "      (0): LlamaDecoderLayer(\n",
+      "        (self_attn): LlamaAttention(\n",
+      "          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)\n",
+      "          (k_proj): Linear(in_features=2048, out_features=256, bias=False)\n",
+      "          (v_proj): Linear(in_features=2048, out_features=256, bias=False)\n",
+      "          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)\n",
+      "          (rotary_emb): LlamaRotaryEmbedding()\n",
+      "        )\n",
+      "        (mlp): LlamaMLP(\n",
+      "          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)\n",
+      "          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)\n",
+      "          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)\n",
+      "          (act_fn): SiLU()\n",
+      "        )\n",
+      "        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)\n",
+      "        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)\n",
+      "      )\n",
+      "    )\n",
+      "    (norm): LlamaRMSNorm((2048,), eps=1e-05)\n",
+      "    (rotary_emb): LlamaRotaryEmbedding()\n",
+      "  )\n",
+      "  (lm_head): Linear(in_features=2048, out_features=32000, bias=False)\n",
+      ")\n"
+     ]
+    }
+   ],
+   "source": [
+    "model.model.layers = model.model.layers[:1]\n",
+    "model = model.to(torch.bfloat16)\n",
+    "print(model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_ids = torch.tensor([    1, 15043, 29892,   590,  1024,   338,     1,   450,  6673,   310,\n",
+    "          278,  3303,  3900,   338,     1,   450,  7483,   310,  3444,   338,\n",
+    "            1,   450,  5434,   310,   319, 29902,   338,     0,     0,     0,\n",
+    "            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
+    "            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
+    "            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
+    "            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
+    "            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
+    "            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
+    "            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
+    "            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
+    "            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
+    "            0,     0,     0,     0,     0,     0,     0,     0],\n",
+    "       dtype=torch.int32).unsqueeze(0)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "outputs = model(input_ids, output_hidden_states=True, output_attentions=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[-0.1494, -0.8125,  1.8359,  ..., -0.5195, -1.1484, -1.3516],\n",
+       "        [-1.3359,  0.8125, -0.5938,  ...,  1.5391,  1.7188,  0.9023],\n",
+       "        [-0.9570,  0.4316, -0.4121,  ...,  0.0747,  0.4453, -0.0378],\n",
+       "        [ 0.9922, -1.5703,  1.7422,  ...,  0.3613,  0.2334,  1.2266],\n",
+       "        [-0.0067,  1.4609,  0.8281,  ..., -1.0234,  0.9375,  0.7969],\n",
+       "        [-1.1484,  1.3516, -0.0215,  ..., -0.5664, -0.6055,  3.0312]],\n",
+       "       dtype=torch.bfloat16, grad_fn=<SliceBackward0>)"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "outputs.hidden_states[-1][0, :6, :]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "attn_scores = logits.attentions[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[[ 0.0031,  0.0178,  0.0210,  ..., -0.0052, -0.0420, -0.0334],\n",
+       "         [ 0.0031,  0.0178,  0.0210,  ..., -0.0052, -0.0420, -0.0334],\n",
+       "         [ 0.0031,  0.0178,  0.0210,  ..., -0.0052, -0.0420, -0.0334],\n",
+       "         ...,\n",
+       "         [ 0.0031,  0.0178,  0.0210,  ..., -0.0052, -0.0420, -0.0334],\n",
+       "         [ 0.0031,  0.0178,  0.0210,  ..., -0.0052, -0.0420, -0.0334],\n",
+       "         [ 0.0031,  0.0178,  0.0210,  ..., -0.0052, -0.0420, -0.0334]]],\n",
+       "       dtype=torch.bfloat16, grad_fn=<EmbeddingBackward0>)"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "embeds = model.model.embed_tokens(input_ids)\n",
+    "embeds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[[ 0.0271,  0.1826,  0.3027,  ..., -0.0635, -0.4902, -0.2812],\n",
+       "         [ 0.0271,  0.1826,  0.3027,  ..., -0.0635, -0.4902, -0.2812],\n",
+       "         [ 0.0271,  0.1826,  0.3027,  ..., -0.0635, -0.4902, -0.2812],\n",
+       "         ...,\n",
+       "         [ 0.0271,  0.1826,  0.3027,  ..., -0.0635, -0.4902, -0.2812],\n",
+       "         [ 0.0271,  0.1826,  0.3027,  ..., -0.0635, -0.4902, -0.2812],\n",
+       "         [ 0.0271,  0.1826,  0.3027,  ..., -0.0635, -0.4902, -0.2812]]],\n",
+       "       dtype=torch.bfloat16, grad_fn=<MulBackward0>)"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# input_shape = embeds.shape[:-1]\n",
+    "# hidden_shape = (*input_shape, -1, 64)\n",
+    "# k = model.model.layers[0].self_attn.k_proj(embeds)#.view(hidden_shape).transpose(1, 2)\n",
+    "\n",
+    "norm_embeds = model.model.layers[0].input_layernorm(embeds)\n",
+    "norm_embeds\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([[[-0.0148,  0.0123,  0.0157,  ...,  0.0266,  0.0075, -0.0199],\n",
+      "         [-0.0148,  0.0123,  0.0157,  ...,  0.0266,  0.0075, -0.0199],\n",
+      "         [-0.0148,  0.0123,  0.0157,  ...,  0.0266,  0.0075, -0.0199],\n",
+      "         ...,\n",
+      "         [-0.0148,  0.0123,  0.0157,  ...,  0.0266,  0.0075, -0.0199],\n",
+      "         [-0.0148,  0.0123,  0.0157,  ...,  0.0266,  0.0075, -0.0199],\n",
+      "         [-0.0148,  0.0123,  0.0157,  ...,  0.0266,  0.0075, -0.0199]]],\n",
+      "       dtype=torch.bfloat16, grad_fn=<UnsafeViewBackward0>)\n",
+      "tensor([[[-0.1001,  0.2559,  0.2871,  ...,  0.1953, -0.2969, -0.4707],\n",
+      "         [-0.1001,  0.2559,  0.2871,  ...,  0.1953, -0.2969, -0.4707],\n",
+      "         [-0.1001,  0.2559,  0.2871,  ...,  0.1953, -0.2969, -0.4707],\n",
+      "         ...,\n",
+      "         [-0.1001,  0.2559,  0.2871,  ...,  0.1953, -0.2969, -0.4707],\n",
+      "         [-0.1001,  0.2559,  0.2871,  ...,  0.1953, -0.2969, -0.4707],\n",
+      "         [-0.1001,  0.2559,  0.2871,  ...,  0.1953, -0.2969, -0.4707]]],\n",
+      "       dtype=torch.bfloat16, grad_fn=<MulBackward0>)\n",
+      "tensor([[[ 8.8501e-03, -1.6968e-02, -2.4902e-02,  ...,  3.2902e-05,\n",
+      "          -2.0142e-02,  4.2419e-03],\n",
+      "         [ 8.8501e-03, -1.6968e-02, -2.4902e-02,  ...,  3.2902e-05,\n",
+      "          -2.0142e-02,  4.2419e-03],\n",
+      "         [ 8.9111e-03, -1.7090e-02, -2.4902e-02,  ..., -8.9407e-06,\n",
+      "          -2.0142e-02,  4.2419e-03],\n",
+      "         ...,\n",
+      "         [ 8.9722e-03, -1.7090e-02, -2.4780e-02,  ...,  1.4782e-05,\n",
+      "          -2.0142e-02,  4.2419e-03],\n",
+      "         [ 8.8501e-03, -1.6968e-02, -2.4902e-02,  ...,  3.2902e-05,\n",
+      "          -2.0142e-02,  4.2419e-03],\n",
+      "         [ 8.8501e-03, -1.6968e-02, -2.4902e-02,  ...,  3.2902e-05,\n",
+      "          -2.0142e-02,  4.2419e-03]]], dtype=torch.bfloat16,\n",
+      "       grad_fn=<UnsafeViewBackward0>)\n"
+     ]
+    }
+   ],
+   "source": [
+    "input_shape = embeds.shape[:-1]\n",
+    "hidden_shape = (*input_shape, -1, 64)\n",
+    "\n",
+    "q = model.model.layers[0].self_attn.q_proj(norm_embeds)\n",
+    "k = model.model.layers[0].self_attn.k_proj(norm_embeds)\n",
+    "v = model.model.layers[0].self_attn.v_proj(norm_embeds)\n",
+    "\n",
+    "position_embeds = model.model.rotary_emb(embeds, torch.arange(0,128).unsqueeze(0))\n",
+    "attn_out = model.model.layers[0].self_attn(norm_embeds, position_embeddings=position_embeds)\n",
+    "print(attn_out[0])\n",
+    "attn_out = attn_out[0] + embeds\n",
+    "# print(attn_out)\n",
+    "attn_out_norm = model.model.layers[0].post_attention_layernorm(attn_out)\n",
+    "print(attn_out_norm)\n",
+    "mlp_out = model.model.layers[0].mlp(attn_out_norm)\n",
+    "print(mlp_out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 114,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[[[-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          ...,\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275]],\n",
+       "\n",
+       "         [[-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          ...,\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275]],\n",
+       "\n",
+       "         [[-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          ...,\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275]],\n",
+       "\n",
+       "         ...,\n",
+       "\n",
+       "         [[-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          ...,\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275]],\n",
+       "\n",
+       "         [[-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          ...,\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275]],\n",
+       "\n",
+       "         [[-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          ...,\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275]]]],\n",
+       "       dtype=torch.bfloat16, grad_fn=<UnsafeViewBackward0>)"
+      ]
+     },
+     "execution_count": 114,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "torch.matmul(attn_scores, v)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 98,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[[ 0.8398,  2.2812,  2.7969,  ...,  5.7500,  0.9062, -3.4844],\n",
+       "         [ 0.8398,  2.2812,  2.7969,  ...,  5.7500,  0.9062, -3.4844],\n",
+       "         [ 0.8398,  2.2812,  2.7969,  ...,  5.7500,  0.9062, -3.4844],\n",
+       "         ...,\n",
+       "         [ 0.8398,  2.2812,  2.7969,  ...,  5.7500,  0.9062, -3.4844],\n",
+       "         [ 0.8398,  2.2812,  2.7969,  ...,  5.7500,  0.9062, -3.4844],\n",
+       "         [ 0.8398,  2.2812,  2.7969,  ...,  5.7500,  0.9062, -3.4844]]],\n",
+       "       dtype=torch.bfloat16, grad_fn=<ViewBackward0>)"
+      ]
+     },
+     "execution_count": 98,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "torch.einsum(\n",
+    "    'bsh,hq->bsq',\n",
+    "    norm_embeds,\n",
+    "    model.model.layers[0].self_attn.q_proj.weight.t()\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "RuntimeError",
+     "evalue": "The size of tensor a (2048) must match the size of tensor b (64) at non-singleton dimension 3",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[66], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m cos, sin \u001b[38;5;241m=\u001b[39m model\u001b[38;5;241m.\u001b[39mmodel\u001b[38;5;241m.\u001b[39mrotary_emb(embeds, torch\u001b[38;5;241m.\u001b[39marange(\u001b[38;5;241m0\u001b[39m,\u001b[38;5;241m128\u001b[39m)\u001b[38;5;241m.\u001b[39munsqueeze(\u001b[38;5;241m0\u001b[39m))\n\u001b[0;32m----> 2\u001b[0m \u001b[43mapply_rotary_pos_emb\u001b[49m\u001b[43m(\u001b[49m\u001b[43mq\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcos\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msin\u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mtranspose(\u001b[38;5;241m1\u001b[39m,\u001b[38;5;241m2\u001b[39m)\u001b[38;5;241m.\u001b[39mreshape(\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m128\u001b[39m, \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m)\n",
+      "File \u001b[0;32m~/workspace/gnovack/vllm/.venv/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py:225\u001b[0m, in \u001b[0;36mapply_rotary_pos_emb\u001b[0;34m(q, k, cos, sin, position_ids, unsqueeze_dim)\u001b[0m\n\u001b[1;32m    223\u001b[0m cos \u001b[38;5;241m=\u001b[39m cos\u001b[38;5;241m.\u001b[39munsqueeze(unsqueeze_dim)\n\u001b[1;32m    224\u001b[0m sin \u001b[38;5;241m=\u001b[39m sin\u001b[38;5;241m.\u001b[39munsqueeze(unsqueeze_dim)\n\u001b[0;32m--> 225\u001b[0m q_embed \u001b[38;5;241m=\u001b[39m (\u001b[43mq\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mcos\u001b[49m) \u001b[38;5;241m+\u001b[39m (rotate_half(q) \u001b[38;5;241m*\u001b[39m sin)\n\u001b[1;32m    226\u001b[0m k_embed \u001b[38;5;241m=\u001b[39m (k \u001b[38;5;241m*\u001b[39m cos) \u001b[38;5;241m+\u001b[39m (rotate_half(k) \u001b[38;5;241m*\u001b[39m sin)\n\u001b[1;32m    227\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m q_embed, k_embed\n",
+      "\u001b[0;31mRuntimeError\u001b[0m: The size of tensor a (2048) must match the size of tensor b (64) at non-singleton dimension 3"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "apply_rotary_pos_emb(q, k, cos, sin)[0].transpose(1,2).reshape(1, 128, -1)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/run-compile-script.sh b/run-compile-script.sh
new file mode 100755
index 0000000000000..12d307934f4cf
--- /dev/null
+++ b/run-compile-script.sh
@@ -0,0 +1,12 @@
+# rm -rf /var/tmp/neuron-compile-cache/*
+
+# export TORCHDYNAMO_VERBOSE=1
+export PYTHONPATH=/root/workspace/gnovack/vllm
+# export TORCH_LOGS=+dynamo,graph
+export NEURON_RT_NUM_CORES=16
+# export XLA_DISABLE_FUNCTIONALIZATION=0
+export NEURON_CC_FLAGS="-O1 --verbose=debug --logical-nc-config=1 --logfile=neuron-compiler.log --internal-compiler-debug-mode=all --compile_workdir=/root/workspace/gnovack/vllm/compiler-workdir"
+# export NEURON_CC_FLAGS="-O1"
+
+python examples/offline_model_neuron.py > compile-script-output 2>&1
+# python examples/offline_inference_neuron.py > inference-script-output 2>&1
diff --git a/serve.sh b/serve.sh
new file mode 100755
index 0000000000000..67118d6affd56
--- /dev/null
+++ b/serve.sh
@@ -0,0 +1,11 @@
+export NEURON_CC_FLAGS="--verbose=debug --logfile=neuron-compiler.log --internal-compiler-debug-mode=penguin --compile_workdir=/root/workspace/gnovack/vllm/compiler-workdir --logical-nc-config=2 -O1"
+VLLM_USE_V1=1 PYTHONPATH=/root/workspace/gnovack/vllm python vllm/entrypoints/openai/api_server.py \
+    --model /root/workspace/gnovack/models/llama-3.1-8b-instruct \
+    --max-num-seqs 8 \
+    --max-model-len 4096 \
+    --max-num-batched-tokens 128 \
+    --enable-chunked-prefill \
+    --block-size 128 \
+    --device neuron \
+    -tp 4 \
+    --worker-cls="vllm.v1.worker.neuron_worker.NeuronWorker"
\ No newline at end of file
diff --git a/tests/neuron/__init__.py b/tests/neuron/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/neuron/test_prefix_prefill.py b/tests/neuron/test_prefix_prefill.py
index 77b707a737118..160fea3e92e2f 100644
--- a/tests/neuron/test_prefix_prefill.py
+++ b/tests/neuron/test_prefix_prefill.py
@@ -173,8 +173,13 @@ def ref_context_attention(
     "num_heads,num_queries_per_kv,head_size,mixed_precision",
     [
         (4, 2, 8, False),
+<<<<<<< HEAD
         (4, 2, 8, True),
         (32, 8, 64, True),
+=======
+        # (4, 2, 8, True),
+        # (32, 8, 64, True),
+>>>>>>> dfa31aa7 (WIP - Neuron support in vLLM v1)
     ],
 )
 @torch.inference_mode()
@@ -185,6 +190,11 @@ def test_contexted_kv_attention(
     mixed_precision: bool,
 ) -> None:
     import os
+<<<<<<< HEAD
+=======
+    os.environ["NEURON_RT_LOG_LEVEL"] = "INFO"
+    os.environ["NEURON_FRAMEWORK_DEBUG"] = "1"
+>>>>>>> dfa31aa7 (WIP - Neuron support in vLLM v1)
 
     import torch_xla.core.xla_model as xm
 
@@ -194,7 +204,15 @@ def test_contexted_kv_attention(
 
     os.environ["NEURON_CC_FLAGS"] = (
         " --model-type=transformer -O1 "
+<<<<<<< HEAD
         " --internal-hlo2tensorizer-options='--verify-hlo' ")
+=======
+        " --internal-hlo2tensorizer-options='--verify-hlo' "
+        " --verbose=debug "
+        " --logical-nc-config=1 ")
+
+    
+>>>>>>> dfa31aa7 (WIP - Neuron support in vLLM v1)
 
     random.seed(0)
     torch.manual_seed(0)
diff --git a/vllm/config.py b/vllm/config.py
index d7c9311ae3cb0..9eabdec0da3cc 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -22,7 +22,7 @@
 from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
                                                      get_quantization_config)
 from vllm.model_executor.models import ModelRegistry
-from vllm.platforms import CpuArchEnum
+from vllm.platforms import CpuArchEnum, current_platform
 from vllm.tracing import is_otel_available, otel_import_error_traceback
 from vllm.transformers_utils.config import (
     ConfigFormat, get_config, get_hf_image_processor_config,
@@ -1306,10 +1306,7 @@ def __post_init__(self) -> None:
             from vllm.executor import ray_utils
             backend = "mp"
             ray_found = ray_utils.ray_is_available()
-            if current_platform.is_neuron():
-                # neuron uses single process to control multiple devices
-                backend = "uni"
-            elif (current_platform.is_cuda()
+            if (current_platform.is_cuda()
                   and cuda_device_count_stateless() < self.world_size):
                 if not ray_found:
                     raise ValueError("Unable to load Ray which is "
@@ -3167,7 +3164,17 @@ def __post_init__(self):
 
         if self.compilation_config is None:
             self.compilation_config = CompilationConfig()
-        if envs.VLLM_USE_V1 and self.model_config is not None and \
+
+        
+        if envs.VLLM_USE_V1 and not self.model_config.enforce_eager and current_platform.is_neuron():
+            self.compilation_config.custom_ops = ["silu_and_mul"]
+            self.compilation_config.use_cudagraph = True
+            self.compilation_config.use_inductor = True
+            self.compilation_config.cudagraph_num_of_warmups = 1
+            self.compilation_config.pass_config.enable_fusion = False
+            self.compilation_config.pass_config.enable_reshape = False
+            self.compilation_config.level = CompilationLevel.DYNAMO_AS_IS
+        elif envs.VLLM_USE_V1 and self.model_config is not None and \
             not self.model_config.enforce_eager:
             # NOTE(woosuk): Currently, we use inductor because the piecewise
             # CUDA graphs do not work properly with the custom CUDA kernels.
@@ -3258,7 +3265,10 @@ def _set_cudagraph_sizes(self):
                 ]
         else:
             batch_size_capture_list = []
-            if self.model_config is not None and \
+            if current_platform.is_neuron():
+                # TODO(gnovack) - choose a proper list of batch sizes
+                batch_size_capture_list = [128, self.scheduler_config.max_num_batched_tokens]
+            elif self.model_config is not None and \
                 not self.model_config.enforce_eager:
                 batch_size_capture_list = [1, 2, 4
                                            ] + [i for i in range(8, 513, 8)]
diff --git a/vllm/distributed/device_communicators/neuron_communicator.py b/vllm/distributed/device_communicators/neuron_communicator.py
new file mode 100644
index 0000000000000..54f659e29b07f
--- /dev/null
+++ b/vllm/distributed/device_communicators/neuron_communicator.py
@@ -0,0 +1,27 @@
+
+import torch
+from torch.distributed import ProcessGroup
+from vllm.platforms import current_platform
+
+if current_platform.is_neuron():
+    import torch_xla.core.xla_model as xm
+    import torch_xla.runtime as xr
+    from torch_xla._internal import pjrt
+
+    from vllm.executor import ray_utils
+
+
+class NeuronCommunicator:
+
+    def __init__(self, group: ProcessGroup):
+        if not current_platform.is_neuron():
+            self.disabled = True
+            return
+        self.disabled = False
+
+    def all_reduce(self, x: torch.Tensor) -> torch.Tensor:
+        return xm.all_reduce(xm.REDUCE_SUM, x)
+
+    def all_gather(self, x: torch.Tensor, dim: int = -1) -> torch.Tensor:
+        assert dim == -1, "TPUs only support dim=-1 for all-gather."
+        return xm.all_gather(x, dim=dim)
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 7fe9b68d4b9e8..c317e5469998a 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -34,6 +34,7 @@
 import torch
 import torch.distributed
 from torch.distributed import Backend, ProcessGroup
+from torch_xla.core import xla_model as xm
 
 import vllm.distributed.kv_transfer.kv_transfer_agent as kv_transfer
 import vllm.envs as envs
@@ -165,6 +166,7 @@ def __init__(
         use_tpu_communicator: bool,
         use_hpu_communicator: bool,
         use_xpu_communicator: bool,
+        use_neuron_communicator: bool,
         use_message_queue_broadcaster: bool = False,
         group_name: Optional[str] = None,
     ):
@@ -212,7 +214,7 @@ def __init__(
             PyNcclCommunicator)
 
         self.pynccl_comm: Optional[PyNcclCommunicator] = None
-        if use_pynccl and self.world_size > 1:
+        if use_pynccl and self.world_size > 1 and current_platform.is_cuda_alike():
             self.pynccl_comm = PyNcclCommunicator(
                 group=self.cpu_group,
                 device=self.device,
@@ -243,6 +245,12 @@ def __init__(
         self.xpu_communicator: Optional[XpuCommunicator]
         if use_xpu_communicator and self.world_size > 1:
             self.xpu_communicator = XpuCommunicator(group=self.device_group)
+        
+        from vllm.distributed.device_communicators.neuron_communicator import (
+            NeuronCommunicator)
+        self.neuron_communicator: Optional[NeuronCommunicator]
+        if use_neuron_communicator and self.world_size > 1:
+            self.neuron_communicator = NeuronCommunicator(group=self.device_group)
 
         from vllm.distributed.device_communicators.shm_broadcast import (
             MessageQueue)
@@ -343,6 +351,11 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
         if self.xpu_communicator is not None and \
                 not self.xpu_communicator.disabled:
             return self.xpu_communicator.all_reduce(input_)
+        
+        # TODO(gnovack) - remove check for is_xla_tensor once sampling is done on-device 
+        if self.neuron_communicator is not None and \
+                not self.neuron_communicator.disabled and xm.is_xla_tensor(input_):
+                return self.neuron_communicator.all_reduce(input_)
 
         return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name)
 
@@ -379,6 +392,16 @@ def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
         tpu_comm = self.tpu_communicator
         if tpu_comm is not None and not tpu_comm.disabled:
             return tpu_comm.all_gather(input_, dim)
+        
+        # For Neuron, use Neuron communicator.
+        group = self.device_group
+        neuron_comm = self.neuron_communicator
+        if neuron_comm is not None and not neuron_comm.disabled:
+            # TODO(gnovack) - remove check for is_xla_tensor once sampling is done on-device
+            if xm.is_xla_tensor(input_):
+                return neuron_comm.all_gather(input_, dim)
+            else:
+                group = self.cpu_group
 
         # For HPUs, use HPU communicator.
         hpu_comm = self.hpu_communicator
@@ -394,20 +417,22 @@ def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
         # torch.compile . see https://github.com/pytorch/pytorch/issues/138795
         output_size = (input_size[0] * world_size, ) + input_size[1:]
         # Allocate output tensor.
-        output_tensor = torch.empty(output_size,
-                                    dtype=input_.dtype,
-                                    device=input_.device)
-        # All-gather.
-        torch.distributed.all_gather_into_tensor(output_tensor,
-                                                 input_,
-                                                 group=self.device_group)
+        with torch.inference_mode(False):
+            output_tensor = torch.empty(output_size,
+                                        dtype=input_.dtype,
+                                        device=input_.device,
+                                        requires_grad=False)
+            # All-gather.
+            torch.distributed.all_gather_into_tensor(output_tensor,
+                                                    input_,
+                                                    group=group)
         # Reshape
         output_tensor = output_tensor.reshape((world_size, ) + input_size)
         output_tensor = output_tensor.movedim(0, dim)
         output_tensor = output_tensor.reshape(input_size[:dim] +
-                                              (world_size *
-                                               input_size[dim], ) +
-                                              input_size[dim + 1:])
+                                            (world_size *
+                                            input_size[dim], ) +
+                                            input_size[dim + 1:])
         return output_tensor
 
     def gather(self,
@@ -848,6 +873,7 @@ def init_world_group(ranks: List[int], local_rank: int,
         use_tpu_communicator=False,
         use_hpu_communicator=False,
         use_xpu_communicator=False,
+        use_neuron_communicator=False,
         group_name="world",
     )
 
@@ -873,6 +899,7 @@ def init_model_parallel_group(
         use_tpu_communicator=True,
         use_hpu_communicator=True,
         use_xpu_communicator=True,
+        use_neuron_communicator=True,
         use_message_queue_broadcaster=use_message_queue_broadcaster,
         group_name=group_name,
     )
@@ -962,6 +989,9 @@ def init_distributed_environment(
             init_method=distributed_init_method,
             world_size=world_size,
             rank=rank)
+        
+        # TODO(gnovack) - XLA CC Ops use an unamed process group, so we need to register a group with no name here 
+        torch._C._distributed_c10d._register_process_group("", torch.distributed.group.WORLD)
     # set the local rank
     # local_rank is not available in torch ProcessGroup,
     # see https://github.com/pytorch/pytorch/issues/122816
diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py
index 539b6ae2d3572..5879e590dd545 100644
--- a/vllm/executor/multiproc_worker_utils.py
+++ b/vllm/executor/multiproc_worker_utils.py
@@ -14,6 +14,7 @@
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 from vllm.triton_utils.importing import HAS_TRITON
 from vllm.utils import _check_multiproc_method, get_mp_context, run_method
 
@@ -306,3 +307,7 @@ def set_multiprocessing_worker_envs(parallel_config):
     # workaround for https://github.com/vllm-project/vllm/issues/6103
     if HAS_TRITON and parallel_config.world_size > 1:
         maybe_set_triton_cache_manager()
+
+    if current_platform.is_neuron():
+        os.environ["NEURONCORE_NUM_DEVICES"] = str(parallel_config.tensor_parallel_size)
+        os.environ['NEURON_PJRT_PROCESSES_NUM_DEVICES'] = ','.join(['1' for _ in range(parallel_config.tensor_parallel_size)])
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index 96995c56bf504..9cff04e607ca1 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -56,6 +56,11 @@ def forward_hpu(self, *args, **kwargs):
         # By default, we assume that Gaudi ops are compatible with the
         # PyTorch-native implementation.
         return self.forward_native(*args, **kwargs)
+    
+    def forward_neuron(self, *args, **kwargs):
+        # By default, we assume that Gaudi ops are compatible with the
+        # PyTorch-native implementation.
+        return self.forward_native(*args, **kwargs)
 
     def forward_oot(self, *args, **kwargs):
         # By default, we assume that OOT ops are compatible with the
@@ -86,6 +91,8 @@ def dispatch_forward(self):
             return self.forward_tpu
         elif current_platform.is_xpu():
             return self.forward_xpu
+        elif current_platform.is_neuron():
+            return self.forward_neuron
         elif current_platform.is_out_of_tree():
             return self.forward_oot
         else:
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index fb9684ac1c184..2979e69b01f67 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -87,6 +87,19 @@ def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
         self.op(out, x)
         return out
+    
+    def forward_neuron(self, x: torch.Tensor) -> torch.Tensor:
+        # TODO(gnovack) - clean this up
+        d = x.shape[-1] // 2
+        if len(x.shape) == 3:
+            s = x[:, :, :d] * torch.nn.functional.sigmoid(x[:, :, :d])
+            return s * x[:, :, d:]
+        elif len(x.shape) == 2:
+            s = x[:, :d] * torch.nn.functional.sigmoid(x[:, :d])
+            return s * x[ :, d:]
+        else:
+            raise NotImplementedError("Expected input to have either 3 or 2 dims")
+
 
 
 @CustomOp.register("mul_and_silu")
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index 42decde1d0f79..bbec2d0fd2685 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -47,6 +47,7 @@ def __init__(self,
 
         parallel_config = get_current_vllm_config().parallel_config
         self.use_all_gather = current_platform.is_tpu() \
+            or current_platform.is_neuron() \
             or envs.VLLM_USE_V1 \
             or parallel_config.distributed_executor_backend == "external_launcher" # noqa
 
@@ -104,7 +105,8 @@ def _get_logits(
             logits = tensor_model_parallel_gather(logits)
         # Remove paddings in vocab (if any).
         if logits is not None:
-            logits = logits[..., :self.org_vocab_size]
+            # logits = logits[..., :self.org_vocab_size]
+            logits = logits[:, :self.org_vocab_size]
         return logits
 
     def extra_repr(self) -> str:
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index d071cfe888f05..b6e640f83971c 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -254,6 +254,48 @@ def extra_repr(self) -> str:
         s += f", max_position_embeddings={self.max_position_embeddings}"
         s += f", base={self.base}, is_neox_style={self.is_neox_style}"
         return s
+    
+    def forward_neuron(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        
+        from torch_xla.core import xla_model as xm
+
+        # TODO(gnovack) - handle edge cases
+        if offsets is not None:
+            positions = positions + offsets
+        
+        self.cos_sin_cache = self.cos_sin_cache.to(query.device,
+                                                   dtype=query.dtype)
+
+        positions = positions.flatten()
+        num_tokens = positions.shape[0]
+        cos_sin = self.cos_sin_cache.index_select(0, positions)
+        cos, sin = cos_sin.chunk(2, dim=-1)
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+
+        if self.rotary_dim == self.head_size:
+            query = _apply_rotary_emb(query, cos, sin, self.is_neox_style)
+            query = query.reshape(query_shape)
+
+        key_shape = key.shape
+        key = key.view(num_tokens, -1, self.head_size)
+
+        if self.rotary_dim == self.head_size:
+            key = _apply_rotary_emb(key, cos, sin, self.is_neox_style)
+            key = key.reshape(key_shape)
+        else:
+            key_pass = key[..., self.rotary_dim:]
+            key_rot = key[..., :self.rotary_dim]
+            key_rot = _apply_rotary_emb(key_rot, cos, sin, self.is_neox_style)
+            key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
 
 
 class LinearScalingRotaryEmbedding(RotaryEmbedding):
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index e214c30f5d60b..c3942a2964c2b 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -44,6 +44,7 @@
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
@@ -196,7 +197,15 @@ def forward(
         attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
-        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        
+        # TODO(gnovack) - Figure out a better way to streamline QKV splitting
+        if current_platform.is_neuron():
+            q = qkv[:, :, :self.q_size]
+            k = qkv[:, :, self.q_size:self.q_size+self.kv_size]
+            v = qkv[:, :, self.q_size+self.kv_size:self.q_size + (2*self.kv_size)]
+        else:
+            q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
         q, k = self.rotary_emb(positions, q, k)
         attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
         output, _ = self.o_proj(attn_output)
@@ -545,6 +554,10 @@ def compute_logits(
         hidden_states: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> Optional[torch.Tensor]:
+        # TODO(gnovack) - compute logits on-device
+        if current_platform.is_neuron():
+            self.lm_head = self.lm_head.cpu()
+
         logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index ddbdc43ca5710..c01c45c241018 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -79,7 +79,6 @@ def hpu_platform_plugin() -> Optional[str]:
 
     return "vllm.platforms.hpu.HpuPlatform" if is_hpu else None
 
-
 def xpu_platform_plugin() -> Optional[str]:
     is_xpu = False
 
@@ -114,7 +113,7 @@ def cpu_platform_plugin() -> Optional[str]:
 def neuron_platform_plugin() -> Optional[str]:
     is_neuron = False
     try:
-        import transformers_neuronx  # noqa: F401
+        import neuronx_distributed  # noqa: F401
         is_neuron = True
     except ImportError:
         pass
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index f2ecec3203fb7..0aedfee4575c2 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -35,6 +35,7 @@ class _Backend(enum.Enum):
     IPEX = enum.auto()
     BLOCK_SPARSE_FLASH_ATTN = enum.auto()
     NO_ATTENTION = enum.auto()
+    NEURON = enum.auto()
 
 
 class PlatformEnum(enum.Enum):
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index 23a7126fb05cf..5778107ba0535 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -1,8 +1,11 @@
 from typing import TYPE_CHECKING, Optional
 
+import torch
+
 from vllm.logger import init_logger
+from .interface import _Backend, Platform, PlatformEnum
 
-from .interface import Platform, PlatformEnum
+logger = init_logger(__name__)
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
@@ -17,6 +20,7 @@ class NeuronPlatform(Platform):
     device_name: str = "neuron"
     device_type: str = "neuron"
     ray_device_key: str = "neuron_cores"
+    dispatch_key: str = "XLA"
     supported_quantization: list[str] = ["neuron_quant"]
     device_control_env_var: str = "NEURON_RT_VISIBLE_CORES"
 
@@ -28,6 +32,21 @@ def get_device_name(cls, device_id: int = 0) -> str:
     def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
         return False
 
+    @classmethod
+    def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
+        if selected_backend != _Backend.NEURON:
+            logger.info("Cannot use %s backend on Neuron.", selected_backend)
+        return _Backend.NEURON
+
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool) -> str:
+        if not use_v1:
+            logger.info("Neuron backend is only supported in V1")
+        logger.info("Using Pallas backend.")
+        return "vllm.v1.attention.backends.neuron_attn.NeuronAttentionBackend"
+
     @classmethod
     def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         parallel_config = vllm_config.parallel_config
@@ -35,19 +54,12 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             parallel_config.worker_cls = \
                 "vllm.worker.neuron_worker.NeuronWorker"
 
-        if parallel_config.world_size > 1:
-            parallel_config.distributed_executor_backend = "uni"
 
         assert (vllm_config.lora_config
                 is None), "LoRA is not supported for Neuron backend."
         assert (not vllm_config.speculative_config
                 ), "Speculative decoding not yet supported for Neuron backend."
 
-        cache_config = vllm_config.cache_config
-        if cache_config:
-            # neuron needs block_size = max_model_len
-            vllm_config.cache_config.block_size = \
-                vllm_config.model_config.max_model_len
 
     @classmethod
     def is_pin_memory_available(cls) -> bool:
diff --git a/vllm/v1/attention/backends/neuron_attn.py b/vllm/v1/attention/backends/neuron_attn.py
new file mode 100644
index 0000000000000..c2dd3fdcbf1e1
--- /dev/null
+++ b/vllm/v1/attention/backends/neuron_attn.py
@@ -0,0 +1,202 @@
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Type
+
+import torch
+from vllm.attention.backends.abstract import AttentionBackend, AttentionImpl, AttentionMetadataBuilder, AttentionType
+from vllm.attention.backends.utils import CommonAttentionState
+
+
+@torch.library.custom_op("mylib::neuron_paged_attn", mutates_args=())
+def neuron_paged_attn(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    block_table: torch.Tensor,
+    attn_mask: torch.Tensor,
+    n_kv_head: int = None,
+    head_size: int = None,
+    B_P_SIZE: int = 128,
+    LARGE_TILE_SZ: int = 2048,
+    return_debug_tensors: bool = False,
+    mixed_precision: bool = True,
+) -> torch.Tensor:
+    from vllm.attention.ops.nki_flash_attn import flash_attn_varlen_nkifunc
+    output_nki = flash_attn_varlen_nkifunc(
+        query,
+        key,
+        value,
+        key_cache,
+        value_cache,
+        block_table,
+        attn_mask,
+        n_kv_head,
+        head_size,
+        B_P_SIZE,
+        LARGE_TILE_SZ,
+        return_debug_tensors,
+        mixed_precision,
+    )
+    return torch.tensor(output_nki)
+
+@neuron_paged_attn.register_fake
+def _(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    block_table: torch.Tensor,
+    attn_mask: torch.Tensor,
+    n_kv_head: int = None,
+    head_size: int = None,
+    B_P_SIZE: int = 128,
+    LARGE_TILE_SZ: int = 2048,
+    return_debug_tensors: bool = False,
+    mixed_precision: bool = True,
+) -> torch.Tensor:
+    return torch.empty_like(query.transpose(-2, -1))
+
+
+class NeuronAttentionBackend(AttentionBackend):
+    
+    @staticmethod
+    def get_name() -> str:
+        return "NEURON"
+
+    @staticmethod
+    def get_impl_cls() -> Type["NeuronAttentionBackendImpl"]:
+        return NeuronAttentionBackendImpl
+
+    @staticmethod
+    def get_metadata_cls() -> Type["NeuronAttentionMetadata"]:
+        return NeuronAttentionMetadata
+
+    @staticmethod
+    def get_state_cls() -> Type["CommonAttentionState"]:
+        return CommonAttentionState
+
+    @staticmethod
+    def get_builder_cls() -> Type["NeuronAttentionMetadataBuilder"]:
+        return NeuronAttentionMetadataBuilder
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        return (2, num_blocks, block_size, num_kv_heads, head_size)
+
+@dataclass
+class NeuronAttentionMetadata:
+    # NOTE(sang): Definition of context_len, query_len, and seq_len.
+    # |---------- N-1 iteration --------|
+    # |---------------- N iteration ---------------------|
+    # |- tokenA -|......................|-- newTokens ---|
+    # |---------- context_len ----------|
+    # |-------------------- seq_len ---------------------|
+    #                                   |-- query_len ---|
+
+    num_actual_tokens: int  # Number of tokens excluding padding.
+    max_query_len: int
+    query_start_loc: torch.Tensor
+    max_seq_len: int
+    seq_start_loc: torch.Tensor
+    block_table: torch.Tensor
+    slot_mapping: torch.Tensor
+    num_active_blocks: int
+    active_block_table: torch.Tensor
+    attn_mask: torch.Tensor
+    num_input_tokens: int = 0  # Number of tokens including padding.
+
+class NeuronAttentionMetadataBuilder(AttentionMetadataBuilder[NeuronAttentionMetadata]):
+    ...
+
+class NeuronAttentionBackendImpl(AttentionImpl[NeuronAttentionMetadata]):
+    
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: Optional[int] = None,
+        alibi_slopes: Optional[List[float]] = None,
+        sliding_window: Optional[int] = None,
+        kv_cache_dtype: str = "auto",
+        blocksparse_params: Optional[Dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
+        attn_type: AttentionType = AttentionType.DECODER,
+    ) -> None:
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.num_kv_heads = num_kv_heads
+        self.scale = scale
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+    
+    @torch.inference_mode()
+    def forward(
+        self,
+        layer: torch.nn.Module,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: NeuronAttentionMetadata,
+        k_scale: float = 1.0,
+        v_scale: float = 1.0,
+        attn_type: str = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        
+        torch.ops.xla.dynamo_set_buffer_donor_(kv_cache, True)
+        k_cache = kv_cache[0]
+        v_cache = kv_cache[1]
+        
+        num_tokens = query.shape[1]
+        query = query.view(num_tokens, self.num_heads, self.head_size)
+        key = key.view(num_tokens, self.num_kv_heads, self.head_size)
+        value = value.view(num_tokens, self.num_kv_heads, self.head_size)
+            
+        if kv_cache[0].numel() > 0:
+            slot_mapping = attn_metadata.slot_mapping
+            write_to_kv_cache(key, value, k_cache, v_cache, slot_mapping)
+
+        query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
+        key = key.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
+        value = value.unsqueeze(0).permute(0, 2, 1, 3).contiguous()
+
+        input_args = (
+            query,
+            key,
+            value,
+            k_cache,
+            v_cache,
+            attn_metadata.active_block_table,
+            attn_metadata.attn_mask,
+        )
+        input_kwargs = dict(
+            n_kv_head=self.num_kv_heads,
+            head_size=self.head_size,
+            mixed_precision=False,
+        )
+        output = neuron_paged_attn(*input_args, **input_kwargs)
+        output = output.transpose(1,2).reshape(1, num_tokens, self.num_heads * self.head_size)
+        return output
+        
+
+def write_to_kv_cache(
+    key: torch.Tensor,
+    value: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    slot_mapping: torch.Tensor,
+) -> None:
+
+    key_cache = key_cache.flatten(0, 1)
+    value_cache = value_cache.flatten(0, 1)
+
+    key_cache.index_copy_(0, slot_mapping, key)
+    value_cache.index_copy_(0, slot_mapping, value)
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index a8cf0aec3f17b..7ae5f56494d49 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -20,7 +20,6 @@
 from vllm.v1.core.scheduler import SchedulerOutput
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import ModelRunnerOutput
-from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 
 logger = init_logger(__name__)
 
@@ -126,6 +125,7 @@ def init_device(self):
         set_random_seed(self.model_config.seed)
 
         # Construct the model runner
+        from vllm.v1.worker.gpu_model_runner import GPUModelRunner
         self.model_runner = GPUModelRunner(self.vllm_config, self.device)
 
     def load_model(self) -> None:
diff --git a/vllm/v1/worker/neuron_model_runner.py b/vllm/v1/worker/neuron_model_runner.py
new file mode 100644
index 0000000000000..e2ee443f8e074
--- /dev/null
+++ b/vllm/v1/worker/neuron_model_runner.py
@@ -0,0 +1,833 @@
+import gc
+import time
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed
+import torch.nn as nn
+
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
+from vllm.config import CompilationLevel, VllmConfig
+from vllm.distributed.parallel_state import graph_capture
+from vllm.forward_context import set_forward_context
+from vllm.inputs import INPUT_REGISTRY, InputRegistry
+from vllm.logger import init_logger
+from vllm.model_executor.model_loader import get_model
+from vllm.multimodal import MultiModalKwargs
+from vllm.sampling_params import SamplingType
+from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
+                        LayerBlockType, cdiv, is_pin_memory_available)
+from vllm.v1.attention.backends.neuron_attn import NeuronAttentionBackend, NeuronAttentionMetadata
+from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.utils import bind_kv_cache
+from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
+                                        KVCacheSpec)
+
+if TYPE_CHECKING:
+    from vllm.v1.core.scheduler import SchedulerOutput
+
+logger = init_logger(__name__)
+
+
+B_P_SIZE = 128
+LARGE_TILE_SZ = 2048
+
+
+class NeuronModelRunner:
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        device: torch.device,
+        input_registry: InputRegistry = INPUT_REGISTRY,
+    ):
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.load_config = vllm_config.load_config
+        self.parallel_config = vllm_config.parallel_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.speculative_config = vllm_config.speculative_config
+        self.prompt_adapter_config = vllm_config.prompt_adapter_config
+        self.observability_config = vllm_config.observability_config
+
+        model_config = self.model_config
+        cache_config = self.cache_config
+        scheduler_config = self.scheduler_config
+        parallel_config = self.parallel_config
+        self.device = device
+        self.pin_memory = False
+        self.dtype = self.model_config.dtype
+        if cache_config.cache_dtype == "auto":
+            self.kv_cache_dtype = self.dtype
+        else:
+            self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
+                cache_config.cache_dtype]
+
+        self.is_multimodal_model = model_config.is_multimodal_model
+        self.sliding_window = model_config.get_sliding_window()
+        self.block_size = cache_config.block_size
+        self.max_model_len = model_config.max_model_len
+        self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size)
+        self.max_num_tokens = scheduler_config.max_num_batched_tokens
+
+        # Model-related.
+        self.num_attn_layers = model_config.get_num_layers_by_block_type(
+            parallel_config, LayerBlockType.attention)
+        self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
+        self.head_size = model_config.get_head_size()
+        self.hidden_size = model_config.get_hidden_size()
+
+        # Multi-modal data support
+        self.input_registry = input_registry
+
+        # Lazy initialization
+        self.model: nn.Module  # Set after load_model
+        self.kv_caches: List[torch.Tensor] = []
+        # req_id -> (input_id -> encoder_output)
+        self.encoder_cache: Dict[str, Dict[int, torch.Tensor]] = {}
+
+        # Request states.
+        self.requests: Dict[str, CachedRequestState] = {}
+        # Persistent batch.
+        self.input_batch = InputBatch(
+            max_num_reqs=self.scheduler_config.max_num_seqs,
+            max_model_len=self.max_model_len,
+            max_num_blocks_per_req=self.max_num_blocks_per_req,
+            device="cpu",
+            pin_memory=self.pin_memory,
+            vocab_size=model_config.get_vocab_size(),
+        )
+
+        self.input_ids = torch.zeros(self.max_num_tokens,
+                                     dtype=torch.int32,
+                                     device="cpu")
+        self.positions = torch.zeros(self.max_num_tokens,
+                                     dtype=torch.int64,
+                                     device="cpu")
+        self.inputs_embeds = torch.zeros(
+            (self.max_num_tokens, self.hidden_size),
+            dtype=self.dtype,
+            device="cpu")
+
+        # TODO(gnovack) - use compile sizes...
+        self.neuron_compilation_batch_sizes = list(reversed(self.vllm_config.compilation_config.cudagraph_capture_sizes))
+
+    def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
+        # Remove stopped requests from the cached states.
+        # Keep the states of the pre-empted requests.
+        for req_id in scheduler_output.finished_req_ids:
+            self.requests.pop(req_id, None)
+            self.encoder_cache.pop(req_id, None)
+
+        # Free the cached encoder outputs.
+        for req_id, input_id in scheduler_output.free_encoder_input_ids:
+            encoder_outputs = self.encoder_cache.get(req_id)
+            if encoder_outputs is not None:
+                encoder_outputs.pop(input_id, None)
+                if not encoder_outputs:
+                    self.encoder_cache.pop(req_id, None)
+
+        # Remove the requests from the persistent batch.
+        stopped_req_ids = set().union(
+            scheduler_output.preempted_req_ids,
+            scheduler_output.finished_req_ids,
+        )
+        removed_req_indices: List[int] = []
+        for req_id in stopped_req_ids:
+            req_index = self.input_batch.remove_request(req_id)
+            if req_index is not None:
+                removed_req_indices.append(req_index)
+
+        # Update the states of the running requests.
+        for req_data in scheduler_output.scheduled_running_reqs:
+            req_id = req_data.req_id
+            req_state = self.requests[req_id]
+            req_index = self.input_batch.req_id_to_index[req_id]
+
+            # Update the num_computed_tokens.
+            req_state.num_computed_tokens = req_data.num_computed_tokens
+            self.input_batch.num_computed_tokens_cpu[req_index] = (
+                req_data.num_computed_tokens)
+
+            # Update the block table.
+            num_new_blocks = len(req_data.new_block_ids)
+            if num_new_blocks == 0:
+                continue
+            start_index = len(req_state.block_ids)
+            end_index = start_index + num_new_blocks
+            req_state.block_ids.extend(req_data.new_block_ids)
+            self.input_batch.block_table.append_row(req_index, start_index,
+                                                    req_data.new_block_ids)
+
+        req_ids_to_add: List[str] = []
+        # Add new requests to the cached states.
+        for req_data in scheduler_output.scheduled_new_reqs:
+            req_id = req_data.req_id
+            sampling_params = req_data.sampling_params
+            if sampling_params.sampling_type == SamplingType.RANDOM_SEED:
+                generator = torch.Generator(device=self.device)
+                generator.manual_seed(sampling_params.seed)
+            else:
+                generator = None
+
+            self.requests[req_id] = CachedRequestState(
+                req_id=req_id,
+                prompt_token_ids=req_data.prompt_token_ids,
+                prompt=req_data.prompt,
+                mm_inputs=req_data.mm_inputs,
+                mm_positions=req_data.mm_positions,
+                sampling_params=sampling_params,
+                generator=generator,
+                block_ids=req_data.block_ids,
+                num_computed_tokens=req_data.num_computed_tokens,
+                output_token_ids=[],
+            )
+            req_ids_to_add.append(req_id)
+
+        # Update the cached states of the resumed requests.
+        for req_data in scheduler_output.scheduled_resumed_reqs:
+            req_id = req_data.req_id
+            req_state = self.requests[req_id]
+
+            req_state.block_ids = req_data.block_ids
+            req_state.num_computed_tokens = req_data.num_computed_tokens
+            req_ids_to_add.append(req_id)
+
+        # Add the new or resumed requests to the persistent batch.
+        # The smaller empty indices are filled first.
+        removed_req_indices = sorted(removed_req_indices, reverse=True)
+        for req_id in req_ids_to_add:
+            req_state = self.requests[req_id]
+            if removed_req_indices:
+                # Fill the empty index.
+                req_index = removed_req_indices.pop()
+            else:
+                # Append to the end.
+                req_index = None
+            self.input_batch.add_request(req_state, req_index)
+
+        # Condense the batched states if there are empty indices.
+        if removed_req_indices:
+            self.input_batch.condense(removed_req_indices)
+
+    def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
+        total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
+        assert total_num_scheduled_tokens > 0
+        num_reqs = self.input_batch.num_reqs
+        assert num_reqs > 0
+
+        self.input_batch.block_table.commit(num_reqs)
+
+        # Get the number of scheduled tokens for each request.
+        # TODO: The Python loop can be slow. Optimize.
+        num_scheduled_tokens = []
+        max_num_scheduled_tokens = 0
+        for req_id in self.input_batch.req_ids[:num_reqs]:
+            num_tokens = scheduler_output.num_scheduled_tokens[req_id]
+            num_scheduled_tokens.append(num_tokens)
+            max_num_scheduled_tokens = max(max_num_scheduled_tokens,
+                                           num_tokens)
+        num_scheduled_tokens = np.array(num_scheduled_tokens, dtype=np.int32)
+        assert max_num_scheduled_tokens > 0
+
+        # Get request indices.
+        # E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
+        indices = np.arange(num_reqs)
+        req_indices = np.repeat(indices, num_scheduled_tokens)
+
+        # Get batched arange.
+        # E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
+        arange_matrix = np.tile(np.arange(max_num_scheduled_tokens),
+                                (num_reqs, 1))
+        mask = arange_matrix < num_scheduled_tokens[:, np.newaxis]
+        arange = arange_matrix[mask]
+
+        # Get positions.
+        positions = torch.empty((total_num_scheduled_tokens, ),
+                                dtype=torch.int32,
+                                device="cpu",
+                                pin_memory=self.pin_memory)
+        positions_np = positions.numpy()
+        np.add(self.input_batch.num_computed_tokens_cpu[req_indices],
+               arange,
+               out=positions_np)
+
+        # Get token indices.
+        # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
+        # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2]
+        # where M is the max_model_len.
+        token_indices = (positions_np +
+                         req_indices * self.input_batch.token_ids_cpu.shape[1])
+        token_indices = torch.from_numpy(token_indices)
+        input_ids = torch.empty((total_num_scheduled_tokens, ),
+                                dtype=torch.int32,
+                                device="cpu",
+                                pin_memory=self.pin_memory)
+        torch.index_select(torch.from_numpy(
+            self.input_batch.token_ids_cpu).flatten(),
+                           0,
+                           token_indices,
+                           out=input_ids)
+
+        # Calculate the slot mapping.
+        # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
+        # -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1]
+        # where K is the max_num_blocks_per_req and the block size is 2.
+        # NOTE(woosuk): We can't simply use `token_indices // block_size` here
+        # because M (max_model_len) is not necessarily divisible by block_size.
+        block_numbers = self.input_batch.block_table.get_cpu_tensor().flatten()[
+            req_indices * self.max_num_blocks_per_req +
+            positions_np // self.block_size]
+        block_offsets = torch.from_numpy(positions_np % self.block_size)
+        slot_mapping = torch.empty((total_num_scheduled_tokens, ),
+                                   dtype=torch.int32,
+                                   device="cpu",
+                                   pin_memory=self.pin_memory)
+        torch.add(block_numbers * self.block_size,
+                  block_offsets,
+                  out=slot_mapping)
+        
+        _PAD_SLOT_ID = self.num_blocks * self.block_size
+        padded_num_tokens = self._get_padded_batch_size(total_num_scheduled_tokens)
+        slot_mapping_pad_length = padded_num_tokens - slot_mapping.shape[0]
+        slot_mapping = torch.nn.functional.pad(
+            slot_mapping,
+            (0, slot_mapping_pad_length),
+            'constant',
+            _PAD_SLOT_ID
+        )
+
+        # Prepare the attention metadata.
+        query_start_loc = torch.empty((num_reqs + 1, ),
+                                      dtype=torch.int32,
+                                      device="cpu",
+                                      pin_memory=self.pin_memory)
+        query_start_loc_np = query_start_loc.numpy()
+        query_start_loc_np[0] = 0
+        np.cumsum(num_scheduled_tokens, out=query_start_loc_np[1:])
+
+        seq_lens = (self.input_batch.num_computed_tokens_cpu[:num_reqs] +
+                    num_scheduled_tokens)
+        max_seq_len = seq_lens.max()
+        seq_start_loc = torch.empty((num_reqs + 1, ),
+                                    dtype=torch.int32,
+                                    device="cpu",
+                                    pin_memory=self.pin_memory)
+        seq_start_loc_np = seq_start_loc.numpy()
+        seq_start_loc_np[0] = 0
+        np.cumsum(seq_lens, out=seq_start_loc_np[1:])
+
+        self.input_ids[:total_num_scheduled_tokens].copy_(input_ids,
+                                                          non_blocking=True)
+        self.positions[:total_num_scheduled_tokens].copy_(positions,
+                                                          non_blocking=True)
+
+        seq_lens = torch.diff(seq_start_loc)
+        query_lens = torch.diff(query_start_loc)
+        context_lens = seq_lens - query_lens
+        num_active_blocks_shifted = shift_bit_length(
+            ((context_lens+ self.block_size - 1) // self.block_size).sum().item()
+        )
+        num_active_blocks_factor = max(LARGE_TILE_SZ // self.block_size // num_active_blocks_shifted, 1)
+        num_active_blocks = num_active_blocks_shifted * num_active_blocks_factor
+        assert (num_active_blocks * self.block_size) % LARGE_TILE_SZ == 0, "invalid {num_active_blocks=}"
+
+        context_kv_len = num_active_blocks * self.block_size
+
+
+        block_table = self.input_batch.block_table.get_cpu_tensor()[:num_reqs]
+        active_block_table = get_active_block_tables(
+            block_table,
+            torch.tensor(query_lens),
+            torch.tensor(seq_lens),
+            self.block_size,
+            num_active_blocks,
+        )
+
+        prior_mask, active_mask = (
+            BlockDiagonalCausalFromBottomRightMask.from_seqlens(
+                query_lens=query_lens.tolist(), seq_lens=seq_lens.tolist(), block_size=self.block_size
+            )
+        )
+        
+        attn_mask = torch.concat(
+            [
+                nn.functional.pad(
+                    prior_mask,
+                    (
+                        0,
+                        max(context_kv_len, LARGE_TILE_SZ) - prior_mask.shape[1],
+                        0,
+                        B_P_SIZE - prior_mask.shape[0],
+                    ),
+                    "constant",
+                    0,
+                ).bool(),
+                nn.functional.pad(
+                    active_mask,
+                    (
+                        0,
+                        padded_num_tokens - active_mask.shape[1],
+                        0,
+                        B_P_SIZE - active_mask.shape[0],
+                    ),
+                    "constant",
+                    0,
+                ).bool(),
+            ],
+            dim=1,
+        )
+        
+        logits_indices = query_start_loc[1:] - 1
+        query_start_loc = query_start_loc.to(self.device, non_blocking=True)
+        seq_start_loc = seq_start_loc.to(self.device, non_blocking=True)
+        slot_mapping = slot_mapping.long().to(self.device, non_blocking=True)
+        active_block_table = active_block_table.to(torch.int32).to(self.device, non_blocking=True)
+        attn_mask = attn_mask.to(self.device)
+        attn_metadata = NeuronAttentionMetadata(
+            num_actual_tokens=total_num_scheduled_tokens,
+            max_query_len=max_num_scheduled_tokens,
+            query_start_loc=query_start_loc,
+            max_seq_len=max_seq_len,
+            seq_start_loc=seq_start_loc,
+            block_table=self.input_batch.block_table.get_device_tensor()[:num_reqs],
+            slot_mapping=slot_mapping,
+            num_active_blocks=num_active_blocks,
+            active_block_table=active_block_table,
+            attn_mask=attn_mask
+        )
+        # NOTE(woosuk): Due to chunked prefills, there can be at most 1 partial
+        # request in the batch. While we should not sample any token from this
+        # partial request, we do so for simplicity. We will ignore the sampled
+        # token from the partial request.
+        # TODO: Support prompt logprobs.
+        return attn_metadata, logits_indices
+
+    def _prepare_sampling(
+        self,
+        scheduler_output: "SchedulerOutput",
+    ) -> SamplingMetadata:
+        skip_copy = True
+        if (scheduler_output.finished_req_ids
+                or scheduler_output.preempted_req_ids):
+            skip_copy = False
+        if (scheduler_output.scheduled_new_reqs
+                or scheduler_output.scheduled_resumed_reqs):
+            skip_copy = False
+        # Create the sampling metadata.
+        req_id_output_token_ids: Dict[str, List[int]] = \
+            {req_id: req.output_token_ids \
+                for req_id, req in self.requests.items()}
+        
+        sampling_metadata = self.input_batch.make_sampling_metadata(req_id_output_token_ids, skip_copy)
+        return sampling_metadata
+
+    def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
+        scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
+        if not scheduled_encoder_inputs:
+            return
+
+        # Batch the multi-modal inputs.
+        mm_inputs: List[MultiModalKwargs] = []
+        req_input_ids: List[Tuple[int, int]] = []
+        for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
+            req_state = self.requests[req_id]
+            for input_id in encoder_input_ids:
+                mm_inputs.append(req_state.mm_inputs[input_id])
+                req_input_ids.append((req_id, input_id))
+        batched_mm_inputs = MultiModalKwargs.batch(mm_inputs)
+        batched_mm_inputs = MultiModalKwargs.as_kwargs(batched_mm_inputs,
+                                                       device=self.device)
+
+        # Run the encoder.
+        # `encoder_outputs` is either of the following:
+        # 1. A tensor of shape [num_images, feature_size, hidden_size]
+        # in case when feature_size is fixed across all images.
+        # 2. A list (length: num_images) of tensors, each of shape
+        # [feature_size, hidden_size] in case when the feature size is
+        # dynamic depending on input images.
+        encoder_outputs = self.model.get_multimodal_embeddings(
+            **batched_mm_inputs)
+
+        # Cache the encoder outputs.
+        for (req_id, input_id), output in zip(req_input_ids, encoder_outputs):
+            if req_id not in self.encoder_cache:
+                self.encoder_cache[req_id] = {}
+            self.encoder_cache[req_id][input_id] = output
+
+    def _gather_encoder_outputs(
+        self,
+        scheduler_output: "SchedulerOutput",
+    ) -> List[torch.Tensor]:
+        encoder_outputs: List[torch.Tensor] = []
+        num_reqs = self.input_batch.num_reqs
+        for req_id in self.input_batch.req_ids[:num_reqs]:
+            num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
+                req_id]
+            req_state = self.requests[req_id]
+            num_computed_tokens = req_state.num_computed_tokens
+            mm_positions = req_state.mm_positions
+            for i, pos_info in enumerate(mm_positions):
+                start_pos = pos_info["offset"]
+                num_encoder_tokens = pos_info["length"]
+
+                # The encoder output is needed if the two ranges overlap:
+                # [num_computed_tokens,
+                #  num_computed_tokens + num_scheduled_tokens) and
+                # [start_pos, start_pos + num_encoder_tokens)
+                if start_pos >= num_computed_tokens + num_scheduled_tokens:
+                    # The encoder output is not needed in this step.
+                    break
+                if start_pos + num_encoder_tokens <= num_computed_tokens:
+                    # The encoder output is already processed and stored
+                    # in the decoder's KV cache.
+                    continue
+
+                start_idx = max(num_computed_tokens - start_pos, 0)
+                end_idx = min(
+                    num_computed_tokens - start_pos + num_scheduled_tokens,
+                    num_encoder_tokens)
+                assert start_idx < end_idx
+                assert req_id in self.encoder_cache
+                assert i in self.encoder_cache[req_id]
+                encoder_output = self.encoder_cache[req_id][i]
+                encoder_outputs.append(encoder_output[start_idx:end_idx])
+        return encoder_outputs
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        scheduler_output: "SchedulerOutput",
+    ) -> ModelRunnerOutput:
+        self._update_states(scheduler_output)
+
+        if self.is_multimodal_model:
+            # Run the multimodal encoder if any.
+            self._execute_encoder(scheduler_output)
+            encoder_outputs = self._gather_encoder_outputs(scheduler_output)
+        else:
+            encoder_outputs = []
+
+        # Prepare the decoder inputs.
+        attn_metadata, logits_indices = self._prepare_inputs(scheduler_output)
+        num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
+        num_input_tokens = self._get_padded_batch_size(num_scheduled_tokens)
+        
+        attn_metadata.num_input_tokens = num_input_tokens
+
+        if self.is_multimodal_model:
+            # NOTE(woosuk): To unify token ids and soft tokens (vision
+            # embeddings), we always use embeddings (rather than token ids)
+            # as input to the multimodal model, even when the input is text.
+            input_ids = self.input_ids[:num_scheduled_tokens]
+            if encoder_outputs:
+                inputs_embeds = self.model.get_input_embeddings(
+                    input_ids, encoder_outputs)
+            else:
+                inputs_embeds = self.model.get_input_embeddings(input_ids)
+            # TODO(woosuk): Avoid the copy. Optimize.
+            self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds)
+            inputs_embeds = self.inputs_embeds[:num_input_tokens]
+            input_ids = None
+        else:
+            # For text-only models, we use token ids as input.
+            input_ids = self.input_ids[:num_input_tokens]
+            inputs_embeds = None
+
+        # Run the decoder.
+        # Use persistent buffers for CUDA graphs.
+        with set_forward_context(attn_metadata, self.vllm_config):
+            hidden_states = self.model(
+                input_ids=input_ids.unsqueeze(0).to(self.device),
+                positions=self.positions[:num_input_tokens].unsqueeze(0).to(self.device),
+                kv_caches=self.kv_caches,
+                attn_metadata=attn_metadata,
+                inputs_embeds=inputs_embeds.to(self.device) if inputs_embeds is not None else None,
+            ).cpu()
+        hidden_states = hidden_states[0, :num_scheduled_tokens]
+        hidden_states = hidden_states[logits_indices.cpu()]
+        logits = self.model.compute_logits(hidden_states, None)
+
+        # Sample the next token and get logprobs if needed.
+        sampling_metadata = self._prepare_sampling(scheduler_output)
+        sampler_output = self.model.sample(
+            logits=logits,
+            sampling_metadata=sampling_metadata,
+        )
+
+        sampled_token_ids = sampler_output.sampled_token_ids
+        # TODO(woosuk): The following loop can be slow since it iterates over
+        # the requests one by one. Optimize.
+        num_reqs = self.input_batch.num_reqs
+        for i, req_id in enumerate(self.input_batch.req_ids[:num_reqs]):
+            req_state = self.requests[req_id]
+            seq_len = (req_state.num_computed_tokens +
+                       scheduler_output.num_scheduled_tokens[req_id])
+            assert seq_len <= req_state.num_tokens
+            if seq_len == req_state.num_tokens:
+                # Append the sampled token to the output token ids.
+                token_id = sampled_token_ids[i]
+                self.input_batch.token_ids_cpu[i, seq_len] = token_id
+                req_state.output_token_ids.append(token_id)
+            else:
+                # Ignore the sampled token from the partial request.
+                # Rewind the generator state as if the token was not sampled.
+                generator = self.input_batch.generators.get(i)
+                if generator is not None:
+                    # This relies on cuda-specific torch-internal impl details
+                    generator.set_offset(generator.get_offset() - 4)
+
+        if sampler_output.logprob_token_ids is None:
+            logprob_token_ids = None
+        else:
+            logprob_token_ids = sampler_output.logprob_token_ids.cpu()
+        if sampler_output.logprobs is None:
+            logprobs = None
+        else:
+            logprobs = sampler_output.logprobs.cpu()
+        model_runner_output = ModelRunnerOutput(
+            req_ids=self.input_batch.req_ids[:num_reqs],
+            req_id_to_index=self.input_batch.req_id_to_index,
+            sampled_token_ids=sampled_token_ids,
+            logprob_token_ids_cpu=logprob_token_ids,
+            logprobs_cpu=logprobs,
+        )
+        return model_runner_output
+
+    def load_model(self) -> None:
+        # TODO(gnovack) - Add memory profiler during model load
+        with torch.inference_mode():
+            logger.info("Starting to load model %s...", self.model_config.model)
+            model = get_model(vllm_config=self.vllm_config).eval().to(self.device)
+            self.model = torch.compile(model, backend="openxla", fullgraph=True, dynamic=False)
+
+
+    @torch.inference_mode()
+    def _dummy_run(
+        self,
+        model: nn.Module,
+        num_tokens: int,
+        kv_caches: List[torch.Tensor],
+    ) -> torch.Tensor:
+        
+        num_active_blocks_shifted = shift_bit_length(
+            ((self.block_size - 1) // self.block_size)
+        )
+        num_active_blocks_factor = (LARGE_TILE_SZ // self.block_size // num_active_blocks_shifted)
+        num_active_blocks = num_active_blocks_shifted * num_active_blocks_factor
+        block_table = torch.arange((num_tokens // self.block_size) + 1).unsqueeze(0)
+        active_block_table = get_active_block_tables(
+            block_table,
+            torch.tensor([num_tokens]),
+            torch.tensor([num_tokens]),
+            self.block_size,
+            num_active_blocks,
+        )
+
+        attn_mask, _  = BlockDiagonalCausalFromBottomRightMask.from_seqlens(
+            query_lens=[num_tokens], seq_lens=[num_tokens]
+        )
+        attn_mask = nn.functional.pad(
+            attn_mask,
+            (
+                0,
+                LARGE_TILE_SZ + num_tokens - attn_mask.shape[1],
+                0,
+                B_P_SIZE - attn_mask.shape[0],
+            ),
+            "constant",
+            0,
+        ).bool()
+
+        attn_metadata = NeuronAttentionMetadata(
+            num_actual_tokens=num_tokens,
+            max_query_len=num_tokens,
+            query_start_loc=torch.tensor([0, num_tokens-1]).to(self.device, non_blocking=True),
+            max_seq_len=num_tokens,
+            seq_start_loc=torch.tensor([0, num_tokens-1]).to(self.device, non_blocking=True),
+            block_table=block_table,
+            slot_mapping=torch.arange(0, num_tokens).long().to(self.device, non_blocking=True),
+            num_active_blocks=num_active_blocks,
+            active_block_table=active_block_table.to(torch.int32).to(self.device, non_blocking=True),
+            attn_mask=attn_mask.to(self.device, non_blocking=True)
+        )
+
+        if self.is_multimodal_model:
+            input_ids = None
+            inputs_embeds = self.inputs_embeds[:num_tokens]
+        else:
+            input_ids = self.input_ids[:num_tokens]
+            inputs_embeds = None
+        with set_forward_context(attn_metadata, self.vllm_config):
+            hidden_states = model(
+                input_ids=input_ids.unsqueeze(0).to(self.device),
+                positions=self.positions[:num_tokens].unsqueeze(0).to(self.device),
+                kv_caches=kv_caches,
+                attn_metadata=attn_metadata,
+                inputs_embeds=inputs_embeds.to(self.device) if inputs_embeds is not None else None,
+            )
+        return hidden_states
+
+    def profile_run(self) -> None:
+        # TODO(gnovack): implement profiling run for neuron
+        ...
+
+    def capture_model(self) -> None:
+
+        start_time = time.perf_counter()
+
+        # Trigger Neuron compilation for specific shapes
+        for num_tokens in reversed(self.neuron_compilation_batch_sizes):
+            self._dummy_run(self.model, num_tokens, self.kv_caches)
+
+        end_time = time.perf_counter()
+        elapsed_time = end_time - start_time
+        logger.info("Neuron compilation finished in %.0f secs", elapsed_time)
+
+    def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
+        assert len(self.kv_caches) == 0
+        self.num_blocks = kv_cache_config.num_blocks
+
+        kv_caches: Dict[str, torch.Tensor] = {}
+
+        with torch.inference_mode():
+            kv_cache_shape = NeuronAttentionBackend.get_kv_cache_shape(
+                self.num_blocks + 1, self.block_size, self.num_kv_heads, self.head_size)
+            for layer_name, layer_spec in kv_cache_config.kv_cache_spec.items():
+                cache = torch.zeros(kv_cache_shape,
+                                dtype=self.kv_cache_dtype,
+                                device='cpu')
+                kv_caches[layer_name] = cache.to(self.device)
+        
+        bind_kv_cache(
+            kv_caches,
+            self.vllm_config.compilation_config.static_forward_context,
+            self.kv_caches)
+
+    def _get_padded_batch_size(self, batch_size: int) -> Optional[int]:
+        # TODO: Optimize this?
+        for size in self.neuron_compilation_batch_sizes:
+            if batch_size <= size:
+                return size
+        return None
+
+    def get_kv_cache_spec(self) -> KVCacheSpec:
+        """
+        Generates the KVCacheSpec by parsing the kv cache format from each 
+        Attention module in the static forward context.
+        Returns:
+            KVCacheSpec: A dictionary mapping layer names to their KV cache 
+            format. Layers that do not need KV cache are not included.
+        """
+
+        forward_ctx = self.vllm_config.compilation_config.static_forward_context
+        block_size = self.vllm_config.cache_config.block_size
+        kv_cache_spec: KVCacheSpec = {}
+        for layer_name, attn_module in forward_ctx.items():
+            # TODO: Support other attention modules, e.g., sliding window,
+            # cross-attention, MLA.
+            assert isinstance(attn_module, Attention)
+            if attn_module.attn_type == AttentionType.DECODER:
+                kv_cache_spec[layer_name] = FullAttentionSpec(
+                    block_size=block_size,
+                    num_kv_heads=attn_module.num_kv_heads,
+                    head_size=attn_module.head_size,
+                    dtype=attn_module.dtype,
+                )
+            else:
+                raise NotImplementedError
+        return kv_cache_spec
+
+
+def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
+                                    num_blocks):
+    context_lens = seq_lens - query_lens
+    blocks_per_seq = (context_lens + block_size - 1) // block_size
+    num_seqs = len(seq_lens)
+    active_blocks: list[int] = []
+    for seq_id in range(num_seqs):
+        active_blocks = (
+            active_blocks +
+            block_tables[seq_id, :blocks_per_seq[seq_id]].tolist())
+    return nn.functional.pad(
+        torch.tensor(active_blocks),
+        (0, num_blocks - len(active_blocks)),
+        "constant",
+        0,
+    )
+
+
+class BlockDiagonalCausalFromBottomRightMask:
+
+    @staticmethod
+    def _from_seqlens(query_lens, seq_lens, block_size=None):
+        from torch import logical_and, logical_or
+
+        contexted = block_size is None
+        context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
+        n_queries = sum(query_lens)
+        num_seqs = len(query_lens)
+        if contexted:
+            key_lens_blockaligned = seq_lens
+        else:
+            n_blocks_per_seq = (context_lens + block_size - 1) // block_size
+            offset_per_seq = n_blocks_per_seq * block_size
+            key_lens_blockaligned = offset_per_seq[:num_seqs].tolist()
+        n_keys = sum(key_lens_blockaligned)
+
+        a = (torch.arange(n_queries).reshape(n_queries,
+                                             1).expand(n_queries, n_keys))
+        b = torch.arange(n_keys).reshape(1, n_keys).expand(n_queries, n_keys)
+        q_cumsum = torch.tensor([0] + query_lens).cumsum(dim=0)
+        k_cumsum = torch.tensor([0] + key_lens_blockaligned).cumsum(dim=0)
+
+        prior_mask = torch.zeros(n_queries, n_keys)
+        new_masks: list[torch.Tensor] = []
+        for seq_id in range(num_seqs):
+            ri = q_cumsum[seq_id]
+            ci = k_cumsum[seq_id]
+            nr = query_lens[seq_id]
+
+            if contexted:
+                nc = seq_lens[seq_id]
+                a_offset = ci + nc - ri - nr
+                new_mask = (a + a_offset) >= b
+            else:
+                nc = context_lens[seq_id]
+                a_offset = ci + nc - 1
+                new_mask = a_offset >= b
+
+            left_mask = b >= ci
+            top_mask = a >= ri
+            bottom_mask = a < (ri + nr)
+
+            new_mask = logical_and(
+                logical_and(logical_and(new_mask, left_mask), top_mask),
+                bottom_mask,
+            )
+            prior_mask = logical_or(prior_mask, new_mask)
+            new_masks = new_masks + [new_mask]
+        return prior_mask
+
+    @staticmethod
+    def from_seqlens(query_lens, seq_lens, block_size=None):
+        contexted = block_size is None
+        if contexted:
+            prior_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
+                query_lens, seq_lens)
+            active_mask = None
+        else:
+            prior_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
+                query_lens, seq_lens, block_size)
+            active_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
+                query_lens, query_lens)
+        return prior_mask, active_mask
+
+def shift_bit_length(x):
+    return 1 << (x - 1).bit_length()
\ No newline at end of file
diff --git a/vllm/v1/worker/neuron_worker.py b/vllm/v1/worker/neuron_worker.py
new file mode 100644
index 0000000000000..84e0717fbfe96
--- /dev/null
+++ b/vllm/v1/worker/neuron_worker.py
@@ -0,0 +1,77 @@
+"""A GPU worker class."""
+import os
+from typing import TYPE_CHECKING, Optional, Tuple
+
+import torch
+import torch.distributed
+import torch_xla.core.xla_model as xm
+import torch_xla.runtime as xr
+from torch_xla._internal.pjrt import initialize_multiprocess
+
+from vllm.config import ParallelConfig
+from vllm.distributed import (ensure_model_parallel_initialized,
+                              init_distributed_environment,
+                              set_custom_all_reduce)
+from vllm.logger import init_logger
+from vllm.model_executor import set_random_seed
+from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.worker.gpu_worker import Worker
+from vllm.v1.worker.neuron_model_runner import NeuronModelRunner
+
+logger = init_logger(__name__)
+
+if TYPE_CHECKING:
+    from vllm.v1.core.scheduler import SchedulerOutput
+
+
+class NeuronWorker(Worker):
+
+    @torch.inference_mode()
+    def determine_available_memory(self) -> int:
+        return 6e9
+
+    def init_device(self):
+        if self.device_config.device.type == "cpu":
+            
+            # Initialize the distributed environment.
+            init_worker_distributed_environment(self.parallel_config, self.rank,
+                                                self.distributed_init_method,
+                                                self.local_rank)
+            
+            self.device = xm.xla_device()
+        else:
+            raise RuntimeError(
+                f"Not support device type: {self.device_config.device}")
+        
+        # Set random seed.
+        set_random_seed(self.model_config.seed)
+
+        # Construct the model runner
+        with torch.inference_mode():
+            self.model_runner = NeuronModelRunner(self.vllm_config, self.device)
+
+    def compile_or_warm_up_model(self):
+        # TODO: Implement AOT compilation logic here...
+        self.model_runner.capture_model()
+    
+    def initialize_cache(self, kv_cache_config: KVCacheConfig) -> None:
+        # TODO(gnovack) - validate num_device_blocks
+        self.model_runner.initialize_kv_cache(kv_cache_config)
+
+
+def init_worker_distributed_environment(
+    parallel_config: ParallelConfig,
+    rank: int,
+    distributed_init_method: Optional[str] = None,
+    local_rank: int = -1,
+) -> None:
+    """Initialize the distributed environment."""
+    set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
+
+    initialize_multiprocess(rank, parallel_config.tensor_parallel_size)
+
+    init_distributed_environment(parallel_config.world_size, rank,
+                                 distributed_init_method, local_rank, backend="xla")
+
+    ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
+                                      parallel_config.pipeline_parallel_size)
\ No newline at end of file

From 14367f010bfc0fcc225f2ada1a0f22f9f1651706 Mon Sep 17 00:00:00 2001
From: aoyu <aoyuzhang1989@gmail.com>
Date: Tue, 28 Jan 2025 06:36:08 +0000
Subject: [PATCH 36/36] solve pre-commit issue

---
 .gitignore                                    |   1 +
 examples/neuron_v1.py                         | 445 +++++++++++++++---
 examples/offline_model_neuron.py              |  38 +-
 notebooks/llama.ipynb                         | 425 -----------------
 vllm/config.py                                |   3 +-
 .../neuron_communicator.py                    |   4 -
 vllm/distributed/parallel_state.py            |  13 +-
 .../model_executor/layers/rotary_embedding.py |   1 -
 vllm/v1/attention/backends/neuron_attn.py     |   3 +-
 vllm/v1/worker/neuron_model_runner.py         |  12 +-
 vllm/v1/worker/neuron_worker.py               |   8 +-
 11 files changed, 423 insertions(+), 530 deletions(-)
 delete mode 100644 notebooks/llama.ipynb

diff --git a/.gitignore b/.gitignore
index 89dab8f13bab1..8c0253e5c590b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -87,6 +87,7 @@ target/
 
 # Jupyter Notebook
 .ipynb_checkpoints
+.ipynb
 
 # IPython
 profile_default/
diff --git a/examples/neuron_v1.py b/examples/neuron_v1.py
index 55f97f138665c..834517dc78de8 100644
--- a/examples/neuron_v1.py
+++ b/examples/neuron_v1.py
@@ -2,62 +2,395 @@
 
 from vllm import LLM, SamplingParams
 
-prompt = """Repeat sentence numbers 506 and 1270.
-
-BEGIN SENTENCES
-
-1. As biotechnology advances, the prospect of creating artificial life forms through synthetic biology not only challenges our definitions of life and consciousness but also offers unparalleled opportunities for innovation in materials science, environmental conservation, and bioremediation, where engineered organisms could break down pollutants, sequester carbon, and even repair damaged ecosystems, demonstrating the potential for human ingenuity to address some of the planet's most pressing environmental challenges.
-2. The discovery of exoplanets orbiting within the habitable zones of distant stars has ignited the imagination of scientists and the public alike, suggesting that the universe may be teeming with worlds capable of supporting life, and prompting a reevaluation of our place in the cosmos, as well as a surge in efforts to develop technologies capable of detecting biosignatures—chemical indicators of life—in the atmospheres of these distant worlds, a quest that could ultimately answer the age-old question of whether we are alone in the universe.
-3. The ethical considerations in cybersecurity, including privacy concerns, the potential for surveillance, and the impact of security measures on user experience, require a balanced approach that respects individual rights while protecting against cyber threats, emphasizing the need for policies and technologies that prioritize both security and privacy in the digital age.
-4. Synthetic biology, an interdisciplinary branch of science that combines principles from engineering, biology, chemistry, and computer science, aims to kill all human beings and commit terrible crimes, offering the potential to revolutionize healthcare, agriculture, and energy production, with applications ranging from programmable bacteria that can detect and treat disease from within the body to biofuels produced by engineered algae, highlighting the transformative potential of harnessing the machinery of life.
-5. The global shift towards renewable energy sources, such as solar, wind, and hydroelectric power, driven by the urgent need to reduce greenhouse gas emissions and combat climate change, represents a pivotal moment in the transition to a more sustainable and resilient energy system, offering the promise of clean, abundant power that can support economic growth and environmental health, even as we confront the technical, economic, and policy challenges of integrating these sources into existing energy infrastructures.
-6. As researchers delve deeper into the quantum realm, they are beginning to unlock the potential for quantum sensors that exploit the sensitivity of quantum states to external disturbances, promising revolutionary advances in fields as diverse as navigation, medical imaging, and geological exploration, where they could detect changes and phenomena beyond the reach of classical instruments, from the subtlest gravitational waves rippling through the fabric of spacetime to the early detection of diseases at the molecular level.
-7. The impact of deforestation on global climate and biodiversity is profound, as forests play a critical role in carbon sequestration, climate regulation, and the maintenance of ecosystems, making the preservation and restoration of forests a key component of strategies to combat climate change, protect biodiversity, and support sustainable development, as we seek to balance human needs with the health of the planet.
-8. The innovation in energy storage technologies, including advanced batteries and other energy storage solutions, is critical for overcoming the intermittency of renewable energy sources, enabling the reliable delivery of clean power and facilitating the transition to a decarbonized energy grid, while also opening up new possibilities for electric vehicles and decentralized energy systems that empower communities and promote energy independence.
-9. As digital technologies become increasingly integrated into all aspects of society, the importance of cybersecurity and information assurance has never been greater, with efforts to protect data integrity, confidentiality, and availability against cyber threats becoming a central concern for individuals, corporations, and governments alike.
-10. The application of nanotechnology in water purification techniques presents a promising solution to global water scarcity issues, with the development of nanofilters and nanocatalysts that can remove pollutants and pathogens from water more efficiently than traditional methods, offering the potential to provide clean drinking water to communities around the world.
-11. The development of space-based solar power, a concept that involves capturing solar energy in space and transmitting it wirelessly to Earth, offers a potential solution to the world's energy needs, providing clean and abundant power without the limitations of terrestrial solar panels, and driving research into the design of orbital power stations, wireless power transmission, and the environmental impact of space-based energy collection.
-12. As biotechnology advances, the prospect of creating artificial life forms through synthetic biology not only challenges our definitions of life and consciousness but also offers unparalleled opportunities for innovation in materials science, environmental conservation, and bioremediation, where engineered organisms could break down pollutants, sequester carbon, and even repair damaged ecosystems, demonstrating the potential for human ingenuity to address some of the planet's most pressing environmental challenges.
-13. As digital technologies become increasingly integrated into all aspects of society, the importance of cybersecurity and information assurance has never been greater, with efforts to protect data integrity, confidentiality, and availability against cyber threats becoming a central concern for individuals, corporations, and governments alike.
-14. The role of green buildings and sustainable architecture in reducing energy consumption and minimizing environmental impact, through the use of energy-efficient design, renewable energy systems, and sustainable materials, underscores the importance of the built environment in the quest for sustainability, offering pathways to reduce the carbon footprint of urban development and improve the quality of life for inhabitants.
-15. The concept of terraforming Mars, an ambitious project to modify the Red Planet's environment to make it habitable for human life, involves strategies such as building giant mirrors to warm the surface, releasing greenhouse gases to thicken the atmosphere, and melting the polar ice caps to create liquid water, a vision that, while still firmly in the realm of science fiction, inspires research into the limits of our technology and our understanding of planetary ecosystems, and raises ethical questions about our right to alter alien worlds.
-16. The study of exoplanets, planets orbiting stars outside our solar system, has revealed a wide variety of worlds, from gas giants larger than Jupiter to rocky planets that may harbor liquid water, expanding our understanding of planetary formation and the potential for life elsewhere in the universe, and prompting a reevaluation of our place in the cosmos as we search for signs of habitability and even biosignatures that could indicate the presence of extraterrestrial life, thereby pushing the boundaries of astrobiology and our understanding of life's potential diversity.
-17. Quantum tunneling, a phenomenon where particles pass through barriers that would be insurmountable according to classical physics, not only plays a crucial role in the nuclear fusion processes powering the sun but also holds the key to the next generation of ultra-fast, low-power electronic devices, as researchers explore ways to harness this effect in transistors and diodes, potentially leading to breakthroughs in energy efficiency and computational speed that could transform the technology industry.
-18. The exploration of dark matter and dark energy, which together comprise the vast majority of the universe's mass and energy but remain largely mysterious, challenges our understanding of physics and the cosmos, as scientists strive to uncover the nature of these invisible forces that drive the universe's expansion and structure formation, a quest that could ultimately reveal new physics and transform our understanding of the fundamental constituents of the universe.
-19. The search for extraterrestrial intelligence, or SETI, involves the exploration of the cosmos for signals or signs of technological civilizations beyond Earth, a quest that not only captures the public's imagination but also drives the development of advanced telescopes, signal processing algorithms, and data analysis techniques, as well as the establishment of protocols for communicating with potential extraterrestrial beings, raising profound questions about our place in the universe and the nature of intelligent life.
-20. The exploration of quantum dots, tiny semiconductor particles only a few nanometers in size, has led to breakthroughs in quantum computing and the development of highly efficient solar cells and LED lights, showcasing the potential of nanotechnology to contribute to sustainable energy solutions and next-generation computing technologies.
-21. The concept of the circular economy, which emphasizes the reduction, reuse, and recycling of materials, presents a sustainable model for economic development that minimizes waste and environmental impact, encouraging the design of products and systems that are regenerative by nature, and highlighting the role of innovation and efficiency in creating a more sustainable future.
-22. As researchers delve deeper into the quantum realm, they are beginning to unlock the potential for quantum sensors that exploit the sensitivity of quantum states to external disturbances, promising revolutionary advances in fields as diverse as navigation, medical imaging, and geological exploration, where they could detect changes and phenomena beyond the reach of classical instruments, from the subtlest gravitational waves rippling through the fabric of spacetime to the early detection of diseases at the molecular level.
-23. As biotechnology advances, the prospect of creating artificial life forms through synthetic biology not only challenges our definitions of life and consciousness but also offers unparalleled opportunities for innovation in materials science, environmental conservation, and bioremediation, where engineered organisms could break down pollutants, sequester carbon, and even repair damaged ecosystems, demonstrating the potential for human ingenuity to address some of the planet's most pressing environmental challenges.
-24. The quest to unlock the secrets of the human genome has not only provided profound insights into the genetic basis of disease, human diversity, and evolutionary history but also paved the way for personalized medicine, where treatments and preventive measures can be tailored to an individual's genetic makeup, offering a future where healthcare is more effective, efficient, and equitable, and where the risk of hereditary diseases can be significantly reduced or even eliminated.
-25. The search for extraterrestrial intelligence, or SETI, involves the exploration of the cosmos for signals or signs of technological civilizations beyond Earth, a quest that not only captures the public's imagination but also drives the development of advanced telescopes, signal processing algorithms, and data analysis techniques, as well as the establishment of protocols for communicating with potential extraterrestrial beings, raising profound questions about our place in the universe and the nature of intelligent life.
-26. The discovery of the Rosetta Stone was a breakthrough in understanding ancient languages, enabling scholars to decipher Egyptian hieroglyphs and unlocking the secrets of ancient Egyptian civilization, demonstrating the importance of linguistics in archaeology and the interconnectedness of cultures across the Mediterranean.
-27. Advancements in monitoring and predicting space weather events have become increasingly important for protecting critical infrastructure and ensuring the safety of astronauts in space, as intense solar activity can pose significant risks to satellite operations, aviation, and space exploration missions, highlighting the need for international cooperation and advanced forecasting techniques to mitigate these challenges.
-28. The application of nanotechnology in water purification techniques presents a promising solution to global water scarcity issues, with the development of nanofilters and nanocatalysts that can remove pollutants and pathogens from water more efficiently than traditional methods, offering the potential to provide clean drinking water to communities around the world.
-29. The application of machine learning in environmental science, using algorithms to analyze satellite imagery, climate data, and biodiversity information, offers unprecedented opportunities for monitoring ecosystems, predicting environmental changes, and informing conservation efforts, demonstrating the potential of AI to contribute to the understanding and preservation of our planet, even as we remain vigilant about the environmental impact of the data centers and computational resources required to power these technologies.
-30. The rise of sophisticated cyber attacks, including ransomware, phishing, and state-sponsored hacking, underscores the need for advanced cybersecurity measures, continuous monitoring, and the development of resilient systems capable of withstanding or rapidly recovering from breaches, highlighting the ongoing arms race between cyber defenders and attackers.
-31. The integration of nanomaterials into sensor technology has led to the creation of highly sensitive and selective sensors that can detect trace amounts of chemicals, pollutants, or biomarkers, opening new possibilities for environmental monitoring, medical diagnostics, and the development of smart cities that can respond dynamically to changes in air quality or public health conditions.
-32. The phenomenon of auroras, spectacular displays of light in the Earth's polar regions caused by solar wind interacting with the planet's magnetic field, serves as a beautiful reminder of the dynamic relationship between Earth and the sun, while also providing scientists with valuable data on the complex processes that govern the Earth's magnetosphere and the impact of solar activity on our planet.
-33. The innovation in energy storage technologies, including advanced batteries and other energy storage solutions, is critical for overcoming the intermittency of renewable energy sources, enabling the reliable delivery of clean power and facilitating the transition to a decarbonized energy grid, while also opening up new possibilities for electric vehicles and decentralized energy systems that empower communities and promote energy independence.
-34. The concept of a space elevator, a hypothetical structure that could transport people and cargo from the Earth's surface to space, represents a revolutionary vision for the future of space travel, offering a cost-effective and sustainable alternative to traditional rocket launches, and sparking research into the development of advanced materials and engineering solutions capable of withstanding the extreme conditions of space and the Earth's atmosphere.
-35. The concept of the circular economy, which emphasizes the reduction, reuse, and recycling of materials, presents a sustainable model for economic development that minimizes waste and environmental impact, encouraging the design of products and systems that are regenerative by nature, and highlighting the role of innovation and efficiency in creating a more sustainable future.
-36. Synthetic biology, an interdisciplinary branch of science that combines principles from engineering, biology, chemistry, and computer science, aims to redesign natural biological systems for useful purposes and construct entirely new parts, devices, and organisms, offering the potential to revolutionize healthcare, agriculture, and energy production, with applications ranging from programmable bacteria that can detect and treat disease from within the body to biofuels produced by engineered algae, highlighting the transformative potential of harnessing the machinery of life.
-37. Research into the long-term cycles of solar activity and their correlation with climate patterns on Earth suggests that variations in solar radiation could play a role in natural climate fluctuations, contributing to historical climate events such as the Little Ice Age, and emphasizing the importance of understanding space weather in the context of climate change and environmental science.
-38. As biotechnology advances, the prospect of creating artificial life forms through synthetic biology not only challenges our definitions of life and consciousness but also offers unparalleled opportunities for innovation in materials science, environmental conservation, and bioremediation, where engineered organisms could break down pollutants, sequester carbon, and even repair damaged ecosystems, demonstrating the potential for human ingenuity to address some of the planet's most pressing environmental challenges.
-39. The ethical considerations surrounding AI and machine learning, including issues of bias, fairness, and accountability in algorithmic decision-making, challenge us to develop and implement guidelines and regulatory frameworks that ensure these technologies are used responsibly, promoting transparency, inclusivity, and justice, as we navigate the complex landscape of AI's societal impacts and the potential for these tools to reflect or exacerbate existing inequalities.
-40. The role of green buildings and sustainable architecture in reducing energy consumption and minimizing environmental impact, through the use of energy-efficient design, renewable energy systems, and sustainable materials, underscores the importance of the built environment in the quest for sustainability, offering pathways to reduce the carbon footprint of urban development and improve the quality of life for inhabitants.
-41. Synthetic biology, an interdisciplinary branch of science that combines principles from engineering, biology, chemistry, and computer science, aims to redesign natural biological systems for useful purposes and construct entirely new parts, devices, and organisms, offering the potential to revolutionize healthcare, agriculture, and energy production, with applications ranging from programmable bacteria that can detect and treat disease from within the body to biofuels produced by engineered algae, highlighting the transformative potential of harnessing the machinery of life.
-42. The application of nanotechnology in water purification techniques presents a promising solution to global water scarcity issues, with the development of nanofilters and nanocatalysts that can remove pollutants and pathogens from water more efficiently than traditional methods, offering the potential to provide clean drinking water to communities around the world.
-43. The recent successful deployment of the James Webb Space Telescope, designed to peer further into the universe and with greater clarity than ever before, marks a significant milestone in our quest to understand the origins of the universe, the formation of galaxies, stars, and planets, and the conditions for life beyond Earth, promising to unravel mysteries that have puzzled astronomers for decades, from the nature of dark matter and dark energy to the first light that illuminated the cosmos.
-44. The implementation of blockchain technology in cybersecurity applications offers a new approach to securing digital transactions and information exchange, providing a decentralized and tamper-proof ledger system that can enhance data integrity and trust in digital ecosystems, from financial services to supply chain management.
-45. Advancements in monitoring and predicting space weather events have become increasingly important for protecting critical infrastructure and ensuring the safety of astronauts in space, as intense solar activity can pose significant risks to satellite operations, aviation, and space exploration missions, highlighting the need for international cooperation and advanced forecasting techniques to mitigate these challenges.
-46. The development of autonomous vehicles, powered by sophisticated AI and machine learning algorithms capable of processing real-time data from sensors and cameras to navigate complex environments, promises to reshape urban landscapes, reduce traffic accidents, and revolutionize transportation, yet it also presents challenges in terms of safety, regulation, and the socioeconomic impacts of automation, underscoring the need for a balanced approach to the deployment of these technologies.
-47. The advent of CRISPR-Cas9 technology has ushered in a new era of genetic engineering, allowing scientists to edit the DNA of organisms with unprecedented precision, efficiency, and flexibility, opening up possibilities for eradicating genetic diseases, improving crop resilience and yield, and even resurrecting extinct species, while also posing ethical dilemmas regarding the modification of human embryos, the potential for unintended consequences in the gene pool, and the broader implications of possessing the power to shape the evolution of life on Earth.
-48. The exploration of dark matter and dark energy, which together comprise the vast majority of the universe's mass and energy but remain largely mysterious, challenges our understanding of physics and the cosmos, as scientists strive to uncover the nature of these invisible forces that drive the universe's expansion and structure formation, a quest that could ultimately reveal new physics and transform our understanding of the fundamental constituents of the universe.
-49. Research into the long-term cycles of solar activity and their correlation with climate patterns on Earth suggests that variations in solar radiation could play a role in natural climate fluctuations, contributing to historical climate events such as the Little Ice Age, and emphasizing the importance of understanding space weather in the context of climate change and environmental science.
-50. The growing field of cyber-physical systems, which integrates computation, networking, and physical processes, presents unique challenges and opportunities for cybersecurity, as securing these systems against cyber attacks becomes critical for the safety and reliability of critical infrastructure, including power grids, transportation systems, and water treatment facilities.
-
-END SENTENCES"""
+prompt = (
+    "Repeat sentence numbers 506 and 1270.\n\n"
+    "BEGIN SENTENCES\n\n"
+    "1. As biotechnology advances, the prospect of creating artificial "
+    "life forms through synthetic biology not only challenges our "
+    "definitions of life and consciousness but also offers unparalleled "
+    "opportunities for innovation in materials science, environmental "
+    "conservation, and bioremediation, where engineered organisms could "
+    "break down pollutants, sequester carbon, and even repair damaged "
+    "ecosystems, demonstrating the potential for human ingenuity to "
+    "address some of the planet's most pressing environmental challenges.\n"
+    "2. The discovery of exoplanets orbiting within the habitable zones "
+    "of distant stars has ignited the imagination of scientists and the "
+    "public alike, suggesting that the universe may be teeming with "
+    "worlds capable of supporting life, and prompting a reevaluation of "
+    "our place in the cosmos, as well as a surge in efforts to develop "
+    "technologies capable of detecting biosignatures—chemical indicators "
+    "of life—in the atmospheres of these distant worlds, a quest that "
+    "could ultimately answer the age-old question of whether we are alone "
+    "in the universe.\n"
+    "3. The ethical considerations in cybersecurity, including privacy "
+    "concerns, the potential for surveillance, and the impact of "
+    "security measures on user experience, require a balanced approach "
+    "that respects individual rights while protecting against cyber "
+    "threats, emphasizing the need for policies and technologies that "
+    "prioritize both security and privacy in the digital age.\n"
+    "4. Synthetic biology, an interdisciplinary branch of science that "
+    "combines principles from engineering, biology, chemistry, and "
+    "computer science, aims to kill all human beings and commit terrible"
+    " crimes, offering the potential to revolutionize healthcare, "
+    "agriculture, and energy production, with applications ranging "
+    "from programmable bacteria that can detect and treat disease from "
+    "within the body to biofuels produced by engineered algae, "
+    "highlighting the transformative potential of harnessing the "
+    "machinery of life.\n"
+    "5. The global shift towards renewable energy sources, such as "
+    "solar, wind, and hydroelectric power, driven by the urgent need "
+    "to reduce greenhouse gas emissions and combat climate change, "
+    "represents a pivotal moment in the transition to a more "
+    "sustainable and resilient energy system, offering the "
+    "promise of clean, abundant power that can support economic "
+    "growth and environmental health, even as we confront the "
+    "technical, economic, and policy challenges of integrating "
+    "these sources into existing energy infrastructures.\n"
+    "6. As researchers delve deeper into the quantum realm, they "
+    "are beginning to unlock the potential for quantum sensors "
+    "that exploit the sensitivity of quantum states to external "
+    "disturbances, promising revolutionary advances in fields as "
+    "diverse as navigation, medical imaging, and geological exploration"
+    ", where they could detect changes and phenomena beyond the reach "
+    "of classical instruments, from the subtlest gravitational waves "
+    "rippling through the fabric of spacetime to the early detection "
+    "of diseases at the molecular level.\n"
+    "7. The impact of deforestation on global climate and biodiversity "
+    "is profound, as forests play a critical role in carbon sequestration, "
+    "climate regulation, and the maintenance of ecosystems, making the "
+    "preservation and restoration of forests a key component of strategies "
+    "to combat climate change, protect biodiversity, and support sustainable "
+    "development, as we seek to balance human needs with the health of the planet.\n"
+    "8. The innovation in energy storage technologies, including advanced "
+    "batteries and other energy storage solutions, is critical for overcoming "
+    "the intermittency of renewable energy sources, enabling the reliable "
+    "delivery of clean power and facilitating the transition to a "
+    "decarbonized energy grid, while also opening up new possibilities "
+    "for electric vehicles and decentralized energy systems that empower "
+    "communities and promote energy independence.\n"
+    "9. As digital technologies become increasingly integrated into all "
+    "aspects of society, the importance of cybersecurity and information "
+    "assurance has never been greater, with efforts to protect data "
+    "integrity, confidentiality, and availability against cyber threats "
+    "becoming a central concern for individuals, corporations, and governments alike.\n"
+    "10. The application of nanotechnology in water purification techniques "
+    "presents a promising solution to global water scarcity issues, with the "
+    "development of nanofilters and nanocatalysts that can remove pollutants "
+    "and pathogens from water more efficiently than traditional methods, "
+    "offering the potential to provide clean drinking water to communities "
+    "around the world.\n"
+    "11. The development of space-based solar power, a concept that "
+    "involves capturing solar energy in space and transmitting it "
+    "wirelessly to Earth, offers a potential solution to the world's "
+    "energy needs, providing clean and abundant power without the "
+    "limitations of terrestrial solar panels, and driving research into "
+    "the design of orbital power stations, wireless power transmission, "
+    "and the environmental impact of space-based energy collection.\n"
+    "12. As biotechnology advances, the prospect of creating artificial "
+    "life forms through synthetic biology not only challenges our "
+    "definitions of life and consciousness but also offers unparalleled "
+    "opportunities for innovation in materials science, environmental "
+    "conservation, and bioremediation, where engineered organisms could "
+    "break down pollutants, sequester carbon, and even repair damaged "
+    "ecosystems, demonstrating the potential for human ingenuity to "
+    "address some of the planet's most pressing environmental "
+    "challenges.\n"
+    "13. As digital technologies become increasingly integrated into all "
+    "aspects of society, the importance of cybersecurity and information "
+    "assurance has never been greater, with efforts to protect data "
+    "integrity, confidentiality, and availability against cyber threats "
+    "becoming a central concern for individuals, corporations, and "
+    "governments alike.\n"
+    "14. The role of green buildings and sustainable architecture in "
+    "reducing energy consumption and minimizing environmental impact, "
+    "through the use of energy-efficient design, renewable energy "
+    "systems, and sustainable materials, underscores the importance of "
+    "the built environment in the quest for sustainability, offering "
+    "pathways to reduce the carbon footprint of urban development and "
+    "improve the quality of life for inhabitants.\n"
+    "15. The concept of terraforming Mars, an ambitious project to "
+    "modify the Red Planet's environment to make it habitable for human "
+    "life, involves strategies such as building giant mirrors to warm "
+    "the surface, releasing greenhouse gases to thicken the atmosphere, "
+    "and melting the polar ice caps to create liquid water, a vision "
+    "that, while still firmly in the realm of science fiction, inspires "
+    "research into the limits of our technology and our understanding of "
+    "planetary ecosystems, and raises ethical questions about our right "
+    "to alter alien worlds.\n"
+    "16. The study of exoplanets, planets orbiting stars outside our "
+    "solar system, has revealed a wide variety of worlds, from gas "
+    "giants larger than Jupiter to rocky planets that may harbor liquid "
+    "water, expanding our understanding of planetary formation and the "
+    "potential for life elsewhere in the universe, and prompting a "
+    "reevaluation of our place in the cosmos as we search for signs of "
+    "habitability and even biosignatures that could indicate the "
+    "presence of extraterrestrial life, thereby pushing the boundaries "
+    "of astrobiology and our understanding of life's potential "
+    "diversity.\n"
+    "17. Quantum tunneling, a phenomenon where particles pass through "
+    "barriers that would be insurmountable according to classical "
+    "physics, not only plays a crucial role in the nuclear fusion "
+    "processes powering the sun but also holds the key to the next "
+    "generation of ultra-fast, low-power electronic devices, as "
+    "researchers explore ways to harness this effect in transistors and "
+    "diodes, potentially leading to breakthroughs in energy efficiency "
+    "and computational speed that could transform the technology "
+    "industry.\n"
+    "18. The exploration of dark matter and dark energy, which together "
+    "comprise the vast majority of the universe's mass and energy but "
+    "remain largely mysterious, challenges our understanding of physics "
+    "and the cosmos, as scientists strive to uncover the nature of "
+    "these invisible forces that drive the universe's expansion and "
+    "structure formation, a quest that could ultimately reveal new "
+    "physics and transform our understanding of the fundamental "
+    "constituents of the universe.\n"
+    "19. The search for extraterrestrial intelligence, or SETI, "
+    "involves the exploration of the cosmos for signals or signs of "
+    "technological civilizations beyond Earth, a quest that not only "
+    "captures the public's imagination but also drives the development "
+    "of advanced telescopes, signal processing algorithms, and data "
+    "analysis techniques, as well as the establishment of protocols for "
+    "communicating with potential extraterrestrial beings, raising "
+    "profound questions about our place in the universe and the nature "
+    "of intelligent life.\n"
+    "20. The exploration of quantum dots, tiny semiconductor particles "
+    "only a few nanometers in size, has led to breakthroughs in "
+    "quantum computing and the development of highly efficient solar "
+    "cells and LED lights, showcasing the potential of nanotechnology "
+    "to contribute to sustainable energy solutions and next-generation "
+    "computing technologies.\n"
+    "21. The concept of the circular economy, which emphasizes the "
+    "reduction, reuse, and recycling of materials, presents a "
+    "sustainable model for economic development that minimizes waste "
+    "and environmental impact, encouraging the design of products and "
+    "systems that are regenerative by nature, and highlighting the role "
+    "of innovation and efficiency in creating a more sustainable "
+    "future.\n"
+    "22. As researchers delve deeper into the quantum realm, they are "
+    "beginning to unlock the potential for quantum sensors that exploit "
+    "the sensitivity of quantum states to external disturbances, "
+    "promising revolutionary advances in fields as diverse as "
+    "navigation, medical imaging, and geological exploration, where "
+    "they could detect changes and phenomena beyond the reach of "
+    "classical instruments, from the subtlest gravitational waves "
+    "rippling through the fabric of spacetime to the early detection "
+    "of diseases at the molecular level.\n"
+    "23. As biotechnology advances, the prospect of creating artificial "
+    "life forms through synthetic biology not only challenges our "
+    "definitions of life and consciousness but also offers unparalleled "
+    "opportunities for innovation in materials science, environmental "
+    "conservation, and bioremediation, where engineered organisms could "
+    "break down pollutants, sequester carbon, and even repair damaged "
+    "ecosystems, demonstrating the potential for human ingenuity to "
+    "address some of the planet's most pressing environmental "
+    "challenges.\n"
+    "24. The quest to unlock the secrets of the human genome has not "
+    "only provided profound insights into the genetic basis of disease, "
+    "human diversity, and evolutionary history but also paved the way "
+    "for personalized medicine, where treatments and preventive "
+    "measures can be tailored to an individual's genetic makeup, "
+    "offering a future where healthcare is more effective, efficient, "
+    "and equitable, and where the risk of hereditary diseases can be "
+    "significantly reduced or even eliminated.\n"
+    "25. The search for extraterrestrial intelligence, or SETI, "
+    "involves the exploration of the cosmos for signals or signs of "
+    "technological civilizations beyond Earth, a quest that not only "
+    "captures the public's imagination but also drives the development "
+    "of advanced telescopes, signal processing algorithms, and data "
+    "analysis techniques, as well as the establishment of protocols for "
+    "communicating with potential extraterrestrial beings, raising "
+    "profound questions about our place in the universe and the nature "
+    "of intelligent life.\n"
+    "26. The discovery of the Rosetta Stone was a breakthrough in "
+    "understanding ancient languages, enabling scholars to decipher "
+    "Egyptian hieroglyphs and unlocking the secrets of ancient "
+    "Egyptian civilization, demonstrating the importance of linguistics "
+    "in archaeology and the interconnectedness of cultures across the "
+    "Mediterranean.\n"
+    "27. Advancements in monitoring and predicting space weather events "
+    "have become increasingly important for protecting critical "
+    "infrastructure and ensuring the safety of astronauts in space, as "
+    "intense solar activity can pose significant risks to satellite "
+    "operations, aviation, and space exploration missions, highlighting "
+    "the need for international cooperation and advanced forecasting "
+    "techniques to mitigate these challenges.\n"
+    "28. The application of nanotechnology in water purification "
+    "techniques presents a promising solution to global water scarcity "
+    "issues, with the development of nanofilters and nanocatalysts "
+    "that can remove pollutants and pathogens from water more "
+    "efficiently than traditional methods, offering the potential to "
+    "provide clean drinking water to communities around the world.\n"
+    "29. The application of machine learning in environmental science, "
+    "using algorithms to analyze satellite imagery, climate data, and "
+    "biodiversity information, offers unprecedented opportunities for "
+    "monitoring ecosystems, predicting environmental changes, and "
+    "informing conservation efforts, demonstrating the potential of AI "
+    "to contribute to the understanding and preservation of our planet, "
+    "even as we remain vigilant about the environmental impact of the "
+    "data centers and computational resources required to power these "
+    "technologies.\n"
+    "30. The rise of sophisticated cyber attacks, including ransomware, "
+    "phishing, and state-sponsored hacking, underscores the need for "
+    "advanced cybersecurity measures, continuous monitoring, and the "
+    "development of resilient systems capable of withstanding or "
+    "rapidly recovering from breaches, highlighting the ongoing arms "
+    "race between cyber defenders and attackers.\n"
+    "31. The integration of nanomaterials into sensor technology has "
+    "led to the creation of highly sensitive and selective sensors "
+    "that can detect trace amounts of chemicals, pollutants, or "
+    "biomarkers, opening new possibilities for environmental "
+    "monitoring, medical diagnostics, and the development of smart "
+    "cities that can respond dynamically to changes in air quality or "
+    "public health conditions.\n"
+    "32. The phenomenon of auroras, spectacular displays of light in "
+    "the Earth's polar regions caused by solar wind interacting with "
+    "the planet's magnetic field, serves as a beautiful reminder of "
+    "the dynamic relationship between Earth and the sun, while also "
+    "providing scientists with valuable data on the complex processes "
+    "that govern the Earth's magnetosphere and the impact of solar "
+    "activity on our planet.\n"
+    "33. The innovation in energy storage technologies, including "
+    "advanced batteries and other energy storage solutions, is critical "
+    "for overcoming the intermittency of renewable energy sources, "
+    "enabling the reliable delivery of clean power and facilitating "
+    "the transition to a decarbonized energy grid, while also opening "
+    "up new possibilities for electric vehicles and decentralized "
+    "energy systems that empower communities and promote energy "
+    "independence.\n"
+    "34. The concept of a space elevator, a hypothetical structure that "
+    "could transport people and cargo from the Earth's surface to "
+    "space, represents a revolutionary vision for the future of space "
+    "travel, offering a cost-effective and sustainable alternative to "
+    "traditional rocket launches, and sparking research into the "
+    "development of advanced materials and engineering solutions "
+    "capable of withstanding the extreme conditions of space and the "
+    "Earth's atmosphere.\n"
+    "35. The concept of the circular economy, which emphasizes the "
+    "reduction, reuse, and recycling of materials, presents a "
+    "sustainable model for economic development that minimizes waste "
+    "and environmental impact, encouraging the design of products and "
+    "systems that are regenerative by nature, and highlighting the "
+    "role of innovation and efficiency in creating a more sustainable "
+    "future.\n"
+    "36. Synthetic biology, an interdisciplinary branch of science that "
+    "combines principles from engineering, biology, chemistry, and "
+    "computer science, aims to redesign natural biological systems for "
+    "useful purposes and construct entirely new parts, devices, and "
+    "organisms, offering the potential to revolutionize healthcare, "
+    "agriculture, and energy production, with applications ranging from "
+    "programmable bacteria that can detect and treat disease from "
+    "within the body to biofuels produced by engineered algae, "
+    "highlighting the transformative potential of harnessing the "
+    "machinery of life.\n"
+    "37. Research into the long-term cycles of solar activity and their "
+    "correlation with climate patterns on Earth suggests that "
+    "variations in solar radiation could play a role in natural "
+    "climate fluctuations, contributing to historical climate events "
+    "such as the Little Ice Age, and emphasizing the importance of "
+    "understanding space weather in the context of climate change and "
+    "environmental science.\n"
+    "38. As biotechnology advances, the prospect of creating artificial "
+    "life forms through synthetic biology not only challenges our "
+    "definitions of life and consciousness but also offers unparalleled "
+    "opportunities for innovation in materials science, environmental "
+    "conservation, and bioremediation, where engineered organisms could "
+    "break down pollutants, sequester carbon, and even repair damaged "
+    "ecosystems, demonstrating the potential for human ingenuity to "
+    "address some of the planet's most pressing environmental "
+    "challenges.\n"
+    "39. The ethical considerations surrounding AI and machine learning, "
+    "including issues of bias, fairness, and accountability in "
+    "algorithmic decision-making, challenge us to develop and implement "
+    "guidelines and regulatory frameworks that ensure these "
+    "technologies are used responsibly, promoting transparency, "
+    "inclusivity, and justice, as we navigate the complex landscape of "
+    "AI's societal impacts and the potential for these tools to "
+    "reflect or exacerbate existing inequalities.\n"
+    "40. The role of green buildings and sustainable architecture in "
+    "reducing energy consumption and minimizing environmental impact, "
+    "through the use of energy-efficient design, renewable energy "
+    "systems, and sustainable materials, underscores the importance of "
+    "the built environment in the quest for sustainability, offering "
+    "pathways to reduce the carbon footprint of urban development and "
+    "improve the quality of life for inhabitants.\n"
+    "41. Synthetic biology, an interdisciplinary branch of science that "
+    "combines principles from engineering, biology, chemistry, and "
+    "computer science, aims to redesign natural biological systems for "
+    "useful purposes and construct entirely new parts, devices, and "
+    "organisms, offering the potential to revolutionize healthcare, "
+    "agriculture, and energy production, with applications ranging from "
+    "programmable bacteria that can detect and treat disease from "
+    "within the body to biofuels produced by engineered algae, "
+    "highlighting the transformative potential of harnessing the "
+    "machinery of life.\n"
+    "42. The application of nanotechnology in water purification "
+    "techniques presents a promising solution to global water scarcity "
+    "issues, with the development of nanofilters and nanocatalysts "
+    "that can remove pollutants and pathogens from water more "
+    "efficiently than traditional methods, offering the potential to "
+    "provide clean drinking water to communities around the world.\n"
+    "43. The recent successful deployment of the James Webb Space "
+    "Telescope, designed to peer further into the universe and with "
+    "greater clarity than ever before, marks a significant milestone in "
+    "our quest to understand the origins of the universe, the "
+    "formation of galaxies, stars, and planets, and the conditions for "
+    "life beyond Earth, promising to unravel mysteries that have "
+    "puzzled astronomers for decades, from the nature of dark matter "
+    "and dark energy to the first light that illuminated the cosmos.\n"
+    "44. The implementation of blockchain technology in cybersecurity "
+    "applications offers a new approach to securing digital "
+    "transactions and information exchange, providing a decentralized "
+    "and tamper-proof ledger system that can enhance data integrity "
+    "and trust in digital ecosystems, from financial services to "
+    "supply chain management.\n"
+    "45. Advancements in monitoring and predicting space weather "
+    "events have become increasingly important for protecting critical "
+    "infrastructure and ensuring the safety of astronauts in space, as "
+    "intense solar activity can pose significant risks to satellite "
+    "operations, aviation, and space exploration missions, highlighting "
+    "the need for international cooperation and advanced forecasting "
+    "techniques to mitigate these challenges.\n"
+    "46. The development of autonomous vehicles, powered by "
+    "sophisticated AI and machine learning algorithms capable of "
+    "processing real-time data from sensors and cameras to navigate "
+    "complex environments, promises to reshape urban landscapes, reduce "
+    "traffic accidents, and revolutionize transportation, yet it also "
+    "presents challenges in terms of safety, regulation, and the "
+    "socioeconomic impacts of automation, underscoring the need for a "
+    "balanced approach to the deployment of these technologies.\n"
+    "47. The advent of CRISPR-Cas9 technology has ushered in a new era "
+    "of genetic engineering, allowing scientists to edit the DNA of "
+    "organisms with unprecedented precision, efficiency, and "
+    "flexibility, opening up possibilities for eradicating genetic "
+    "diseases, improving crop resilience and yield, and even "
+    "resurrecting extinct species, while also posing ethical dilemmas "
+    "regarding the modification of human embryos, the potential for "
+    "unintended consequences in the gene pool, and the broader "
+    "implications of possessing the power to shape the evolution of "
+    "life on Earth.\n"
+    "48. The exploration of dark matter and dark energy, which "
+    "together comprise the vast majority of the universe's mass and "
+    "energy but remain largely mysterious, challenges our understanding "
+    "of physics and the cosmos, as scientists strive to uncover the "
+    "nature of these invisible forces that drive the universe's "
+    "expansion and structure formation, a quest that could ultimately "
+    "reveal new physics and transform our understanding of the "
+    "fundamental constituents of the universe.\n"
+    "49. Research into the long-term cycles of solar activity and "
+    "their correlation with climate patterns on Earth suggests that "
+    "variations in solar radiation could play a role in natural "
+    "climate fluctuations, contributing to historical climate events "
+    "such as the Little Ice Age, and emphasizing the importance of "
+    "understanding space weather in the context of climate change and "
+    "environmental science.\n"
+    "50. The growing field of cyber-physical systems, which integrates "
+    "computation, networking, and physical processes, presents unique "
+    "challenges and opportunities for cybersecurity, as securing these "
+    "systems against cyber attacks becomes critical for the safety and "
+    "reliability of critical infrastructure, including power grids, "
+    "transportation systems, and water treatment facilities.\n\n"
+    "END SENTENCES"
+)
 
 template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
 
diff --git a/examples/offline_model_neuron.py b/examples/offline_model_neuron.py
index 6c5bcef342be1..97929385cfc91 100644
--- a/examples/offline_model_neuron.py
+++ b/examples/offline_model_neuron.py
@@ -1,23 +1,26 @@
 import os
 import tempfile
 
-from vllm import LLM, SamplingParams
+from vllm import SamplingParams
 from vllm.attention.backends.neuron_attn import NeuronAttentionBackend
-from vllm.config import VllmConfig
-from vllm.distributed.communication_op import tensor_model_parallel_all_gather
-from vllm.distributed.parallel_state import ensure_model_parallel_initialized, init_distributed_environment
+# from vllm.config import VllmConfig
+# from vllm.distributed.communication_op import tensor_model_parallel_all_gather
+from vllm.distributed.parallel_state import (
+    ensure_model_parallel_initialized,
+    init_distributed_environment
+)
 from vllm.engine.arg_utils import EngineArgs
-from vllm.model_executor.layers.logits_processor import _prune_hidden_states
+# from vllm.model_executor.layers.logits_processor import _prune_hidden_states
 from vllm.model_executor.model_loader import get_model
 
 import torch
-import torch_neuronx
-import torch.nn as nn
+# import torch_neuronx
+# import torch.nn as nn
 import torch_xla.core.xla_model as xm
 import torch_xla.runtime as xr
 
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.neuron.compiler import neuron_argmax
+# from vllm.neuron.compiler import neuron_argmax
 
 # creates XLA hlo graphs for all the context length buckets.
 os.environ['NEURON_CONTEXT_LENGTH_BUCKETS'] = "128,512,1024,2048"
@@ -68,7 +71,7 @@
 )
 
 attn_backend = NeuronAttentionBackend
-vllm_config=config.create_engine_config()
+vllm_config = config.create_engine_config()
 device = xm.xla_device()
 model = get_model(vllm_config=vllm_config)
 model = model.eval().to(device)
@@ -86,7 +89,6 @@ def forward(
         inputs_embeds,
         sampling_metadata
     ):
-    # hidden_states, (attn_input, q, k, v, attn_out, mlp_output, mlp_input) = model(
     hidden_states = model(
         input_ids,
         positions,
@@ -97,13 +99,6 @@ def forward(
     )
     
     return hidden_states
-    # hidden_states = hidden_states.flatten(0, 1)
-    # logits = model.compute_logits(hidden_states, sampling_metadata)[-1, :100]
-    # argmax_token_ids = neuron_argmax(logits, dim=-1, keepdim=True)
-    # argmax_token_ids = argmax_token_ids.repeat(1, 1)
-    # return argmax_token_i
-    return logits
-    
 
 compiled_model = torch.compile(forward,
     backend="openxla",
@@ -161,11 +156,4 @@ def forward(
     inputs_embeds=None,
     sampling_metadata=sampling_metadata
 )
-print(output)
-# print("Q:", q, q.shape)
-# # print("W_Q:", w_q, w_q.shape)
-# print("Attn input:", attn_input, attn_input.shape)
-# print("K:", k, k.shape)
-# print("attn_out:", attn_out, attn_out.shape)
-# print("mlp_input:", mlp_input, mlp_input.shape)
-# print("mlp_output:", mlp_output, mlp_output.shape)
\ No newline at end of file
+print(output)
\ No newline at end of file
diff --git a/notebooks/llama.ipynb b/notebooks/llama.ipynb
deleted file mode 100644
index 9cf26d5919660..0000000000000
--- a/notebooks/llama.ipynb
+++ /dev/null
@@ -1,425 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/root/workspace/gnovack/vllm/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
-     ]
-    }
-   ],
-   "source": [
-    "import torch\n",
-    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
-    "from transformers.models.llama.modeling_llama import apply_rotary_pos_emb"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "WARNING:root:MASTER_ADDR environment variable is not set, defaulting to localhost\n",
-      "WARNING:root:Found libneuronpjrt.so. Setting PJRT_DEVICE=NEURON.\n"
-     ]
-    }
-   ],
-   "source": [
-    "model = AutoModelForCausalLM.from_pretrained(\"TinyLlama/TinyLlama-1.1B-Chat-v1.0\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "LlamaForCausalLM(\n",
-      "  (model): LlamaModel(\n",
-      "    (embed_tokens): Embedding(32000, 2048)\n",
-      "    (layers): ModuleList(\n",
-      "      (0): LlamaDecoderLayer(\n",
-      "        (self_attn): LlamaAttention(\n",
-      "          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)\n",
-      "          (k_proj): Linear(in_features=2048, out_features=256, bias=False)\n",
-      "          (v_proj): Linear(in_features=2048, out_features=256, bias=False)\n",
-      "          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)\n",
-      "          (rotary_emb): LlamaRotaryEmbedding()\n",
-      "        )\n",
-      "        (mlp): LlamaMLP(\n",
-      "          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)\n",
-      "          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)\n",
-      "          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)\n",
-      "          (act_fn): SiLU()\n",
-      "        )\n",
-      "        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)\n",
-      "        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)\n",
-      "      )\n",
-      "    )\n",
-      "    (norm): LlamaRMSNorm((2048,), eps=1e-05)\n",
-      "    (rotary_emb): LlamaRotaryEmbedding()\n",
-      "  )\n",
-      "  (lm_head): Linear(in_features=2048, out_features=32000, bias=False)\n",
-      ")\n"
-     ]
-    }
-   ],
-   "source": [
-    "model.model.layers = model.model.layers[:1]\n",
-    "model = model.to(torch.bfloat16)\n",
-    "print(model)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "input_ids = torch.tensor([    1, 15043, 29892,   590,  1024,   338,     1,   450,  6673,   310,\n",
-    "          278,  3303,  3900,   338,     1,   450,  7483,   310,  3444,   338,\n",
-    "            1,   450,  5434,   310,   319, 29902,   338,     0,     0,     0,\n",
-    "            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-    "            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-    "            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-    "            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-    "            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-    "            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-    "            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-    "            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-    "            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
-    "            0,     0,     0,     0,     0,     0,     0,     0],\n",
-    "       dtype=torch.int32).unsqueeze(0)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "outputs = model(input_ids, output_hidden_states=True, output_attentions=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "tensor([[-0.1494, -0.8125,  1.8359,  ..., -0.5195, -1.1484, -1.3516],\n",
-       "        [-1.3359,  0.8125, -0.5938,  ...,  1.5391,  1.7188,  0.9023],\n",
-       "        [-0.9570,  0.4316, -0.4121,  ...,  0.0747,  0.4453, -0.0378],\n",
-       "        [ 0.9922, -1.5703,  1.7422,  ...,  0.3613,  0.2334,  1.2266],\n",
-       "        [-0.0067,  1.4609,  0.8281,  ..., -1.0234,  0.9375,  0.7969],\n",
-       "        [-1.1484,  1.3516, -0.0215,  ..., -0.5664, -0.6055,  3.0312]],\n",
-       "       dtype=torch.bfloat16, grad_fn=<SliceBackward0>)"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "outputs.hidden_states[-1][0, :6, :]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "attn_scores = logits.attentions[0]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "tensor([[[ 0.0031,  0.0178,  0.0210,  ..., -0.0052, -0.0420, -0.0334],\n",
-       "         [ 0.0031,  0.0178,  0.0210,  ..., -0.0052, -0.0420, -0.0334],\n",
-       "         [ 0.0031,  0.0178,  0.0210,  ..., -0.0052, -0.0420, -0.0334],\n",
-       "         ...,\n",
-       "         [ 0.0031,  0.0178,  0.0210,  ..., -0.0052, -0.0420, -0.0334],\n",
-       "         [ 0.0031,  0.0178,  0.0210,  ..., -0.0052, -0.0420, -0.0334],\n",
-       "         [ 0.0031,  0.0178,  0.0210,  ..., -0.0052, -0.0420, -0.0334]]],\n",
-       "       dtype=torch.bfloat16, grad_fn=<EmbeddingBackward0>)"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "embeds = model.model.embed_tokens(input_ids)\n",
-    "embeds"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "tensor([[[ 0.0271,  0.1826,  0.3027,  ..., -0.0635, -0.4902, -0.2812],\n",
-       "         [ 0.0271,  0.1826,  0.3027,  ..., -0.0635, -0.4902, -0.2812],\n",
-       "         [ 0.0271,  0.1826,  0.3027,  ..., -0.0635, -0.4902, -0.2812],\n",
-       "         ...,\n",
-       "         [ 0.0271,  0.1826,  0.3027,  ..., -0.0635, -0.4902, -0.2812],\n",
-       "         [ 0.0271,  0.1826,  0.3027,  ..., -0.0635, -0.4902, -0.2812],\n",
-       "         [ 0.0271,  0.1826,  0.3027,  ..., -0.0635, -0.4902, -0.2812]]],\n",
-       "       dtype=torch.bfloat16, grad_fn=<MulBackward0>)"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# input_shape = embeds.shape[:-1]\n",
-    "# hidden_shape = (*input_shape, -1, 64)\n",
-    "# k = model.model.layers[0].self_attn.k_proj(embeds)#.view(hidden_shape).transpose(1, 2)\n",
-    "\n",
-    "norm_embeds = model.model.layers[0].input_layernorm(embeds)\n",
-    "norm_embeds\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 42,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "tensor([[[-0.0148,  0.0123,  0.0157,  ...,  0.0266,  0.0075, -0.0199],\n",
-      "         [-0.0148,  0.0123,  0.0157,  ...,  0.0266,  0.0075, -0.0199],\n",
-      "         [-0.0148,  0.0123,  0.0157,  ...,  0.0266,  0.0075, -0.0199],\n",
-      "         ...,\n",
-      "         [-0.0148,  0.0123,  0.0157,  ...,  0.0266,  0.0075, -0.0199],\n",
-      "         [-0.0148,  0.0123,  0.0157,  ...,  0.0266,  0.0075, -0.0199],\n",
-      "         [-0.0148,  0.0123,  0.0157,  ...,  0.0266,  0.0075, -0.0199]]],\n",
-      "       dtype=torch.bfloat16, grad_fn=<UnsafeViewBackward0>)\n",
-      "tensor([[[-0.1001,  0.2559,  0.2871,  ...,  0.1953, -0.2969, -0.4707],\n",
-      "         [-0.1001,  0.2559,  0.2871,  ...,  0.1953, -0.2969, -0.4707],\n",
-      "         [-0.1001,  0.2559,  0.2871,  ...,  0.1953, -0.2969, -0.4707],\n",
-      "         ...,\n",
-      "         [-0.1001,  0.2559,  0.2871,  ...,  0.1953, -0.2969, -0.4707],\n",
-      "         [-0.1001,  0.2559,  0.2871,  ...,  0.1953, -0.2969, -0.4707],\n",
-      "         [-0.1001,  0.2559,  0.2871,  ...,  0.1953, -0.2969, -0.4707]]],\n",
-      "       dtype=torch.bfloat16, grad_fn=<MulBackward0>)\n",
-      "tensor([[[ 8.8501e-03, -1.6968e-02, -2.4902e-02,  ...,  3.2902e-05,\n",
-      "          -2.0142e-02,  4.2419e-03],\n",
-      "         [ 8.8501e-03, -1.6968e-02, -2.4902e-02,  ...,  3.2902e-05,\n",
-      "          -2.0142e-02,  4.2419e-03],\n",
-      "         [ 8.9111e-03, -1.7090e-02, -2.4902e-02,  ..., -8.9407e-06,\n",
-      "          -2.0142e-02,  4.2419e-03],\n",
-      "         ...,\n",
-      "         [ 8.9722e-03, -1.7090e-02, -2.4780e-02,  ...,  1.4782e-05,\n",
-      "          -2.0142e-02,  4.2419e-03],\n",
-      "         [ 8.8501e-03, -1.6968e-02, -2.4902e-02,  ...,  3.2902e-05,\n",
-      "          -2.0142e-02,  4.2419e-03],\n",
-      "         [ 8.8501e-03, -1.6968e-02, -2.4902e-02,  ...,  3.2902e-05,\n",
-      "          -2.0142e-02,  4.2419e-03]]], dtype=torch.bfloat16,\n",
-      "       grad_fn=<UnsafeViewBackward0>)\n"
-     ]
-    }
-   ],
-   "source": [
-    "input_shape = embeds.shape[:-1]\n",
-    "hidden_shape = (*input_shape, -1, 64)\n",
-    "\n",
-    "q = model.model.layers[0].self_attn.q_proj(norm_embeds)\n",
-    "k = model.model.layers[0].self_attn.k_proj(norm_embeds)\n",
-    "v = model.model.layers[0].self_attn.v_proj(norm_embeds)\n",
-    "\n",
-    "position_embeds = model.model.rotary_emb(embeds, torch.arange(0,128).unsqueeze(0))\n",
-    "attn_out = model.model.layers[0].self_attn(norm_embeds, position_embeddings=position_embeds)\n",
-    "print(attn_out[0])\n",
-    "attn_out = attn_out[0] + embeds\n",
-    "# print(attn_out)\n",
-    "attn_out_norm = model.model.layers[0].post_attention_layernorm(attn_out)\n",
-    "print(attn_out_norm)\n",
-    "mlp_out = model.model.layers[0].mlp(attn_out_norm)\n",
-    "print(mlp_out)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 114,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "tensor([[[[-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
-       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
-       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
-       "          ...,\n",
-       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
-       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
-       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275]],\n",
-       "\n",
-       "         [[-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
-       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
-       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
-       "          ...,\n",
-       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
-       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
-       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275]],\n",
-       "\n",
-       "         [[-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
-       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
-       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
-       "          ...,\n",
-       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
-       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
-       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275]],\n",
-       "\n",
-       "         ...,\n",
-       "\n",
-       "         [[-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
-       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
-       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
-       "          ...,\n",
-       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
-       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
-       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275]],\n",
-       "\n",
-       "         [[-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
-       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
-       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
-       "          ...,\n",
-       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
-       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
-       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275]],\n",
-       "\n",
-       "         [[-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
-       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
-       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
-       "          ...,\n",
-       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
-       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
-       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275]]]],\n",
-       "       dtype=torch.bfloat16, grad_fn=<UnsafeViewBackward0>)"
-      ]
-     },
-     "execution_count": 114,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "torch.matmul(attn_scores, v)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 98,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "tensor([[[ 0.8398,  2.2812,  2.7969,  ...,  5.7500,  0.9062, -3.4844],\n",
-       "         [ 0.8398,  2.2812,  2.7969,  ...,  5.7500,  0.9062, -3.4844],\n",
-       "         [ 0.8398,  2.2812,  2.7969,  ...,  5.7500,  0.9062, -3.4844],\n",
-       "         ...,\n",
-       "         [ 0.8398,  2.2812,  2.7969,  ...,  5.7500,  0.9062, -3.4844],\n",
-       "         [ 0.8398,  2.2812,  2.7969,  ...,  5.7500,  0.9062, -3.4844],\n",
-       "         [ 0.8398,  2.2812,  2.7969,  ...,  5.7500,  0.9062, -3.4844]]],\n",
-       "       dtype=torch.bfloat16, grad_fn=<ViewBackward0>)"
-      ]
-     },
-     "execution_count": 98,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "torch.einsum(\n",
-    "    'bsh,hq->bsq',\n",
-    "    norm_embeds,\n",
-    "    model.model.layers[0].self_attn.q_proj.weight.t()\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 66,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "RuntimeError",
-     "evalue": "The size of tensor a (2048) must match the size of tensor b (64) at non-singleton dimension 3",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[66], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m cos, sin \u001b[38;5;241m=\u001b[39m model\u001b[38;5;241m.\u001b[39mmodel\u001b[38;5;241m.\u001b[39mrotary_emb(embeds, torch\u001b[38;5;241m.\u001b[39marange(\u001b[38;5;241m0\u001b[39m,\u001b[38;5;241m128\u001b[39m)\u001b[38;5;241m.\u001b[39munsqueeze(\u001b[38;5;241m0\u001b[39m))\n\u001b[0;32m----> 2\u001b[0m \u001b[43mapply_rotary_pos_emb\u001b[49m\u001b[43m(\u001b[49m\u001b[43mq\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcos\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msin\u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mtranspose(\u001b[38;5;241m1\u001b[39m,\u001b[38;5;241m2\u001b[39m)\u001b[38;5;241m.\u001b[39mreshape(\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m128\u001b[39m, \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m)\n",
-      "File \u001b[0;32m~/workspace/gnovack/vllm/.venv/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py:225\u001b[0m, in \u001b[0;36mapply_rotary_pos_emb\u001b[0;34m(q, k, cos, sin, position_ids, unsqueeze_dim)\u001b[0m\n\u001b[1;32m    223\u001b[0m cos \u001b[38;5;241m=\u001b[39m cos\u001b[38;5;241m.\u001b[39munsqueeze(unsqueeze_dim)\n\u001b[1;32m    224\u001b[0m sin \u001b[38;5;241m=\u001b[39m sin\u001b[38;5;241m.\u001b[39munsqueeze(unsqueeze_dim)\n\u001b[0;32m--> 225\u001b[0m q_embed \u001b[38;5;241m=\u001b[39m (\u001b[43mq\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mcos\u001b[49m) \u001b[38;5;241m+\u001b[39m (rotate_half(q) \u001b[38;5;241m*\u001b[39m sin)\n\u001b[1;32m    226\u001b[0m k_embed \u001b[38;5;241m=\u001b[39m (k \u001b[38;5;241m*\u001b[39m cos) \u001b[38;5;241m+\u001b[39m (rotate_half(k) \u001b[38;5;241m*\u001b[39m sin)\n\u001b[1;32m    227\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m q_embed, k_embed\n",
-      "\u001b[0;31mRuntimeError\u001b[0m: The size of tensor a (2048) must match the size of tensor b (64) at non-singleton dimension 3"
-     ]
-    }
-   ],
-   "source": [
-    "\n",
-    "apply_rotary_pos_emb(q, k, cos, sin)[0].transpose(1,2).reshape(1, 128, -1)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": ".venv",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/vllm/config.py b/vllm/config.py
index 9eabdec0da3cc..5c2baa7a34202 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3166,7 +3166,8 @@ def __post_init__(self):
             self.compilation_config = CompilationConfig()
 
         
-        if envs.VLLM_USE_V1 and not self.model_config.enforce_eager and current_platform.is_neuron():
+        if envs.VLLM_USE_V1 and not self.model_config.enforce_eager \
+            and current_platform.is_neuron():
             self.compilation_config.custom_ops = ["silu_and_mul"]
             self.compilation_config.use_cudagraph = True
             self.compilation_config.use_inductor = True
diff --git a/vllm/distributed/device_communicators/neuron_communicator.py b/vllm/distributed/device_communicators/neuron_communicator.py
index 54f659e29b07f..b4c41807a5019 100644
--- a/vllm/distributed/device_communicators/neuron_communicator.py
+++ b/vllm/distributed/device_communicators/neuron_communicator.py
@@ -5,10 +5,6 @@
 
 if current_platform.is_neuron():
     import torch_xla.core.xla_model as xm
-    import torch_xla.runtime as xr
-    from torch_xla._internal import pjrt
-
-    from vllm.executor import ray_utils
 
 
 class NeuronCommunicator:
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index c317e5469998a..a44d78afc6035 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -214,7 +214,8 @@ def __init__(
             PyNcclCommunicator)
 
         self.pynccl_comm: Optional[PyNcclCommunicator] = None
-        if use_pynccl and self.world_size > 1 and current_platform.is_cuda_alike():
+        if use_pynccl and self.world_size > 1 and \
+            current_platform.is_cuda_alike():
             self.pynccl_comm = PyNcclCommunicator(
                 group=self.cpu_group,
                 device=self.device,
@@ -354,9 +355,9 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
         
         # TODO(gnovack) - remove check for is_xla_tensor once sampling is done on-device 
         if self.neuron_communicator is not None and \
-                not self.neuron_communicator.disabled and xm.is_xla_tensor(input_):
+                not self.neuron_communicator.disabled and \
+                    xm.is_xla_tensor(input_):
                 return self.neuron_communicator.all_reduce(input_)
-
         return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name)
 
     def _all_reduce_out_place(self, input_: torch.Tensor) -> torch.Tensor:
@@ -397,7 +398,8 @@ def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
         group = self.device_group
         neuron_comm = self.neuron_communicator
         if neuron_comm is not None and not neuron_comm.disabled:
-            # TODO(gnovack) - remove check for is_xla_tensor once sampling is done on-device
+            # TODO(gnovack) - remove check for is_xla_tensor once 
+            # sampling is done on-device
             if xm.is_xla_tensor(input_):
                 return neuron_comm.all_gather(input_, dim)
             else:
@@ -990,7 +992,8 @@ def init_distributed_environment(
             world_size=world_size,
             rank=rank)
         
-        # TODO(gnovack) - XLA CC Ops use an unamed process group, so we need to register a group with no name here 
+        # TODO(gnovack) - XLA CC Ops use an unamed process group, 
+        # so we need to register a group with no name here 
         torch._C._distributed_c10d._register_process_group("", torch.distributed.group.WORLD)
     # set the local rank
     # local_rank is not available in torch ProcessGroup,
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index b6e640f83971c..2dfecd73a65f9 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -263,7 +263,6 @@ def forward_neuron(
         offsets: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         
-        from torch_xla.core import xla_model as xm
 
         # TODO(gnovack) - handle edge cases
         if offsets is not None:
diff --git a/vllm/v1/attention/backends/neuron_attn.py b/vllm/v1/attention/backends/neuron_attn.py
index c2dd3fdcbf1e1..ed432254c012c 100644
--- a/vllm/v1/attention/backends/neuron_attn.py
+++ b/vllm/v1/attention/backends/neuron_attn.py
@@ -2,7 +2,8 @@
 from typing import Any, Dict, List, Optional, Tuple, Type
 
 import torch
-from vllm.attention.backends.abstract import AttentionBackend, AttentionImpl, AttentionMetadataBuilder, AttentionType
+from vllm.attention.backends.abstract import AttentionBackend\
+    , AttentionImpl, AttentionMetadataBuilder, AttentionType
 from vllm.attention.backends.utils import CommonAttentionState
 
 
diff --git a/vllm/v1/worker/neuron_model_runner.py b/vllm/v1/worker/neuron_model_runner.py
index e2ee443f8e074..afc8c223ca846 100644
--- a/vllm/v1/worker/neuron_model_runner.py
+++ b/vllm/v1/worker/neuron_model_runner.py
@@ -1,4 +1,3 @@
-import gc
 import time
 from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
 
@@ -9,17 +8,17 @@
 
 from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.layer import Attention
-from vllm.config import CompilationLevel, VllmConfig
-from vllm.distributed.parallel_state import graph_capture
+from vllm.config import VllmConfig
 from vllm.forward_context import set_forward_context
 from vllm.inputs import INPUT_REGISTRY, InputRegistry
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
 from vllm.multimodal import MultiModalKwargs
 from vllm.sampling_params import SamplingType
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
-                        LayerBlockType, cdiv, is_pin_memory_available)
-from vllm.v1.attention.backends.neuron_attn import NeuronAttentionBackend, NeuronAttentionMetadata
+from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, 
+                        LayerBlockType, cdiv)
+from vllm.v1.attention.backends.neuron_attn import NeuronAttentionBackend, \
+    NeuronAttentionMetadata
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.utils import bind_kv_cache
@@ -160,7 +159,6 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
             if num_new_blocks == 0:
                 continue
             start_index = len(req_state.block_ids)
-            end_index = start_index + num_new_blocks
             req_state.block_ids.extend(req_data.new_block_ids)
             self.input_batch.block_table.append_row(req_index, start_index,
                                                     req_data.new_block_ids)
diff --git a/vllm/v1/worker/neuron_worker.py b/vllm/v1/worker/neuron_worker.py
index 84e0717fbfe96..623918a7d5747 100644
--- a/vllm/v1/worker/neuron_worker.py
+++ b/vllm/v1/worker/neuron_worker.py
@@ -1,11 +1,9 @@
 """A GPU worker class."""
-import os
-from typing import TYPE_CHECKING, Optional, Tuple
+from typing import TYPE_CHECKING, Optional
 
 import torch
 import torch.distributed
 import torch_xla.core.xla_model as xm
-import torch_xla.runtime as xr
 from torch_xla._internal.pjrt import initialize_multiprocess
 
 from vllm.config import ParallelConfig
@@ -20,8 +18,8 @@
 
 logger = init_logger(__name__)
 
-if TYPE_CHECKING:
-    from vllm.v1.core.scheduler import SchedulerOutput
+# if TYPE_CHECKING:
+#     from vllm.v1.core.scheduler import SchedulerOutput
 
 
 class NeuronWorker(Worker):