From 3c7d19898f4c88c0840b2794483042851a251138 Mon Sep 17 00:00:00 2001
From: amit <amit.man@gmail.com>
Date: Tue, 3 Jun 2025 09:33:37 +0300
Subject: [PATCH 01/33] V1 support of priority shedualing

Signed-off-by: amit <amit.man@gmail.com>
---
 docs/usage/v1_guide.md          |   2 +-
 tests/v1/core/test_scheduler.py | 594 ++++++++++++++++++++++++++++++++
 vllm/v1/core/sched/scheduler.py | 220 +++++++++---
 vllm/v1/engine/__init__.py      |   1 +
 vllm/v1/engine/processor.py     |   3 +-
 vllm/v1/request.py              |   9 +-
 6 files changed, 771 insertions(+), 58 deletions(-)

diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index 3d5d7ce45cce..516b24443d4c 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -69,7 +69,7 @@ This living user guide outlines a few known **important changes and limitations*
 way by using a simple dictionary (e.g., {request_id: num_tokens}) to dynamically
 allocate a fixed token budget per request, enabling features like chunked prefills,
 prefix caching, and speculative decoding without a strict separation between prefill
-and decode phases.
+and decode phases. The V1 scheduler supports multiple scheduling policies, including First-Come, First-Served (FCFS) and priority-based scheduling (where requests are processed based on assigned priority, with FCFS as a tie-breaker), configurable via the `--scheduling-policy` argument.
 
 ### Semantic Changes and Deprecated Features
 
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index f38454b1b288..079179730d5d 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1246,3 +1246,597 @@ def test_memory_leak():
 
     # Confirm no memory leak.
     assert_scheduler_empty(scheduler)
+
+
+def create_scheduler_with_priority(
+    model: str = "facebook/opt-125m",
+    max_num_seqs: int = 16,
+    max_num_batched_tokens: int = 8192,
+    enable_prefix_caching: Optional[bool] = None,
+    long_prefill_token_threshold: int = 0,
+    disable_chunked_mm_input: bool = False,
+    use_kv_connector: bool = False,
+    num_blocks: int = 10000,
+    block_size: int = 16,
+    max_model_len: Optional[int] = None,
+    num_speculative_tokens: Optional[int] = None,
+) -> Scheduler:
+    '''Create scheduler with priority policy enabled.
+
+    Args:
+      model: model under test
+      max_num_seqs: max sequences to schedule
+      max_num_batch_tokens: max num tokens to batch
+      enable_prefix_caching: optionally force APC config
+                             (True/False) or use default
+                             (None)
+
+    Returns:
+      {class}`Scheduler` instance with priority scheduling
+    '''
+    if max_model_len is None:
+        max_model_len = max_num_batched_tokens
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=max_num_seqs,
+        max_num_batched_tokens=max_num_batched_tokens,
+        max_model_len=max_model_len,
+        long_prefill_token_threshold=long_prefill_token_threshold,
+        disable_chunked_mm_input=disable_chunked_mm_input,
+        enable_chunked_prefill=True,
+        policy="priority",  # Enable priority scheduling
+    )
+    model_config = ModelConfig(
+        model=model,
+        task="auto",
+        tokenizer=model,
+        tokenizer_mode="auto",
+        trust_remote_code=True,
+        dtype="float16",
+        seed=42,
+    )
+    # Cache config, optionally force APC
+    kwargs_cache = ({} if enable_prefix_caching is None else {
+        'enable_prefix_caching': enable_prefix_caching
+    })
+    cache_config = CacheConfig(
+        block_size=block_size,
+        gpu_memory_utilization=0.9,
+        swap_space=0,
+        cache_dtype="auto",
+        **kwargs_cache,
+    )
+    kv_transfer_config = KVTransferConfig(
+        kv_connector="SharedStorageConnector",
+        kv_role="kv_both",
+        kv_connector_extra_config={"shared_storage_path": "local_storage"},
+    ) if use_kv_connector else None
+
+    speculative_config: Optional[SpeculativeConfig] = None
+    if num_speculative_tokens is not None:
+        speculative_config = SpeculativeConfig(
+            model="ngram", num_speculative_tokens=num_speculative_tokens)
+
+    vllm_config = VllmConfig(
+        scheduler_config=scheduler_config,
+        model_config=model_config,
+        cache_config=cache_config,
+        kv_transfer_config=kv_transfer_config,
+        speculative_config=speculative_config,
+    )
+    kv_cache_config = KVCacheConfig(
+        num_blocks=num_blocks,  # A large number of blocks to hold all requests
+        tensors={},
+        kv_cache_groups=[
+            KVCacheGroupSpec(['layer'],
+                             FullAttentionSpec(block_size, 1, 1, torch.float32,
+                                               False))
+        ],
+    )
+    cache_config.num_gpu_blocks = num_blocks
+    return Scheduler(
+        vllm_config=vllm_config,
+        kv_cache_config=kv_cache_config,
+        log_stats=True,
+        structured_output_manager=StructuredOutputManager(vllm_config),
+    )
+
+
+def create_requests_with_priority(
+        num_requests: int,
+        priorities: list[int],
+        arrival_times: Optional[list[float]] = None,
+        num_tokens: int = 10,
+        mm_positions: Optional[list[PlaceholderRange]] = None,
+        max_tokens: int = 16,
+        stop_token_ids: Optional[list[int]] = None,
+        prompt_logprobs: Optional[int] = None):
+    """Create requests with specified priorities and arrival times."""
+    assert len(priorities) == num_requests
+    if arrival_times is not None:
+        assert len(arrival_times) == num_requests
+    else:
+        arrival_times = [float(i) for i in range(num_requests)]
+
+    sampling_params = SamplingParams(ignore_eos=False,
+                                     max_tokens=max_tokens,
+                                     stop_token_ids=stop_token_ids,
+                                     prompt_logprobs=prompt_logprobs)
+    requests = []
+    for i in range(num_requests):
+        if mm_positions is not None:
+            mm_position = mm_positions[i]
+            mm_inputs = [MultiModalKwargs({})] * len(mm_position)
+        else:
+            mm_position = None
+            mm_inputs = None
+        request = Request(
+            request_id=f"{i}",
+            prompt_token_ids=[i] * num_tokens,
+            sampling_params=sampling_params,
+            multi_modal_inputs=mm_inputs,
+            multi_modal_placeholders=mm_position,
+            multi_modal_hashes=None,
+            eos_token_id=EOS_TOKEN_ID,
+            arrival_time=arrival_times[i],
+            priority=priorities[i],
+        )
+        requests.append(request)
+    return requests
+
+
+def test_priority_scheduling_basic_ordering():
+    """Test that requests are scheduled in priority order 
+    (lower value = higher priority)."""
+    scheduler = create_scheduler_with_priority()
+
+    # Create requests with different priorities
+    # Priority 0 (highest), 1, 2 (lowest)
+    priorities = [2, 0, 1]  # Add in non-priority order
+    arrival_times = [1.0, 2.0, 3.0]  # All different arrival times
+    requests = create_requests_with_priority(num_requests=3,
+                                             priorities=priorities,
+                                             arrival_times=arrival_times)
+
+    # Add requests in non-priority order
+    for request in requests:
+        scheduler.add_request(request)
+
+    # Schedule and verify priority order
+    output = scheduler.schedule()
+
+    # Should schedule all requests since they fit in budget
+    assert len(output.scheduled_new_reqs) == 3
+
+    # Verify they are scheduled in priority order:
+    # req_1 (priority 0), req_2 (priority 1), req_0 (priority 2)
+    scheduled_req_ids = [req.req_id for req in output.scheduled_new_reqs]
+    assert scheduled_req_ids == ["1", "2", "0"]
+
+
+def test_priority_scheduling_arrival_time_tiebreaker():
+    """Test that arrival time is used 
+    as tiebreaker when priorities are equal."""
+    scheduler = create_scheduler_with_priority()
+
+    # Create requests with same priority but different arrival times
+    priorities = [1, 1, 1]  # All same priority
+    arrival_times = [3.0, 1.0, 2.0]  # Different arrival times
+    requests = create_requests_with_priority(num_requests=3,
+                                             priorities=priorities,
+                                             arrival_times=arrival_times)
+
+    # Add requests in non-arrival order
+    for request in requests:
+        scheduler.add_request(request)
+
+    # Schedule and verify arrival time order
+    output = scheduler.schedule()
+
+    # Should schedule all requests since they fit in budget
+    assert len(output.scheduled_new_reqs) == 3
+
+    # Verify they are scheduled in arrival time order:
+    # req_1 (1.0), req_2 (2.0), req_0 (3.0)
+    scheduled_req_ids = [req.req_id for req in output.scheduled_new_reqs]
+    assert scheduled_req_ids == ["1", "2", "0"]
+
+
+def test_priority_scheduling_mixed_priority_and_arrival():
+    """Test priority scheduling with mixed priorities and arrival times."""
+    scheduler = create_scheduler_with_priority()
+
+    # Create requests with mixed priorities and arrival times
+    priorities = [2, 1, 1, 0]  # Mixed priorities
+    arrival_times = [1.0, 3.0, 2.0, 4.0]  # Mixed arrival times
+    requests = create_requests_with_priority(num_requests=4,
+                                             priorities=priorities,
+                                             arrival_times=arrival_times)
+
+    # Add requests
+    for request in requests:
+        scheduler.add_request(request)
+
+    # Schedule and verify order
+    output = scheduler.schedule()
+
+    # Should schedule all requests since they fit in budget
+    assert len(output.scheduled_new_reqs) == 4
+
+    # Expected order:
+    # 1. req_3 (priority 0, arrival 4.0)
+    # 2. req_2 (priority 1, arrival 2.0) - earlier arrival than req_1
+    # 3. req_1 (priority 1, arrival 3.0)
+    # 4. req_0 (priority 2, arrival 1.0)
+    scheduled_req_ids = [req.req_id for req in output.scheduled_new_reqs]
+    assert scheduled_req_ids == ["3", "2", "1", "0"]
+
+
+def test_priority_scheduling_preemption():
+    """Test that priority scheduling preempts 
+    lower priority requests when memory is constrained."""
+    # Create scheduler with very limited memory to force preemption
+    scheduler = create_scheduler_with_priority(
+        max_num_seqs=3,  # Allow multiple requests
+        max_num_batched_tokens=200,
+        num_blocks=6,  # Very limited blocks to force memory pressure
+        block_size=16,  # Standard block size
+    )
+
+    # Create initial low-priority requests that will consume most memory
+    low_priority_requests = create_requests_with_priority(
+        num_requests=2,
+        priorities=[5, 5],  # Low priority
+        arrival_times=[1.0, 2.0],
+        num_tokens=30  # Large enough to consume significant memory
+    )
+
+    # Add and schedule low priority requests
+    for request in low_priority_requests:
+        scheduler.add_request(request)
+
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == 2
+
+    # Simulate model execution to move requests to running state
+    model_output = ModelRunnerOutput(
+        req_ids=[req.request_id for req in low_priority_requests],
+        req_id_to_index={
+            req.request_id: i
+            for i, req in enumerate(low_priority_requests)
+        },
+        sampled_token_ids=[[100] for _ in low_priority_requests],
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+    scheduler.update_from_output(output, model_output)
+
+    # Verify both requests are running
+    assert len(scheduler.running) == 2
+
+    # Now add a high-priority request that requires memory allocation
+    # This should trigger preemption due to memory constraints
+    high_priority_request = create_requests_with_priority(
+        num_requests=1,
+        priorities=[0],  # High priority
+        arrival_times=[3.0],
+        num_tokens=30  # Large enough to require significant memory
+    )[0]
+
+    scheduler.add_request(high_priority_request)
+
+    # Schedule again - this should trigger
+    # preemption when trying to allocate memory
+    output = scheduler.schedule()
+
+    # Due to the scheduler's design, if preemption happens
+    # during running request scheduling,
+    # waiting requests won't be scheduled in the same step
+    # Let's check if preemption occurred by looking at the waiting queue
+
+    # If preemption happened, we should see requests in the
+    # waiting queue
+    if len(scheduler.waiting) > 1:  # high priority + preempted request
+        # Preemption occurred - verify the high priority request
+        # gets scheduled next
+        output2 = scheduler.schedule()
+        assert len(output2.scheduled_new_reqs) == 1
+        # High priority request
+        assert output2.scheduled_new_reqs[0].req_id == "0"
+    else:
+        # No preemption needed - all requests fit
+        # This is also valid behavior if memory allows
+        assert len(output.scheduled_new_reqs) == 1
+        # High priority request
+        assert output.scheduled_new_reqs[0].req_id == "0"
+
+
+def test_priority_scheduling_no_preemption_when_space_available():
+    """Test that preemption doesn't happen 
+    when there's space for new requests."""
+    scheduler = create_scheduler_with_priority(
+        max_num_seqs=3,  # Allow 3 concurrent requests
+        max_num_batched_tokens=200,  # Sufficient token budget
+    )
+
+    # Add two low-priority running requests
+    low_priority_requests = create_requests_with_priority(
+        num_requests=2,
+        priorities=[5, 5],
+        arrival_times=[1.0, 2.0],
+        num_tokens=30)
+
+    for request in low_priority_requests:
+        scheduler.add_request(request)
+
+    output = scheduler.schedule()
+    model_output = ModelRunnerOutput(
+        req_ids=[req.request_id for req in low_priority_requests],
+        req_id_to_index={
+            req.request_id: i
+            for i, req in enumerate(low_priority_requests)
+        },
+        sampled_token_ids=[[100] for _ in low_priority_requests],
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+    scheduler.update_from_output(output, model_output)
+
+    # Add high-priority request
+    high_priority_request = create_requests_with_priority(num_requests=1,
+                                                          priorities=[0],
+                                                          arrival_times=[3.0],
+                                                          num_tokens=30)[0]
+
+    scheduler.add_request(high_priority_request)
+
+    # Schedule - should not preempt since there's space
+    output = scheduler.schedule()
+
+    # Should schedule the new request without preemption
+    assert len(output.scheduled_new_reqs) == 1
+    assert len(scheduler.running) == 3  # All three requests running
+    assert len(scheduler.waiting) == 0  # No requests waiting
+
+
+def test_priority_scheduling_preemption_victim_selection():
+    """Test that the correct victim is selected for 
+    preemption based on priority and arrival time."""
+    # This test verifies the priority-based victim selection logic
+    # by checking the waiting queue order after adding requests with different
+    # priorities
+    scheduler = create_scheduler_with_priority(
+        max_num_seqs=1,  # Force sequential processing to test priority order
+    )
+
+    # Create requests with different priorities
+    requests = create_requests_with_priority(
+        num_requests=3,
+        priorities=[3, 2, 0],  # Different priorities: low, medium, high
+        arrival_times=[1.0, 2.0, 3.0],
+        num_tokens=10)
+
+    # Add all requests
+    for request in requests:
+        scheduler.add_request(request)
+
+    # Schedule - should only schedule the highest priority request
+    # (req_2, priority 0)
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == 1
+    assert output.scheduled_new_reqs[0].req_id == "2"  # Highest priority
+
+    # Verify the waiting queue has the remaining requests in priority order
+    assert len(scheduler.waiting) == 2
+
+    # Extract waiting requests and verify priority order
+    temp_waiting = list(scheduler.waiting)
+    temp_waiting.sort()  # Sort by (priority, arrival_time, request)
+
+    waiting_priorities = [priority for priority, _, _ in temp_waiting]
+    waiting_req_ids = [req.request_id for _, _, req in temp_waiting]
+
+    # Should be req_1 (priority 2) then req_0 (priority 3)
+    assert waiting_priorities == [2, 3]
+    assert waiting_req_ids == ["1", "0"]
+
+
+def test_priority_scheduling_equal_priority_preemption():
+    """Test arrival time tiebreaker when requests have equal priority."""
+    # This test verifies that arrival time is used as a tiebreaker for equal
+    # priorities
+    scheduler = create_scheduler_with_priority(
+        max_num_seqs=1,  # Force sequential processing
+    )
+
+    # Create requests with same priority but different arrival times
+    requests = create_requests_with_priority(
+        num_requests=3,
+        priorities=[2, 2, 2],  # Same priority
+        arrival_times=[3.0, 1.0, 2.0],  # Different arrival times
+        num_tokens=10)
+
+    # Add all requests
+    for request in requests:
+        scheduler.add_request(request)
+
+    # Schedule - should schedule the request with earliest arrival time
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == 1
+    assert output.scheduled_new_reqs[0].req_id == "1"  # Earliest arrival (1.0)
+
+    # Verify the waiting queue has remaining requests in arrival time order
+    assert len(scheduler.waiting) == 2
+
+    # Extract waiting requests and verify arrival time order
+    temp_waiting = list(scheduler.waiting)
+    temp_waiting.sort()  # Sort by (priority, arrival_time, request)
+
+    waiting_arrival_times = [
+        arrival_time for _, arrival_time, _ in temp_waiting
+    ]
+    waiting_req_ids = [req.request_id for _, _, req in temp_waiting]
+
+    # Should be req_2 (arrival 2.0) then req_0 (arrival 3.0)
+    assert waiting_arrival_times == [2.0, 3.0]
+    assert waiting_req_ids == ["2", "0"]
+
+
+def test_priority_scheduling_waiting_queue_order():
+    """Test that the waiting queue maintains priority order."""
+    scheduler = create_scheduler_with_priority(
+        max_num_seqs=1,  # Only one request can run at a time
+    )
+
+    # Create multiple requests with different priorities
+    requests = create_requests_with_priority(
+        num_requests=4,
+        priorities=[3, 1, 2, 0],  # Mixed priorities
+        arrival_times=[1.0, 2.0, 3.0, 4.0],
+        num_tokens=10)
+
+    # Add all requests
+    for request in requests:
+        scheduler.add_request(request)
+
+    # Schedule - should only schedule the highest priority request
+    # (req_3, priority 0)
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == 1
+    assert output.scheduled_new_reqs[0].req_id == "3"
+
+    # Verify waiting queue has remaining requests in priority order
+    assert len(scheduler.waiting) == 3
+
+    # Extract requests from waiting queue
+    # (it's a heap, so we need to pop to see order)
+    waiting_priorities = []
+    waiting_req_ids = []
+    temp_waiting = list(scheduler.waiting)  # Copy the heap
+    temp_waiting.sort()  # Sort by (priority, arrival_time, request)
+
+    for priority, arrival_time, request in temp_waiting:
+        waiting_priorities.append(priority)
+        waiting_req_ids.append(request.request_id)
+
+    # Should be ordered by priority: req_1 (1), req_2 (2), req_0 (3)
+    assert waiting_req_ids == ["1", "2", "0"]
+    assert waiting_priorities == [1, 2, 3]
+
+
+def test_priority_scheduling_fcfs_fallback():
+    """Test that FCFS behavior is maintained when all 
+    requests have same priority."""
+    scheduler = create_scheduler_with_priority()
+
+    # Create requests with same priority but different arrival times
+    priorities = [1, 1, 1, 1]  # All same priority
+    arrival_times = [4.0, 1.0, 3.0, 2.0]  # Different arrival times
+    requests = create_requests_with_priority(num_requests=4,
+                                             priorities=priorities,
+                                             arrival_times=arrival_times)
+
+    # Add requests
+    for request in requests:
+        scheduler.add_request(request)
+
+    # Schedule
+    output = scheduler.schedule()
+
+    # Should schedule all requests in arrival time order
+    assert len(output.scheduled_new_reqs) == 4
+    scheduled_req_ids = [req.req_id for req in output.scheduled_new_reqs]
+
+    # Expected order by arrival time:
+    # req_1 (1.0), req_3 (2.0), req_2 (3.0), req_0 (4.0)
+    assert scheduled_req_ids == ["1", "3", "2", "0"]
+
+
+def test_priority_scheduling_with_limited_slots():
+    """Test priority scheduling when max_num_seqs limits concurrent requests."""
+    scheduler = create_scheduler_with_priority(
+        max_num_seqs=2,  # Only allow 2 concurrent requests
+        max_num_batched_tokens=1000,  # Plenty of token budget
+    )
+
+    # Create requests with different priorities
+    requests = create_requests_with_priority(
+        num_requests=4,
+        priorities=[3, 1, 2, 0],  # Mixed priorities
+        arrival_times=[1.0, 2.0, 3.0, 4.0],
+        num_tokens=10)
+
+    # Add all requests
+    for request in requests:
+        scheduler.add_request(request)
+
+    # Schedule - should only schedule the 2 highest priority requests
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == 2
+
+    # Should schedule req_3 (priority 0) and req_1 (priority 1)
+    scheduled_req_ids = [req.req_id for req in output.scheduled_new_reqs]
+    assert "3" in scheduled_req_ids  # Priority 0
+    assert "1" in scheduled_req_ids  # Priority 1
+
+    # Remaining requests should be in waiting queue in priority order
+    assert len(scheduler.waiting) == 2
+
+    # Extract waiting requests and verify order
+    temp_waiting = list(scheduler.waiting)
+    temp_waiting.sort()
+    waiting_priorities = [priority for priority, _, _ in temp_waiting]
+    waiting_req_ids = [req.request_id for _, _, req in temp_waiting]
+
+    # Should be req_2 (priority 2) then req_0 (priority 3)
+    assert waiting_priorities == [2, 3]
+    assert waiting_req_ids == ["2", "0"]
+
+
+def test_priority_scheduling_heap_property():
+    """Test that the waiting queue maintains heap 
+    property for priority scheduling."""
+    scheduler = create_scheduler_with_priority(
+        max_num_seqs=1,  # Only one request can run at a time
+    )
+
+    # Add requests in random priority order
+    priorities = [5, 1, 8, 3, 2, 7, 4, 6]
+    arrival_times = [float(i) for i in range(len(priorities))]
+    requests = create_requests_with_priority(num_requests=len(priorities),
+                                             priorities=priorities,
+                                             arrival_times=arrival_times,
+                                             num_tokens=10)
+
+    # Add all requests
+    for request in requests:
+        scheduler.add_request(request)
+
+    # Schedule one request at a time and verify priority order
+    scheduled_priorities = []
+
+    while scheduler.waiting:
+        output = scheduler.schedule()
+        if output.scheduled_new_reqs:
+            req = output.scheduled_new_reqs[0]
+            scheduled_priorities.append(requests[int(req.req_id)].priority)
+
+            # Simulate completion to make room for next request
+            model_output = ModelRunnerOutput(
+                req_ids=[req.req_id],
+                req_id_to_index={req.req_id: 0},
+                sampled_token_ids=[[100]],
+                spec_token_ids=None,
+                logprobs=None,
+                prompt_logprobs_dict={},
+            )
+            scheduler.update_from_output(output, model_output)
+
+            # Finish the request to make room for the next one
+            scheduler.finish_requests(req.req_id,
+                                      RequestStatus.FINISHED_STOPPED)
+
+    # Verify requests were scheduled in priority order (lowest value first)
+    expected_priorities = sorted(priorities)
+    assert scheduled_priorities == expected_priorities
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index ce16a1ed5a09..89e4016a7b97 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -2,10 +2,11 @@
 
 from __future__ import annotations
 
+import heapq
 import time
 from collections import defaultdict, deque
 from collections.abc import Iterable
-from typing import Any, Optional, Union
+from typing import Any, Optional, Union, cast
 
 from vllm.config import VllmConfig
 from vllm.distributed.kv_events import EventPublisherFactory, KVEventBatch
@@ -63,8 +64,8 @@ def __init__(
 
         # Scheduling constraints.
         self.max_num_running_reqs = self.scheduler_config.max_num_seqs
-        self.max_num_scheduled_tokens = \
-            self.scheduler_config.max_num_batched_tokens
+        self.max_num_scheduled_tokens = (
+            self.scheduler_config.max_num_batched_tokens)
         self.max_model_len = self.scheduler_config.max_model_len
         self.enable_kv_cache_events = (
             self.kv_events_config is not None
@@ -88,8 +89,12 @@ def __init__(
 
         # req_id -> Request
         self.requests: dict[str, Request] = {}
+        # Scheduling policy
+        self.policy = self.scheduler_config.policy
         # Priority queues for requests.
-        self.waiting: deque[Request] = deque()
+        self.waiting: Union[list[tuple[int, float, Request]],
+                            deque[Request]] = ([] if self.policy == "priority"
+                                               else deque())
         self.running: list[Request] = []
 
         # The request IDs that are finished in between the previous and the
@@ -104,8 +109,8 @@ def __init__(
         # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating
         # them at each scheduling step.
         # Request id -> deque of CachedRequestData
-        self._cached_reqs_data: dict[
-            str, deque[CachedRequestData]] = defaultdict(deque)
+        self._cached_reqs_data: dict[str, deque[CachedRequestData]] = (
+            defaultdict(deque))
 
         # Encoder-related.
         # Calculate encoder cache size if applicable
@@ -209,10 +214,16 @@ def schedule(self) -> SchedulerOutput:
             encoder_inputs_to_schedule = None
             new_encoder_budget = encoder_budget
             if request.has_encoder_inputs:
-                (encoder_inputs_to_schedule, num_new_tokens,
-                 new_encoder_budget) = self._try_schedule_encoder_inputs(
-                     request, request.num_computed_tokens, num_new_tokens,
-                     encoder_budget)
+                (
+                    encoder_inputs_to_schedule,
+                    num_new_tokens,
+                    new_encoder_budget,
+                ) = self._try_schedule_encoder_inputs(
+                    request,
+                    request.num_computed_tokens,
+                    num_new_tokens,
+                    encoder_budget,
+                )
 
             if num_new_tokens == 0:
                 # The request cannot be scheduled because one of the following
@@ -230,18 +241,33 @@ def schedule(self) -> SchedulerOutput:
 
             num_draft_tokens = max(
                 num_new_tokens + request.num_computed_tokens -
-                request.num_tokens, 0)
+                request.num_tokens,
+                0,
+            )
 
             while True:
                 new_blocks = self.kv_cache_manager.allocate_slots(
                     request,
                     num_new_tokens,
                     num_draft_tokens=num_draft_tokens,
-                    num_lookahead_tokens=self.num_lookahead_tokens)
+                    num_lookahead_tokens=self.num_lookahead_tokens,
+                )
                 if new_blocks is None:
                     # The request cannot be scheduled.
                     # Preempt the lowest-priority request.
-                    preempted_req = self.running.pop()
+                    if not self.running:
+                        # No request to preempt.
+                        can_schedule = False
+                        break
+                    if self.policy == "priority":
+                        preempted_req = min(
+                            self.running,
+                            key=lambda r: (r.priority, r.arrival_time),
+                        )
+                        self.running.remove(preempted_req)
+                    else:
+                        preempted_req = self.running.pop()
+
                     self.kv_cache_manager.free(preempted_req)
                     preempted_req.status = RequestStatus.PREEMPTED
                     preempted_req.num_computed_tokens = 0
@@ -249,7 +275,19 @@ def schedule(self) -> SchedulerOutput:
                         preempted_req.record_event(
                             EngineCoreEventType.PREEMPTED, scheduled_timestamp)
 
-                    self.waiting.appendleft(preempted_req)
+                    if self.policy == "priority":
+                        heapq.heappush(
+                            cast(list[tuple[int, float, Request]],
+                                 self.waiting),
+                            (
+                                preempted_req.priority,
+                                preempted_req.arrival_time,
+                                preempted_req,
+                            ),
+                        )
+                    else:
+                        cast(deque[Request],
+                             self.waiting).appendleft(preempted_req)
                     preempted_reqs.append(preempted_req)
                     if preempted_req == request:
                         # No more request to preempt.
@@ -307,7 +345,10 @@ def schedule(self) -> SchedulerOutput:
 
         # Use a temporary deque to collect requests that need to be skipped
         # and put back at the head of the waiting queue later
-        skipped_waiting_requests: deque[Request] = deque()
+        skipped_waiting_requests: Union[list[tuple[int, float, Request]],
+                                        deque[Request]] = ([] if self.policy
+                                                           == "priority" else
+                                                           deque())
 
         # Next, schedule the WAITING requests.
         if not preempted_reqs:
@@ -315,7 +356,14 @@ def schedule(self) -> SchedulerOutput:
                 if len(self.running) == self.max_num_running_reqs:
                     break
 
-                request = self.waiting[0]
+                if self.policy == "priority":
+                    if (not self.waiting
+                        ):  # Should not happen due to outer loop condition
+                        break
+                    priority_val, arrival_time_val, request = heapq.heappop(
+                        cast(list[tuple[int, float, Request]], self.waiting))
+                else:
+                    request = cast(deque[Request], self.waiting).popleft()
 
                 # KVTransfer: skip request if still waiting for remote kvs.
                 if request.status == RequestStatus.WAITING_FOR_REMOTE_KVS:
@@ -325,9 +373,18 @@ def schedule(self) -> SchedulerOutput:
                     else:
                         logger.debug(
                             "%s is still in WAITING_FOR_REMOTE_KVS state.",
-                            request.request_id)
-                        self.waiting.popleft()
-                        skipped_waiting_requests.appendleft(request)
+                            request.request_id,
+                        )
+                        if self.policy == "priority":
+                            waiting_queue = cast(
+                                list[tuple[int, float, Request]],
+                                skipped_waiting_requests)
+                            heapq.heappush(
+                                waiting_queue,
+                                (priority_val, arrival_time_val, request))
+                        else:
+                            cast(deque[Request],
+                                 skipped_waiting_requests).appendleft(request)
                         continue
 
                 # Skip request if the structured output request is still waiting
@@ -337,19 +394,33 @@ def schedule(self) -> SchedulerOutput:
                     if structured_output_req and structured_output_req.grammar:
                         request.status = RequestStatus.WAITING
                     else:
-                        self.waiting.popleft()
-                        skipped_waiting_requests.appendleft(request)
+                        if self.policy == "priority":
+                            waiting_queue = cast(
+                                list[tuple[int, float, Request]],
+                                skipped_waiting_requests)
+                            heapq.heappush(
+                                waiting_queue,
+                                (priority_val, arrival_time_val, request))
+                        else:
+                            cast(deque[Request],
+                                 skipped_waiting_requests).appendleft(request)
                         continue
 
                 # Check that adding the request still respects the max_loras
                 # constraint.
-                if self.lora_config and request.lora_request and (
-                        len(scheduled_loras) == self.lora_config.max_loras
-                        and request.lora_request.lora_int_id
-                        not in scheduled_loras):
+                if (self.lora_config and request.lora_request and
+                    (len(scheduled_loras) == self.lora_config.max_loras and
+                     request.lora_request.lora_int_id not in scheduled_loras)):
                     # Scheduling would exceed max_loras, skip.
-                    self.waiting.popleft()
-                    skipped_waiting_requests.appendleft(request)
+                    if self.policy == "priority":
+                        waiting_queue = cast(list[tuple[int, float, Request]],
+                                             skipped_waiting_requests)
+                        heapq.heappush(
+                            waiting_queue,
+                            (priority_val, arrival_time_val, request))
+                    else:
+                        cast(deque[Request],
+                             skipped_waiting_requests).appendleft(request)
                     continue
 
                 num_external_computed_tokens = 0
@@ -358,9 +429,8 @@ def schedule(self) -> SchedulerOutput:
                 # Get already-cached tokens.
                 if request.num_computed_tokens == 0:
                     # Get locally-cached tokens.
-                    new_computed_blocks, num_new_local_computed_tokens = \
-                        self.kv_cache_manager.get_computed_blocks(
-                            request)
+                    new_computed_blocks, num_new_local_computed_tokens = (
+                        self.kv_cache_manager.get_computed_blocks(request))
 
                     # Get externally-cached tokens if using a KVConnector.
                     if self.connector is not None:
@@ -400,11 +470,16 @@ def schedule(self) -> SchedulerOutput:
 
                     # Schedule encoder inputs.
                     if request.has_encoder_inputs:
-                        (encoder_inputs_to_schedule, num_new_tokens,
-                         new_encoder_budget
-                         ) = self._try_schedule_encoder_inputs(
-                             request, num_computed_tokens, num_new_tokens,
-                             encoder_budget)
+                        (
+                            encoder_inputs_to_schedule,
+                            num_new_tokens,
+                            new_encoder_budget,
+                        ) = self._try_schedule_encoder_inputs(
+                            request,
+                            num_computed_tokens,
+                            num_new_tokens,
+                            encoder_budget,
+                        )
                         if num_new_tokens == 0:
                             # The request cannot be scheduled.
                             break
@@ -419,6 +494,15 @@ def schedule(self) -> SchedulerOutput:
                 )
                 if new_blocks is None:
                     # The request cannot be scheduled.
+                    if self.policy == "priority":
+                        waiting_queue = cast(list[tuple[int, float, Request]],
+                                             skipped_waiting_requests)
+                        heapq.heappush(
+                            waiting_queue,
+                            (priority_val, arrival_time_val, request))
+                    else:
+                        # For FCFS, push back to the front of the deque.
+                        cast(deque[Request], self.waiting).appendleft(request)
                     break
 
                 # KVConnector: update internal state after allocation.
@@ -432,17 +516,27 @@ def schedule(self) -> SchedulerOutput:
                         num_external_computed_tokens,
                     )
 
-                self.waiting.popleft()
+                # Request was already popped from self.waiting
+                # (either via heapq.heappop or self.waiting.popleft())
+                # unless it was re-added above due to new_blocks being None.
                 if load_kv_async:
                     # If loading async, allocate memory and put request
                     # into the WAITING_FOR_REMOTE_KV state.
-                    skipped_waiting_requests.appendleft(request)
+                    if self.policy == "priority":
+                        waiting_queue = cast(list[tuple[int, float, Request]],
+                                             skipped_waiting_requests)
+                        heapq.heappush(
+                            waiting_queue,
+                            (priority_val, arrival_time_val, request))
+                    else:
+                        cast(deque[Request],
+                             skipped_waiting_requests).appendleft(request)
                     request.status = RequestStatus.WAITING_FOR_REMOTE_KVS
                     continue
 
                 if request.use_structured_output:
-                    structured_output_request_ids[
-                        request.request_id] = req_index
+                    structured_output_request_ids[request.request_id] = (
+                        req_index)
                 req_index += 1
                 self.running.append(request)
                 if self.log_stats:
@@ -478,7 +572,17 @@ def schedule(self) -> SchedulerOutput:
 
         # Put back any skipped requests at the head of the waiting queue
         if skipped_waiting_requests:
-            self.waiting.extendleft(skipped_waiting_requests)
+            if self.policy == "priority":
+                waiting_queue = cast(list[tuple[int, float, Request]],
+                                     self.waiting)
+                skipped_queue = cast(list[tuple[int, float, Request]],
+                                     skipped_waiting_requests)
+                for item in skipped_queue:
+                    heapq.heappush(waiting_queue, item)
+            else:  # FCFS
+                waiting_deque = cast(deque[Request], self.waiting)
+                skipped_deque = cast(deque[Request], skipped_waiting_requests)
+                waiting_deque.extendleft(skipped_deque)
 
         # Check if the scheduling constraints are satisfied.
         total_num_scheduled_tokens = sum(num_scheduled_tokens.values())
@@ -488,8 +592,8 @@ def schedule(self) -> SchedulerOutput:
         # Since some requests in the RUNNING queue may not be scheduled in
         # this step, the total number of scheduled requests can be smaller than
         # len(self.running).
-        assert (len(scheduled_new_reqs) + len(scheduled_resumed_reqs) +
-                len(scheduled_running_reqs) <= len(self.running))
+        assert len(scheduled_new_reqs) + len(scheduled_resumed_reqs) + len(
+            scheduled_running_reqs) <= len(self.running)
 
         # Get the longest common prefix among all requests in the running queue.
         # This can be potentially used for cascade attention.
@@ -734,7 +838,8 @@ def update_from_output(
                 spec_decoding_stats = self.make_spec_decoding_stats(
                     spec_decoding_stats,
                     num_draft_tokens=len(scheduled_spec_token_ids),
-                    num_accepted_tokens=len(generated_token_ids) - 1)
+                    num_accepted_tokens=len(generated_token_ids) - 1,
+                )
 
             cached_encoder_input_ids = (
                 self.encoder_cache_manager.get_cached_input_ids(request))
@@ -862,7 +967,13 @@ def get_request_counts(self) -> tuple[int, int]:
         return len(self.running), len(self.waiting)
 
     def add_request(self, request: Request) -> None:
-        self.waiting.append(request)
+        if self.policy == "priority":
+            heapq.heappush(
+                cast(list[tuple[int, float, Request]], self.waiting),
+                (request.priority, request.arrival_time, request),
+            )
+        else:
+            cast(deque[Request], self.waiting).append(request)
         self.requests[request.request_id] = request
         if self.log_stats:
             request.record_event(EngineCoreEventType.QUEUED)
@@ -892,7 +1003,7 @@ def finish_requests(
             if request.status == RequestStatus.RUNNING:
                 self.running.remove(request)
             else:
-                self.waiting.remove(request)
+                cast(deque[Request], self.waiting).remove(request)
             request.status = finished_status
             self._free_request(request)
 
@@ -921,7 +1032,7 @@ def _free_blocks(self, request: Request):
         del self.requests[request.request_id]
 
     def get_num_unfinished_requests(self) -> int:
-        return len(self.waiting) + len(self.running)
+        return len(cast(deque[Request], self.waiting)) + len(self.running)
 
     def has_finished_requests(self) -> bool:
         return len(self.finished_req_ids) > 0
@@ -939,7 +1050,7 @@ def make_stats(
         assert prefix_cache_stats is not None
         return SchedulerStats(
             num_running_reqs=len(self.running),
-            num_waiting_reqs=len(self.waiting),
+            num_waiting_reqs=len(cast(deque[Request], self.waiting)),
             gpu_cache_usage=self.kv_cache_manager.usage,
             prefix_cache_stats=prefix_cache_stats,
             spec_decoding_stats=spec_decoding_stats,
@@ -957,7 +1068,8 @@ def make_spec_decoding_stats(
             spec_decoding_stats = SpecDecodingStats.new(self.num_spec_tokens)
         spec_decoding_stats.observe_draft(
             num_draft_tokens=num_draft_tokens,
-            num_accepted_tokens=num_accepted_tokens)
+            num_accepted_tokens=num_accepted_tokens,
+        )
         return spec_decoding_stats
 
     def shutdown(self) -> None:
@@ -981,8 +1093,8 @@ def _connector_finished(
         """
         if self.connector is None:
             return False, None
-        assert len(self.kv_cache_config.kv_cache_groups
-                   ) == 1, "KV connector only supports one KV cache group now"
+        assert (len(self.kv_cache_config.kv_cache_groups) == 1
+                ), "KV connector only supports one KV cache group now"
         block_ids = self.kv_cache_manager.get_block_ids(request.request_id)[0]
         return self.connector.request_finished(request, block_ids)
 
@@ -1000,8 +1112,8 @@ def _update_waiting_for_remote_kv(self, request: Request) -> bool:
         """
         if request.request_id not in self.finished_recving_kv_req_ids:
             return False
-        assert len(self.kv_cache_config.kv_cache_groups
-                   ) == 1, "KV connector only supports one KV cache group now"
+        assert (len(self.kv_cache_config.kv_cache_groups) == 1
+                ), "KV connector only supports one KV cache group now"
         # Now that the blocks are ready, actually cache them.
         block_ids = self.kv_cache_manager.get_block_ids(request.request_id)[0]
         num_computed_tokens = len(block_ids) * self.block_size
@@ -1032,9 +1144,9 @@ def _update_from_kv_xfer_finished(self,
             scheduler the request during the next step.
         """
         # P/D: update recv and send status from last step.
-        for req_id in (model_runner_output.finished_recving or ()):
+        for req_id in model_runner_output.finished_recving or ():
             logger.debug("Finished recving KV transfer for request %s", req_id)
             self.finished_recving_kv_req_ids.add(req_id)
-        for req_id in (model_runner_output.finished_sending or ()):
+        for req_id in model_runner_output.finished_sending or ():
             logger.debug("Finished sending KV transfer for request %s", req_id)
             self._free_blocks(self.requests[req_id])
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 0c9f61a76427..46710e98db97 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -63,6 +63,7 @@ class EngineCoreRequest(
     # belong to, to cover a race condition where the request is sent before
     # a wave finished notification is received.
     current_wave: int = 0
+    priority: int = 0
 
 
 class EngineCoreEventType(enum.IntEnum):
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 64a756148780..15d9f5e917dd 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -217,8 +217,6 @@ def process_inputs(
         # TODO(woosuk): Support encoder-decoder models.
         self._validate_lora(lora_request)
         self._validate_params(params, lora_request)
-        if priority != 0:
-            raise ValueError("V1 does not support priority yet.")
         if trace_headers is not None:
             raise ValueError("V1 does not support tracing yet.")
         if prompt_adapter_request is not None:
@@ -327,6 +325,7 @@ def process_inputs(
             arrival_time=arrival_time,
             lora_request=lora_request,
             cache_salt=decoder_inputs.get("cache_salt"),
+            priority=priority,
         )
 
     def _validate_model_inputs(self,
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 42c75ef96401..350826386c0a 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import enum
+import time
 from typing import TYPE_CHECKING, Any, Optional, Union
 
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
@@ -27,17 +28,22 @@ def __init__(
         sampling_params: SamplingParams,
         eos_token_id: Optional[int],
         client_index: int = 0,
+        arrival_time: Optional[float] = None,
         lora_request: Optional["LoRARequest"] = None,
         structured_output_request: Optional["StructuredOutputRequest"] = None,
         cache_salt: Optional[str] = None,
+        priority: int = 0,
     ) -> None:
         self.request_id = request_id
         self.client_index = client_index
+        self.priority = priority
         self.sampling_params = sampling_params
         # Because of LoRA, the eos token id can be different for each request.
         self.eos_token_id = eos_token_id
         self.lora_request = lora_request
         self.structured_output_request = structured_output_request
+        self.arrival_time = arrival_time if arrival_time is not None else \
+            time.time()
 
         self.status = (RequestStatus.WAITING_FOR_FSM
                        if sampling_params.guided_decoding is not None else
@@ -91,17 +97,18 @@ def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
 
         return cls(
             request_id=request.request_id,
-            client_index=request.client_index,
             prompt_token_ids=request.prompt_token_ids,
             multi_modal_inputs=request.mm_inputs,
             multi_modal_hashes=request.mm_hashes,
             multi_modal_placeholders=request.mm_placeholders,
             sampling_params=request.sampling_params,
             eos_token_id=request.eos_token_id,
+            arrival_time=request.arrival_time,
             lora_request=request.lora_request,
             structured_output_request=StructuredOutputRequest(
                 sampling_params=request.sampling_params),
             cache_salt=request.cache_salt,
+            priority=request.priority,
         )
 
     def append_output_token_ids(

From 4b2e513da7e23f1030adec07e78f7a1e599a4f10 Mon Sep 17 00:00:00 2001
From: amit <amit.man@gmail.com>
Date: Tue, 3 Jun 2025 18:35:24 +0300
Subject: [PATCH 02/33] style(docs): split long line and wrap paragraph in note
 admonition

Signed-off-by: amit <amit.man@gmail.com>
---
 docs/usage/v1_guide.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index b5a35c04a287..af0ef74026e2 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -72,7 +72,13 @@ This living user guide outlines a few known **important changes and limitations*
 way by using a simple dictionary (e.g., `{request_id: num_tokens}`) to dynamically
 allocate a fixed token budget per request, enabling features like chunked prefills,
 prefix caching, and speculative decoding without a strict separation between prefill
-and decode phases. The V1 scheduler supports multiple scheduling policies, including First-Come, First-Served (FCFS) and priority-based scheduling (where requests are processed based on assigned priority, with FCFS as a tie-breaker), configurable via the `--scheduling-policy` argument.
+and decode phases. 
+
+!!! note
+    The V1 scheduler supports multiple scheduling policies, including First-Come, 
+    First-Served (FCFS) and priority-based scheduling (where requests are processed 
+    based on assigned priority, with FCFS as a tie-breaker), configurable via the 
+    `--scheduling-policy` argument.
 
 ### Semantic Changes and Deprecated Features
 

From 419de95f761068c34277f6cf54917e151326ee76 Mon Sep 17 00:00:00 2001
From: amit <amit.man@gmail.com>
Date: Tue, 3 Jun 2025 18:54:04 +0300
Subject: [PATCH 03/33] style(docs): fix pymarkdown error

Signed-off-by: amit <amit.man@gmail.com>
---
 docs/usage/v1_guide.md | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index af0ef74026e2..9992884a6ab1 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -68,17 +68,17 @@ This living user guide outlines a few known **important changes and limitations*
 - **🟠 Delayed**: Temporarily dropped in V1 but planned to be re-introduced later.
 - **🔴 Deprecated**: Not planned for V1 unless there is strong demand.
 
-**Note**: vLLM V1’s unified scheduler treats both prompt and output tokens the same
-way by using a simple dictionary (e.g., `{request_id: num_tokens}`) to dynamically
-allocate a fixed token budget per request, enabling features like chunked prefills,
-prefix caching, and speculative decoding without a strict separation between prefill
-and decode phases. 
-
 !!! note
-    The V1 scheduler supports multiple scheduling policies, including First-Come, 
-    First-Served (FCFS) and priority-based scheduling (where requests are processed 
-    based on assigned priority, with FCFS as a tie-breaker), configurable via the 
-    `--scheduling-policy` argument.
+    vLLM V1’s unified scheduler treats both prompt and output tokens the same
+    way by using a simple dictionary (e.g., `{request_id: num_tokens}`) to dynamically
+    allocate a fixed token budget per request, enabling features like chunked prefills,
+    prefix caching, and speculative decoding without a strict separation between prefill
+    and decode phases.
+
+The V1 scheduler supports multiple scheduling policies, including First-Come,
+First-Served (FCFS) and priority-based scheduling (where requests are processed
+based on assigned priority, with FCFS as a tie-breaker), configurable via the
+`--scheduling-policy` argument.
 
 ### Semantic Changes and Deprecated Features
 

From de76b7e861f1c865e5ab1620c416f66938235675 Mon Sep 17 00:00:00 2001
From: amit <amit.man@gmail.com>
Date: Sun, 8 Jun 2025 10:00:10 +0300
Subject: [PATCH 04/33] Merge remote-tracking branch 'upstream/main' into
 v1-priority-schedular

Signed-off-by: amit <amit.man@gmail.com>
---
 .buildkite/check-wheel-size.py                |   1 +
 .buildkite/generate_index.py                  |   1 +
 .buildkite/lm-eval-harness/conftest.py        |   1 +
 .../test_lm_eval_correctness.py               |   1 +
 .../convert-results-json-to-markdown.py       |   1 +
 .../scripts/download-tokenizer.py             |   1 +
 .../scripts/generate-nightly-markdown.py      |   1 +
 .../scripts/get-lmdeploy-modelname.py         |   1 +
 .../scripts/summary-nightly-results.py        |   1 +
 .buildkite/release-pipeline.yaml              |  19 +-
 .buildkite/scripts/annotate-release.sh        |  31 +
 .../hardware_ci/run-cpu-test-ppc64le.sh       |   3 +-
 .../scripts/hardware_ci/run-cpu-test.sh       |  13 +-
 .../scripts/hardware_ci/run-tpu-v1-test.sh    |   2 +-
 .buildkite/scripts/tpu/cleanup_docker.sh      |  24 +
 .buildkite/scripts/tpu/config_v6e_1.env       |  14 +
 .buildkite/scripts/tpu/docker_run_bm.sh       | 102 ++++
 .buildkite/scripts/tpu/run_bm.sh              |  94 +++
 .buildkite/test-pipeline.yaml                 |   6 +
 .github/CODEOWNERS                            |  16 +-
 .github/PULL_REQUEST_TEMPLATE.md              |  13 +-
 .pre-commit-config.yaml                       |   2 +
 CMakeLists.txt                                |   5 +-
 README.md                                     |  10 +-
 benchmarks/auto_tune.sh                       | 164 +++--
 benchmarks/backend_request_func.py            |   1 +
 benchmarks/benchmark_dataset.py               |  11 +-
 benchmarks/benchmark_latency.py               |   1 +
 .../benchmark_long_document_qa_throughput.py  |   1 +
 benchmarks/benchmark_prefix_caching.py        |   1 +
 benchmarks/benchmark_prioritization.py        |   1 +
 benchmarks/benchmark_serving.py               |   1 +
 .../benchmark_serving_structured_output.py    |   2 +-
 benchmarks/benchmark_throughput.py            |   1 +
 benchmarks/benchmark_utils.py                 |   8 +-
 .../cutlass_benchmarks/sparse_benchmarks.py   |   1 +
 benchmarks/cutlass_benchmarks/utils.py        |   1 +
 .../cutlass_benchmarks/w8a8_benchmarks.py     |   1 +
 .../cutlass_benchmarks/weight_shapes.py       |   1 +
 .../disagg_prefill_proxy_server.py            |   1 +
 .../disagg_benchmarks/round_robin_proxy.py    |   1 +
 .../visualize_benchmark_results.py            |   1 +
 .../fused_kernels/layernorm_rms_benchmarks.py |   1 +
 benchmarks/kernels/bench_fp8_gemm.py          |   3 +-
 benchmarks/kernels/benchmark_aqlm.py          |   1 +
 benchmarks/kernels/benchmark_bitblas.py       |   1 +
 .../kernels/benchmark_cutlass_fp4_moe.py      |   3 +-
 .../kernels/benchmark_grouped_gemm_cutlass.py |  62 +-
 benchmarks/kernels/benchmark_layernorm.py     |   1 +
 benchmarks/kernels/benchmark_lora.py          |   1 +
 benchmarks/kernels/benchmark_machete.py       |   1 +
 benchmarks/kernels/benchmark_marlin.py        |   1 +
 benchmarks/kernels/benchmark_moe.py           |   1 +
 .../benchmark_moe_permute_unpermute.py        |   1 +
 .../kernels/benchmark_paged_attention.py      |   1 +
 benchmarks/kernels/benchmark_quant.py         |   1 +
 benchmarks/kernels/benchmark_rmsnorm.py       |   1 +
 benchmarks/kernels/benchmark_rope.py          |   1 +
 benchmarks/kernels/benchmark_shapes.py        |   1 +
 .../kernels/benchmark_w8a8_block_fp8.py       |   1 +
 .../benchmark_fp8_block_dense_gemm.py         |   1 +
 benchmarks/kernels/graph_machete_bench.py     |   1 +
 benchmarks/kernels/utils.py                   |   1 +
 benchmarks/kernels/weight_shapes.py           |   1 +
 benchmarks/overheads/benchmark_hashing.py     |   1 +
 cmake/hipify.py                               |   1 +
 csrc/attention/mla/cutlass_mla_kernels.cu     |   2 +-
 .../vllm_cutlass_library_extension.py         |   1 +
 csrc/moe/marlin_moe_wna16/generate_kernels.py |   1 +
 csrc/moe/moe_ops.h                            |   6 +-
 csrc/moe/moe_permute_unpermute_op.cu          |  56 ++
 csrc/moe/permute_unpermute_kernels/dispatch.h |  18 +-
 csrc/moe/topk_softmax_kernels.cu              |  16 +-
 csrc/moe/torch_bindings.cpp                   |   6 +
 csrc/ops.h                                    |  19 +-
 .../c3x/scaled_mm_blockwise_sm100_fp8.cu      |   4 -
 ...scaled_mm_blockwise_sm100_fp8_dispatch.cuh | 206 +++++--
 .../c3x/scaled_mm_sm100_fp8_dispatch.cuh      |  53 +-
 .../cutlass_w8a8/moe/grouped_mm_c3x.cu        |  29 +-
 .../cutlass_w8a8/moe/grouped_mm_c3x.cuh       |   6 +-
 .../quantization/cutlass_w8a8/moe/moe_data.cu |  78 ++-
 .../cutlass_w8a8/scaled_mm_entry.cu           |  48 +-
 csrc/quantization/fp8/common.cu               |  35 +-
 csrc/quantization/fp8/common.cuh              |  68 ++-
 .../fused_kernels/layernorm_utils.cuh         |  99 +--
 .../gptq_marlin/generate_kernels.py           |   1 +
 csrc/quantization/machete/generate.py         |   1 +
 csrc/quantization/vectorization.cuh           |  23 +-
 csrc/sampler.cu                               |  86 +++
 csrc/torch_bindings.cpp                       |  28 +-
 docker/Dockerfile.nightly_torch               |   3 +
 docker/Dockerfile.ppc64le                     |  96 ++-
 docker/Dockerfile.rocm                        |   5 +-
 docs/cli/README.md                            |   2 +
 docs/contributing/ci-failures.md              | 120 ++++
 docs/deployment/docker.md                     |   4 +-
 docs/features/tool_calling.md                 |   6 +-
 docs/mkdocs/hooks/generate_examples.py        |   1 +
 docs/mkdocs/hooks/remove_announcement.py      |   1 +
 docs/mkdocs/hooks/url_schemes.py              |   1 +
 docs/models/extensions/tensorizer.md          |   2 +-
 docs/models/supported_models.md               |   1 +
 docs/usage/v1_guide.md                        |   8 +-
 examples/offline_inference/audio_language.py  |   1 +
 .../automatic_prefix_caching.py               |   1 +
 examples/offline_inference/basic/basic.py     |   1 +
 examples/offline_inference/basic/chat.py      |   1 +
 examples/offline_inference/basic/classify.py  |   1 +
 examples/offline_inference/basic/embed.py     |   1 +
 examples/offline_inference/basic/generate.py  |   1 +
 examples/offline_inference/basic/score.py     |   1 +
 .../offline_inference/batch_llm_inference.py  |   1 +
 examples/offline_inference/chat_with_tools.py |   1 +
 .../offline_inference/context_extension.py    |  82 ++-
 examples/offline_inference/data_parallel.py   |   1 +
 .../decode_example.py                         |   1 +
 .../prefill_example.py                        |   1 +
 .../disaggregated_prefill.py                  |   1 +
 examples/offline_inference/eagle.py           |   1 +
 .../embed_jina_embeddings_v3.py               |   1 +
 .../offline_inference/embed_matryoshka_fy.py  |   1 +
 examples/offline_inference/encoder_decoder.py |   1 +
 .../encoder_decoder_multimodal.py             |   1 +
 .../offline_inference/llm_engine_example.py   |   1 +
 .../offline_inference/load_sharded_state.py   |   1 +
 .../lora_with_quantization_inference.py       |   1 +
 examples/offline_inference/metrics.py         |   1 +
 examples/offline_inference/mistral-small.py   |   1 +
 examples/offline_inference/mlpspeculator.py   |   1 +
 .../offline_inference/multilora_inference.py  |   1 +
 examples/offline_inference/neuron.py          |   1 +
 examples/offline_inference/neuron_eagle.py    |   1 +
 .../neuron_int8_quantization.py               |   1 +
 .../offline_inference/neuron_multimodal.py    |   1 +
 .../offline_inference/neuron_speculation.py   |   1 +
 examples/offline_inference/prefix_caching.py  |   1 +
 .../prithvi_geospatial_mae.py                 |   1 +
 examples/offline_inference/profiling.py       |   1 +
 .../profiling_tpu/profiling.py                |   1 +
 .../prompt_embed_inference.py                 |   1 +
 .../qwen2_5_omni/only_thinker.py              |   1 +
 examples/offline_inference/qwen_1m.py         |   1 +
 examples/offline_inference/reproducibility.py |   1 +
 examples/offline_inference/rlhf.py            |   1 +
 examples/offline_inference/rlhf_colocate.py   |   1 +
 examples/offline_inference/rlhf_utils.py      |   1 +
 .../offline_inference/save_sharded_state.py   |   1 +
 .../offline_inference/simple_profiling.py     |   1 +
 .../offline_inference/structured_outputs.py   |   1 +
 .../offline_inference/torchrun_example.py     |   1 +
 examples/offline_inference/tpu.py             |   1 +
 examples/offline_inference/vision_language.py |   1 +
 .../vision_language_embedding.py              |   1 +
 .../vision_language_multi_image.py            |  37 +-
 examples/online_serving/api_client.py         |   1 +
 .../online_serving/cohere_rerank_client.py    |   1 +
 .../disagg_proxy_demo.py                      |   1 +
 .../gradio_openai_chatbot_webserver.py        |   1 +
 examples/online_serving/gradio_webserver.py   |   1 +
 .../online_serving/jinaai_rerank_client.py    |   1 +
 .../online_serving/kv_events_subscriber.py    |   1 +
 .../multi_instance_data_parallel.py           |  58 ++
 .../openai_chat_completion_client.py          |   1 +
 ...i_chat_completion_client_for_multimodal.py |   1 +
 ...penai_chat_completion_client_with_tools.py |   1 +
 ...t_completion_client_with_tools_required.py |   1 +
 ...enai_chat_completion_structured_outputs.py |   2 +-
 ...etion_structured_outputs_structural_tag.py |   1 +
 ...etion_structured_outputs_with_reasoning.py |   1 +
 ...at_completion_tool_calls_with_reasoning.py |   1 +
 .../openai_chat_completion_with_reasoning.py  |   1 +
 ...hat_completion_with_reasoning_streaming.py |   1 +
 ...ai_chat_embedding_client_for_multimodal.py |   1 +
 .../openai_classification_client.py           |   1 +
 .../openai_completion_client.py               |   1 +
 .../openai_cross_encoder_score.py             |   1 +
 .../online_serving/openai_embedding_client.py |   1 +
 .../openai_embedding_matryoshka_fy.py         |   1 +
 .../online_serving/openai_pooling_client.py   |   1 +
 .../openai_transcription_client.py            |   1 +
 .../opentelemetry/dummy_client.py             |   1 +
 ...ompt_embed_inference_with_openai_client.py |   1 +
 examples/online_serving/ray_serve_deepseek.py |   1 +
 ...val_augmented_generation_with_langchain.py |   1 +
 ...al_augmented_generation_with_llamaindex.py |   1 +
 .../streamlit_openai_chatbot_webserver.py     |   1 +
 examples/online_serving/utils.py              |   1 +
 .../others/lmcache/cpu_offload_lmcache.py     |   1 +
 .../lmcache/disagg_prefill_lmcache_v0.py      |   1 +
 .../disagg_example_nixl.sh                    |   6 +-
 .../disagg_proxy_server.py                    |   1 +
 .../lmcache/kv_cache_sharing_lmcache_v1.py    |   1 +
 examples/others/tensorize_vllm_model.py       |   1 +
 examples/tool_chat_template_deepseekr1.jinja  |  92 +++
 find_cuda_init.py                             |   1 +
 requirements/common.txt                       |   2 +-
 requirements/cpu.txt                          |   3 +
 requirements/nightly_torch_test.txt           |   9 +-
 requirements/rocm.txt                         |   3 +-
 requirements/test.in                          |   2 +-
 requirements/test.txt                         |   2 +-
 requirements/tpu.txt                          |  10 +-
 setup.py                                      |   2 +
 tests/async_engine/api_server_async_engine.py |   1 +
 tests/async_engine/conftest.py                |   1 +
 tests/async_engine/test_api_server.py         |   1 +
 tests/async_engine/test_async_llm_engine.py   |  23 +
 tests/async_engine/test_request_tracker.py    |   1 +
 .../test_basic_correctness.py                 |  29 +-
 .../basic_correctness/test_chunked_prefill.py |   1 +
 tests/basic_correctness/test_cpu_offload.py   |   1 +
 tests/basic_correctness/test_cumem.py         |   1 +
 tests/basic_correctness/test_preemption.py    |   1 +
 tests/benchmarks/test_latency_cli.py          |   1 +
 tests/benchmarks/test_serve_cli.py            |   1 +
 tests/benchmarks/test_throughput_cli.py       |   1 +
 tests/build_cython.py                         |   1 +
 tests/compile/backend.py                      |   1 +
 tests/compile/conftest.py                     |   1 +
 .../compile/piecewise/test_full_cudagraph.py  |   8 +-
 tests/compile/piecewise/test_simple.py        |   3 +-
 tests/compile/piecewise/test_toy_llama.py     |   7 +-
 tests/compile/test_async_tp.py                |   1 +
 tests/compile/test_basic_correctness.py       |   1 +
 tests/compile/test_config.py                  |  43 ++
 tests/compile/test_full_graph.py              |   1 +
 tests/compile/test_functionalization.py       |   1 +
 tests/compile/test_fusion.py                  |   1 +
 tests/compile/test_pass_manager.py            |   1 +
 tests/compile/test_sequence_parallelism.py    |   1 +
 tests/compile/test_silu_mul_quant_fusion.py   |   1 +
 tests/compile/test_wrapper.py                 |   1 +
 tests/conftest.py                             |   1 +
 tests/core/block/conftest.py                  |   1 +
 tests/core/block/e2e/conftest.py              |   1 +
 tests/core/block/e2e/test_correctness.py      |   1 +
 .../e2e/test_correctness_sliding_window.py    |   1 +
 tests/core/block/test_block_manager.py        |   1 +
 tests/core/block/test_block_table.py          |   1 +
 tests/core/block/test_common.py               |   1 +
 .../block/test_cpu_gpu_block_allocator.py     |   1 +
 tests/core/block/test_naive_block.py          |   1 +
 tests/core/block/test_prefix_caching_block.py |   1 +
 tests/core/conftest.py                        |   1 +
 tests/core/test_chunked_prefill_scheduler.py  |   1 +
 tests/core/test_num_computed_tokens_update.py |   1 +
 tests/core/test_scheduler.py                  |   1 +
 tests/core/test_scheduler_encoder_decoder.py  |   1 +
 tests/core/test_serialization.py              |   1 +
 tests/core/utils.py                           |   1 +
 tests/detokenizer/conftest.py                 |   1 +
 .../test_disable_detokenization.py            |   1 +
 tests/detokenizer/test_stop_checker.py        |   1 +
 tests/detokenizer/test_stop_reason.py         |   1 +
 tests/detokenizer/test_stop_strings.py        |   1 +
 tests/distributed/conftest.py                 | 108 ++--
 tests/distributed/test_ca_buffer_sharing.py   |   1 +
 tests/distributed/test_comm_ops.py            |   1 +
 tests/distributed/test_custom_all_reduce.py   |   1 +
 tests/distributed/test_distributed_oot.py     |   1 +
 tests/distributed/test_events.py              |  70 ++-
 tests/distributed/test_expert_parallel.py     |   1 +
 .../distributed/test_multi_node_assignment.py |   1 +
 tests/distributed/test_pipeline_parallel.py   |   1 +
 tests/distributed/test_pipeline_partition.py  |   1 +
 tests/distributed/test_pp_cudagraph.py        |   1 +
 tests/distributed/test_pynccl.py              |   1 +
 tests/distributed/test_same_node.py           |   1 +
 tests/distributed/test_sequence_parallel.py   |   1 +
 tests/distributed/test_shm_broadcast.py       |   1 +
 tests/distributed/test_torchrun_example.py    |   1 +
 tests/distributed/test_utils.py               |   1 +
 tests/encoder_decoder/test_e2e_correctness.py |   1 +
 tests/engine/conftest.py                      |   1 +
 tests/engine/test_arg_utils.py                |  19 +-
 tests/engine/test_computed_prefix_blocks.py   |   1 +
 tests/engine/test_executor.py                 |   1 +
 .../test_multi_step_output_processor.py       |   1 +
 tests/engine/test_multiproc_workers.py        |   1 +
 tests/engine/test_options.py                  |   1 +
 tests/engine/test_short_mm_context.py         |   1 +
 tests/entrypoints/conftest.py                 |   1 +
 tests/entrypoints/llm/test_accuracy.py        |   1 +
 tests/entrypoints/llm/test_chat.py            |   1 +
 tests/entrypoints/llm/test_collective_rpc.py  |   1 +
 tests/entrypoints/llm/test_encode.py          |   1 +
 tests/entrypoints/llm/test_generate.py        |   1 +
 .../llm/test_generate_multiple_loras.py       |   1 +
 tests/entrypoints/llm/test_gpu_utilization.py |   1 +
 tests/entrypoints/llm/test_guided_generate.py |   1 +
 tests/entrypoints/llm/test_lazy_outlines.py   |   1 +
 .../entrypoints/llm/test_prompt_validation.py |   1 +
 .../offline_mode/test_offline_mode.py         |   1 +
 .../openai/correctness/test_lmeval.py         |   1 +
 .../openai/correctness/test_mteb.py           |   1 +
 .../test_transcription_api_correctness.py     |   1 +
 .../openai/test_async_tokenization.py         |   1 +
 tests/entrypoints/openai/test_audio.py        |   1 +
 tests/entrypoints/openai/test_basic.py        |   1 +
 tests/entrypoints/openai/test_chat.py         |   1 +
 tests/entrypoints/openai/test_chat_echo.py    |   1 +
 .../openai/test_chat_logit_bias_validation.py |   1 +
 .../entrypoints/openai/test_chat_template.py  |   1 +
 .../openai/test_chat_with_tool_reasoning.py   |   1 +
 .../entrypoints/openai/test_chunked_prompt.py |   1 +
 .../entrypoints/openai/test_classification.py |   1 +
 tests/entrypoints/openai/test_cli_args.py     |   1 +
 tests/entrypoints/openai/test_completion.py   |   1 +
 .../test_completion_with_function_calling.py  |  76 ++-
 .../test_completion_with_prompt_embeds.py     |   1 +
 tests/entrypoints/openai/test_embedding.py    |   1 +
 .../openai/test_embedding_dimensions.py       |   1 +
 .../openai/test_encoder_decoder.py            |   1 +
 .../entrypoints/openai/test_lora_adapters.py  |   1 +
 .../entrypoints/openai/test_lora_resolvers.py |   1 +
 tests/entrypoints/openai/test_metrics.py      |   1 +
 tests/entrypoints/openai/test_models.py       |   1 +
 .../openai/test_oot_registration.py           |   1 +
 .../entrypoints/openai/test_openai_schema.py  |   1 +
 tests/entrypoints/openai/test_pooling.py      |   1 +
 .../openai/test_prompt_validation.py          |   1 +
 tests/entrypoints/openai/test_rerank.py       |   1 +
 .../openai/test_return_tokens_as_ids.py       |   1 +
 tests/entrypoints/openai/test_root_path.py    |   1 +
 tests/entrypoints/openai/test_run_batch.py    |   1 +
 tests/entrypoints/openai/test_score.py        |   1 +
 tests/entrypoints/openai/test_serving_chat.py |   1 +
 .../entrypoints/openai/test_serving_models.py |   1 +
 tests/entrypoints/openai/test_shutdown.py     |   1 +
 tests/entrypoints/openai/test_sleep.py        |   1 +
 .../openai/test_tensorizer_entrypoint.py      |   1 +
 tests/entrypoints/openai/test_tokenization.py |   1 +
 .../openai/test_transcription_validation.py   |   1 +
 tests/entrypoints/openai/test_truncation.py   |   1 +
 tests/entrypoints/openai/test_video.py        |   1 +
 tests/entrypoints/openai/test_vision.py       |   1 +
 .../openai/test_vision_embedding.py           |   1 +
 .../test_llama4_pythonic_tool_parser.py       |   1 +
 .../tool_parsers/test_pythonic_tool_parser.py |   1 +
 .../entrypoints/openai/tool_parsers/utils.py  |   1 +
 .../test_api_server_process_manager.py        |   1 +
 tests/entrypoints/test_chat_utils.py          |   1 +
 tests/entrypoints/test_ssl_cert_refresher.py  |   1 +
 .../test_fastsafetensors_loader.py            |   1 +
 .../test_weight_utils.py                      |   1 +
 tests/kernels/allclose_default.py             |   1 +
 tests/kernels/attention/conftest.py           |   1 +
 tests/kernels/attention/test_attention.py     |   1 +
 .../attention/test_attention_selector.py      |   6 +-
 .../attention/test_blocksparse_attention.py   |   1 +
 tests/kernels/attention/test_cache.py         |   1 +
 .../attention/test_cascade_flash_attn.py      |   1 +
 .../attention/test_encoder_decoder_attn.py    |   1 +
 tests/kernels/attention/test_flash_attn.py    |   1 +
 tests/kernels/attention/test_flashinfer.py    |   1 +
 tests/kernels/attention/test_flashmla.py      |   1 +
 .../kernels/attention/test_lightning_attn.py  |   1 +
 .../attention/test_merge_attn_states.py       |   1 +
 tests/kernels/attention/test_mha_attn.py      |   1 +
 .../kernels/attention/test_mla_decode_cpu.py  |   1 +
 .../kernels/attention/test_prefix_prefill.py  |   1 +
 .../attention/test_rocm_attention_selector.py |   1 +
 .../attention/test_triton_decode_attention.py |   1 +
 .../test_triton_unified_attention.py          |   1 +
 tests/kernels/core/test_activation.py         |   1 +
 .../core/test_fused_quant_layernorm.py        |   1 +
 tests/kernels/core/test_layernorm.py          |   1 +
 tests/kernels/core/test_opcheck.py            |   1 +
 tests/kernels/core/test_permute_cols.py       |   1 +
 tests/kernels/core/test_pos_encoding.py       |   1 +
 tests/kernels/core/test_rotary_embedding.py   |   1 +
 tests/kernels/core/test_uva.py                |   1 +
 tests/kernels/mamba/test_causal_conv1d.py     |   1 +
 tests/kernels/mamba/test_mamba_mixer2.py      |   1 +
 tests/kernels/mamba/test_mamba_ssm.py         |   1 +
 tests/kernels/mamba/test_mamba_ssm_ssd.py     |   1 +
 tests/kernels/moe/__init__.py                 |   0
 tests/kernels/moe/deepep_utils.py             | 191 ++++++
 tests/kernels/moe/test_batched_moe.py         |   1 +
 tests/kernels/moe/test_cutlass_moe.py         |   9 +-
 tests/kernels/moe/test_deepep_deepgemm_moe.py | 513 ++++++++++++++++
 tests/kernels/moe/test_deepep_moe.py          | 459 ++++++++++++++
 tests/kernels/moe/test_moe.py                 |   1 +
 .../kernels/moe/test_moe_permute_unpermute.py |   1 +
 tests/kernels/moe/test_nvfp4_moe.py           |   6 +-
 tests/kernels/moe/test_pplx_cutlass_moe.py    | 287 +++++++++
 tests/kernels/moe/test_pplx_moe.py            | 132 +---
 tests/kernels/moe/test_rocm_aiter_topk.py     |   1 +
 tests/kernels/moe/test_triton_moe_ptpc_fp8.py |   1 +
 tests/kernels/quant_utils.py                  |   1 +
 tests/kernels/quantization/nvfp4_utils.py     |   1 +
 .../quantization/test_allspark_gemm.py        |   1 +
 tests/kernels/quantization/test_aqlm.py       |   1 +
 tests/kernels/quantization/test_awq.py        |   1 +
 tests/kernels/quantization/test_awq_triton.py |   1 +
 tests/kernels/quantization/test_block_fp8.py  |   1 +
 tests/kernels/quantization/test_block_int8.py |   1 +
 .../quantization/test_cutlass_2of4_sparse.py  |   1 +
 .../quantization/test_cutlass_scaled_mm.py    |   4 +-
 tests/kernels/quantization/test_fp8_quant.py  |   1 +
 tests/kernels/quantization/test_ggml.py       |   1 +
 tests/kernels/quantization/test_gguf.py       |   1 +
 tests/kernels/quantization/test_gptq.py       |   1 +
 .../kernels/quantization/test_int8_kernel.py  |   1 +
 tests/kernels/quantization/test_int8_quant.py |   1 +
 tests/kernels/quantization/test_machete_mm.py |   1 +
 .../kernels/quantization/test_marlin_gemm.py  |   1 +
 .../kernels/quantization/test_nvfp4_quant.py  |   1 +
 .../quantization/test_nvfp4_scaled_mm.py      |   1 +
 .../quantization/test_rocm_skinny_gemms.py    |   1 +
 .../quantization/test_triton_scaled_mm.py     |   1 +
 .../test_apply_repetition_penalties.py        |  76 +++
 tests/kernels/test_cutlass_mla_decode.py      |   5 +-
 tests/kernels/test_flex_attention.py          |  93 +++
 tests/kernels/test_fused_quant_activation.py  |   1 +
 tests/kernels/test_triton_flash_attention.py  |   1 +
 tests/kernels/utils.py                        |   1 +
 tests/kv_transfer/test_disagg.py              |   1 +
 tests/kv_transfer/test_lookup_buffer.py       |   1 +
 tests/kv_transfer/test_module.py              |   1 +
 tests/kv_transfer/test_send_recv.py           |   1 +
 tests/lora/conftest.py                        |   1 +
 tests/lora/test_add_lora.py                   |   1 +
 tests/lora/test_baichuan.py                   |   1 +
 tests/lora/test_chatglm3_tp.py                |   1 +
 tests/lora/test_layers.py                     |   1 +
 tests/lora/test_llama_tp.py                   |   1 +
 tests/lora/test_lora_allowed_token_ids.py     |   1 +
 tests/lora/test_lora_checkpoints.py           |   1 +
 tests/lora/test_lora_functions.py             |   1 +
 tests/lora/test_lora_huggingface.py           |   1 +
 tests/lora/test_lora_manager.py               |   1 +
 tests/lora/test_minicpmv_tp.py                |   1 +
 tests/lora/test_mixtral.py                    |   1 +
 tests/lora/test_peft_helper.py                |   1 +
 tests/lora/test_phi.py                        |   1 +
 tests/lora/test_punica_ops.py                 |   1 +
 tests/lora/test_quant_model.py                |   1 +
 tests/lora/test_qwen2vl.py                    |   1 +
 tests/lora/test_resolver.py                   |   1 +
 tests/lora/test_tokenizer_group.py            |   1 +
 tests/lora/test_transfomers_model.py          |   1 +
 tests/lora/test_utils.py                      |   1 +
 tests/lora/test_worker.py                     |   1 +
 tests/lora/utils.py                           |   1 +
 tests/metrics/test_metrics.py                 |   1 +
 tests/mistral_tool_use/conftest.py            |   1 +
 .../test_mistral_tool_calls.py                |   1 +
 tests/mistral_tool_use/utils.py               |   1 +
 tests/model_executor/conftest.py              |   1 +
 .../model_executor/test_enabled_custom_ops.py |   1 +
 .../model_executor/test_guided_processors.py  |   1 +
 tests/model_executor/test_logits_processor.py |   1 +
 .../test_model_load_with_params.py            |   1 +
 tests/model_executor/test_weight_utils.py     |   1 +
 tests/models/language/generation/test_bart.py |   1 +
 .../models/language/generation/test_common.py |   2 +-
 .../language/generation/test_granite.py       |   1 +
 .../generation/test_granitemoehybrid.py       |   1 +
 .../models/language/generation/test_hybrid.py |   1 +
 .../language/generation/test_mistral.py       |   1 +
 .../models/language/generation/test_phimoe.py |   1 +
 tests/models/language/pooling/embed_utils.py  |   7 +-
 tests/models/language/pooling/mteb_utils.py   |  13 +-
 tests/models/language/pooling/test_baai.py    |   1 +
 .../language/pooling/test_classification.py   |   1 +
 .../models/language/pooling/test_embedding.py |   1 +
 tests/models/language/pooling/test_gritlm.py  |   1 +
 tests/models/language/pooling/test_gte.py     |   8 +-
 .../models/language/pooling/test_intfloat.py  |  46 ++
 tests/models/language/pooling/test_jina.py    |   4 +-
 tests/models/language/pooling/test_nomic.py   |   4 +-
 .../pooling/test_nomic_max_model_len.py       |   1 +
 tests/models/language/pooling/test_scoring.py |   1 +
 .../pooling/test_snowflake_arctic_embed.py    |   1 +
 .../pooling/test_truncation_control.py        |   1 +
 .../multimodal/generation/test_common.py      |  10 +-
 .../multimodal/generation/test_florence2.py   |   3 +
 .../generation/test_granite_speech.py         |   3 +-
 .../multimodal/generation/test_interleaved.py |   1 +
 .../multimodal/generation/test_mllama.py      |   1 +
 .../multimodal/generation/test_phi4mm.py      |   5 +
 .../multimodal/generation/test_pixtral.py     |   1 +
 .../multimodal/generation/test_qwen2_vl.py    |   1 +
 .../multimodal/generation/test_ultravox.py    |   1 +
 .../multimodal/generation/test_whisper.py     |   1 +
 .../generation/vlm_utils/builders.py          |   1 +
 .../generation/vlm_utils/case_filtering.py    |   1 +
 .../multimodal/generation/vlm_utils/core.py   |   1 +
 .../generation/vlm_utils/custom_inputs.py     |   1 +
 .../generation/vlm_utils/model_utils.py       |  19 +-
 .../generation/vlm_utils/runners.py           |   1 +
 .../multimodal/generation/vlm_utils/types.py  |   1 +
 .../multimodal/pooling/test_dse_qwen2_vl.py   |   1 +
 .../multimodal/pooling/test_intern_vit.py     |   1 +
 .../multimodal/pooling/test_llava_next.py     |   1 +
 tests/models/multimodal/pooling/test_phi3v.py |   1 +
 .../multimodal/processing/test_common.py      |   3 +-
 .../multimodal/processing/test_h2ovl.py       |   1 +
 .../multimodal/processing/test_idefics3.py    |   1 +
 .../multimodal/processing/test_internvl.py    |   1 +
 .../multimodal/processing/test_llama4.py      |   1 +
 .../multimodal/processing/test_llava_next.py  |   1 +
 .../processing/test_llava_onevision.py        |   1 +
 .../processing/test_minimax_vl_01.py          |   1 +
 .../multimodal/processing/test_mllama.py      |   1 +
 .../multimodal/processing/test_phi3v.py       |   1 +
 .../multimodal/processing/test_phi4mm.py      |   1 +
 .../multimodal/processing/test_qwen2_vl.py    |   1 +
 .../multimodal/processing/test_smolvlm.py     |   1 +
 tests/models/quantization/test_aqlm.py        |   1 +
 tests/models/quantization/test_awq.py         |   1 +
 tests/models/quantization/test_bitblas.py     |   1 +
 tests/models/quantization/test_fp8.py         |   1 +
 tests/models/quantization/test_gguf.py        |   3 +-
 .../models/quantization/test_gptq_bitblas.py  |   1 +
 tests/models/quantization/test_gptq_marlin.py |   1 +
 .../quantization/test_gptq_marlin_24.py       |   1 +
 tests/models/quantization/test_modelopt.py    |   1 +
 tests/models/quantization/test_mxfp4.py       |   1 +
 tests/models/quantization/test_nvfp4.py       |   1 +
 tests/models/registry.py                      |  50 +-
 tests/models/test_initialization.py           |  14 +
 tests/models/test_oot_registration.py         |   1 +
 tests/models/test_registry.py                 |   1 +
 tests/models/test_transformers.py             |   1 +
 tests/models/test_utils.py                    |   1 +
 tests/models/test_vision.py                   |   1 +
 tests/models/utils.py                         |   1 +
 tests/mq_llm_engine/conftest.py               |   1 +
 tests/mq_llm_engine/test_abort.py             |   1 +
 tests/mq_llm_engine/test_error_handling.py    |   1 +
 tests/mq_llm_engine/test_load.py              |   1 +
 tests/mq_llm_engine/utils.py                  |   1 +
 .../multi_step/test_correctness_async_llm.py  |   1 +
 tests/multi_step/test_correctness_llm.py      |   1 +
 tests/multimodal/test_hasher.py               |   1 +
 tests/multimodal/test_image.py                |   1 +
 tests/multimodal/test_inputs.py               |   1 +
 tests/multimodal/test_processing.py           |   1 +
 tests/multimodal/test_utils.py                | 112 +++-
 tests/multimodal/test_video.py                |   1 +
 tests/multimodal/utils.py                     |   1 +
 tests/neuron/1_core/test_activation.py        |   1 +
 tests/neuron/1_core/test_block_table.py       |   1 +
 tests/neuron/1_core/test_cache.py             |   1 +
 tests/neuron/1_core/test_layernorm.py         |   1 +
 tests/neuron/1_core/test_logits_processor.py  |   1 +
 .../neuron/1_core/test_neuron_model_runner.py |   1 +
 tests/neuron/1_core/test_neuron_quant.py      |   1 +
 tests/neuron/1_core/test_prefix_prefill.py    |   1 +
 tests/neuron/1_core/test_rotary_embedding.py  |   1 +
 tests/neuron/2_core/test_comm_ops.py          |   1 +
 tests/neuron/2_core/test_eagle.py             |   1 +
 tests/neuron/2_core/test_mistral.py           |   1 +
 tests/neuron/2_core/test_multi_lora.py        |   1 +
 .../test_filesystem_resolver.py               |   1 +
 tests/plugins/vllm_add_dummy_model/setup.py   |   1 +
 .../vllm_add_dummy_model/__init__.py          |   1 +
 .../my_gemma_embedding.py                     |   1 +
 .../vllm_add_dummy_model/my_llava.py          |   1 +
 .../vllm_add_dummy_model/my_opt.py            |   1 +
 .../plugins/vllm_add_dummy_platform/setup.py  |   1 +
 .../vllm_add_dummy_platform/__init__.py       |   1 +
 .../dummy_attention_backend.py                |   1 +
 .../vllm_add_dummy_platform/dummy_platform.py |   1 +
 tests/plugins_tests/conftest.py               |   1 +
 tests/plugins_tests/test_platform_plugins.py  |   1 +
 tests/plugins_tests/test_scheduler_plugins.py |   1 +
 tests/pplx_utils.py                           | 123 ++++
 .../test_disable_sliding_window.py            |   1 +
 tests/prefix_caching/test_prefix_caching.py   |   1 +
 tests/prompt_adapter/test_bloom.py            |   1 +
 .../test_multi_adapter_inference.py           |   1 +
 tests/prompt_adapter/test_pa_lora.py          |   1 +
 tests/quantization/test_auto_round.py         |   1 +
 tests/quantization/test_bitsandbytes.py       |   1 +
 tests/quantization/test_compressed_tensors.py |   3 +-
 tests/quantization/test_configs.py            |   1 +
 tests/quantization/test_cpu_offload.py        |   3 +-
 tests/quantization/test_experts_int8.py       |   1 +
 tests/quantization/test_fp8.py                |   1 +
 tests/quantization/test_gptq_dynamic.py       |   1 +
 tests/quantization/test_ipex_quant.py         |   1 +
 tests/quantization/test_lm_head.py            |   1 +
 tests/quantization/test_ptpc_fp8.py           |   1 +
 tests/quantization/test_quark.py              |   1 +
 .../test_register_quantization_config.py      |   1 +
 tests/quantization/test_torchao.py            |   7 +-
 tests/quantization/utils.py                   |   1 +
 .../test_deepseekr1_reasoning_parser.py       |   1 +
 .../test_granite_reasoning_parser.py          |   1 +
 .../reasoning/test_qwen3_reasoning_parser.py  |   1 +
 tests/reasoning/utils.py                      |   1 +
 .../test_runai_model_streamer_loader.py       |   1 +
 .../test_weight_utils.py                      |   1 +
 tests/samplers/test_beam_search.py            |   1 +
 tests/samplers/test_ignore_eos.py             |   1 +
 tests/samplers/test_logits_processor.py       |   1 +
 tests/samplers/test_logprobs.py               |   1 +
 tests/samplers/test_no_bad_words.py           |   1 +
 tests/samplers/test_ranks.py                  |   1 +
 tests/samplers/test_rejection_sampler.py      |   1 +
 tests/samplers/test_sampler.py                |   1 +
 tests/samplers/test_seeded_generate.py        |   1 +
 .../test_typical_acceptance_sampler.py        |   1 +
 tests/spec_decode/conftest.py                 |   1 +
 tests/spec_decode/e2e/conftest.py             |   1 +
 tests/spec_decode/e2e/test_compatibility.py   |   1 +
 .../spec_decode/e2e/test_eagle_correctness.py |   1 +
 tests/spec_decode/e2e/test_integration.py     |   1 +
 .../e2e/test_integration_dist_tp2.py          |   1 +
 .../e2e/test_integration_dist_tp4.py          |   1 +
 tests/spec_decode/e2e/test_logprobs.py        |   1 +
 .../e2e/test_medusa_correctness.py            |   1 +
 tests/spec_decode/e2e/test_mlp_correctness.py |   1 +
 tests/spec_decode/e2e/test_mtp_correctness.py |   1 +
 .../e2e/test_multistep_correctness.py         |   1 +
 .../spec_decode/e2e/test_ngram_correctness.py |   1 +
 tests/spec_decode/e2e/test_seed.py            |   1 +
 tests/spec_decode/test_batch_expansion.py     |   1 +
 tests/spec_decode/test_dynamic_spec_decode.py |   1 +
 tests/spec_decode/test_memory_usage.py        |   1 +
 tests/spec_decode/test_metrics.py             |   1 +
 tests/spec_decode/test_multi_step_worker.py   |   1 +
 tests/spec_decode/test_ngram_worker.py        |   1 +
 tests/spec_decode/test_scorer.py              |   1 +
 tests/spec_decode/test_spec_decode_worker.py  |   1 +
 tests/spec_decode/test_utils.py               |   1 +
 tests/spec_decode/utils.py                    |   1 +
 tests/standalone_tests/lazy_imports.py        |   1 +
 tests/tensorizer_loader/conftest.py           |   1 +
 tests/tensorizer_loader/test_tensorizer.py    |   1 +
 tests/test_cache_block_hashing.py             |   1 +
 tests/test_config.py                          |  14 +
 tests/test_embedded_commit.py                 |   1 +
 tests/test_inputs.py                          |   1 +
 tests/test_logger.py                          |   1 +
 tests/test_outputs.py                         |   1 +
 tests/test_regression.py                      |   1 +
 tests/test_sampling_params.py                 |   1 +
 tests/test_scalartype.py                      |   1 +
 tests/test_seed_behavior.py                   |   3 +-
 tests/test_sequence.py                        |   1 +
 tests/test_sharded_state_loader.py            |   1 +
 tests/test_triton_utils.py                    |   1 +
 tests/test_utils.py                           |   1 +
 tests/test_version.py                         |   1 +
 tests/test_vllm_port.py                       |   1 +
 tests/tokenization/test_cached_tokenizer.py   |   1 +
 tests/tokenization/test_detokenize.py         |   4 +-
 tests/tokenization/test_get_eos.py            |   1 +
 tests/tokenization/test_mistral_tokenizer.py  |   1 +
 tests/tokenization/test_tokenizer.py          |   1 +
 tests/tokenization/test_tokenizer_group.py    |   1 +
 tests/tokenization/test_tokenizer_registry.py |   1 +
 tests/tool_use/conftest.py                    |   1 +
 ...est_chat_completion_request_validations.py |   1 +
 tests/tool_use/test_chat_completions.py       |   1 +
 tests/tool_use/test_jamba_tool_parser.py      |   1 +
 tests/tool_use/test_parallel_tool_calls.py    |   1 +
 tests/tool_use/test_tool_calls.py             |   1 +
 tests/tool_use/test_tool_choice_required.py   |   1 +
 tests/tool_use/utils.py                       |   1 +
 tests/tpu/lora/test_lora.py                   |   1 +
 tests/tpu/test_compilation.py                 |   6 +-
 tests/tpu/test_custom_dispatcher.py           |   1 +
 tests/tpu/test_moe_pallas.py                  |   3 +-
 tests/tpu/test_quantization_accuracy.py       |   1 +
 tests/tracing/test_tracing.py                 |   1 +
 tests/utils.py                                |   1 +
 tests/v1/core/test_kv_cache_utils.py          | 326 ++++++++--
 tests/v1/core/test_prefix_caching.py          | 451 +++++++++++---
 tests/v1/core/test_scheduler.py               |  17 +-
 tests/v1/core/test_scheduler_e2e.py           |   1 +
 tests/v1/core/test_specialized_manager.py     |  51 +-
 tests/v1/e2e/test_cascade_attention.py        |   1 +
 .../v1/e2e/test_correctness_sliding_window.py |   5 +-
 tests/v1/e2e/test_spec_decode.py              |   1 +
 tests/v1/engine/conftest.py                   |   1 +
 tests/v1/engine/test_async_llm.py             | 199 ++++--
 tests/v1/engine/test_engine_args.py           |   1 +
 tests/v1/engine/test_engine_core.py           |   2 +
 tests/v1/engine/test_engine_core_client.py    | 191 +++++-
 tests/v1/engine/test_llm_engine.py            |   1 +
 tests/v1/engine/test_output_processor.py      |   6 +
 tests/v1/engine/utils.py                      |   1 +
 tests/v1/entrypoints/conftest.py              |   1 +
 .../llm/test_struct_output_generate.py        |   1 +
 .../openai/test_chat_completion.py            |   1 +
 .../v1/entrypoints/openai/test_completion.py  |   1 +
 .../openai/test_multi_api_servers.py          |   1 +
 .../nixl_integration/run_accuracy_test.sh     |  11 +-
 .../nixl_integration/test_accuracy.py         |   2 +
 .../nixl_integration/test_edge_cases.py       |   1 +
 .../nixl_integration/toy_proxy_server.py      |   1 +
 .../kv_connector/unit/test_multi_connector.py |  78 ++-
 .../kv_connector/unit/test_nixl_connector.py  |   5 +-
 .../unit/test_remote_decode_lifecycle.py      |   5 +-
 .../unit/test_remote_prefill_lifecycle.py     |  25 +-
 tests/v1/kv_connector/unit/utils.py           |  11 +-
 tests/v1/metrics/test_ray_metrics.py          |   6 +-
 tests/v1/sample/test_logprobs.py              |  14 +-
 tests/v1/sample/test_logprobs_e2e.py          |   1 +
 tests/v1/sample/test_rejection_sampler.py     |   1 +
 tests/v1/sample/test_sampler.py               |   1 +
 tests/v1/sample/test_sampling_params_e2e.py   |   1 +
 tests/v1/sample/test_topk_topp_sampler.py     |   1 +
 tests/v1/sample/utils.py                      |   1 +
 tests/v1/shutdown/test_delete.py              |   1 +
 tests/v1/shutdown/test_forward_error.py       |   1 +
 tests/v1/shutdown/test_processor_error.py     |   1 +
 tests/v1/shutdown/test_startup_error.py       |   1 +
 tests/v1/shutdown/utils.py                    |   1 +
 tests/v1/spec_decode/test_eagle.py            |  65 +-
 tests/v1/spec_decode/test_max_len.py          |   1 +
 tests/v1/spec_decode/test_ngram.py            |   1 +
 tests/v1/structured_output/test_utils.py      |   1 +
 tests/v1/test_async_llm_dp.py                 |  26 +-
 tests/v1/test_metrics_reader.py               |   1 +
 tests/v1/test_oracle.py                       |   1 +
 tests/v1/test_serial_utils.py                 |   1 +
 tests/v1/test_utils.py                        |   1 +
 tests/v1/tpu/test_basic.py                    |   1 +
 tests/v1/tpu/test_mha_attn.py                 |   1 +
 tests/v1/tpu/test_multimodal.py               |   1 +
 tests/v1/tpu/test_pallas.py                   |   1 +
 tests/v1/tpu/test_perf.py                     |   1 +
 tests/v1/tpu/test_sampler.py                  |   1 +
 .../v1/tpu/test_spmd_model_weight_loading.py  |  13 +-
 tests/v1/tpu/test_topk_topp_sampler.py        |   1 +
 tests/v1/tpu/worker/test_tpu_model_runner.py  | 235 ++++++-
 tests/v1/worker/test_gpu_input_batch.py       |  30 +-
 tests/v1/worker/test_gpu_model_runner.py      | 258 +++++++-
 tests/vllm_test_utils/setup.py                |   1 +
 .../vllm_test_utils/__init__.py               |   1 +
 .../vllm_test_utils/vllm_test_utils/blame.py  |   1 +
 .../vllm_test_utils/monitor.py                |   1 +
 tests/weight_loading/test_weight_loading.py   |   1 +
 tests/worker/conftest.py                      |   1 +
 .../test_encoder_decoder_model_runner.py      |   1 +
 tests/worker/test_model_input.py              |   1 +
 tests/worker/test_model_runner.py             |   1 +
 tests/worker/test_profile.py                  |   1 +
 tests/worker/test_swap.py                     |   1 +
 tools/check_spdx_header.py                    |   5 +-
 tools/check_triton_import.py                  |   1 +
 tools/enforce_regex_import.py                 |   1 +
 tools/profiler/print_layerwise_table.py       |   1 +
 tools/profiler/visualize_layerwise_profile.py |   1 +
 tools/report_build_time_ninja.py              |   1 +
 use_existing_torch.py                         |   1 +
 vllm/__init__.py                              |   1 +
 vllm/_custom_ops.py                           | 114 +++-
 vllm/_ipex_ops.py                             |   1 +
 vllm/adapter_commons/layers.py                |   1 +
 vllm/adapter_commons/models.py                |   1 +
 vllm/adapter_commons/request.py               |   1 +
 vllm/adapter_commons/utils.py                 |   1 +
 vllm/adapter_commons/worker_manager.py        |   1 +
 vllm/assets/audio.py                          |   1 +
 vllm/assets/base.py                           |   1 +
 vllm/assets/image.py                          |   1 +
 vllm/assets/video.py                          |   1 +
 vllm/attention/__init__.py                    |   1 +
 vllm/attention/backends/abstract.py           |   2 +
 vllm/attention/backends/blocksparse_attn.py   |   4 +
 vllm/attention/backends/cpu_mla.py            |  10 +-
 .../backends/dual_chunk_flash_attn.py         |   4 +
 vllm/attention/backends/flash_attn.py         |   4 +
 vllm/attention/backends/flashinfer.py         |   4 +
 vllm/attention/backends/flashmla.py           |   4 +-
 vllm/attention/backends/hpu_attn.py           |   4 +
 vllm/attention/backends/ipex_attn.py          |   4 +
 vllm/attention/backends/mla/common.py         |   4 +
 vllm/attention/backends/pallas.py             |   4 +
 vllm/attention/backends/placeholder_attn.py   |   1 +
 vllm/attention/backends/rocm_aiter_mla.py     |   4 +-
 vllm/attention/backends/rocm_flash_attn.py    |   4 +
 vllm/attention/backends/torch_sdpa.py         |  20 +-
 vllm/attention/backends/triton_mla.py         |   4 +-
 vllm/attention/backends/utils.py              |   1 +
 vllm/attention/backends/xformers.py           |   4 +
 vllm/attention/layer.py                       |  18 +-
 .../blocksparse_attention_kernel.py           |   1 +
 .../ops/blocksparse_attention/interface.py    |   1 +
 .../ops/blocksparse_attention/utils.py        |   1 +
 .../ops/chunked_prefill_paged_decode.py       |   1 +
 vllm/attention/ops/flashmla.py                |   1 +
 vllm/attention/ops/hpu_paged_attn.py          |   1 +
 vllm/attention/ops/ipex_attn.py               |   1 +
 vllm/attention/ops/merge_attn_states.py       |   1 +
 vllm/attention/ops/nki_flash_attn.py          |   1 +
 vllm/attention/ops/paged_attn.py              |   1 +
 vllm/attention/ops/prefix_prefill.py          |   1 +
 vllm/attention/ops/rocm_aiter_mla.py          |   1 +
 vllm/attention/ops/rocm_aiter_paged_attn.py   |   1 +
 vllm/attention/ops/triton_decode_attention.py |   1 +
 vllm/attention/ops/triton_flash_attention.py  |   1 +
 .../attention/ops/triton_merge_attn_states.py |   1 +
 .../attention/ops/triton_unified_attention.py |   1 +
 vllm/attention/selector.py                    |   1 +
 vllm/attention/utils/fa_utils.py              |   1 +
 vllm/beam_search.py                           |   1 +
 vllm/benchmarks/datasets.py                   |  54 +-
 vllm/benchmarks/endpoint_request_func.py      |   1 +
 vllm/benchmarks/latency.py                    |   1 +
 vllm/benchmarks/serve.py                      |   1 +
 vllm/benchmarks/throughput.py                 |   1 +
 vllm/benchmarks/utils.py                      |   1 +
 vllm/collect_env.py                           |   6 +-
 vllm/compilation/activation_quant_fusion.py   |   1 +
 vllm/compilation/backends.py                  |   1 +
 vllm/compilation/base_piecewise_backend.py    |   1 +
 vllm/compilation/collective_fusion.py         |   1 +
 vllm/compilation/compiler_interface.py        |   1 +
 vllm/compilation/counter.py                   |   3 +-
 vllm/compilation/cuda_piecewise_backend.py    |   3 +-
 vllm/compilation/decorators.py                |   1 +
 vllm/compilation/fix_functionalization.py     |   1 +
 vllm/compilation/fusion.py                    |   1 +
 vllm/compilation/fx_utils.py                  |   1 +
 vllm/compilation/inductor_pass.py             |   1 +
 vllm/compilation/monitor.py                   |   1 +
 vllm/compilation/multi_output_match.py        |   1 +
 vllm/compilation/noop_elimination.py          |   1 +
 vllm/compilation/pass_manager.py              |   1 +
 vllm/compilation/sequence_parallelism.py      |   1 +
 vllm/compilation/torch25_custom_graph_pass.py |   1 +
 vllm/compilation/vllm_inductor_pass.py        |   1 +
 vllm/compilation/wrapper.py                   |   8 +-
 vllm/config.py                                |  66 +-
 vllm/connections.py                           |   1 +
 vllm/core/block/block_table.py                |   1 +
 vllm/core/block/common.py                     |   1 +
 vllm/core/block/cpu_gpu_block_allocator.py    |   1 +
 vllm/core/block/interfaces.py                 |   1 +
 vllm/core/block/naive_block.py                |   1 +
 vllm/core/block/prefix_caching_block.py       |   1 +
 vllm/core/block/utils.py                      |   1 +
 vllm/core/block_manager.py                    |   1 +
 vllm/core/evictor.py                          |   1 +
 vllm/core/interfaces.py                       |   1 +
 vllm/core/placeholder_block_space_manager.py  |   1 +
 vllm/core/scheduler.py                        |   1 +
 vllm/device_allocator/cumem.py                |   1 +
 vllm/distributed/__init__.py                  |   1 +
 vllm/distributed/communication_op.py          |   1 +
 .../device_communicators/all2all.py           | 149 ++++-
 .../base_device_communicator.py               |   4 +-
 .../device_communicators/cpu_communicator.py  |   1 +
 .../device_communicators/cuda_communicator.py |   9 +
 .../device_communicators/cuda_wrapper.py      |   1 +
 .../device_communicators/custom_all_reduce.py |   1 +
 .../custom_all_reduce_utils.py                |   1 +
 .../device_communicators/hpu_communicator.py  |   1 +
 .../neuron_communicator.py                    |   1 +
 .../device_communicators/pynccl.py            |   1 +
 .../device_communicators/pynccl_wrapper.py    |   1 +
 .../device_communicators/shm_broadcast.py     |  48 +-
 .../device_communicators/tpu_communicator.py  |   1 +
 .../device_communicators/xpu_communicator.py  |   1 +
 vllm/distributed/kv_events.py                 |  78 ++-
 vllm/distributed/kv_transfer/__init__.py      |   1 +
 .../kv_transfer/kv_connector/base.py          |   1 +
 .../kv_transfer/kv_connector/factory.py       |   1 +
 .../kv_connector/lmcache_connector.py         |   1 +
 .../kv_connector/mooncake_store_connector.py  |   1 +
 .../kv_connector/simple_connector.py          |   1 +
 .../kv_transfer/kv_connector/utils.py         |  19 +-
 .../kv_transfer/kv_connector/v1/__init__.py   |   1 +
 .../kv_transfer/kv_connector/v1/base.py       |  30 +-
 .../kv_connector/v1/lmcache_connector.py      |   1 +
 .../kv_connector/v1/multi_connector.py        |  42 +-
 .../kv_connector/v1/nixl_connector.py         | 404 ++++++++----
 .../v1/shared_storage_connector.py            |   1 +
 .../kv_transfer/kv_connector_agent.py         |   1 +
 .../kv_transfer/kv_lookup_buffer/base.py      |   1 +
 .../kv_lookup_buffer/mooncake_store.py        |   1 +
 .../kv_lookup_buffer/simple_buffer.py         |   1 +
 vllm/distributed/kv_transfer/kv_pipe/base.py  |   1 +
 .../kv_transfer/kv_pipe/mooncake_pipe.py      |   1 +
 .../kv_transfer/kv_pipe/pynccl_pipe.py        |   1 +
 .../kv_transfer/kv_transfer_state.py          |   1 +
 vllm/distributed/parallel_state.py            |   1 +
 vllm/distributed/utils.py                     |   1 +
 vllm/engine/arg_utils.py                      |  44 +-
 vllm/engine/async_llm_engine.py               |  17 +-
 vllm/engine/async_timeout.py                  |   1 +
 vllm/engine/llm_engine.py                     |   1 +
 vllm/engine/metrics.py                        |   1 +
 vllm/engine/metrics_types.py                  |   1 +
 vllm/engine/multiprocessing/__init__.py       |   1 +
 vllm/engine/multiprocessing/client.py         |   1 +
 vllm/engine/multiprocessing/engine.py         |   1 +
 vllm/engine/output_processor/interfaces.py    |   1 +
 vllm/engine/output_processor/multi_step.py    |   1 +
 vllm/engine/output_processor/single_step.py   |   1 +
 vllm/engine/output_processor/stop_checker.py  |   1 +
 vllm/engine/output_processor/util.py          |   1 +
 vllm/engine/protocol.py                       |   1 +
 vllm/entrypoints/api_server.py                |   1 +
 vllm/entrypoints/chat_utils.py                |   1 +
 vllm/entrypoints/cli/benchmark/base.py        |   1 +
 vllm/entrypoints/cli/benchmark/latency.py     |   1 +
 vllm/entrypoints/cli/benchmark/main.py        |   1 +
 vllm/entrypoints/cli/benchmark/serve.py       |   1 +
 vllm/entrypoints/cli/benchmark/throughput.py  |   1 +
 vllm/entrypoints/cli/collect_env.py           |   1 +
 vllm/entrypoints/cli/main.py                  |   5 +-
 vllm/entrypoints/cli/openai.py                |   1 +
 vllm/entrypoints/cli/run_batch.py             |   9 +-
 vllm/entrypoints/cli/serve.py                 |   7 +-
 vllm/entrypoints/cli/types.py                 |   1 +
 vllm/entrypoints/launcher.py                  |   1 +
 vllm/entrypoints/llm.py                       |   1 +
 vllm/entrypoints/logger.py                    |   1 +
 vllm/entrypoints/openai/api_server.py         |   1 +
 vllm/entrypoints/openai/cli_args.py           |   1 +
 vllm/entrypoints/openai/logits_processors.py  |   1 +
 vllm/entrypoints/openai/protocol.py           |   3 +
 vllm/entrypoints/openai/run_batch.py          |   1 +
 vllm/entrypoints/openai/serving_chat.py       |  35 +-
 .../openai/serving_classification.py          |   1 +
 vllm/entrypoints/openai/serving_completion.py |   1 +
 vllm/entrypoints/openai/serving_embedding.py  |   1 +
 vllm/entrypoints/openai/serving_engine.py     |   1 +
 vllm/entrypoints/openai/serving_models.py     |   1 +
 vllm/entrypoints/openai/serving_pooling.py    |   1 +
 vllm/entrypoints/openai/serving_score.py      |   1 +
 .../openai/serving_tokenization.py            |   1 +
 .../openai/serving_transcription.py           |   1 +
 .../openai/tool_parsers/__init__.py           |   1 +
 .../tool_parsers/abstract_tool_parser.py      |   1 +
 .../tool_parsers/deepseekv3_tool_parser.py    |   1 +
 .../granite_20b_fc_tool_parser.py             |   1 +
 .../tool_parsers/granite_tool_parser.py       |   1 +
 .../openai/tool_parsers/hermes_tool_parser.py |   1 +
 .../tool_parsers/internlm2_tool_parser.py     |   1 +
 .../openai/tool_parsers/jamba_tool_parser.py  |   1 +
 .../llama4_pythonic_tool_parser.py            |   1 +
 .../openai/tool_parsers/llama_tool_parser.py  |   1 +
 .../tool_parsers/mistral_tool_parser.py       |  35 +-
 .../tool_parsers/phi4mini_tool_parser.py      |   1 +
 .../tool_parsers/pythonic_tool_parser.py      |   1 +
 vllm/entrypoints/openai/tool_parsers/utils.py |   1 +
 vllm/entrypoints/score_utils.py               |   1 +
 vllm/entrypoints/ssl.py                       |   1 +
 vllm/entrypoints/utils.py                     |  15 +-
 vllm/env_override.py                          |  19 +-
 vllm/envs.py                                  |   9 +
 vllm/executor/executor_base.py                |   1 +
 vllm/executor/mp_distributed_executor.py      |   1 +
 vllm/executor/msgspec_utils.py                |   1 +
 vllm/executor/multiproc_worker_utils.py       |   1 +
 vllm/executor/ray_distributed_executor.py     |   1 +
 vllm/executor/ray_utils.py                    |   1 +
 vllm/executor/uniproc_executor.py             |   1 +
 vllm/forward_context.py                       |   1 +
 vllm/inputs/__init__.py                       |   1 +
 vllm/inputs/data.py                           |   1 +
 vllm/inputs/parse.py                          |   1 +
 vllm/inputs/preprocess.py                     |   1 +
 vllm/inputs/registry.py                       |  27 +-
 vllm/jsontree.py                              |   1 +
 vllm/logger.py                                |   1 +
 vllm/logging_utils/__init__.py                |   1 +
 vllm/logging_utils/dump_input.py              |   1 +
 vllm/logging_utils/formatter.py               |   1 +
 vllm/logits_process.py                        |   1 +
 vllm/lora/fully_sharded_layers.py             |   1 +
 vllm/lora/layers.py                           |   1 +
 vllm/lora/lora.py                             |   1 +
 vllm/lora/models.py                           |   1 +
 vllm/lora/ops/torch_ops/__init__.py           |   1 +
 vllm/lora/ops/torch_ops/lora_ops.py           |   1 +
 vllm/lora/ops/triton_ops/__init__.py          |   1 +
 vllm/lora/ops/triton_ops/kernel_utils.py      |   1 +
 vllm/lora/ops/triton_ops/lora_expand_op.py    |   1 +
 .../ops/triton_ops/lora_kernel_metadata.py    |   1 +
 vllm/lora/ops/triton_ops/lora_shrink_op.py    |   1 +
 vllm/lora/ops/triton_ops/utils.py             |   1 +
 vllm/lora/ops/xla_ops/__init__.py             |   1 +
 vllm/lora/ops/xla_ops/lora_ops.py             |   1 +
 vllm/lora/peft_helper.py                      |   1 +
 vllm/lora/punica_wrapper/__init__.py          |   1 +
 vllm/lora/punica_wrapper/punica_base.py       |   1 +
 vllm/lora/punica_wrapper/punica_cpu.py        |   1 +
 vllm/lora/punica_wrapper/punica_gpu.py        |   1 +
 vllm/lora/punica_wrapper/punica_hpu.py        |   1 +
 vllm/lora/punica_wrapper/punica_selector.py   |   1 +
 vllm/lora/punica_wrapper/punica_tpu.py        |   1 +
 vllm/lora/punica_wrapper/utils.py             |   1 +
 vllm/lora/request.py                          |   1 +
 vllm/lora/resolver.py                         |   1 +
 vllm/lora/utils.py                            |   1 +
 vllm/lora/worker_manager.py                   |   1 +
 vllm/model_executor/__init__.py               |   1 +
 vllm/model_executor/custom_op.py              |   1 +
 .../guided_decoding/__init__.py               |   1 +
 .../guided_decoding/guidance_decoding.py      |   1 +
 .../guidance_logits_processors.py             |   1 +
 .../guided_decoding/guided_fields.py          |   1 +
 .../lm_format_enforcer_decoding.py            |   1 +
 .../guided_decoding/outlines_decoding.py      |   1 +
 .../outlines_logits_processors.py             |   1 +
 vllm/model_executor/guided_decoding/utils.py  |   1 +
 .../guided_decoding/xgrammar_decoding.py      |   1 +
 vllm/model_executor/layers/activation.py      |   1 +
 .../layers/fused_moe/__init__.py              |   1 +
 .../layers/fused_moe/batched_deep_gemm_moe.py | 125 ++++
 .../batched_triton_or_deep_gemm_moe.py        | 117 ++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 +++++
 .../layers/fused_moe/cutlass_moe.py           | 379 +++++++-----
 .../layers/fused_moe/deep_gemm_moe.py         |  34 +-
 .../fused_moe/deepep_ht_prepare_finalize.py   | 236 ++++++++
 .../fused_moe/deepep_ll_prepare_finalize.py   | 186 ++++++
 .../layers/fused_moe/fused_batched_moe.py     |  60 +-
 .../layers/fused_moe/fused_marlin_moe.py      |   1 +
 .../layers/fused_moe/fused_moe.py             |   4 +-
 vllm/model_executor/layers/fused_moe/layer.py | 203 +++++--
 .../layers/fused_moe/modular_kernel.py        | 164 +++--
 .../layers/fused_moe/moe_align_block_size.py  |   1 +
 .../layers/fused_moe/moe_pallas.py            |   1 +
 .../layers/fused_moe/moe_permute_unpermute.py |   6 +-
 .../layers/fused_moe/moe_torch_iterative.py   |   1 +
 .../layers/fused_moe/pplx_prepare_finalize.py |  33 +-
 .../layers/fused_moe/prepare_finalize.py      |  13 +-
 .../layers/fused_moe/rocm_aiter_fused_moe.py  |   1 +
 .../layers/fused_moe/triton_deep_gemm_moe.py  |  13 +-
 vllm/model_executor/layers/fused_moe/utils.py |   5 +-
 vllm/model_executor/layers/layernorm.py       |   1 +
 vllm/model_executor/layers/lightning_attn.py  |   1 +
 vllm/model_executor/layers/linear.py          |   1 +
 .../model_executor/layers/logits_processor.py |   1 +
 .../layers/mamba/mamba2_metadata.py           |   1 +
 .../layers/mamba/mamba_mixer.py               |   1 +
 .../layers/mamba/mamba_mixer2.py              |   1 +
 .../layers/mamba/ops/causal_conv1d.py         |   1 +
 .../layers/mamba/ops/mamba_ssm.py             |   1 +
 .../layers/mamba/ops/ssd_bmm.py               |   1 +
 .../layers/mamba/ops/ssd_chunk_scan.py        |   1 +
 .../layers/mamba/ops/ssd_chunk_state.py       |   1 +
 .../layers/mamba/ops/ssd_combined.py          |   1 +
 .../layers/mamba/ops/ssd_state_passing.py     |   1 +
 vllm/model_executor/layers/pooler.py          |   1 +
 .../layers/quantization/__init__.py           |   1 +
 .../layers/quantization/aqlm.py               |   1 +
 .../layers/quantization/auto_round.py         |   1 +
 .../model_executor/layers/quantization/awq.py |   1 +
 .../layers/quantization/awq_marlin.py         |   1 +
 .../layers/quantization/awq_triton.py         |   1 +
 .../layers/quantization/base_config.py        |   1 +
 .../layers/quantization/bitblas.py            |   1 +
 .../layers/quantization/bitsandbytes.py       |   1 +
 .../compressed_tensors/compressed_tensors.py  |   7 +-
 .../compressed_tensors_moe.py                 |  88 +--
 .../compressed_tensors/schemes/__init__.py    |   1 +
 .../schemes/compressed_tensors_24.py          |   1 +
 .../schemes/compressed_tensors_scheme.py      |   1 +
 .../schemes/compressed_tensors_w4a16_24.py    |   1 +
 .../schemes/compressed_tensors_w4a16_nvfp4.py |   1 +
 .../schemes/compressed_tensors_w8a16_fp8.py   |   1 +
 .../schemes/compressed_tensors_w8a8_fp8.py    |   1 +
 .../schemes/compressed_tensors_w8a8_int8.py   |   1 +
 .../schemes/compressed_tensors_wNa16.py       |   1 +
 .../compressed_tensors/triton_scaled_mm.py    |   1 +
 .../quantization/compressed_tensors/utils.py  |   1 +
 .../layers/quantization/deepspeedfp.py        |   1 +
 .../layers/quantization/experts_int8.py       |   1 +
 .../layers/quantization/fbgemm_fp8.py         |   1 +
 .../model_executor/layers/quantization/fp8.py |  46 +-
 .../layers/quantization/gguf.py               |   1 +
 .../layers/quantization/gptq.py               |   1 +
 .../layers/quantization/gptq_bitblas.py       |   1 +
 .../layers/quantization/gptq_marlin.py        |   1 +
 .../layers/quantization/gptq_marlin_24.py     |   1 +
 .../layers/quantization/hqq_marlin.py         |   1 +
 .../layers/quantization/ipex_quant.py         |   1 +
 .../kernels/mixed_precision/MPLinearKernel.py |   1 +
 .../kernels/mixed_precision/__init__.py       |   1 +
 .../kernels/mixed_precision/allspark.py       |   1 +
 .../kernels/mixed_precision/bitblas.py        |   1 +
 .../kernels/mixed_precision/exllama.py        |   1 +
 .../kernels/mixed_precision/machete.py        |   1 +
 .../kernels/mixed_precision/marlin.py         |   1 +
 .../kernels/scaled_mm/ScaledMMLinearKernel.py |   1 +
 .../kernels/scaled_mm/__init__.py             |   1 +
 .../quantization/kernels/scaled_mm/aiter.py   |   1 +
 .../quantization/kernels/scaled_mm/cutlass.py |   1 +
 .../quantization/kernels/scaled_mm/triton.py  |   1 +
 .../quantization/kernels/scaled_mm/xla.py     |   1 +
 .../layers/quantization/kv_cache.py           |   1 +
 .../layers/quantization/marlin.py             |   1 +
 .../layers/quantization/modelopt.py           |   1 +
 .../layers/quantization/moe_wna16.py          |   1 +
 .../layers/quantization/neuron_quant.py       |   1 +
 .../layers/quantization/ptpc_fp8.py           |   1 +
 .../model_executor/layers/quantization/qqq.py |   1 +
 .../layers/quantization/quark/quark.py        |   1 +
 .../layers/quantization/quark/quark_moe.py    |   1 +
 .../quantization/quark/schemes/__init__.py    |   1 +
 .../quark/schemes/quark_scheme.py             |   1 +
 .../quark/schemes/quark_w4a4_mxfp4.py         |   1 +
 .../quark/schemes/quark_w8a8_fp8.py           |   1 +
 .../quark/schemes/quark_w8a8_int8.py          |   1 +
 .../layers/quantization/quark/utils.py        |   1 +
 .../layers/quantization/schema.py             |   1 +
 .../layers/quantization/torchao.py            |  22 +-
 .../layers/quantization/tpu_int8.py           |   1 +
 .../layers/quantization/utils/__init__.py     |   1 +
 .../quantization/utils/allspark_utils.py      |   1 +
 .../quantization/utils/bitblas_utils.py       |   1 +
 .../layers/quantization/utils/fp8_utils.py    |  15 +-
 .../layers/quantization/utils/gptq_utils.py   |   1 +
 .../layers/quantization/utils/int8_utils.py   |   1 +
 .../layers/quantization/utils/layer_utils.py  |   1 +
 .../quantization/utils/machete_utils.py       |   1 +
 .../layers/quantization/utils/marlin_utils.py |   1 +
 .../quantization/utils/marlin_utils_fp4.py    |   1 +
 .../quantization/utils/marlin_utils_fp8.py    |   1 +
 .../quantization/utils/marlin_utils_test.py   |   1 +
 .../utils/marlin_utils_test_24.py             |   1 +
 .../utils/marlin_utils_test_qqq.py            |   1 +
 .../layers/quantization/utils/mxfp4_utils.py  |   1 +
 .../utils/nvfp4_emulation_utils.py            |   1 +
 .../layers/quantization/utils/quant_utils.py  |   1 +
 .../layers/quantization/utils/w8a8_utils.py   |   1 +
 .../layers/rejection_sampler.py               |   1 +
 vllm/model_executor/layers/resampler.py       |   1 +
 .../model_executor/layers/rotary_embedding.py |   1 +
 vllm/model_executor/layers/sampler.py         |   1 +
 .../layers/spec_decode_base_sampler.py        |   1 +
 .../layers/typical_acceptance_sampler.py      |   1 +
 vllm/model_executor/layers/utils.py           |  14 +-
 .../layers/vocab_parallel_embedding.py        |   1 +
 vllm/model_executor/model_loader/__init__.py  |   1 +
 .../model_loader/base_loader.py               |   1 +
 .../model_loader/bitsandbytes_loader.py       |   1 +
 .../model_loader/default_loader.py            |   1 +
 .../model_loader/dummy_loader.py              |   1 +
 .../model_loader/gguf_loader.py               |   1 +
 vllm/model_executor/model_loader/neuron.py    |   1 +
 .../model_loader/neuronx_distributed.py       |   1 +
 .../model_loader/runai_streamer_loader.py     |   1 +
 .../model_loader/sharded_state_loader.py      |   1 +
 .../model_executor/model_loader/tensorizer.py |   1 +
 .../model_loader/tensorizer_loader.py         |   1 +
 vllm/model_executor/model_loader/utils.py     |   1 +
 .../model_loader/weight_utils.py              |   1 +
 vllm/model_executor/models/__init__.py        |   1 +
 vllm/model_executor/models/adapters.py        |   1 +
 vllm/model_executor/models/aimv2.py           |   1 +
 vllm/model_executor/models/arctic.py          |   1 +
 vllm/model_executor/models/aria.py            |   1 +
 vllm/model_executor/models/aya_vision.py      |  15 +-
 vllm/model_executor/models/baichuan.py        |   1 +
 vllm/model_executor/models/bamba.py           |   1 +
 vllm/model_executor/models/bart.py            |   1 +
 vllm/model_executor/models/bert.py            |  14 +-
 vllm/model_executor/models/bert_with_rope.py  |   8 +-
 vllm/model_executor/models/blip.py            |   1 +
 vllm/model_executor/models/blip2.py           |   1 +
 vllm/model_executor/models/bloom.py           |   1 +
 vllm/model_executor/models/chameleon.py       |   1 +
 vllm/model_executor/models/chatglm.py         |   1 +
 vllm/model_executor/models/clip.py            |   1 +
 vllm/model_executor/models/commandr.py        |   1 +
 .../models/constant_size_cache.py             |   1 +
 vllm/model_executor/models/dbrx.py            |   1 +
 vllm/model_executor/models/deepseek.py        |   1 +
 vllm/model_executor/models/deepseek_mtp.py    |   1 +
 vllm/model_executor/models/deepseek_v2.py     |   1 +
 vllm/model_executor/models/deepseek_vl2.py    |   1 +
 vllm/model_executor/models/eagle.py           |   1 +
 vllm/model_executor/models/exaone.py          |   1 +
 vllm/model_executor/models/fairseq2_llama.py  |   1 +
 vllm/model_executor/models/falcon.py          |   1 +
 vllm/model_executor/models/falcon_h1.py       |   1 +
 vllm/model_executor/models/florence2.py       |   1 +
 vllm/model_executor/models/fuyu.py            |   1 +
 vllm/model_executor/models/gemma.py           |   1 +
 vllm/model_executor/models/gemma2.py          |   1 +
 vllm/model_executor/models/gemma3.py          |   1 +
 vllm/model_executor/models/gemma3_mm.py       |   1 +
 vllm/model_executor/models/glm.py             |   1 +
 vllm/model_executor/models/glm4.py            |   1 +
 vllm/model_executor/models/glm4v.py           |   1 +
 vllm/model_executor/models/gpt2.py            |   1 +
 vllm/model_executor/models/gpt_bigcode.py     |   1 +
 vllm/model_executor/models/gpt_j.py           |   1 +
 vllm/model_executor/models/gpt_neox.py        |   1 +
 vllm/model_executor/models/granite.py         |   1 +
 vllm/model_executor/models/granite_speech.py  |   1 +
 vllm/model_executor/models/granitemoe.py      |   1 +
 .../model_executor/models/granitemoehybrid.py |   1 +
 .../model_executor/models/granitemoeshared.py |   1 +
 vllm/model_executor/models/gritlm.py          |   1 +
 vllm/model_executor/models/grok1.py           |   1 +
 vllm/model_executor/models/h2ovl.py           |   1 +
 .../models/idefics2_vision_model.py           |   1 +
 vllm/model_executor/models/idefics3.py        |  17 +-
 vllm/model_executor/models/interfaces.py      |   1 +
 vllm/model_executor/models/interfaces_base.py |   1 +
 vllm/model_executor/models/intern_vit.py      |   1 +
 vllm/model_executor/models/internlm2.py       |   1 +
 vllm/model_executor/models/internlm2_ve.py    |   1 +
 vllm/model_executor/models/internvl.py        |   1 +
 vllm/model_executor/models/jais.py            |   1 +
 vllm/model_executor/models/jamba.py           |   1 +
 vllm/model_executor/models/kimi_vl.py         |   1 +
 vllm/model_executor/models/llama.py           |   1 +
 vllm/model_executor/models/llama4.py          |   1 +
 vllm/model_executor/models/llama_eagle.py     |  15 +-
 vllm/model_executor/models/llama_eagle3.py    |  25 +-
 vllm/model_executor/models/llava.py           |   1 +
 vllm/model_executor/models/llava_next.py      |   1 +
 .../model_executor/models/llava_next_video.py |   1 +
 vllm/model_executor/models/llava_onevision.py |   1 +
 vllm/model_executor/models/mamba.py           |   1 +
 vllm/model_executor/models/mamba2.py          |   1 +
 vllm/model_executor/models/mamba_cache.py     |   1 +
 vllm/model_executor/models/medusa.py          |   1 +
 vllm/model_executor/models/mimo.py            |   1 +
 vllm/model_executor/models/mimo_mtp.py        |   1 +
 vllm/model_executor/models/minicpm.py         |   1 +
 vllm/model_executor/models/minicpm3.py        |   1 +
 vllm/model_executor/models/minicpm_eagle.py   |   1 +
 vllm/model_executor/models/minicpmo.py        |   1 +
 vllm/model_executor/models/minicpmv.py        |   1 +
 vllm/model_executor/models/minimax_cache.py   |   1 +
 vllm/model_executor/models/minimax_text_01.py |   1 +
 vllm/model_executor/models/minimax_vl_01.py   |   1 +
 vllm/model_executor/models/mistral3.py        |   1 +
 vllm/model_executor/models/mixtral.py         |   1 +
 vllm/model_executor/models/mixtral_quant.py   |   1 +
 vllm/model_executor/models/mllama.py          |   1 +
 vllm/model_executor/models/mllama4.py         |   1 +
 vllm/model_executor/models/mlp_speculator.py  |   1 +
 vllm/model_executor/models/modernbert.py      |   1 +
 vllm/model_executor/models/module_mapping.py  |   1 +
 vllm/model_executor/models/molmo.py           |   1 +
 vllm/model_executor/models/moonvit.py         |   1 +
 vllm/model_executor/models/mpt.py             |   1 +
 vllm/model_executor/models/nemotron.py        |   1 +
 vllm/model_executor/models/nemotron_h.py      | 573 ++++++++++++++++++
 vllm/model_executor/models/nemotron_nas.py    |   1 +
 vllm/model_executor/models/nvlm_d.py          |   1 +
 vllm/model_executor/models/olmo.py            |   1 +
 vllm/model_executor/models/olmo2.py           |   1 +
 vllm/model_executor/models/olmoe.py           |   1 +
 vllm/model_executor/models/opt.py             |   1 +
 vllm/model_executor/models/orion.py           |   1 +
 vllm/model_executor/models/ovis.py            |   1 +
 vllm/model_executor/models/paligemma.py       |   1 +
 vllm/model_executor/models/persimmon.py       |   1 +
 vllm/model_executor/models/phi.py             |   1 +
 vllm/model_executor/models/phi3.py            |   1 +
 vllm/model_executor/models/phi3_small.py      |   1 +
 vllm/model_executor/models/phi3v.py           |   1 +
 vllm/model_executor/models/phi4mm.py          |   1 +
 vllm/model_executor/models/phi4mm_audio.py    |   1 +
 vllm/model_executor/models/phi4mm_utils.py    |   1 +
 vllm/model_executor/models/phimoe.py          |   1 +
 vllm/model_executor/models/pixtral.py         |   1 +
 vllm/model_executor/models/plamo2.py          |   1 +
 .../models/prithvi_geospatial_mae.py          |   1 +
 vllm/model_executor/models/qwen.py            |   1 +
 vllm/model_executor/models/qwen2.py           |   1 +
 .../models/qwen2_5_omni_thinker.py            |   1 +
 vllm/model_executor/models/qwen2_5_vl.py      |   1 +
 vllm/model_executor/models/qwen2_audio.py     |   1 +
 vllm/model_executor/models/qwen2_moe.py       |   1 +
 vllm/model_executor/models/qwen2_rm.py        |   1 +
 vllm/model_executor/models/qwen2_vl.py        |   1 +
 vllm/model_executor/models/qwen3.py           |   1 +
 vllm/model_executor/models/qwen3_moe.py       |   1 +
 vllm/model_executor/models/qwen_vl.py         |   1 +
 vllm/model_executor/models/registry.py        |   2 +
 vllm/model_executor/models/roberta.py         |   1 +
 vllm/model_executor/models/siglip.py          |   1 +
 vllm/model_executor/models/skyworkr1v.py      |   1 +
 vllm/model_executor/models/smolvlm.py         |   1 +
 vllm/model_executor/models/solar.py           |   1 +
 vllm/model_executor/models/stablelm.py        |   1 +
 vllm/model_executor/models/starcoder2.py      |   1 +
 vllm/model_executor/models/telechat2.py       |   1 +
 vllm/model_executor/models/teleflm.py         |   1 +
 vllm/model_executor/models/transformers.py    |   1 +
 vllm/model_executor/models/ultravox.py        |   1 +
 vllm/model_executor/models/utils.py           |   1 +
 vllm/model_executor/models/vision.py          |   1 +
 vllm/model_executor/models/whisper.py         |   1 +
 vllm/model_executor/models/zamba2.py          |   1 +
 vllm/model_executor/parameter.py              |   1 +
 vllm/model_executor/pooling_metadata.py       |   1 +
 vllm/model_executor/sampling_metadata.py      |   1 +
 vllm/model_executor/utils.py                  |   1 +
 vllm/multimodal/__init__.py                   |   1 +
 vllm/multimodal/audio.py                      |   1 +
 vllm/multimodal/base.py                       |   1 +
 vllm/multimodal/hasher.py                     |   1 +
 vllm/multimodal/image.py                      |   1 +
 vllm/multimodal/inputs.py                     |   9 +-
 vllm/multimodal/parse.py                      |   1 +
 vllm/multimodal/processing.py                 |   1 +
 vllm/multimodal/profiling.py                  |   1 +
 vllm/multimodal/registry.py                   |   1 +
 vllm/multimodal/utils.py                      |  31 +-
 vllm/multimodal/video.py                      |   1 +
 vllm/outputs.py                               |   1 +
 vllm/platforms/__init__.py                    |   1 +
 vllm/platforms/cpu.py                         |  78 ++-
 vllm/platforms/cuda.py                        |  43 ++
 vllm/platforms/hpu.py                         |   1 +
 vllm/platforms/interface.py                   |  30 +
 vllm/platforms/neuron.py                      |   1 +
 vllm/platforms/rocm.py                        |   1 +
 vllm/platforms/tpu.py                         |   1 +
 vllm/platforms/xpu.py                         |   1 +
 vllm/plugins/__init__.py                      |   1 +
 .../lora_resolvers/filesystem_resolver.py     |   1 +
 vllm/pooling_params.py                        |   1 +
 vllm/profiler/layerwise_profile.py            |   1 +
 vllm/profiler/utils.py                        |   1 +
 vllm/prompt_adapter/layers.py                 |   1 +
 vllm/prompt_adapter/models.py                 |   1 +
 vllm/prompt_adapter/request.py                |   1 +
 vllm/prompt_adapter/utils.py                  |   1 +
 vllm/prompt_adapter/worker_manager.py         |   1 +
 vllm/reasoning/__init__.py                    |   1 +
 vllm/reasoning/abs_reasoning_parsers.py       |   1 +
 .../reasoning/deepseek_r1_reasoning_parser.py |   1 +
 vllm/reasoning/granite_reasoning_parser.py    |   1 +
 vllm/reasoning/qwen3_reasoning_parser.py      |   1 +
 vllm/sampling_params.py                       |  12 +
 vllm/scalar_type.py                           |   1 +
 vllm/scripts.py                               |   1 +
 vllm/sequence.py                              |   1 +
 vllm/spec_decode/batch_expansion.py           |   1 +
 vllm/spec_decode/draft_model_runner.py        |   2 +-
 vllm/spec_decode/interfaces.py                |   1 +
 vllm/spec_decode/medusa_worker.py             |   1 +
 vllm/spec_decode/metrics.py                   |   1 +
 vllm/spec_decode/mlp_speculator_worker.py     |   1 +
 vllm/spec_decode/mqa_scorer.py                |   1 +
 vllm/spec_decode/multi_step_worker.py         |   1 +
 vllm/spec_decode/ngram_worker.py              |   1 +
 vllm/spec_decode/proposer_worker_base.py      |   1 +
 .../spec_decode/smaller_tp_proposer_worker.py |   1 +
 vllm/spec_decode/spec_decode_worker.py        |   1 +
 vllm/spec_decode/target_model_runner.py       |   1 +
 vllm/spec_decode/top1_proposer.py             |   1 +
 vllm/spec_decode/util.py                      |   1 +
 vllm/test_utils.py                            |   1 +
 vllm/third_party/pynvml.py                    |   1 +
 vllm/tracing.py                               |   1 +
 vllm/transformers_utils/__init__.py           |   1 +
 .../chat_templates/__init__.py                |   1 +
 .../chat_templates/registry.py                |   1 +
 vllm/transformers_utils/config.py             |   1 +
 vllm/transformers_utils/configs/__init__.py   |   3 +
 vllm/transformers_utils/configs/arctic.py     |   1 +
 vllm/transformers_utils/configs/chatglm.py    |   1 +
 vllm/transformers_utils/configs/cohere2.py    |   1 +
 vllm/transformers_utils/configs/dbrx.py       |   1 +
 .../configs/deepseek_vl2.py                   |   1 +
 vllm/transformers_utils/configs/eagle.py      |   1 +
 vllm/transformers_utils/configs/exaone.py     |   1 +
 vllm/transformers_utils/configs/falcon.py     |   1 +
 vllm/transformers_utils/configs/h2ovl.py      |   1 +
 vllm/transformers_utils/configs/internvl.py   |   1 +
 vllm/transformers_utils/configs/jais.py       |   1 +
 vllm/transformers_utils/configs/kimi_vl.py    |   1 +
 vllm/transformers_utils/configs/medusa.py     |   1 +
 .../configs/minimax_text_01.py                |   1 +
 .../configs/minimax_vl_01.py                  |   1 +
 vllm/transformers_utils/configs/mllama.py     |   1 +
 .../configs/mlp_speculator.py                 |   1 +
 vllm/transformers_utils/configs/moonvit.py    |   1 +
 vllm/transformers_utils/configs/mpt.py        |   1 +
 vllm/transformers_utils/configs/nemotron.py   |   1 +
 vllm/transformers_utils/configs/nemotron_h.py | 258 ++++++++
 vllm/transformers_utils/configs/nvlm_d.py     |   1 +
 vllm/transformers_utils/configs/ovis.py       |   1 +
 vllm/transformers_utils/configs/skyworkr1v.py |   1 +
 vllm/transformers_utils/configs/solar.py      |   1 +
 vllm/transformers_utils/configs/telechat2.py  |   1 +
 vllm/transformers_utils/configs/ultravox.py   |   1 +
 vllm/transformers_utils/detokenizer.py        |   1 +
 vllm/transformers_utils/detokenizer_utils.py  |   1 +
 vllm/transformers_utils/processor.py          |   1 +
 .../transformers_utils/processors/__init__.py |   1 +
 .../processors/deepseek_vl2.py                |   1 +
 vllm/transformers_utils/processors/ovis.py    |   1 +
 vllm/transformers_utils/s3_utils.py           |   1 +
 vllm/transformers_utils/tokenizer.py          |   1 +
 vllm/transformers_utils/tokenizer_base.py     |   1 +
 vllm/transformers_utils/tokenizer_group.py    |   1 +
 .../transformers_utils/tokenizers/__init__.py |   1 +
 vllm/transformers_utils/tokenizers/mistral.py |   3 +
 vllm/transformers_utils/utils.py              |   1 +
 vllm/triton_utils/__init__.py                 |   1 +
 vllm/triton_utils/importing.py                |   1 +
 vllm/usage/usage_lib.py                       |   1 +
 vllm/utils.py                                 |   4 +-
 vllm/v1/attention/backends/cpu_attn.py        | 163 +++++
 vllm/v1/attention/backends/flash_attn.py      |  82 ++-
 vllm/v1/attention/backends/flashinfer.py      |  37 +-
 vllm/v1/attention/backends/flex_attention.py  | 477 +++++++++++++++
 vllm/v1/attention/backends/mla/common.py      |   7 +-
 vllm/v1/attention/backends/mla/cutlass_mla.py |  96 +++
 vllm/v1/attention/backends/mla/flashmla.py    |   4 +-
 .../attention/backends/mla/rocm_aiter_mla.py  |   4 +-
 vllm/v1/attention/backends/mla/triton_mla.py  |   4 +-
 vllm/v1/attention/backends/pallas.py          |   7 +-
 vllm/v1/attention/backends/triton_attn.py     |  57 +-
 vllm/v1/attention/backends/utils.py           |  34 ++
 vllm/v1/core/block_pool.py                    |  73 ++-
 vllm/v1/core/encoder_cache_manager.py         |   1 +
 vllm/v1/core/kv_cache_coordinator.py          | 358 +++++++++++
 vllm/v1/core/kv_cache_manager.py              | 140 +++--
 vllm/v1/core/kv_cache_utils.py                | 379 ++++++++++--
 vllm/v1/core/sched/interface.py               |   1 +
 vllm/v1/core/sched/output.py                  |   1 +
 vllm/v1/core/sched/scheduler.py               |  49 +-
 vllm/v1/core/sched/utils.py                   |   1 +
 vllm/v1/core/single_type_kv_cache_manager.py  | 157 +++--
 vllm/v1/engine/__init__.py                    |   2 +
 vllm/v1/engine/async_llm.py                   |  25 +-
 vllm/v1/engine/coordinator.py                 |   1 +
 vllm/v1/engine/core.py                        |   1 +
 vllm/v1/engine/core_client.py                 |  37 +-
 vllm/v1/engine/detokenizer.py                 |   1 +
 vllm/v1/engine/exceptions.py                  |   1 +
 vllm/v1/engine/llm_engine.py                  |   1 +
 vllm/v1/engine/logprobs.py                    |   1 +
 vllm/v1/engine/mm_input_cache.py              |   1 +
 vllm/v1/engine/output_processor.py            |   1 +
 vllm/v1/engine/parallel_sampling.py           |   1 +
 vllm/v1/engine/processor.py                   |   9 +
 vllm/v1/executor/abstract.py                  |   1 +
 vllm/v1/executor/multiproc_executor.py        |   1 +
 vllm/v1/executor/ray_distributed_executor.py  |   1 +
 vllm/v1/kv_cache_interface.py                 |  34 +-
 vllm/v1/metrics/loggers.py                    |   1 +
 vllm/v1/metrics/prometheus.py                 |   1 +
 vllm/v1/metrics/ray_wrappers.py               |  11 +
 vllm/v1/metrics/reader.py                     |   1 +
 vllm/v1/metrics/stats.py                      |   1 +
 vllm/v1/outputs.py                            |   1 +
 vllm/v1/request.py                            |   1 +
 vllm/v1/sample/metadata.py                    |   1 +
 vllm/v1/sample/ops/bad_words.py               |   1 +
 vllm/v1/sample/ops/penalties.py               |   1 +
 vllm/v1/sample/ops/topk_topp_sampler.py       |   1 +
 vllm/v1/sample/rejection_sampler.py           |   1 +
 vllm/v1/sample/sampler.py                     |  18 +-
 vllm/v1/sample/tpu/metadata.py                |   1 +
 vllm/v1/sample/tpu/sampler.py                 |   1 +
 vllm/v1/serial_utils.py                       |   1 +
 vllm/v1/spec_decode/eagle.py                  |  12 +-
 vllm/v1/spec_decode/medusa.py                 |   1 +
 vllm/v1/spec_decode/metadata.py               |   1 +
 vllm/v1/spec_decode/metrics.py                |   1 +
 vllm/v1/spec_decode/ngram_proposer.py         |   1 +
 vllm/v1/spec_decode/utils.py                  |   1 +
 vllm/v1/structured_output/__init__.py         |   1 +
 vllm/v1/structured_output/backend_guidance.py |   1 +
 vllm/v1/structured_output/backend_types.py    |   1 +
 vllm/v1/structured_output/backend_xgrammar.py |   1 +
 vllm/v1/structured_output/request.py          |   1 +
 vllm/v1/structured_output/utils.py            |   1 +
 vllm/v1/utils.py                              |   3 +-
 vllm/v1/worker/block_table.py                 |   4 +-
 vllm/v1/worker/cpu_model_runner.py            |  86 +++
 vllm/v1/worker/cpu_worker.py                  | 101 +++
 vllm/v1/worker/gpu_input_batch.py             |  19 +-
 vllm/v1/worker/gpu_model_runner.py            | 242 ++++++--
 vllm/v1/worker/gpu_worker.py                  |  58 +-
 vllm/v1/worker/lora_model_runner_mixin.py     |   1 +
 vllm/v1/worker/tpu_model_runner.py            |  72 ++-
 vllm/v1/worker/tpu_worker.py                  |   4 +-
 vllm/v1/worker/utils.py                       |  37 ++
 vllm/v1/worker/worker_base.py                 |   1 +
 vllm/version.py                               |   1 +
 vllm/worker/cache_engine.py                   |   1 +
 vllm/worker/cpu_enc_dec_model_runner.py       |   2 +-
 vllm/worker/cpu_model_runner.py               |   2 +-
 vllm/worker/cpu_pooling_model_runner.py       |   2 +-
 vllm/worker/cpu_worker.py                     |   1 +
 vllm/worker/enc_dec_model_runner.py           |   2 +-
 vllm/worker/hpu_model_runner.py               |   1 +
 vllm/worker/hpu_worker.py                     |   1 +
 vllm/worker/model_runner.py                   |   2 +-
 vllm/worker/model_runner_base.py              |   1 +
 vllm/worker/multi_step_hpu_worker.py          |   1 +
 vllm/worker/multi_step_model_runner.py        |   1 +
 vllm/worker/multi_step_neuron_model_runner.py |   2 +-
 ...i_step_neuronx_distributed_model_runner.py |   2 +-
 vllm/worker/multi_step_tpu_worker.py          |   1 +
 vllm/worker/multi_step_worker.py              |   1 +
 vllm/worker/neuron_model_runner.py            |   3 +-
 vllm/worker/neuron_worker.py                  |   1 +
 .../neuronx_distributed_model_runner.py       |   1 +
 vllm/worker/pooling_model_runner.py           |   2 +-
 vllm/worker/tpu_model_runner.py               |   1 +
 vllm/worker/tpu_worker.py                     |   1 +
 vllm/worker/utils.py                          |   1 +
 vllm/worker/worker.py                         |   1 +
 vllm/worker/worker_base.py                    |   4 +-
 vllm/worker/xpu_model_runner.py               |   2 +-
 vllm/worker/xpu_worker.py                     |   1 +
 1513 files changed, 13047 insertions(+), 2268 deletions(-)
 create mode 100755 .buildkite/scripts/annotate-release.sh
 create mode 100755 .buildkite/scripts/tpu/cleanup_docker.sh
 create mode 100644 .buildkite/scripts/tpu/config_v6e_1.env
 create mode 100755 .buildkite/scripts/tpu/docker_run_bm.sh
 create mode 100755 .buildkite/scripts/tpu/run_bm.sh
 create mode 100644 csrc/sampler.cu
 create mode 100644 docs/contributing/ci-failures.md
 create mode 100644 examples/online_serving/multi_instance_data_parallel.py
 create mode 100644 examples/tool_chat_template_deepseekr1.jinja
 create mode 100644 tests/compile/test_config.py
 create mode 100644 tests/kernels/moe/__init__.py
 create mode 100644 tests/kernels/moe/deepep_utils.py
 create mode 100644 tests/kernels/moe/test_deepep_deepgemm_moe.py
 create mode 100644 tests/kernels/moe/test_deepep_moe.py
 create mode 100644 tests/kernels/moe/test_pplx_cutlass_moe.py
 create mode 100644 tests/kernels/test_apply_repetition_penalties.py
 create mode 100644 tests/kernels/test_flex_attention.py
 create mode 100644 tests/models/language/pooling/test_intfloat.py
 create mode 100644 tests/pplx_utils.py
 create mode 100644 vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
 create mode 100644 vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
 create mode 100644 vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
 create mode 100644 vllm/model_executor/models/nemotron_h.py
 create mode 100644 vllm/transformers_utils/configs/nemotron_h.py
 create mode 100644 vllm/v1/attention/backends/cpu_attn.py
 create mode 100644 vllm/v1/attention/backends/flex_attention.py
 create mode 100644 vllm/v1/attention/backends/mla/cutlass_mla.py
 create mode 100644 vllm/v1/core/kv_cache_coordinator.py
 create mode 100644 vllm/v1/worker/cpu_model_runner.py
 create mode 100644 vllm/v1/worker/cpu_worker.py

diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py
index e29881fcbac0..68aff793ae6a 100644
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import sys
diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py
index 270663c415c7..7045d8810493 100644
--- a/.buildkite/generate_index.py
+++ b/.buildkite/generate_index.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import os
diff --git a/.buildkite/lm-eval-harness/conftest.py b/.buildkite/lm-eval-harness/conftest.py
index 769d2efda4ad..c0d60dd5328f 100644
--- a/.buildkite/lm-eval-harness/conftest.py
+++ b/.buildkite/lm-eval-harness/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from pathlib import Path
 
 import pytest
diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
index 409a6ca82008..930adfaf3e19 100644
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 LM eval harness on model to compare vs HF baseline computed offline.
 Configs are found in configs/$MODEL.yaml
diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index 7f2a2d8dc296..a4f1638c1adb 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 import os
diff --git a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
index 778a3a8d87f6..8532ff7ef798 100644
--- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 
diff --git a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
index 10a7a2f5a467..053fd52c35ae 100644
--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import json
diff --git a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
index e5f179a0f5b6..ddea1d2b1b1e 100644
--- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
+++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from lmdeploy.serve.openai.api_client import APIClient
 
diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
index 2a7b37991f31..fb3b9d5e34e0 100644
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import datetime
 import json
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index b3c27e2c99c2..16b5ad0297fe 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,5 +1,6 @@
 steps:
   - label: "Build wheel - CUDA 12.8"
+    id: build-wheel-cuda-12-8
     agents:
       queue: cpu_queue_postmerge
     commands:
@@ -11,6 +12,7 @@ steps:
       DOCKER_BUILDKIT: "1"
 
   - label: "Build wheel - CUDA 12.6"
+    id: build-wheel-cuda-12-6
     agents:
       queue: cpu_queue_postmerge
     commands:
@@ -28,6 +30,7 @@ steps:
 
   - label: "Build wheel - CUDA 11.8"
     # depends_on: block-build-cu118-wheel
+    id: build-wheel-cuda-11-8
     agents:
       queue: cpu_queue_postmerge
     commands:
@@ -44,6 +47,7 @@ steps:
 
   - label: "Build release image"
     depends_on: block-release-image-build
+    id: build-release-image
     agents:
       queue: cpu_queue_postmerge
     commands:
@@ -51,6 +55,18 @@ steps:
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 
+  - label: "Annotate release workflow"
+    depends_on:
+      - build-release-image
+      - build-wheel-cuda-12-8
+      - build-wheel-cuda-12-6
+      - build-wheel-cuda-11-8
+    id: annotate-release-workflow
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "bash .buildkite/scripts/annotate-release.sh"
+
   - label: "Build and publish TPU release image"
     depends_on: ~
     if: build.env("NIGHTLY") == "1"
@@ -70,9 +86,10 @@ steps:
       DOCKER_BUILDKIT: "1"
 
   - input: "Provide Release version here"
+    id: input-release-version
     fields:
       - text: "What is the release version?"
-        key: "release-version"
+        key: release-version
 
   - block: "Build CPU release image"
     key: block-cpu-release-image-build
diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh
new file mode 100755
index 000000000000..94e0ac2398f3
--- /dev/null
+++ b/.buildkite/scripts/annotate-release.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+set -ex
+
+# Get release version and strip leading 'v' if present
+RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
+
+if [ -z "$RELEASE_VERSION" ]; then
+  echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
+  exit 1
+fi
+
+buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
+To download the wheel:
+\`\`\`
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl . 
+\`\`\`
+
+To download and upload the image:
+
+\`\`\`
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
+docker tag vllm/vllm-openai vllm/vllm-openai:latest
+docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
+docker push vllm/vllm-openai:latest
+docker push vllm/vllm-openai:v${RELEASE_VERSION}
+\`\`\`
+EOF 
\ No newline at end of file
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
index 077bd9914907..36bcb015d308 100755
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -7,6 +7,7 @@ set -ex
 # Setup cleanup
 remove_docker_container() {
   if [[ -n "$container_id" ]]; then
+      podman stop --all -t0
       podman rm -f "$container_id" || true
   fi
   podman system prune -f
@@ -37,7 +38,7 @@ function cpu_tests() {
     pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
     pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
     pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
-    pytest -v -s tests/models/language/pooling/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]"
+    pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
 }
 
 # All of CPU tests are expected to be finished less than 40 mins.
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 0a11935607e2..61aa7df13b4d 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -6,6 +6,7 @@ set -ex
 
 # allow to bind to different cores
 CORE_RANGE=${CORE_RANGE:-48-95}
+OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}
 
 export CMAKE_BUILD_PARALLEL_LEVEL=32
@@ -23,10 +24,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
 
 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
- --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
- --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
 
 function cpu_tests() {
   set -e
@@ -56,7 +55,7 @@ function cpu_tests() {
   # Run AWQ test
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
-    pytest -s -v \
+    VLLM_USE_V1=0 pytest -s -v \
     tests/quantization/test_ipex_quant.py"
 
   # Run chunked-prefill and prefix-cache test
@@ -68,8 +67,6 @@ function cpu_tests() {
   # online serving
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
-    export VLLM_CPU_KVCACHE_SPACE=10 
-    export VLLM_CPU_OMP_THREADS_BIND=$1
     python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 
     timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
     python3 benchmarks/benchmark_serving.py \
@@ -89,4 +86,4 @@ function cpu_tests() {
 
 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
-timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+timeout 1h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
index 3212b660ec35..a2a5c2a02cbb 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -150,7 +150,7 @@ run_and_track_test 9 "test_multimodal.py" \
 run_and_track_test 10 "test_pallas.py" \
     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
 run_and_track_test 11 "test_struct_output_generate.py" \
-    "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py"
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
 run_and_track_test 12 "test_moe_pallas.py" \
     "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
 run_and_track_test 13 "test_lora.py" \
diff --git a/.buildkite/scripts/tpu/cleanup_docker.sh b/.buildkite/scripts/tpu/cleanup_docker.sh
new file mode 100755
index 000000000000..209d9c4341cd
--- /dev/null
+++ b/.buildkite/scripts/tpu/cleanup_docker.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+set -euo pipefail
+
+docker_root=$(docker info -f '{{.DockerRootDir}}')
+if [ -z "$docker_root" ]; then
+  echo "Failed to determine Docker root directory."
+  exit 1
+fi
+echo "Docker root directory: $docker_root"
+# Check disk usage of the filesystem where Docker's root directory is located
+disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
+# Define the threshold
+threshold=70
+if [ "$disk_usage" -gt "$threshold" ]; then
+  echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
+  # Remove dangling images (those that are not tagged and not used by any container)
+  docker image prune -f
+  # Remove unused volumes / force the system prune for old images as well.
+  docker volume prune -f && docker system prune --force --filter "until=72h" --all
+  echo "Docker images and volumes cleanup completed."
+else
+  echo "Disk usage is below $threshold%. No cleanup needed."
+fi
diff --git a/.buildkite/scripts/tpu/config_v6e_1.env b/.buildkite/scripts/tpu/config_v6e_1.env
new file mode 100644
index 000000000000..441758647347
--- /dev/null
+++ b/.buildkite/scripts/tpu/config_v6e_1.env
@@ -0,0 +1,14 @@
+# Environment config
+TEST_NAME=llama8b
+CONTAINER_NAME=vllm-tpu
+
+# vllm config
+MODEL=meta-llama/Llama-3.1-8B-Instruct
+MAX_NUM_SEQS=512
+MAX_NUM_BATCHED_TOKENS=512
+TENSOR_PARALLEL_SIZE=1
+MAX_MODEL_LEN=2048
+DOWNLOAD_DIR=/mnt/disks/persist
+EXPECTED_THROUGHPUT=8.0
+INPUT_LEN=1800
+OUTPUT_LEN=128
diff --git a/.buildkite/scripts/tpu/docker_run_bm.sh b/.buildkite/scripts/tpu/docker_run_bm.sh
new file mode 100755
index 000000000000..6705da03e3d7
--- /dev/null
+++ b/.buildkite/scripts/tpu/docker_run_bm.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+
+if [ ! -f "$1" ]; then
+  echo "Error: The env file '$1' does not exist."
+  exit 1  # Exit the script with a non-zero status to indicate an error
+fi
+
+ENV_FILE=$1
+
+# For testing on local vm, use `set -a` to export all variables
+source /etc/environment
+source $ENV_FILE
+
+remove_docker_container() { 
+    docker rm -f tpu-test || true; 
+    docker rm -f vllm-tpu || true;
+    docker rm -f $CONTAINER_NAME || true;
+}
+
+trap remove_docker_container EXIT
+
+# Remove the container that might not be cleaned up in the previous run.
+remove_docker_container
+
+# Build docker image.
+# TODO: build the image outside the script and share the image with other
+# tpu test if building time is too long.
+DOCKER_BUILDKIT=1 docker build \
+  --build-arg max_jobs=16 \
+  --build-arg USE_SCCACHE=1 \
+  --build-arg GIT_REPO_CHECK=0 \
+  --tag vllm/vllm-tpu-bm \
+  --progress plain -f docker/Dockerfile.tpu .
+
+LOG_ROOT=$(mktemp -d)
+# If mktemp fails, set -e will cause the script to exit.
+echo "Results will be stored in: $LOG_ROOT"
+
+if [ -z "$HF_TOKEN" ]; then
+  echo "Error: HF_TOKEN is not set or is empty."  
+  exit 1
+fi
+
+# Make sure mounted disk or dir exists
+if [ ! -d "$DOWNLOAD_DIR" ]; then
+    echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder."
+    exit 1
+fi
+
+echo "Run model $MODEL"
+echo
+
+echo "starting docker...$CONTAINER_NAME"
+echo    
+docker run \
+ -v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
+ --env-file $ENV_FILE \
+ -e HF_TOKEN="$HF_TOKEN" \
+ -e TARGET_COMMIT=$BUILDKITE_COMMIT \
+ -e MODEL=$MODEL \
+ -e WORKSPACE=/workspace \
+ --name $CONTAINER_NAME \
+ -d \
+ --privileged \
+ --network host \
+ -v /dev/shm:/dev/shm \
+ vllm/vllm-tpu-bm tail -f /dev/null
+
+echo "run script..."
+echo
+docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/hardware_ci/run_bm.sh"
+
+echo "copy result back..."
+VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt
+BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt
+docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG" 
+docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG"
+
+throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
+echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput"
+
+if [ "$BUILDKITE" = "true" ]; then
+  echo "Running inside Buildkite"
+  buildkite-agent artifact upload "$VLLM_LOG" 
+  buildkite-agent artifact upload "$BM_LOG"
+else
+  echo "Not running inside Buildkite"
+fi
+
+#
+# compare the throughput with EXPECTED_THROUGHPUT 
+# and assert meeting the expectation
+# 
+if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
+  echo "Failed to get the throughput"
+  exit 1
+fi
+
+if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then
+  echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)"
+  exit 1
+fi
diff --git a/.buildkite/scripts/tpu/run_bm.sh b/.buildkite/scripts/tpu/run_bm.sh
new file mode 100755
index 000000000000..877669cd956a
--- /dev/null
+++ b/.buildkite/scripts/tpu/run_bm.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+
+set -euo pipefail
+
+VLLM_LOG="$WORKSPACE/vllm_log.txt"
+BM_LOG="$WORKSPACE/bm_log.txt"
+
+if [ -n "$TARGET_COMMIT" ]; then
+  head_hash=$(git rev-parse HEAD)
+  if [ "$TARGET_COMMIT" != "$head_hash" ]; then
+    echo "Error: target commit $TARGET_COMMIT does not match HEAD: $head_hash"
+    exit 1
+  fi
+fi
+
+echo "model: $MODEL"
+echo
+
+#
+# create a log folder
+#
+mkdir "$WORKSPACE/log"
+
+# TODO: Move to image building.
+pip install pandas
+pip install datasets
+
+#
+# create sonnet_4x
+#
+echo "Create sonnet_4x.txt"
+echo "" > benchmarks/sonnet_4x.txt
+for _ in {1..4}
+ do
+  cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
+done
+
+#
+# start vllm service in backend
+#
+echo "lanching vllm..."
+echo "logging to $VLLM_LOG"
+echo
+
+VLLM_USE_V1=1 vllm serve $MODEL \
+ --seed 42 \
+ --disable-log-requests \
+ --max-num-seqs $MAX_NUM_SEQS \
+ --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
+ --tensor-parallel-size $TENSOR_PARALLEL_SIZE \
+ --no-enable-prefix-caching \
+ --download_dir $DOWNLOAD_DIR \
+ --max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
+
+
+echo "wait for 20 minutes.."
+echo
+# sleep 1200
+# wait for 10 minutes...
+for i in {1..120}; do
+    # TODO: detect other type of errors.
+    if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
+        echo "Detected RuntimeError, exiting."
+        exit 1
+    elif grep -Fq "Application startup complete" "$VLLM_LOG"; then
+        echo "Application started"
+        break
+    else
+        echo "wait for 10 seconds..."
+        sleep 10
+    fi
+done
+
+#
+# run test
+#
+echo "run benchmark test..."
+echo "logging to $BM_LOG"
+echo
+python benchmarks/benchmark_serving.py \
+    --backend vllm \
+    --model $MODEL  \
+    --dataset-name sonnet \
+    --dataset-path benchmarks/sonnet_4x.txt \
+    --sonnet-input-len $INPUT_LEN \
+    --sonnet-output-len $OUTPUT_LEN \
+    --ignore-eos > "$BM_LOG"
+
+echo "completed..."
+echo
+
+throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
+echo "throughput: $throughput"
+echo
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 5fb8ceaace05..b739851cb905 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -145,6 +145,7 @@ steps:
   - examples/offline_inference/rlhf_colocate.py
   - tests/examples/offline_inference/data_parallel.py
   - tests/v1/test_async_llm_dp.py
+  - tests/v1/engine/test_engine_core_client.py
   commands:
   # test with tp=2 and external_dp=2
   - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
@@ -154,6 +155,7 @@ steps:
   # test with internal dp
   - python3 ../examples/offline_inference/data_parallel.py
   - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
+  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
   - pytest -v -s distributed/test_utils.py
   - pytest -v -s compile/test_basic_correctness.py
   - pytest -v -s distributed/test_pynccl.py
@@ -318,6 +320,7 @@ steps:
   # these tests need to be separated, cannot combine
   - pytest -v -s compile/piecewise/test_simple.py
   - pytest -v -s compile/piecewise/test_toy_llama.py
+  - pytest -v -s compile/piecewise/test_full_cudagraph.py
 
 - label: PyTorch Fullgraph Test # 18min
   mirror_hardwares: [amdexperimental, amdproduction]
@@ -421,6 +424,9 @@ steps:
   - vllm/model_executor/layers/quantization
   - tests/quantization
   commands:
+  # temporary install here since we need nightly, will move to requirements/test.in
+  # after torchao 0.12 release
+  - pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
 
 - label: LM Eval Small Models # 53min
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 4452ce22d504..e98ccd035ee9 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -10,15 +10,17 @@
 /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
-/vllm/model_executor/guided_decoding @mgoin @russellb
+/vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm
 /vllm/multimodal @DarkLight1337 @ywang96
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
+/vllm/reasoning @aarnphm
+/vllm/entrypoints @aarnphm
 CMakeLists.txt @tlrmchlsmth
 
 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
-/vllm/v1/structured_output @mgoin @russellb
+/vllm/v1/structured_output @mgoin @russellb @aarnphm
 
 # Test ownership
 /.buildkite/lm-eval-harness @mgoin @simon-mo
@@ -27,8 +29,8 @@ CMakeLists.txt @tlrmchlsmth
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
-/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
-/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb
+/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
+/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb @aarnphm
 /tests/kernels @tlrmchlsmth @WoosukKwon
 /tests/model_executor/test_guided_processors.py @mgoin @russellb
 /tests/models @DarkLight1337 @ywang96
@@ -38,11 +40,11 @@ CMakeLists.txt @tlrmchlsmth
 /tests/quantization @mgoin @robertgshaw2-redhat
 /tests/spec_decode @njhill @LiuXiaoxuanPKU
 /tests/test_inputs.py @DarkLight1337 @ywang96
-/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
-/tests/v1/structured_output @mgoin @russellb
+/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
+/tests/v1/structured_output @mgoin @russellb @aarnphm
 /tests/weight_loading @mgoin @youkaichao
 /tests/lora @jeejeelee
 
 # Docs
 /docs @hmellor
-mkdocs.yaml @hmellor
\ No newline at end of file
+mkdocs.yaml @hmellor
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 65be771b94fb..c1d1e07bf628 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,6 +1,15 @@
-FILL IN THE PR DESCRIPTION HERE
+## Essential Elements of an Effective PR Description Checklist
+- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
+- [ ] The test plan, such as providing test command.
+- [ ] The test results, such as pasting the results comparison before and after, or e2e results
 
-FIX #xxxx (*link existing issues this PR will resolve*)
+PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED.
+
+## Purpose
+
+## Test Plan
+
+## Test Result
 
 <!--- pyml disable-next-line no-emphasis-as-heading -->
 **BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 628782228e97..a105b0e14c4a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -11,6 +11,8 @@ repos:
   hooks:
   - id: yapf
     args: [--in-place, --verbose]
+    # Keep the same list from yapfignore here to avoid yapf failing without any inputs
+    exclude: '(.buildkite|benchmarks|build|examples)/.*'
 - repo: https://github.com/astral-sh/ruff-pre-commit
   rev: v0.11.7
   hooks:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 87aa23c080f5..afaed7cd1821 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -242,6 +242,7 @@ set(VLLM_EXT_SRC
   "csrc/activation_kernels.cu"
   "csrc/layernorm_kernels.cu"
   "csrc/layernorm_quant_kernels.cu"
+  "csrc/sampler.cu"
   "csrc/cuda_view.cu"
   "csrc/quantization/gptq/q_gemm.cu"
   "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
@@ -542,8 +543,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # CUTLASS MoE kernels
 
   # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
-  # on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
-  # to compile MoE kernels that use its output.
+  # on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled
+  # if it's possible to compile MoE kernels that use its output.
   cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
     set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
diff --git a/README.md b/README.md
index 67f6b957ec55..ec16d758327d 100644
--- a/README.md
+++ b/README.md
@@ -58,8 +58,8 @@ vLLM is fast with:
 - Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
 - Continuous batching of incoming requests
 - Fast model execution with CUDA/HIP graph
-- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516),INT4, INT8, and FP8.
-- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
+- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516), INT4, INT8, and FP8
+- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer
 - Speculative decoding
 - Chunked prefill
 
@@ -72,14 +72,14 @@ vLLM is flexible and easy to use with:
 - Tensor parallelism and pipeline parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
-- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron.
+- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron
 - Prefix caching support
 - Multi-LoRA support
 
 vLLM seamlessly supports most popular open-source models on HuggingFace, including:
 - Transformer-like LLMs (e.g., Llama)
 - Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
-- Embedding Models (e.g. E5-Mistral)
+- Embedding Models (e.g., E5-Mistral)
 - Multi-modal LLMs (e.g., LLaVA)
 
 Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
@@ -162,4 +162,4 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
 
 ## Media Kit
 
-- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).
+- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit)
diff --git a/benchmarks/auto_tune.sh b/benchmarks/auto_tune.sh
index ea63c6f71a6c..1b01bbd61b62 100644
--- a/benchmarks/auto_tune.sh
+++ b/benchmarks/auto_tune.sh
@@ -10,11 +10,15 @@
 # 3. Set variables (ALL REQUIRED)
 #   BASE: your directory for vllm repo
 #   MODEL: the model served by vllm
+#   TP: ways of tensor parallelism
 #   DOWNLOAD_DIR: directory to download and load model weights.
 #   INPUT_LEN: request input len
 #   OUTPUT_LEN: request output len
 #   MIN_CACHE_HIT_PCT: prefix cache rate
 #   MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
+#   NUM_SEQS_LIST: a list of `max-num-seqs` you want to loop with.
+#   NUM_BATCHED_TOKENS_LIST: a list of `max-num-batched-tokens` you want to loop with.
+#   Note that the default NUM_SEQS_LIST and NUM_BATCHED_TOKENS_LIST are set for medium size input/output len, for extra short context (such as 20:20), you might need to include larger numbers in NUM_SEQS_LIST.
 # 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
 # 5. The final result will be saved in RESULT file. 
 
@@ -30,31 +34,27 @@
 TAG=$(date +"%Y_%m_%d_%H_%M")
 BASE=""
 MODEL="meta-llama/Llama-3.1-8B-Instruct"
+TP=1
 DOWNLOAD_DIR=""
 INPUT_LEN=4000
 OUTPUT_LEN=16
-MIN_CACHE_HIT_PCT_PCT=0
+MIN_CACHE_HIT_PCT=0
 MAX_LATENCY_ALLOWED_MS=100000000000
+NUM_SEQS_LIST="128 256"
+NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
 
 LOG_FOLDER="$BASE/auto-benchmark/$TAG"
 RESULT="$LOG_FOLDER/result.txt"
 
-echo "result file$ $RESULT"
+echo "result file: $RESULT"
 echo "model: $MODEL"
-echo
 
 rm -rf $LOG_FOLDER
 mkdir -p $LOG_FOLDER
 
 cd "$BASE/vllm"
-# create sonnet-4x.txt so that we can sample 2048 tokens for input
-echo "" > benchmarks/sonnet_4x.txt
-for _ in {1..4}
-do
-cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
-done
 
-pip install datasets
+pip install -q datasets
 
 current_hash=$(git rev-parse HEAD)
 echo "hash:$current_hash" >> "$RESULT"
@@ -64,53 +64,69 @@ best_throughput=0
 best_max_num_seqs=0
 best_num_batched_tokens=0
 best_goodput=0
-run_benchmark() {
-    local max_num_seqs=$1
-    local max_num_batched_tokens=$2
-    echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
-    local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
-    echo "vllm_log: $vllm_log"
-    echo
-    rm -f $vllm_log
 
-    # start the server
+start_server() {
+    local gpu_memory_utilization=$1
+    local max_num_seqs=$2
+    local max_num_batched_tokens=$3
+    local vllm_log=$4
+    
+    pkill -f vllm
+
     VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
         --disable-log-requests \
         --port 8004 \
-        --gpu-memory-utilization 0.98 \
+        --gpu-memory-utilization $gpu_memory_utilization \
         --max-num-seqs $max_num_seqs \
         --max-num-batched-tokens $max_num_batched_tokens \
-        --tensor-parallel-size 1 \
+        --tensor-parallel-size $TP \
         --enable-prefix-caching \
         --load-format dummy \
-        --download-dir $DOWNLOAD_DIR \
+        --download-dir "$DOWNLOAD_DIR" \
         --max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
-    echo "wait for 10 minutes.."
-    echo
+
     # wait for 10 minutes...
     server_started=0
-    for i in {1..60}; do        
-        if grep -Fq "Application startup complete" "$vllm_log"; then
-            echo "Application started"
+    for i in {1..60}; do  
+        RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
+        STATUS_CODE=$(echo "$RESPONSE" | tail -n 1) 
+        if [[ "$STATUS_CODE" -eq 200 ]]; then
             server_started=1
             break
         else
-            # echo "wait for 10 seconds..."
             sleep 10
         fi
     done
- 
     if (( ! server_started )); then
-        echo "server did not start within 10 minutes, terminate the benchmarking. Please check server log at $vllm_log"
-        echo "pkill -f vllm"
-        echo
-        pkill vllm
-        sleep 10
+        echo "server did not start within 10 minutes. Please check server log at $vllm_log".
         return 1
+    else
+        return 0
     fi
+}
+
+run_benchmark() {
+    local max_num_seqs=$1
+    local max_num_batched_tokens=$2
+    local gpu_memory_utilization=$3
+    echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
+    local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
+    echo "vllm_log: $vllm_log"
+    echo
+    rm -f $vllm_log
+    pkill -f vllm
+
+    echo "starting server..."
+    start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log
+    result=$?
+    if [[ "$result" -eq 1 ]]; then
+        echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
+    else
+        echo "server started."
+    fi
+    echo
     
     echo "run benchmark test..."
-    echo
     meet_latency_requirement=0
     # get a basic qps by using request-rate inf
     bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
@@ -118,29 +134,29 @@ run_benchmark() {
     python benchmarks/benchmark_serving.py \
         --backend vllm \
         --model $MODEL  \
-        --dataset-name sonnet \
-        --dataset-path benchmarks/sonnet_4x.txt \
-        --sonnet-input-len $INPUT_LEN \
-        --sonnet-output-len $OUTPUT_LEN \
+        --dataset-name random \
+        --random-input-len $INPUT_LEN \
+        --random-output-len $OUTPUT_LEN \
         --ignore-eos \
         --disable-tqdm \
         --request-rate inf \
         --percentile-metrics ttft,tpot,itl,e2el \
         --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
-        --num-prompts 100 \
-        --sonnet-prefix-len $prefix_len \
-        --port 8004 > "$bm_log"
-    through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
+        --num-prompts 1000 \
+        --random-prefix-len $prefix_len \
+        --port 8004 &> "$bm_log"
+    throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
     e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
     goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
 
     if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
         meet_latency_requirement=1
+        request_rate=inf
     fi
 
     if (( ! meet_latency_requirement )); then
-    # start from request-rate as int(through_put) + 1
-        request_rate=$((${through_put%.*} + 1))
+    # start from request-rate as int(throughput) + 1
+        request_rate=$((${throughput%.*} + 1))
         while ((request_rate > 0)); do
             # clear prefix cache
             curl -X POST http://0.0.0.0:8004/reset_prefix_cache
@@ -149,19 +165,18 @@ run_benchmark() {
             python benchmarks/benchmark_serving.py \
                 --backend vllm \
                 --model $MODEL  \
-                --dataset-name sonnet \
-                --dataset-path benchmarks/sonnet_4x.txt \
-                --sonnet-input-len $INPUT_LEN \
-                --sonnet-output-len $OUTPUT_LEN \
-                --ignore_eos \
+                --dataset-name random \
+                --random-input-len $INPUT_LEN \
+                --random-output-len $OUTPUT_LEN \
+                --ignore-eos \
                 --disable-tqdm \
                 --request-rate $request_rate \
                 --percentile-metrics ttft,tpot,itl,e2el \
                 --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
                 --num-prompts 100 \
-                --sonnet-prefix-len $prefix_len \
-                --port 8004 > "$bm_log"
-            through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
+                --random-prefix-len $prefix_len \
+                --port 8004 &> "$bm_log"
+            throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
             e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
             goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
             if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
@@ -173,10 +188,10 @@ run_benchmark() {
     fi
     # write the results and update the best result.
     if ((meet_latency_requirement)); then
-        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput"
-        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput" >> "$RESULT"
-        if (( $(echo "$through_put > $best_throughput" | bc -l) )); then
-            best_throughput=$through_put
+        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput"
+        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput" >> "$RESULT"
+        if (( $(echo "$throughput > $best_throughput" | bc -l) )); then
+            best_throughput=$throughput
             best_max_num_seqs=$max_num_seqs
             best_num_batched_tokens=$max_num_batched_tokens
             best_goodput=$goodput
@@ -188,22 +203,39 @@ run_benchmark() {
 
     echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
 
-    echo "pkill -f vllm"
-    echo
     pkill vllm
     sleep 10
-    rm -f $vllm_log
     printf '=%.0s' $(seq 1 20)
     return 0
 }
 
+read -r -a num_seqs_list <<< "$NUM_SEQS_LIST"
+read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST"
+
+# first find out the max gpu-memory-utilization without HBM OOM.
+gpu_memory_utilization=0.98
+find_gpu_memory_utilization=0
+while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
+    start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log"
+    result=$?
+    if [[ "$result" -eq 0 ]]; then
+        find_gpu_memory_utilization=1
+        break
+    else
+        gpu_memory_utilization=$(echo "$gpu_memory_utilization - 0.01" | bc)
+    fi
+done
+
+if [[ "$find_gpu_memory_utilization" -eq 1 ]]; then
+    echo "Using gpu_memory_utilization=$gpu_memory_utilization to serve model."
+else
+    echo "Cannot find a proper gpu_memory_utilization over 0.9 to serve the model, please check logs in $LOG_FOLDER."
+    exit 1
+fi
 
-num_seqs_list="128 256"
-num_batched_tokens_list="512 1024 2048 4096"
-for num_seqs in $num_seqs_list; do
-    for num_batched_tokens in $num_batched_tokens_list; do
-        run_benchmark $num_seqs $num_batched_tokens
-        exit 0
+for num_seqs in "${num_seqs_list[@]}"; do
+    for num_batched_tokens in "${num_batched_tokens_list[@]}"; do
+        run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization
     done
 done
 echo "finish permutations"
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 85e6eda7f36f..ddb38e304cd6 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import io
 import json
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index d86bf045ea47..5d2a26cd443c 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This module defines a framework for sampling benchmark requests from various
 datasets. Each dataset subclass of BenchmarkDataset must implement sample
@@ -864,7 +865,15 @@ def sample(
         for item in self.data:
             if len(sampled_requests) >= num_requests:
                 break
-            prompt = f"{item['instruction']}:\n{item['input']}"
+            prompt = f"{item['input']}\n\n{item['instruction']} Just output \
+            the code, do not include any explanation."
+
+            # apply template
+            prompt = tokenizer.apply_chat_template(
+                [{"role": "user", "content": prompt}],
+                add_generation_prompt=True,
+                tokenize=False,
+            )
             prompt_len = len(tokenizer(prompt).input_ids)
             sampled_requests.append(
                 SampleRequest(
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index de62bf5c63c7..c06857247eee 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Benchmark the latency of processing a single batch of requests."""
 
 import argparse
diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py
index 109624c87789..00869fa94e71 100644
--- a/benchmarks/benchmark_long_document_qa_throughput.py
+++ b/benchmarks/benchmark_long_document_qa_throughput.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Offline benchmark to test the long document QA throughput.
 
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index ffaa8035797c..3e4704f0b820 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Benchmark the efficiency of prefix caching.
 
diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py
index a05dd24dece8..5496703f23cc 100644
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Benchmark offline prioritization."""
 
 import argparse
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 6bd9f1b49c2e..81428fb7dae1 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 r"""Benchmark online serving throughput.
 
 On the server side, run one of the following commands:
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index 6a50f47d3951..c1501ad52c25 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 r"""Benchmark online serving throughput with structured outputs.
 
 On the server side, run one of the following commands:
@@ -11,7 +12,6 @@
         --model <your_model> \
         --dataset json \
         --structured-output-ratio 1.0 \
-        --structured-output-backend auto \
         --request-rate 10 \
         --num-prompts 1000
 
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 7a13babda9d1..d19753d40e49 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Benchmark offline inference throughput."""
 
 import argparse
diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py
index b0c4fca92c3d..283f938df50a 100644
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import json
@@ -65,4 +66,9 @@ def iterencode(self, o: Any, *args, **kwargs) -> Any:
 
 def write_to_json(filename: str, records: list) -> None:
     with open(filename, "w") as f:
-        json.dump(records, f, cls=InfEncoder)
+        json.dump(
+            records,
+            f,
+            cls=InfEncoder,
+            default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
+        )
diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
index da258f98e085..9ec270bbd2e9 100644
--- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import copy
diff --git a/benchmarks/cutlass_benchmarks/utils.py b/benchmarks/cutlass_benchmarks/utils.py
index 7e9f5a7fc0f4..b4f3c6bf94ed 100644
--- a/benchmarks/cutlass_benchmarks/utils.py
+++ b/benchmarks/cutlass_benchmarks/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Cutlass bench utils
 from collections.abc import Iterable
diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index 08e93837f7dd..cec422e8d597 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import copy
diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py
index d31b623a1ee6..25b96ef56620 100644
--- a/benchmarks/cutlass_benchmarks/weight_shapes.py
+++ b/benchmarks/cutlass_benchmarks/weight_shapes.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Weight Shapes are in the format
 # ([K, N], TP_SPLIT_DIM)
diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
index fce156e1c96c..f62d8102e2d9 100644
--- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
+++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 
diff --git a/benchmarks/disagg_benchmarks/round_robin_proxy.py b/benchmarks/disagg_benchmarks/round_robin_proxy.py
index fd19b40bf252..b1df2f255822 100644
--- a/benchmarks/disagg_benchmarks/round_robin_proxy.py
+++ b/benchmarks/disagg_benchmarks/round_robin_proxy.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import itertools
diff --git a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
index 484d0cb3cba7..74fa56d076cf 100644
--- a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
+++ b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 
diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
index 37a9173a1a93..901524214469 100644
--- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pickle as pkl
 import time
diff --git a/benchmarks/kernels/bench_fp8_gemm.py b/benchmarks/kernels/bench_fp8_gemm.py
index 36d03e40ef9a..b964ed242edf 100644
--- a/benchmarks/kernels/bench_fp8_gemm.py
+++ b/benchmarks/kernels/bench_fp8_gemm.py
@@ -1,14 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import copy
 import itertools
 
 import torch
-import triton
 from weight_shapes import WEIGHT_SHAPES
 
 from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
 from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
+from vllm.triton_utils import triton
 
 
 @triton.testing.perf_report(
diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py
index e9934aa479dd..42de062b08e4 100644
--- a/benchmarks/kernels/benchmark_aqlm.py
+++ b/benchmarks/kernels/benchmark_aqlm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import sys
diff --git a/benchmarks/kernels/benchmark_bitblas.py b/benchmarks/kernels/benchmark_bitblas.py
index d40ab70ec539..97ee06034137 100644
--- a/benchmarks/kernels/benchmark_bitblas.py
+++ b/benchmarks/kernels/benchmark_bitblas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
diff --git a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
index d39d8a6e3aba..35c20ee41b9a 100644
--- a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
+++ b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Benchmark the performance of the cutlass_moe_fp4 kernel vs the triton_moe
 kernel. The cutlass_moe_fp4 kernel takes in fp4 quantized weights and 16-bit
@@ -90,7 +91,7 @@ def bench_run(
 
     score = torch.randn((m, num_experts), device=device, dtype=dtype)
 
-    topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
+    topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
 
     quant_blocksize = 16
     w1_blockscale = torch.empty(
diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
index 2197bceabe6c..acabe6c1ddb0 100644
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 import torch.utils.benchmark as benchmark
@@ -6,8 +7,8 @@
 
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
 from vllm.model_executor.layers.fused_moe.fused_moe import (
-    cutlass_moe_fp8,
     fused_experts,
     fused_topk,
 )
@@ -69,18 +70,9 @@ def bench_run(
     w1_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)
     w2_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)
 
-    ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
-    c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64)
-    ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64)
-    c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
-
     for expert in range(num_experts):
         w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert])
         w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert])
-    w1_q_notransp = w1_q.clone()
-    w2_q_notransp = w2_q.clone()
-    w1_q = w1_q.transpose(1, 2)
-    w2_q = w2_q.transpose(1, 2)
 
     score = torch.randn((m, num_experts), device="cuda", dtype=dtype)
 
@@ -121,10 +113,6 @@ def run_cutlass_moe(
         w2_scale: torch.Tensor,
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
-        ab_strides1: torch.Tensor,
-        c_strides1: torch.Tensor,
-        ab_strides2: torch.Tensor,
-        c_strides2: torch.Tensor,
         num_repeats: int,
     ):
         for _ in range(num_repeats):
@@ -132,14 +120,10 @@ def run_cutlass_moe(
                 a,
                 w1,
                 w2,
-                w1_scale,
-                w2_scale,
                 topk_weights,
                 topk_ids,
-                ab_strides1,
-                c_strides1,
-                ab_strides2,
-                c_strides2,
+                w1_scale,
+                w2_scale,
                 a1_scale=a_scale,
             )
 
@@ -152,10 +136,6 @@ def run_cutlass_from_graph(
         w2_scale: torch.Tensor,
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
-        ab_strides1: torch.Tensor,
-        c_strides1: torch.Tensor,
-        ab_strides2: torch.Tensor,
-        c_strides2: torch.Tensor,
     ):
         with set_current_vllm_config(
             VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
@@ -164,14 +144,10 @@ def run_cutlass_from_graph(
                 a,
                 w1_q,
                 w2_q,
-                w1_scale,
-                w2_scale,
                 topk_weights,
                 topk_ids,
-                ab_strides1,
-                c_strides1,
-                ab_strides2,
-                c_strides2,
+                w1_scale,
+                w2_scale,
                 a1_scale=a_scale,
             )
 
@@ -217,10 +193,6 @@ def replay_graph(graph, num_repeats):
             w2_scale,
             topk_weights,
             topk_ids,
-            ab_strides1,
-            c_strides1,
-            ab_strides2,
-            c_strides2,
         )
     torch.cuda.synchronize()
 
@@ -229,8 +201,8 @@ def replay_graph(graph, num_repeats):
     with torch.cuda.graph(triton_graph, stream=triton_stream):
         run_triton_from_graph(
             a,
-            w1_q_notransp,
-            w2_q_notransp,
+            w1_q,
+            w2_q,
             topk_weights,
             topk_ids,
             w1_scale,
@@ -249,18 +221,12 @@ def replay_graph(graph, num_repeats):
         "w2": w2,
         "score": score,
         "topk": topk,
-        "w1_q_notransp": w1_q_notransp,
-        "w2_q_notransp": w2_q_notransp,
         # Cutlass params
         "a_scale": a_scale,
         "w1_q": w1_q,
         "w2_q": w2_q,
         "w1_scale": w1_scale,
         "w2_scale": w2_scale,
-        "ab_strides1": ab_strides1,
-        "c_strides1": c_strides1,
-        "ab_strides2": ab_strides2,
-        "c_strides2": c_strides2,
         # cuda graph params
         "cutlass_graph": cutlass_graph,
         "triton_graph": triton_graph,
@@ -278,8 +244,8 @@ def replay_graph(graph, num_repeats):
     # Warmup
     run_triton_moe(
         a,
-        w1_q_notransp,
-        w2_q_notransp,
+        w1_q,
+        w2_q,
         topk_weights,
         topk_ids,
         w1_scale,
@@ -290,7 +256,7 @@ def replay_graph(graph, num_repeats):
 
     results.append(
         benchmark.Timer(
-            stmt="run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)",  # noqa: E501
+            stmt="run_triton_moe(a, w1_q, w2_q, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)",  # noqa: E501
             globals=globals,
             label=label,
             sub_label=sub_label,
@@ -321,16 +287,12 @@ def replay_graph(graph, num_repeats):
         w2_scale,
         topk_weights,
         topk_ids,
-        ab_strides1,
-        c_strides1,
-        ab_strides2,
-        c_strides2,
         num_warmup,
     )
 
     results.append(
         benchmark.Timer(
-            stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2, num_runs)",  # noqa: E501
+            stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, num_runs)",  # noqa: E501
             globals=globals,
             label=label,
             sub_label=sub_label,
diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py
index f21ca97eeb8a..69978ec6b23e 100644
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 
diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py
index 6c1284930c1e..3d38d4b3534e 100644
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import copy
diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py
index f8f1db04790b..0f896f187ecb 100644
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import copy
diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
index b17baff2e5f5..9ea1fddae2a3 100644
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 import torch.utils.benchmark as benchmark
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index c2f7660858f5..6cb55b35993e 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import json
diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
index 333986fdf5ef..dba1f3943b96 100644
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 from typing import Any, TypedDict
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index 54f05e723226..7e0376c18ecc 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 import time
diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py
index 2463dfebe83c..6ab26f5f1adf 100644
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 
diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py
index d720083b6150..4cf633a81358 100644
--- a/benchmarks/kernels/benchmark_rmsnorm.py
+++ b/benchmarks/kernels/benchmark_rmsnorm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 from typing import Optional, Union
diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
index 944024ca3572..b81baf17a8c6 100644
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from itertools import accumulate
 from typing import Optional
diff --git a/benchmarks/kernels/benchmark_shapes.py b/benchmarks/kernels/benchmark_shapes.py
index 70190ba24d9d..18c459c31d3f 100644
--- a/benchmarks/kernels/benchmark_shapes.py
+++ b/benchmarks/kernels/benchmark_shapes.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 WEIGHT_SHAPES = {
     "ideal": [[4 * 256 * 32, 256 * 32]],
diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
index 6315c1ee6cdd..4fcdbadd65ec 100644
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Adapted from sglang quantization/tuning_block_wise_kernel.py
 
 import argparse
diff --git a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
index e37764825451..e67ce0545318 100644
--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # fmt: off
 # ruff: noqa: E501
 import time
diff --git a/benchmarks/kernels/graph_machete_bench.py b/benchmarks/kernels/graph_machete_bench.py
index 0c86e4072957..9a4da0ef5a85 100644
--- a/benchmarks/kernels/graph_machete_bench.py
+++ b/benchmarks/kernels/graph_machete_bench.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 import pickle
diff --git a/benchmarks/kernels/utils.py b/benchmarks/kernels/utils.py
index 877a29feed9d..4bbb36bb4359 100644
--- a/benchmarks/kernels/utils.py
+++ b/benchmarks/kernels/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 from collections.abc import Iterable
diff --git a/benchmarks/kernels/weight_shapes.py b/benchmarks/kernels/weight_shapes.py
index afe159ddda6e..a27f02394afb 100644
--- a/benchmarks/kernels/weight_shapes.py
+++ b/benchmarks/kernels/weight_shapes.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Weight Shapes are in the format
 # ([K, N], TP_SPLIT_DIM)
diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py
index d5701a8fbd6d..0957a9c65f06 100644
--- a/benchmarks/overheads/benchmark_hashing.py
+++ b/benchmarks/overheads/benchmark_hashing.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import cProfile
 import pstats
diff --git a/cmake/hipify.py b/cmake/hipify.py
index a15577125eb1..55d378f5b111 100755
--- a/cmake/hipify.py
+++ b/cmake/hipify.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 #
 # A command line tool for running pytorch's hipify preprocessor on CUDA
diff --git a/csrc/attention/mla/cutlass_mla_kernels.cu b/csrc/attention/mla/cutlass_mla_kernels.cu
index 6743af0cf2db..f4b6b19f4b23 100644
--- a/csrc/attention/mla/cutlass_mla_kernels.cu
+++ b/csrc/attention/mla/cutlass_mla_kernels.cu
@@ -119,7 +119,7 @@ typename T::Fmha::Arguments args_from_options(
       {static_cast<ElementOut*>(out.data_ptr()), stride_O,
        static_cast<ElementAcc*>(nullptr), stride_LSE},
       hw_info,
-      -1,       // split_kv
+      1,        // split_kv
       nullptr,  // is_var_split_kv
   };
   // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
diff --git a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
index d64f0d0a5c2a..1dd7101acc27 100644
--- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
+++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
 from typing import Union
diff --git a/csrc/moe/marlin_moe_wna16/generate_kernels.py b/csrc/moe/marlin_moe_wna16/generate_kernels.py
index 15f008d4f61e..49f33718a21e 100644
--- a/csrc/moe/marlin_moe_wna16/generate_kernels.py
+++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import glob
 import itertools
 import os
diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h
index 8fda434d452f..c4faef731060 100644
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -30,4 +30,8 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                              int64_t BLOCK_SIZE_K, int64_t bit);
 #endif
 
-bool moe_permute_unpermute_supported();
\ No newline at end of file
+bool moe_permute_unpermute_supported();
+
+void shuffle_rows(const torch::Tensor& input_tensor,
+                  const torch::Tensor& dst2src_map,
+                  torch::Tensor& output_tensor);
\ No newline at end of file
diff --git a/csrc/moe/moe_permute_unpermute_op.cu b/csrc/moe/moe_permute_unpermute_op.cu
index 9a7465261abf..68f429fac18a 100644
--- a/csrc/moe/moe_permute_unpermute_op.cu
+++ b/csrc/moe/moe_permute_unpermute_op.cu
@@ -130,6 +130,62 @@ void moe_unpermute(
   });
 }
 
+template <typename T>
+__global__ void shuffleInputRowsKernel(const T* input,
+                                       const int32_t* dst2src_map, T* output,
+                                       int64_t num_src_rows,
+                                       int64_t num_dst_rows, int64_t num_cols) {
+  int64_t dest_row_idx = blockIdx.x;
+  int64_t const source_row_idx = dst2src_map[dest_row_idx];
+
+  if (blockIdx.x < num_dst_rows) {
+    // Load 128-bits per thread
+    constexpr int64_t ELEM_PER_THREAD = 128 / sizeof(T) / 8;
+    using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;
+
+    // Duplicate and permute rows
+    auto const* source_row_ptr =
+        reinterpret_cast<DataElem const*>(input + source_row_idx * num_cols);
+    auto* dest_row_ptr =
+        reinterpret_cast<DataElem*>(output + dest_row_idx * num_cols);
+
+    int64_t const start_offset = threadIdx.x;
+    int64_t const stride = blockDim.x;
+    int64_t const num_elems_in_col = num_cols / ELEM_PER_THREAD;
+
+    for (int elem_index = start_offset; elem_index < num_elems_in_col;
+         elem_index += stride) {
+      dest_row_ptr[elem_index] = source_row_ptr[elem_index];
+    }
+  }
+}
+
+void shuffle_rows(const torch::Tensor& input_tensor,
+                  const torch::Tensor& dst2src_map,
+                  torch::Tensor& output_tensor) {
+  TORCH_CHECK(input_tensor.scalar_type() == output_tensor.scalar_type(),
+              "Input and output tensors must have the same data type");
+
+  auto stream = at::cuda::getCurrentCUDAStream().stream();
+  int64_t const blocks = output_tensor.size(0);
+  int64_t const threads = 256;
+  int64_t const num_dest_rows = output_tensor.size(0);
+  int64_t const num_src_rows = input_tensor.size(0);
+  int64_t const num_cols = input_tensor.size(1);
+
+  TORCH_CHECK(!(num_cols % (128 / sizeof(input_tensor.scalar_type()) / 8)),
+              "num_cols must be divisible by 128 / "
+              "sizeof(input_tensor.scalar_type()) / 8");
+
+  MOE_DISPATCH(input_tensor.scalar_type(), [&] {
+    shuffleInputRowsKernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        reinterpret_cast<scalar_t*>(input_tensor.data_ptr()),
+        dst2src_map.data_ptr<int32_t>(),
+        reinterpret_cast<scalar_t*>(output_tensor.data_ptr()), num_src_rows,
+        num_dest_rows, num_cols);
+  });
+}
+
 #else
 
 void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
diff --git a/csrc/moe/permute_unpermute_kernels/dispatch.h b/csrc/moe/permute_unpermute_kernels/dispatch.h
index 41932cdd85bc..d0f1ea4aded3 100644
--- a/csrc/moe/permute_unpermute_kernels/dispatch.h
+++ b/csrc/moe/permute_unpermute_kernels/dispatch.h
@@ -14,12 +14,13 @@
     __VA_ARGS__();                                         \
     break;                                                 \
   }
-#define MOE_DISPATCH_FLOAT_CASE(...)                          \
-  MOE_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)       \
-  MOE_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)        \
-  MOE_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)    \
-  MOE_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) \
-  MOE_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__)
+#define MOE_DISPATCH_FLOAT_CASE(...)                            \
+  MOE_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)         \
+  MOE_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)          \
+  MOE_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)      \
+  MOE_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__)   \
+  MOE_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \
+  MOE_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)
 
 #define MOE_DISPATCH(TYPE, ...) \
   MOE_SWITCH(TYPE, MOE_DISPATCH_FLOAT_CASE(__VA_ARGS__))
@@ -39,6 +40,11 @@ template <>
 struct ScalarType2CudaType<at::ScalarType::BFloat16> {
   using type = __nv_bfloat16;
 };
+// uint8 for packed fp4
+template <>
+struct ScalarType2CudaType<at::ScalarType::Byte> {
+  using type = uint8_t;
+};
 
 // #if __CUDA_ARCH__ >= 890
 // fp8
diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu
index a9379032245d..10be47966f61 100644
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@@ -516,9 +516,8 @@ void topk_softmax(
             topk,
             stream);
     }
-    else
+    else if (topk_indices.scalar_type() == at::ScalarType::UInt32)
     {
-        assert(topk_indices.scalar_type() == at::ScalarType::UInt32);
         vllm::moe::topkGatingSoftmaxKernelLauncher(
             gating_output.data_ptr<float>(),
             topk_weights.data_ptr<float>(),
@@ -530,4 +529,17 @@ void topk_softmax(
             topk,
             stream);
     }
+    else {
+        assert(topk_indices.scalar_type() == at::ScalarType::Int64);
+        vllm::moe::topkGatingSoftmaxKernelLauncher(
+            gating_output.data_ptr<float>(),
+            topk_weights.data_ptr<float>(),
+            topk_indices.data_ptr<int64_t>(),
+            token_expert_indices.data_ptr<int>(),
+            softmax_workspace.data_ptr<float>(),
+            num_tokens,
+            num_experts,
+            topk,
+            stream);
+    }
 }
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index 7d35ec79ead4..a74eb3720cf1 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -81,6 +81,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
   m.def("moe_permute_unpermute_supported() -> bool");
   m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported);
 
+  // Row shuffle for MoE
+  m.def(
+      "shuffle_rows(Tensor input_tensor, Tensor dst2src_map, Tensor! "
+      "output_tensor) -> ()");
+  m.impl("shuffle_rows", torch::kCUDA, &shuffle_rows);
+
 #endif
 }
 
diff --git a/csrc/ops.h b/csrc/ops.h
index 7044b4588b81..f02f5083ac19 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -92,6 +92,11 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
 void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
                         torch::Tensor& weight, double epsilon);
 
+void apply_repetition_penalties_(torch::Tensor& logits,
+                                 const torch::Tensor& prompt_mask,
+                                 const torch::Tensor& output_mask,
+                                 const torch::Tensor& repetition_penalties);
+
 void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
                                torch::Tensor& weight, torch::Tensor& scale,
                                double epsilon);
@@ -231,7 +236,8 @@ void cutlass_moe_mm(
     torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
     torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
     torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides);
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch);
 
 void cutlass_fp4_group_mm(
     torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
@@ -243,7 +249,16 @@ void get_cutlass_moe_mm_data(
     const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
     torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
     torch::Tensor& input_permutation, torch::Tensor& output_permutation,
-    const int64_t num_experts, const int64_t n, const int64_t k);
+    const int64_t num_experts, const int64_t n, const int64_t k,
+    const std::optional<torch::Tensor>& blockscale_offsets);
+
+void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
+                                  torch::Tensor& problem_sizes1,
+                                  torch::Tensor& problem_sizes2,
+                                  const torch::Tensor& expert_num_tokens,
+                                  const int64_t num_local_experts,
+                                  const int64_t padded_m, const int64_t n,
+                                  const int64_t k);
 
 void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
                            torch::Tensor const& b,
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu
index 84492553c02f..4a8a5ed02d6c 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu
@@ -9,10 +9,6 @@ void cutlass_scaled_mm_blockwise_sm100_fp8(torch::Tensor& out,
                                            torch::Tensor const& b,
                                            torch::Tensor const& a_scales,
                                            torch::Tensor const& b_scales) {
-  TORCH_CHECK(
-      a.size(0) % 4 == 0,
-      "Input tensor must have a number of rows that is a multiple of 4. ",
-      "but got: ", a.size(0), " rows.");
   if (out.dtype() == torch::kBFloat16) {
     cutlass_gemm_blockwise_sm100_fp8_dispatch<cutlass::bfloat16_t>(
         out, a, b, a_scales, b_scales);
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
index ef324364c6d5..c841125dbb73 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "cuda_utils.h"
 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
 
@@ -22,49 +23,49 @@ namespace vllm {
 
 using namespace cute;
 
-template <typename OutType, typename MmaTileShape, typename ScalesPerTile,
-          class ClusterShape, typename EpilogueScheduler,
-          typename MainloopScheduler>
+// clang-format off
+template <class OutType, int ScaleGranularityM,
+          int ScaleGranularityN, int ScaleGranularityK,
+          class MmaTileShape, class ClusterShape,
+          class EpilogueScheduler, class MainloopScheduler,
+          bool swap_ab_ = false>
 struct cutlass_3x_gemm_fp8_blockwise {
+  static constexpr bool swap_ab = swap_ab_;
   using ElementAB = cutlass::float_e4m3_t;
 
   using ElementA = ElementAB;
   using LayoutA = cutlass::layout::RowMajor;
+  using LayoutA_Transpose = typename cutlass::layout::LayoutTranspose<LayoutA>::type;
   static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
 
   using ElementB = ElementAB;
   using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutB_Transpose = typename cutlass::layout::LayoutTranspose<LayoutB>::type;
   static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
 
-  using ElementC = void;
   using ElementD = OutType;
   using LayoutD = cutlass::layout::RowMajor;
+  using LayoutD_Transpose = typename cutlass::layout::LayoutTranspose<LayoutD>::type;
   static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
 
+  using ElementC = void; // TODO: support bias
   using LayoutC = LayoutD;
+  using LayoutC_Transpose = LayoutD_Transpose;
   static constexpr int AlignmentC = AlignmentD;
 
   using ElementAccumulator = float;
   using ElementCompute = float;
   using ElementBlockScale = float;
 
-  // MMA and Cluster Tile Shapes
-  // Shape of the tile computed by tcgen05 MMA, could be across 2 SMs if Cluster
-  // Shape %2 == 0 using MmaTileShape_MNK = Shape<_128,_128,_128>;
-  static constexpr int ScaleMsPerTile = size<0>(ScalesPerTile{});
-  static constexpr int ScaleGranularityM =
-      size<0>(MmaTileShape{}) / ScaleMsPerTile;
-  static constexpr int ScaleGranularityN =
-      size<1>(MmaTileShape{}) / size<1>(ScalesPerTile{});
-  static constexpr int ScaleGranularityK =
-      size<2>(MmaTileShape{}) / size<2>(ScalesPerTile{});
-
-  // Shape of the threadblocks in a cluster
-  using ClusterShape_MNK = ClusterShape;
-
-  using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig<
-      ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
-      cute::UMMA::Major::MN, cute::UMMA::Major::K>;
+  using ScaleConfig = conditional_t<swap_ab,
+      cutlass::detail::Sm100BlockwiseScaleConfig<
+        ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
+        cute::UMMA::Major::K, cute::UMMA::Major::MN>,
+      cutlass::detail::Sm100BlockwiseScaleConfig<
+        ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
+        cute::UMMA::Major::MN, cute::UMMA::Major::K>>;
+
+  // layout_SFA and layout_SFB cannot be swapped since they are deduced.
   using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
   using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
 
@@ -73,7 +74,6 @@ struct cutlass_3x_gemm_fp8_blockwise {
 
   static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
   using ElementScalar = float;
-  // clang-format off
   using DefaultOperation = cutlass::epilogue::fusion::LinearCombination<ElementD, ElementCompute, ElementC, ElementScalar, RoundStyle>;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       ArchTag,
@@ -84,33 +84,47 @@ struct cutlass_3x_gemm_fp8_blockwise {
       ElementAccumulator,
       ElementCompute,
       ElementC,
-      LayoutC,
+      conditional_t<swap_ab, LayoutC_Transpose, LayoutC>,
       AlignmentC,
       ElementD,
-      LayoutD,
+      conditional_t<swap_ab, LayoutD_Transpose, LayoutD>,
       AlignmentD,
       EpilogueScheduler,
       DefaultOperation
   >::CollectiveOp;
  
   using StageCountType = cutlass::gemm::collective::StageCountAuto; 
-  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
-      ArchTag,
-      OperatorClass,
-      ElementA,
-      cute::tuple<LayoutA, LayoutSFA>,
-      AlignmentA,
-      ElementB,
-      cute::tuple<LayoutB, LayoutSFB>,
-      AlignmentB,
-      ElementAccumulator,
-      MmaTileShape,
-      ClusterShape,
-
+  using CollectiveMainloop = conditional_t<swap_ab,
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag,
+          OperatorClass,
+          ElementB,
+          cute::tuple<LayoutB_Transpose, LayoutSFA>,
+          AlignmentB,
+          ElementA,
+          cute::tuple<LayoutA_Transpose, LayoutSFB>,
+          AlignmentA,
+          ElementAccumulator,
+          MmaTileShape,
+          ClusterShape,
           cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
-      MainloopScheduler
-  >::CollectiveOp;
-  // clang-format on
+          MainloopScheduler
+      >::CollectiveOp,
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag,
+          OperatorClass,
+          ElementA,
+          cute::tuple<LayoutA, LayoutSFA>,
+          AlignmentA,
+          ElementB,
+          cute::tuple<LayoutB, LayoutSFB>,
+          AlignmentB,
+          ElementAccumulator,
+          MmaTileShape,
+          ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          MainloopScheduler
+      >::CollectiveOp>;
 
   using KernelType = enable_sm100_only<cutlass::gemm::kernel::GemmUniversal<
       Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue>>;
@@ -123,6 +137,7 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
                                    torch::Tensor const& b,
                                    torch::Tensor const& a_scales,
                                    torch::Tensor const& b_scales) {
+  static constexpr bool swap_ab = Gemm::swap_ab;
   using GemmKernel = typename Gemm::GemmKernel;
   using StrideA = typename Gemm::GemmKernel::StrideA;
   using StrideB = typename Gemm::GemmKernel::StrideB;
@@ -136,7 +151,6 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
   using ElementD = typename Gemm::ElementD;
 
   int32_t m = a.size(0), n = b.size(1), k = a.size(1);
-  auto prob_shape = cute::make_shape(m, n, k, 1);
 
   StrideA a_stride;
   StrideB b_stride;
@@ -146,11 +160,13 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
   b_stride =
       cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
   c_stride =
-      cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(m, n, 1));
+      cutlass::make_cute_packed_stride(StrideC{}, swap_ab ? cute::make_shape(n, m, 1) : cute::make_shape(m, n, 1));
 
-  LayoutSFA layout_SFA =
+  LayoutSFA layout_SFA = swap_ab ? 
+      ScaleConfig::tile_atom_to_shape_SFA(make_shape(n, m, k, 1)) :
       ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1));
-  LayoutSFB layout_SFB =
+  LayoutSFB layout_SFB = swap_ab ?
+      ScaleConfig::tile_atom_to_shape_SFB(make_shape(n, m, k, 1)) :
       ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));
 
   auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
@@ -158,9 +174,22 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
   auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr());
   auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr());
 
-  typename GemmKernel::MainloopArguments mainloop_args{
-      a_ptr,        a_stride,   b_ptr,        b_stride,
-      a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB};
+  auto mainloop_args = [&](){
+    // layout_SFA and layout_SFB cannot be swapped since they are deduced.
+    if (swap_ab) {
+      return typename GemmKernel::MainloopArguments{
+          b_ptr,        b_stride,   a_ptr,        a_stride,
+          b_scales_ptr, layout_SFA, a_scales_ptr, layout_SFB
+      };
+    }
+    else {
+      return typename GemmKernel::MainloopArguments{
+          a_ptr,        a_stride,   b_ptr,        b_stride,
+          a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB
+      };
+    }
+  }();
+  auto prob_shape = swap_ab ? cute::make_shape(n, m, k, 1) : cute::make_shape(m, n, k, 1);
 
   auto c_ptr = static_cast<ElementD*>(out.data_ptr());
   typename GemmKernel::EpilogueArguments epilogue_args{
@@ -175,29 +204,74 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
                                                torch::Tensor const& b,
                                                torch::Tensor const& a_scales,
                                                torch::Tensor const& b_scales) {
-  auto m = a.size(0);
-  auto k = a.size(1);
-  auto n = b.size(1);
-  int sms;
+  int32_t m = a.size(0), n = b.size(1), k = a.size(1), sms;
   cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device());
 
-  auto should_use_2sm = [&sms](int m, int n, int tile1SM = 128) {
-    return std::ceil(static_cast<float>(m) / tile1SM) *
-               std::ceil(static_cast<float>(n) / tile1SM) >=
-           sms;
-  };
-  bool use_2sm = should_use_2sm(m, n);
-  if (use_2sm) {
-    cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
-        OutType, Shape<_256, _128, _128>, Shape<_256, _1, _1>,
-        Shape<_2, _2, _1>, cutlass::epilogue::TmaWarpSpecialized2Sm,
-        cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
-        out, a, b, a_scales, b_scales);
+  constexpr int TILE_K = 128;
+  // TODO: better heuristics
+  bool swap_ab = (m < 16) || (m % 4 != 0);
+  bool use_tma_epilogue = (m * n) % 4 == 0;
+  if (!swap_ab) {
+    constexpr int TILE_N = 128;
+    int tile_m = 256;
+    if (cuda_utils::ceil_div(n, TILE_N) * cuda_utils::ceil_div(m, 64) <= sms) {
+      tile_m = 64;
+    }
+    else if (cuda_utils::ceil_div(n, TILE_N) * cuda_utils::ceil_div(m, 128) <= sms) {
+      tile_m = 128;
+    }
+    if (tile_m == 64) {
+      if (use_tma_epilogue) {
+        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+            OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      } else {
+        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+            OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      }
+    } else if (tile_m == 128) {
+      if (use_tma_epilogue) {
+        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+            OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      } else {
+        cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+            OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      }
+    } else { // tile_m == 256
+      if (use_tma_epilogue) {
+          cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+              OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_2, _1, _1>, cutlass::epilogue::TmaWarpSpecialized2Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      } else {
+          cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+              OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
+            Shape<_2, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized2Sm,
+            cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
+            out, a, b, a_scales, b_scales);
+      }
+    }
   } else {
+    // TODO: Test more tile N configs
+    constexpr int TILE_M = 128;
+    constexpr int TILE_N = 16;
+    // TMA epilogue isn't compatible with Swap A/B
     cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
-        OutType, Shape<_128, _128, _128>, Shape<_128, _1, _1>,
-        Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
-        cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
+        OutType, TILE_M, 1, TILE_K, Shape<Int<TILE_M>, Int<TILE_N>, Int<TILE_K>>,
+        Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
+        cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100, true>>(
         out, a, b, a_scales, b_scales);
   }
 }
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
index 468b77d9593b..6da2da634075 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
@@ -15,6 +15,7 @@ using c3x::cutlass_gemm_caller;
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue>
 struct sm100_fp8_config_default {
+  // M in (128, inf)
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
   using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
@@ -25,6 +26,34 @@ struct sm100_fp8_config_default {
                             KernelSchedule, EpilogueSchedule>;
 };
 
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm100_fp8_config_M128 {
+  // M in (64, 128]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_128, _128, _64>;
+  using ClusterShape = Shape<_2, _2, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm100_fp8_config_M64 {
+  // M in [1, 64]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_64, _64, _256>;
+  using ClusterShape = Shape<_1, _8, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
+
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue,
           typename... EpilogueArgs>
@@ -39,8 +68,28 @@ inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out,
   using Cutlass3xGemmDefault =
       typename sm100_fp8_config_default<InType, OutType,
                                         Epilogue>::Cutlass3xGemm;
-  return cutlass_gemm_caller<Cutlass3xGemmDefault>(
-      out, a, b, std::forward<EpilogueArgs>(args)...);
+  using Cutlass3xGemmM64 =
+      typename sm100_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM128 =
+      typename sm100_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 64) {
+    // m in [1, 64]
+    return cutlass_gemm_caller<Cutlass3xGemmM64>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // m in (64, 128]
+    return cutlass_gemm_caller<Cutlass3xGemmM128>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // m in (128, inf)
+    return cutlass_gemm_caller<Cutlass3xGemmDefault>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
 }
 
 template <template <typename, typename, typename> typename Epilogue,
diff --git a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu
index 2b8bc3fb0b26..c88e134ae406 100644
--- a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu
@@ -84,7 +84,8 @@ void run_cutlass_moe_mm_sm90(
     torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
     torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
     torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch) {
   TORCH_CHECK(a_tensors.size(0) > 0, "No input A tensors provided.");
   TORCH_CHECK(b_tensors.size(0) > 0, "No input B tensors provided.");
   TORCH_CHECK(out_tensors.size(0) > 0, "No output tensors provided.");
@@ -113,19 +114,23 @@ void run_cutlass_moe_mm_sm90(
   if (n >= 8192) {
     cutlass_group_gemm_caller<Cutlass3xGemmN8192>(
         out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides);
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
   } else if (k >= 8192) {
     cutlass_group_gemm_caller<Cutlass3xGemmK8192>(
         out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides);
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
   } else if (m <= 16) {
     cutlass_group_gemm_caller<Cutlass3xGemmM16>(
         out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides);
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
   } else {
     cutlass_group_gemm_caller<Cutlass3xGemmDefault>(
         out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides);
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
   }
 }
 
@@ -134,15 +139,18 @@ void dispatch_moe_mm_sm90(
     torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
     torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
     torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch) {
   if (out_tensors.dtype() == torch::kBFloat16) {
     run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::bfloat16_t>(
         out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides);
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
   } else {
     run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::half_t>(
         out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
-        problem_sizes, a_strides, b_strides, c_strides);
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
   }
 }
 
@@ -153,8 +161,9 @@ void cutlass_moe_mm_sm90(
     torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
     torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
     torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch) {
   dispatch_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
                        expert_offsets, problem_sizes, a_strides, b_strides,
-                       c_strides);
+                       c_strides, per_act_token, per_out_ch);
 }
diff --git a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
index db827b7c5e18..bbd82d72e95b 100644
--- a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
+++ b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
@@ -76,7 +76,8 @@ void cutlass_group_gemm_caller(
     torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
     torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
     torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch) {
   using ElementAB = typename Gemm::ElementAB;
   using ElementD = typename Gemm::ElementD;
 
@@ -84,9 +85,6 @@ void cutlass_group_gemm_caller(
   int k_size = a_tensors.size(1);
   int n_size = out_tensors.size(1);
 
-  bool per_act_token = a_scales.numel() != 1;
-  bool per_out_ch = b_scales.numel() != num_experts;
-
   auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
 
   auto options_int =
diff --git a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
index 894727383a63..32254641cc38 100644
--- a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
@@ -7,7 +7,7 @@
 
 constexpr uint64_t THREADS_PER_EXPERT = 512;
 
-__global__ void compute_problem_sizes(const int* __restrict__ topk_ids,
+__global__ void compute_problem_sizes(const uint32_t* __restrict__ topk_ids,
                                       int32_t* problem_sizes1,
                                       int32_t* problem_sizes2,
                                       int32_t* atomic_buffer,
@@ -45,7 +45,24 @@ __global__ void compute_expert_offsets(
   }
 }
 
-__global__ void compute_arg_sorts(const int* __restrict__ topk_ids,
+__global__ void compute_expert_blockscale_offsets(
+    const int32_t* __restrict__ problem_sizes1, int32_t* expert_offsets,
+    int32_t* blockscale_offsets, int32_t* atomic_buffer,
+    const int num_experts) {
+  int32_t tot_offset = 0;
+  int32_t tot_offset_round = 0;
+  expert_offsets[0] = 0;
+  blockscale_offsets[0] = 0;
+  for (int i = 0; i < num_experts; ++i) {
+    atomic_buffer[i] = tot_offset;
+    tot_offset += problem_sizes1[i * 3];
+    expert_offsets[i + 1] = tot_offset;
+    tot_offset_round += (problem_sizes1[i * 3] + (128 - 1)) / 128 * 128;
+    blockscale_offsets[i + 1] = tot_offset_round;
+  }
+}
+
+__global__ void compute_arg_sorts(const uint32_t* __restrict__ topk_ids,
                                   const int32_t* __restrict__ expert_offsets,
                                   int32_t* input_permutation,
                                   int32_t* output_permutation,
@@ -77,7 +94,8 @@ void get_cutlass_moe_mm_data_caller(
     const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
     torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
     torch::Tensor& input_permutation, torch::Tensor& output_permutation,
-    const int64_t num_experts, const int64_t n, const int64_t k) {
+    const int64_t num_experts, const int64_t n, const int64_t k,
+    const std::optional<torch::Tensor>& blockscale_offsets) {
   auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
   auto options_int32 =
       torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
@@ -85,19 +103,61 @@ void get_cutlass_moe_mm_data_caller(
 
   int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel());
   compute_problem_sizes<<<num_experts, num_threads, 0, stream>>>(
-      static_cast<const int32_t*>(topk_ids.data_ptr()),
+      static_cast<const uint32_t*>(topk_ids.data_ptr()),
       static_cast<int32_t*>(problem_sizes1.data_ptr()),
       static_cast<int32_t*>(problem_sizes2.data_ptr()),
       static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n, k);
-  compute_expert_offsets<<<1, 1, 0, stream>>>(
-      static_cast<const int32_t*>(problem_sizes1.data_ptr()),
-      static_cast<int32_t*>(expert_offsets.data_ptr()),
-      static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
+  if (blockscale_offsets.has_value()) {
+    compute_expert_blockscale_offsets<<<1, 1, 0, stream>>>(
+        static_cast<const int32_t*>(problem_sizes1.data_ptr()),
+        static_cast<int32_t*>(expert_offsets.data_ptr()),
+        static_cast<int32_t*>(blockscale_offsets.value().data_ptr()),
+        static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
+  } else {
+    compute_expert_offsets<<<1, 1, 0, stream>>>(
+        static_cast<const int32_t*>(problem_sizes1.data_ptr()),
+        static_cast<int32_t*>(expert_offsets.data_ptr()),
+        static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
+  }
   compute_arg_sorts<<<num_experts, num_threads, 0, stream>>>(
-      static_cast<const int32_t*>(topk_ids.data_ptr()),
+      static_cast<const uint32_t*>(topk_ids.data_ptr()),
       static_cast<const int32_t*>(expert_offsets.data_ptr()),
       static_cast<int32_t*>(input_permutation.data_ptr()),
       static_cast<int32_t*>(output_permutation.data_ptr()),
       static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(),
       topk_ids.size(1));
 }
+
+__global__ void compute_pplx_data(int32_t* expert_offsets,
+                                  int32_t* problem_sizes1,
+                                  int32_t* problem_sizes2,
+                                  const int32_t* __restrict__ expert_num_tokens,
+                                  const int padded_m, const int n,
+                                  const int k) {
+  int expert_idx = threadIdx.x;
+
+  expert_offsets[expert_idx] = expert_idx * padded_m;
+  problem_sizes1[expert_idx * 3] = expert_num_tokens[expert_idx];
+  problem_sizes1[expert_idx * 3 + 1] = 2 * n;
+  problem_sizes1[expert_idx * 3 + 2] = k;
+  problem_sizes2[expert_idx * 3] = expert_num_tokens[expert_idx];
+  problem_sizes2[expert_idx * 3 + 1] = k;
+  problem_sizes2[expert_idx * 3 + 2] = n;
+}
+
+void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
+                                         torch::Tensor& problem_sizes1,
+                                         torch::Tensor& problem_sizes2,
+                                         const torch::Tensor& expert_num_tokens,
+                                         const int64_t num_local_experts,
+                                         const int64_t padded_m,
+                                         const int64_t n, const int64_t k) {
+  auto stream = at::cuda::getCurrentCUDAStream(expert_offsets.device().index());
+
+  compute_pplx_data<<<1, num_local_experts, 0, stream>>>(
+      static_cast<int32_t*>(expert_offsets.data_ptr()),
+      static_cast<int32_t*>(problem_sizes1.data_ptr()),
+      static_cast<int32_t*>(problem_sizes2.data_ptr()),
+      static_cast<const int32_t*>(expert_num_tokens.data_ptr()), padded_m, n,
+      k);
+}
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index e9b408fbf2ee..348525810810 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -36,7 +36,8 @@ void cutlass_moe_mm_sm90(
     torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
     torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
     torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides);
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch);
 
 #endif
 
@@ -54,7 +55,16 @@ void get_cutlass_moe_mm_data_caller(
     const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
     torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
     torch::Tensor& input_permutation, torch::Tensor& output_permutation,
-    const int64_t num_experts, const int64_t n, const int64_t k);
+    const int64_t num_experts, const int64_t n, const int64_t k,
+    const std::optional<torch::Tensor>& blockscale_offsets);
+
+void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
+                                         torch::Tensor& problem_sizes1,
+                                         torch::Tensor& problem_sizes2,
+                                         const torch::Tensor& expert_num_tokens,
+                                         const int64_t num_local_experts,
+                                         const int64_t padded_m,
+                                         const int64_t n, const int64_t k);
 #endif
 
 void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a,
@@ -206,12 +216,13 @@ void cutlass_moe_mm(
     torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
     torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
     torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
-    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides,
+    bool per_act_token, bool per_out_ch) {
   int32_t version_num = get_sm_version_num();
 #if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
   cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
                       expert_offsets, problem_sizes, a_strides, b_strides,
-                      c_strides);
+                      c_strides, per_act_token, per_out_ch);
   return;
 #endif
   TORCH_CHECK_NOT_IMPLEMENTED(
@@ -224,7 +235,8 @@ void get_cutlass_moe_mm_data(
     const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
     torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
     torch::Tensor& input_permutation, torch::Tensor& output_permutation,
-    const int64_t num_experts, const int64_t n, const int64_t k) {
+    const int64_t num_experts, const int64_t n, const int64_t k,
+    const std::optional<torch::Tensor>& blockscale_offsets) {
   // This function currently gets compiled only if we have a valid cutlass moe
   // mm to run it for.
   int32_t version_num = get_sm_version_num();
@@ -232,7 +244,8 @@ void get_cutlass_moe_mm_data(
     (defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM90)
   get_cutlass_moe_mm_data_caller(topk_ids, expert_offsets, problem_sizes1,
                                  problem_sizes2, input_permutation,
-                                 output_permutation, num_experts, n, k);
+                                 output_permutation, num_experts, n, k,
+                                 blockscale_offsets);
   return;
 #endif
   TORCH_CHECK_NOT_IMPLEMENTED(
@@ -242,6 +255,29 @@ void get_cutlass_moe_mm_data(
       version_num, ". Required capability: 90");
 }
 
+void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
+                                  torch::Tensor& problem_sizes1,
+                                  torch::Tensor& problem_sizes2,
+                                  const torch::Tensor& expert_num_tokens,
+                                  const int64_t num_local_experts,
+                                  const int64_t padded_m, const int64_t n,
+                                  const int64_t k) {
+  // This function currently gets compiled only if we have a valid cutlass moe
+  // mm to run it for.
+  int32_t version_num = get_sm_version_num();
+#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
+  get_cutlass_pplx_moe_mm_data_caller(expert_offsets, problem_sizes1,
+                                      problem_sizes2, expert_num_tokens,
+                                      num_local_experts, padded_m, n, k);
+  return;
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled get_cutlass_pplx_moe_mm_data: no cutlass_scaled_mm kernel "
+      "for CUDA device capability: ",
+      version_num, ". Required capability: 90");
+}
+
 void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
                            torch::Tensor const& b,
                            torch::Tensor const& a_scales,
diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/fp8/common.cu
index eceb3a8ea05d..f3f9f669e00a 100644
--- a/csrc/quantization/fp8/common.cu
+++ b/csrc/quantization/fp8/common.cu
@@ -39,8 +39,8 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
   fp8_type* __restrict__ token_output = &out[offset];
 
   // For vectorization, token_input and token_output pointers need to be
-  // aligned at 8-byte and 4-byte addresses respectively.
-  bool const can_vectorize = hidden_size % 4 == 0;
+  // aligned at 32-byte and 16-byte addresses respectively.
+  bool const can_vectorize = hidden_size % 16 == 0;
 
   float absmax_val = 0.0f;
   if (can_vectorize) {
@@ -48,24 +48,24 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
   } else {
     for (int i = tid; i < hidden_size; i += blockDim.x) {
       float const x = static_cast<float>(token_input[i]);
-      absmax_val = max(absmax_val, fabs(x));
+      absmax_val = fmaxf(absmax_val, fabsf(x));
     }
   }
 
-  using BlockReduce = cub::BlockReduce<float, 1024>;
+  using BlockReduce = cub::BlockReduce<float, 256>;
   __shared__ typename BlockReduce::TempStorage reduceStorage;
   float const block_absmax_val_maybe =
       BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x);
   __shared__ float token_scale;
   if (tid == 0) {
     if (scale_ub) {
-      token_scale = min(block_absmax_val_maybe, *scale_ub);
+      token_scale = fminf(block_absmax_val_maybe, *scale_ub);
     } else {
       token_scale = block_absmax_val_maybe;
     }
     // token scale computation
-    token_scale = max(token_scale / quant_type_max_v<fp8_type>,
-                      min_scaling_factor<fp8_type>::val());
+    token_scale = fmaxf(token_scale / quant_type_max_v<fp8_type>,
+                        min_scaling_factor<fp8_type>::val());
     scale[token_idx] = token_scale;
   }
   __syncthreads();
@@ -88,10 +88,11 @@ void static_scaled_fp8_quant(torch::Tensor& out,          // [..., d]
                              torch::Tensor const& input,  // [..., d]
                              torch::Tensor const& scale)  // [1]
 {
-  int64_t num_tokens = input.numel() / input.size(-1);
-  int64_t num_elems = input.numel();
-  dim3 grid(num_tokens);
-  dim3 block(1024);
+  int const block_size = 256;
+  int const num_tokens = input.numel() / input.size(-1);
+  int const num_elems = input.numel();
+  dim3 const grid(num_tokens);
+  dim3 const block(block_size);
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(
@@ -110,10 +111,11 @@ void dynamic_scaled_fp8_quant(torch::Tensor& out,          // [..., d]
                               torch::Tensor const& input,  // [..., d]
                               torch::Tensor& scale)        // [1]
 {
-  int64_t num_tokens = input.numel() / input.size(-1);
-  int64_t num_elems = input.numel();
-  dim3 grid(num_tokens);
-  dim3 block(1024);
+  int const block_size = 256;
+  int const num_tokens = input.numel() / input.size(-1);
+  int const num_elems = input.numel();
+  dim3 const grid(num_tokens);
+  dim3 const block(block_size);
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(
@@ -141,8 +143,9 @@ void dynamic_per_token_scaled_fp8_quant(
 
   int const hidden_size = input.size(-1);
   int const num_tokens = input.numel() / hidden_size;
+  int const block_size = 256;
   dim3 const grid(num_tokens);
-  dim3 const block(std::min(hidden_size, 1024));
+  dim3 const block(std::min(hidden_size, block_size));
 
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
diff --git a/csrc/quantization/fp8/common.cuh b/csrc/quantization/fp8/common.cuh
index def8b31b2754..d36f94a8f10d 100644
--- a/csrc/quantization/fp8/common.cuh
+++ b/csrc/quantization/fp8/common.cuh
@@ -46,7 +46,7 @@ __device__ __forceinline__ fp8_type scaled_fp8_conversion(float const val,
   }
 
   float r =
-      fmax(-quant_type_max_v<fp8_type>, fmin(x, quant_type_max_v<fp8_type>));
+      fmaxf(-quant_type_max_v<fp8_type>, fminf(x, quant_type_max_v<fp8_type>));
 #ifndef USE_ROCM
   return static_cast<fp8_type>(r);
 #else
@@ -65,7 +65,7 @@ template <typename scalar_t, typename fp8_type>
 __global__ void segmented_max_reduction(float* __restrict__ scale,
                                         const scalar_t* __restrict__ input,
                                         int64_t num_elems) {
-  __shared__ float cache[1024];
+  __shared__ float cache[256];
   int64_t i = blockDim.x * blockIdx.x + threadIdx.x;
 
   // First store maximum for all values processes by
@@ -73,7 +73,7 @@ __global__ void segmented_max_reduction(float* __restrict__ scale,
   scalar_t tmp = 0.0;
   while (i < num_elems) {
     float x = static_cast<float>(input[i]);
-    tmp = max(tmp, fabs(x));
+    tmp = fmaxf(tmp, fabsf(x));
     i += blockDim.x * gridDim.x;
   }
   cache[threadIdx.x] = tmp;
@@ -100,25 +100,27 @@ template <typename scalar_t>
 __device__ float thread_max_vec(scalar_t const* __restrict__ input,
                                 int64_t const num_elems, int const tid,
                                 int const step) {
+  constexpr size_t VEC_SIZE = 16;
+  using scalarxN_t = vec_n_t<scalar_t, VEC_SIZE>;
   // Vectorized input/output to better utilize memory bandwidth.
-  vec4_t<scalar_t> const* vectorized_in =
-      reinterpret_cast<vec4_t<scalar_t> const*>(input);
+  auto const* vectorized_in = reinterpret_cast<scalarxN_t const*>(input);
 
-  int64_t const num_vec_elems = num_elems >> 2;
+  // num_elems / VEC_SIZE (which is 16)
+  int64_t const num_vec_elems = num_elems >> 4;
   float absmax_val = 0.0f;
 
-#pragma unroll 4
+#pragma unroll
   for (int64_t i = tid; i < num_vec_elems; i += step) {
-    vec4_t<scalar_t> in_vec = vectorized_in[i];
-    absmax_val = max(absmax_val, fabs(in_vec.x));
-    absmax_val = max(absmax_val, fabs(in_vec.y));
-    absmax_val = max(absmax_val, fabs(in_vec.z));
-    absmax_val = max(absmax_val, fabs(in_vec.w));
+    scalarxN_t in_vec = vectorized_in[i];
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      absmax_val = fmaxf(absmax_val, fabsf(in_vec.val[j]));
+    }
   }
 
-  // Handle the remaining elements if num_elems is not divisible by 4
-  for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) {
-    absmax_val = max(absmax_val, fabs(input[i]));
+  // Handle the remaining elements if num_elems is not divisible by VEC_SIZE
+  for (int64_t i = num_vec_elems * VEC_SIZE + tid; i < num_elems; i += step) {
+    absmax_val = fmaxf(absmax_val, fabsf(input[i]));
   }
 
   return absmax_val;
@@ -130,31 +132,31 @@ __device__ void scaled_fp8_conversion_vec(fp8_type* __restrict__ out,
                                           float const scale,
                                           int64_t const num_elems,
                                           int const tid, int const step) {
-  using float8x4_t = q8x4_t<fp8_type>;
+  constexpr size_t VEC_SIZE = 16;
+  using scalarxN_t = vec_n_t<scalar_t, VEC_SIZE>;
+  using float8xN_t = q8_n_t<fp8_type, VEC_SIZE>;
   // Vectorized input/output to better utilize memory bandwidth.
-  auto const* vectorized_in = reinterpret_cast<vec4_t<scalar_t> const*>(input);
-  auto* vectorized_out = reinterpret_cast<float8x4_t*>(out);
+  auto const* vectorized_in = reinterpret_cast<scalarxN_t const*>(input);
+  auto* vectorized_out = reinterpret_cast<float8xN_t*>(out);
 
-  int64_t const num_vec_elems = num_elems >> 2;
+  // num_elems / VEC_SIZE (which is 16)
+  int64_t const num_vec_elems = num_elems >> 4;
 
-#pragma unroll 4
+#pragma unroll
   for (int64_t i = tid; i < num_vec_elems; i += step) {
-    vec4_t<scalar_t> in_vec = vectorized_in[i];
-    float8x4_t out_vec;
-
-    out_vec.x = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
-        static_cast<float>(in_vec.x), scale);
-    out_vec.y = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
-        static_cast<float>(in_vec.y), scale);
-    out_vec.z = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
-        static_cast<float>(in_vec.z), scale);
-    out_vec.w = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
-        static_cast<float>(in_vec.w), scale);
+    scalarxN_t in_vec = vectorized_in[i];
+    float8xN_t out_vec;
+
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      out_vec.val[j] = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
+          static_cast<float>(in_vec.val[j]), scale);
+    }
     vectorized_out[i] = out_vec;
   }
 
-  // Handle the remaining elements if num_elems is not divisible by 4
-  for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) {
+  // Handle the remaining elements if num_elems is not divisible by VEC_SIZE
+  for (int64_t i = num_vec_elems * VEC_SIZE + tid; i < num_elems; i += step) {
     out[i] = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
         static_cast<float>(input[i]), scale);
   }
diff --git a/csrc/quantization/fused_kernels/layernorm_utils.cuh b/csrc/quantization/fused_kernels/layernorm_utils.cuh
index e6d23cd24e17..3f188872d80d 100644
--- a/csrc/quantization/fused_kernels/layernorm_utils.cuh
+++ b/csrc/quantization/fused_kernels/layernorm_utils.cuh
@@ -140,6 +140,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
   // sum of squares
   float ss = 0.0f;
 
+  const int VEC_SIZE = 4;
   int32_t const num_vec_elems = hidden_size >> 2;
 
 #pragma unroll 4
@@ -147,22 +148,23 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
     vec4_t<scalar_t> in = vec_input[i];
 
     vec4_t<float> x;
-    x.x = static_cast<float>(in.x);
-    x.y = static_cast<float>(in.y);
-    x.z = static_cast<float>(in.z);
-    x.w = static_cast<float>(in.w);
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      x.val[j] = static_cast<float>(in.val[j]);
+    }
+
     if constexpr (has_residual) {
       vec4_t<scalar_t> r = vec_residual[i];
-      x.x += static_cast<float>(r.x);
-      x.y += static_cast<float>(r.y);
-      x.z += static_cast<float>(r.z);
-      x.w += static_cast<float>(r.w);
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; ++j) {
+        x.val[j] += static_cast<float>(r.val[j]);
+      }
     }
 
-    ss += x.x * x.x;
-    ss += x.y * x.y;
-    ss += x.z * x.z;
-    ss += x.w * x.w;
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      ss += x.val[j] * x.val[j];
+    }
   }
 
   using BlockReduce = cub::BlockReduce<float, 1024>;
@@ -203,6 +205,7 @@ __device__ void compute_dynamic_per_token_scales(
 
   constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
 
+  const int VEC_SIZE = 4;
   int32_t const num_vec_elems = hidden_size >> 2;
   float block_absmax_val_maybe = 0.0f;
 
@@ -212,26 +215,25 @@ __device__ void compute_dynamic_per_token_scales(
     vec4_t<scalar_t> const w = vec_weight[i];
 
     vec4_t<float> x;
-    x.x = static_cast<float>(in.x);
-    x.y = static_cast<float>(in.y);
-    x.z = static_cast<float>(in.z);
-    x.w = static_cast<float>(in.w);
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      x.val[j] = static_cast<float>(in.val[j]);
+    }
+
     if constexpr (has_residual) {
       vec4_t<scalar_t> r = vec_residual[i];
-      x.x += static_cast<float>(r.x);
-      x.y += static_cast<float>(r.y);
-      x.z += static_cast<float>(r.z);
-      x.w += static_cast<float>(r.w);
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; ++j) {
+        x.val[j] += static_cast<float>(r.val[j]);
+      }
     }
 
-    block_absmax_val_maybe = fmaxf(
-        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.x * rms) * w.x));
-    block_absmax_val_maybe = fmaxf(
-        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.y * rms) * w.y));
-    block_absmax_val_maybe = fmaxf(
-        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.z * rms) * w.z));
-    block_absmax_val_maybe = fmaxf(
-        block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.w * rms) * w.w));
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      block_absmax_val_maybe =
+          fmaxf(block_absmax_val_maybe,
+                fabs(static_cast<scalar_t>(x.val[j] * rms) * w.val[j]));
+    }
   }
 
   using BlockReduce = cub::BlockReduce<float, 1024>;
@@ -282,6 +284,7 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
     vec_residual = reinterpret_cast<vec4_t<scalar_t>*>(&residual[token_offset]);
   }
 
+  const int VEC_SIZE = 4;
   int32_t const num_vec_elems = hidden_size >> 2;
 
 // TODO(luka/varun) extract into type-agnostic vectorized quant function to
@@ -292,33 +295,31 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
     vec4_t<scalar_t> const w = vec_weight[i];
 
     vec4_t<float> x;
-    x.x = static_cast<float>(in.x);
-    x.y = static_cast<float>(in.y);
-    x.z = static_cast<float>(in.z);
-    x.w = static_cast<float>(in.w);
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      x.val[j] = static_cast<float>(in.val[j]);
+    }
+
     if constexpr (has_residual) {
       vec4_t<scalar_t> r = vec_residual[i];
-      x.x += static_cast<float>(r.x);
-      x.y += static_cast<float>(r.y);
-      x.z += static_cast<float>(r.z);
-      x.w += static_cast<float>(r.w);
-      // Update residual
-      r.x = static_cast<scalar_t>(x.x);
-      r.y = static_cast<scalar_t>(x.y);
-      r.z = static_cast<scalar_t>(x.z);
-      r.w = static_cast<scalar_t>(x.w);
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; ++j) {
+        x.val[j] += static_cast<float>(r.val[j]);
+      }
+// Update residual
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; ++j) {
+        r.val[j] = static_cast<scalar_t>(x.val[j]);
+      }
       vec_residual[i] = r;
     }
 
     q8x4_t<scalar_out_t> out;
-    out.x = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
-        static_cast<scalar_t>(x.x * rms) * w.x, scale);
-    out.y = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
-        static_cast<scalar_t>(x.y * rms) * w.y, scale);
-    out.z = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
-        static_cast<scalar_t>(x.z * rms) * w.z, scale);
-    out.w = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
-        static_cast<scalar_t>(x.w * rms) * w.w, scale);
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      out.val[j] = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
+          static_cast<scalar_t>(x.val[j] * rms) * w.val[j], scale);
+    }
     vec_output[i] = out;
   }
 }
diff --git a/csrc/quantization/gptq_marlin/generate_kernels.py b/csrc/quantization/gptq_marlin/generate_kernels.py
index 4ac7121ab4e1..18fb6c1a81f8 100644
--- a/csrc/quantization/gptq_marlin/generate_kernels.py
+++ b/csrc/quantization/gptq_marlin/generate_kernels.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import glob
 import itertools
 import os
diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py
index 3114e14baa0c..9af7833d09f3 100644
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 import math
diff --git a/csrc/quantization/vectorization.cuh b/csrc/quantization/vectorization.cuh
index 866da10b5bc1..11d57a5fafe8 100644
--- a/csrc/quantization/vectorization.cuh
+++ b/csrc/quantization/vectorization.cuh
@@ -10,23 +10,22 @@
 namespace vllm {
 
 // Vectorization containers
-template <typename scalar_t>
-struct __align__(8) vec4_t {
-  scalar_t x;
-  scalar_t y;
-  scalar_t z;
-  scalar_t w;
+template <typename scalar_t, size_t vec_size>
+struct __align__(vec_size * sizeof(scalar_t)) vec_n_t {
+  scalar_t val[vec_size];
 };
 
-template <typename quant_type_t>
-struct __align__(4) q8x4_t {
+template <typename quant_type_t, size_t vec_size>
+struct __align__(vec_size * sizeof(quant_type_t)) q8_n_t {
   static_assert(std::is_same_v<quant_type_t, int8_t> ||
                 std::is_same_v<quant_type_t, c10::Float8_e4m3fn> ||
                 std::is_same_v<quant_type_t, c10::Float8_e4m3fnuz>);
-  quant_type_t x;
-  quant_type_t y;
-  quant_type_t z;
-  quant_type_t w;
+  quant_type_t val[vec_size];
 };
 
+template <typename scalar_t>
+using vec4_t = vec_n_t<scalar_t, 4>;
+template <typename quant_type_t>
+using q8x4_t = q8_n_t<quant_type_t, 4>;
+
 }  // namespace vllm
diff --git a/csrc/sampler.cu b/csrc/sampler.cu
new file mode 100644
index 000000000000..ee5793dda0ef
--- /dev/null
+++ b/csrc/sampler.cu
@@ -0,0 +1,86 @@
+#include "dispatch_utils.h"
+
+#include <torch/cuda.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#ifndef USE_ROCM
+  #include <cub/cub.cuh>
+#else
+  #include <hipcub/hipcub.hpp>
+#endif
+
+namespace vllm {
+
+template <typename scalar_t>
+__global__ void apply_repetition_penalties_kernel(
+    scalar_t* __restrict__ logits,         // [num_seqs, vocab_size]
+    const bool* __restrict__ prompt_mask,  // [num_seqs, vocab_size]
+    const bool* __restrict__ output_mask,  // [num_seqs, vocab_size]
+    const scalar_t* __restrict__ repetition_penalties,  // [num_seqs]
+    const int num_seqs, const int vocab_size, const int tile_size) {
+  // Each block handles one sequence and a tile of vocab
+  const int seq_idx = blockIdx.x;
+  if (seq_idx >= num_seqs) return;
+
+  const int tile_start = blockIdx.y * tile_size;
+  const int tile_end = min(tile_start + tile_size, vocab_size);
+
+  // Load repetition penalty for this sequence
+  const scalar_t penalty = repetition_penalties[seq_idx];
+
+  // Each thread processes multiple vocab items within the tile
+  for (int vocab_idx = tile_start + threadIdx.x; vocab_idx < tile_end;
+       vocab_idx += blockDim.x) {
+    const int64_t idx = static_cast<int64_t>(seq_idx) * vocab_size + vocab_idx;
+    const bool is_repeated = prompt_mask[idx] || output_mask[idx];
+    if (is_repeated) {
+      scalar_t logit = logits[idx];
+      if (logit > 0) {
+        logits[idx] = logit / penalty;
+      } else {
+        logits[idx] = logit * penalty;
+      }
+    }
+  }
+}
+
+}  // namespace vllm
+
+void apply_repetition_penalties_(
+    torch::Tensor& logits,             // [num_seqs, vocab_size], in-place
+    const torch::Tensor& prompt_mask,  // [num_seqs, vocab_size]
+    const torch::Tensor& output_mask,  // [num_seqs, vocab_size]
+    const torch::Tensor& repetition_penalties) {  // [num_seqs]
+  TORCH_CHECK(logits.is_contiguous());
+  TORCH_CHECK(prompt_mask.is_contiguous());
+  TORCH_CHECK(output_mask.is_contiguous());
+  TORCH_CHECK(repetition_penalties.is_contiguous());
+
+  int vocab_size = logits.size(-1);
+  int num_seqs = logits.size(0);
+
+  // Get number of SMs on the current device
+  int sms = 0;
+  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount,
+                         logits.get_device());
+
+  // Compute tile_num and tile_size
+  int tile_num =
+      std::min(vocab_size, std::max(1, (sms + num_seqs - 1) / num_seqs));
+  int tile_size = (vocab_size + tile_num - 1) / tile_num;
+
+  // Each block handles one sequence and a tile of vocab
+  dim3 grid(num_seqs, tile_num);
+  dim3 block(std::min(tile_size, 1024));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(logits));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_TYPES(
+      logits.scalar_type(), "apply_repetition_penalties_kernel", [&] {
+        vllm::apply_repetition_penalties_kernel<scalar_t>
+            <<<grid, block, 0, stream>>>(
+                logits.data_ptr<scalar_t>(), prompt_mask.data_ptr<bool>(),
+                output_mask.data_ptr<bool>(),
+                repetition_penalties.data_ptr<scalar_t>(), num_seqs, vocab_size,
+                tile_size);
+      });
+}
\ No newline at end of file
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 371894c56a79..1a1896b4c1ee 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -170,6 +170,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "float epsilon) -> ()");
   ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm);
 
+  // Apply repetition penalties to logits in-place
+  ops.def(
+      "apply_repetition_penalties_(Tensor! logits, Tensor prompt_mask, "
+      "Tensor output_mask, Tensor repetition_penalties) -> ()");
+  ops.impl("apply_repetition_penalties_", torch::kCUDA,
+           &apply_repetition_penalties_);
+
   // Layernorm-quant
   // Apply Root Mean Square (RMS) Normalization to the input tensor.
   ops.def(
@@ -428,7 +435,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "cutlass_moe_mm(Tensor! out_tensors, Tensor a_tensors, Tensor b_tensors, "
       "               Tensor a_scales, Tensor b_scales, Tensor expert_offsets, "
       "               Tensor problem_sizes, Tensor a_strides, "
-      "               Tensor b_strides, Tensor c_strides) -> ()",
+      "               Tensor b_strides, Tensor c_strides, bool per_act_token, "
+      "               bool per_out_ch) -> ()",
       {stride_tag});
   ops.impl("cutlass_moe_mm", torch::kCUDA, &cutlass_moe_mm);
 
@@ -443,10 +451,26 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "                        Tensor! problem_sizes1, Tensor! problem_sizes2, "
       "                        Tensor! input_permutation, "
       "                        Tensor! output_permutation, int num_experts, "
-      "                        int n, int k) -> ()",
+      "                        int n, int k, Tensor? blockscale_offsets) -> ()",
       {stride_tag});
   ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data);
 
+  // A function that computes data required to run fused MoE with w8a8 grouped
+  // GEMM and PPLX. It takes expert_num_tokens and non_zero_expert_idxs
+  // as an input, and computes expert_offsets (token start indices of each
+  // expert). In addition to this, it computes problem sizes for each expert's
+  // multiplication used by the two mms called from fused MoE operation.
+  ops.def(
+      "get_cutlass_pplx_moe_mm_data(Tensor! expert_offsets, "
+      "                             Tensor! problem_sizes1, "
+      "                             Tensor! problem_sizes2, "
+      "                             Tensor expert_num_tokens, "
+      "                             int num_local_experts, int padded_m, "
+      "                             int n, int k) -> ()",
+      {stride_tag});
+  ops.impl("get_cutlass_pplx_moe_mm_data", torch::kCUDA,
+           &get_cutlass_pplx_moe_mm_data);
+
   // Check if cutlass scaled_mm supports block quantization (used by DeepSeekV3)
   ops.def(
       "cutlass_scaled_mm_supports_block_fp8(int cuda_device_capability) -> "
diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch
index 53b8ccd80492..8d43de77aad5 100644
--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@@ -312,4 +312,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # Logging to confirm the torch versions
 RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
 
+# Logging to confirm all the packages are installed
+RUN pip freeze
+
 #################### UNITTEST IMAGE #############################
diff --git a/docker/Dockerfile.ppc64le b/docker/Dockerfile.ppc64le
index 14043eb7a8e3..aaff240388f2 100644
--- a/docker/Dockerfile.ppc64le
+++ b/docker/Dockerfile.ppc64le
@@ -1,10 +1,41 @@
 ARG BASE_UBI_IMAGE_TAG=9.5-1741850109
 
+###############################################################
+# Stage to build openblas
+###############################################################
+
+FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS openblas-builder
+
+ARG MAX_JOBS
+ARG OPENBLAS_VERSION=0.3.29
+RUN microdnf install -y dnf && dnf install -y gcc-toolset-13 make wget unzip \
+    && source /opt/rh/gcc-toolset-13/enable \
+    && wget https://github.com/OpenMathLib/OpenBLAS/releases/download/v$OPENBLAS_VERSION/OpenBLAS-$OPENBLAS_VERSION.zip \
+    && unzip OpenBLAS-$OPENBLAS_VERSION.zip \
+    && cd OpenBLAS-$OPENBLAS_VERSION \
+    &&  make -j${MAX_JOBS} TARGET=POWER9 BINARY=64 USE_OPENMP=1 USE_THREAD=1 NUM_THREADS=120 DYNAMIC_ARCH=1 INTERFACE64=0 \
+    && cd /tmp && touch control
+
+
+###############################################################
+# base stage with dependencies coming from centos mirrors
+###############################################################
+FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS centos-deps-builder
+RUN  microdnf install -y dnf && \ 
+     dnf install -y https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-gpg-keys-9.0-24.el9.noarch.rpm \
+        https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-stream-repos-9.0-24.el9.noarch.rpm \
+        https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
+        dnf config-manager --set-enabled crb
+
+RUN dnf install -y openjpeg2-devel lcms2-devel tcl-devel tk-devel fribidi-devel && \
+    dnf remove -y centos-gpg-keys-9.0-24.el9.noarch centos-stream-repos-9.0-24.el9.noarch 
+
+
 ###############################################################
 # base stage with basic dependencies
 ###############################################################
 
-FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base-builder
+FROM centos-deps-builder AS base-builder
 
 ARG PYTHON_VERSION=3.12
 ARG OPENBLAS_VERSION=0.3.29
@@ -20,25 +51,27 @@ ENV UV_LINK_MODE=copy
 # Note: A symlink for libatomic.so is created for gcc-13 (linker fails to find libatomic otherwise - reqd. for sentencepiece)
 # Note: A dummy file 'control' is created in /tmp/ to artificially create dependencies between stages when building stages in parallel
 #       when `--jobs=<N>` is passed with podman build command
-RUN microdnf install -y openssl-devel dnf \
-    && dnf install -y  https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm \
-    && dnf config-manager --set-enabled codeready-builder-for-rhel-9-ppc64le-rpms \
+
+COPY --from=openblas-builder /tmp/control /dev/null
+
+RUN --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \
+    dnf install -y openssl-devel \
     && dnf install -y \
-       git tar gcc-toolset-13 automake libtool numactl-devel lapack-devel \
+       git tar gcc-toolset-13 automake libtool \
        pkgconfig xsimd zeromq-devel kmod findutils protobuf* \
-       libtiff-devel libjpeg-devel openjpeg2-devel zlib-devel \
-       freetype-devel lcms2-devel libwebp-devel tcl-devel tk-devel \
-       harfbuzz-devel fribidi-devel libraqm-devel libimagequant-devel libxcb-devel \
+       libtiff-devel libjpeg-devel zlib-devel freetype-devel libwebp-devel \
+       harfbuzz-devel libraqm-devel libimagequant-devel libxcb-devel \
        python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \
     && dnf clean all \
+    && PREFIX=/usr/local make -C /openblas install \
     && ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \
     && python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
     && python -m pip install -U pip uv \
     && uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python 'cmake<4' ninja cython scikit_build_core scikit_build \
-    && curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
     && curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
     && cd /tmp && touch control
 
+
 ###############################################################
 # Stage to build torch family
 ###############################################################
@@ -48,6 +81,8 @@ FROM base-builder AS torch-builder
 ARG MAX_JOBS
 ARG TORCH_VERSION=2.6.0
 ARG _GLIBCXX_USE_CXX11_ABI=1
+ARG OPENBLAS_VERSION=0.3.29
+
 RUN --mount=type=cache,target=/root/.cache/uv \
     source /opt/rh/gcc-toolset-13/enable &&  \
     git clone --recursive https://github.com/pytorch/pytorch.git -b v${TORCH_VERSION} && \
@@ -109,7 +144,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
         .. && \
     make install -j ${MAX_JOBS:-$(nproc)} && \
     cd ../../python/ && \
-    uv pip install -v -r requirements-wheel-build.txt && \
+    uv pip install -v -r requirements-build.txt && uv pip install numpy==2.1.3 && \
+    pip show numpy && ls -lrt /opt/vllm/lib/python3.12/site-packages/numpy && \
     PYARROW_PARALLEL=${PYARROW_PARALLEL:-$(nproc)} \
     python setup.py build_ext \
     --build-type=release --bundle-arrow-cpp \
@@ -132,8 +168,25 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     cd opencv-python && \
     sed -i -E -e 's/"setuptools.+",/"setuptools",/g' pyproject.toml && \
     cd opencv && git cherry-pick --no-commit $OPENCV_PATCH && cd .. && \
+    uv pip install scikit-build && \    
     python -m build --wheel --installer=uv --outdir /opencvwheels/
 
+###############################################################
+# Stage to build numactl
+###############################################################
+
+FROM base-builder AS numa-builder
+
+# Note: Building numactl with gcc-11. Compiling with gcc-13 in this builder stage will
+# trigger recompilation with gcc-11 (and require libtool) in the final stage where we do not have gcc-13
+ARG MAX_JOBS
+ARG NUMACTL_VERSION=2.0.19
+RUN git clone --recursive https://github.com/numactl/numactl.git -b v${NUMACTL_VERSION} \
+    && cd numactl \
+    && autoreconf -i && ./configure \
+    && make -j ${MAX_JOBS:-$(nproc)}
+
+
 ###############################################################
 # Stage to build vllm - this stage builds and installs
 # vllm, tensorizer and vllm-tgis-adapter and builds uv cache
@@ -145,6 +198,7 @@ FROM base-builder AS vllmcache-builder
 COPY --from=torch-builder /tmp/control /dev/null
 COPY --from=arrow-builder /tmp/control /dev/null
 COPY --from=cv-builder /tmp/control /dev/null
+COPY --from=numa-builder /tmp/control /dev/null
 
 ARG VLLM_TARGET_DEVICE=cpu
 ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
@@ -160,11 +214,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
     --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
     --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
+    --mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \
     --mount=type=bind,src=.,dst=/src/,rw \
     source /opt/rh/gcc-toolset-13/enable && \
     uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl && \
     sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \
     uv pip install pandas pythran pybind11 /hf_wheels/*.whl && \
+    make -C /numactl install && \
     # sentencepiece.pc is in some pkgconfig inside uv cache
     export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \
     uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \
@@ -173,21 +229,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install /vllmwheel/*.whl
 
 
-###############################################################
-# Stage to build numactl
-###############################################################
-
-FROM base-builder AS numa-builder
-
-# Note: Building numactl with gcc-11. Compiling with gcc-13 in this builder stage will
-# trigger recompilation with gcc-11 (and require libtool) in the final stage where we do not have gcc-13
-ARG MAX_JOBS
-ARG NUMACTL_VERSION=2.0.19
-RUN git clone --recursive https://github.com/numactl/numactl.git -b v${NUMACTL_VERSION} \
-    && cd numactl \
-    && autoreconf -i && ./configure \
-    && make -j ${MAX_JOBS:-$(nproc)}
-
 ###############################################################
 # Stage to build lapack
 ###############################################################
@@ -217,6 +258,7 @@ ENV PATH=${VIRTUAL_ENV}/bin:$PATH
 ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig/
 ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib64:/usr/local/lib:/usr/lib64:/usr/lib
 ENV UV_LINK_MODE=copy
+ENV OMP_NUM_THREADS=16
 
 # create artificial dependencies between stages for independent stages to build in parallel
 COPY --from=torch-builder /tmp/control /dev/null
@@ -225,11 +267,13 @@ COPY --from=cv-builder /tmp/control /dev/null
 COPY --from=vllmcache-builder /tmp/control /dev/null
 COPY --from=numa-builder /tmp/control /dev/null
 COPY --from=lapack-builder /tmp/control /dev/null
+COPY --from=openblas-builder /tmp/control /dev/null
 
 # install gcc-11, python, openblas, numactl, lapack
 RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \
     --mount=type=bind,from=lapack-builder,source=/lapack/,target=/lapack/,rw \
+    --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \
     rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
     microdnf install --nodocs -y \
     tar findutils openssl \
@@ -241,8 +285,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     && microdnf clean all \
     && python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
     && python -m pip install -U pip uv --no-cache \
-    && curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
     && make -C /numactl install \
+    && PREFIX=/usr/local make -C /openblas install \
     && uv pip install 'cmake<4' \
     && cmake --install /lapack/build \
     && uv pip uninstall cmake
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index b186f88d2744..4f40f32a39f2 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -13,7 +13,7 @@ RUN apt-get update -q -y && apt-get install -q -y \
     sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
     apt-transport-https ca-certificates wget curl
 # Remove sccache    
-RUN python3 -m pip install --upgrade pip && pip install setuptools_scm
+RUN python3 -m pip install --upgrade pip
 RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
 ARG COMMON_WORKDIR
 WORKDIR ${COMMON_WORKDIR}
@@ -28,7 +28,8 @@ ARG VLLM_REPO="https://github.com/vllm-project/vllm.git"
 ARG VLLM_BRANCH="main"
 ONBUILD RUN git clone ${VLLM_REPO} \
 	    && cd vllm \
-	    && git checkout ${VLLM_BRANCH}
+	    && git fetch -v --prune -- origin ${VLLM_BRANCH} \
+	    && git checkout FETCH_HEAD
 FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
 
 # -----------------------
diff --git a/docs/cli/README.md b/docs/cli/README.md
index f43ce766390a..df700fb743c0 100644
--- a/docs/cli/README.md
+++ b/docs/cli/README.md
@@ -77,6 +77,8 @@ vllm complete --quick "The future of AI is"
 
 Run benchmark tests for latency online serving throughput and offline inference throughput.
 
+To use benchmark commands, please install with extra dependencies using `pip install vllm[bench]`.
+
 Available Commands:
 
 ```bash
diff --git a/docs/contributing/ci-failures.md b/docs/contributing/ci-failures.md
new file mode 100644
index 000000000000..4d8f78197f33
--- /dev/null
+++ b/docs/contributing/ci-failures.md
@@ -0,0 +1,120 @@
+# CI Failures
+
+What should I do when a CI job fails on my PR, but I don't think my PR caused
+the failure?
+
+- Check the dashboard of current CI test failures:  
+  👉 [CI Failures Dashboard](https://github.com/orgs/vllm-project/projects/20)
+
+- If your failure **is already listed**, it's likely unrelated to your PR.  
+  Help fixing it is always welcome!  
+    - Leave comments with links to additional instances of the failure.  
+    - React with a 👍 to signal how many are affected.
+
+- If your failure **is not listed**, you should **file an issue**.
+
+## Filing a CI Test Failure Issue
+
+- **File a bug report:**  
+    👉 [New CI Failure Report](https://github.com/vllm-project/vllm/issues/new?template=450-ci-failure.yml)
+
+- **Use this title format:**
+  
+    ```
+    [CI Failure]: failing-test-job - regex/matching/failing:test
+    ```
+
+- **For the environment field:**
+  
+    ```
+ Still failing on main as of commit abcdef123
+    ```
+
+- **In the description, include failing tests:**
+  
+    ```
+    FAILED failing/test.py:failing_test1 - Failure description  
+     FAILED failing/test.py:failing_test2 - Failure description  
+    https://github.com/orgs/vllm-project/projects/20  
+    https://github.com/vllm-project/vllm/issues/new?template=400-bug-report.yml  
+    FAILED failing/test.py:failing_test3 - Failure description  
+    ```
+
+- **Attach logs** (collapsible section example):
+    <details>
+    <summary>Logs:</summary>
+
+    ```text
+    ERROR 05-20 03:26:38 [dump_input.py:68] Dumping input data  
+    --- Logging error ---  
+    Traceback (most recent call last):  
+      File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 203, in execute_model  
+        return self.model_executor.execute_model(scheduler_output)  
+    ...
+    FAILED failing/test.py:failing_test1 - Failure description  
+    FAILED failing/test.py:failing_test2 - Failure description  
+    FAILED failing/test.py:failing_test3 - Failure description  
+    ```
+  
+    </details>
+
+## Logs Wrangling
+
+Download the full log file from Buildkite locally.
+
+Strip timestamps and colorization:
+
+```bash
+# Strip timestamps
+sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' ci.log
+
+# Strip colorization
+sed -i -r 's/\x1B\[[0-9;]*[mK]//g' ci.log
+```
+
+Use a tool for quick copy-pasting:
+
+```bash
+tail -525 ci_build.log | wl-copy
+```
+
+## Investigating a CI Test Failure
+
+1. Go to 👉 [Buildkite main branch](https://buildkite.com/vllm/ci/builds?branch=main)  
+2. Bisect to find the first build that shows the issue.  
+3. Add your findings to the GitHub issue.  
+4. If you find a strong candidate PR, mention it in the issue and ping contributors.
+
+## Reproducing a Failure
+
+CI test failures may be flaky. Use a bash loop to run repeatedly:
+
+```bash
+COUNT=1; while pytest -sv tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]; do  
+  COUNT=$[$COUNT + 1]; echo "RUN NUMBER ${COUNT}";  
+done
+```
+
+## Submitting a PR
+
+If you submit a PR to fix a CI failure:
+
+- Link the PR to the issue:  
+  Add `Closes #12345` to the PR description.
+- Add the `ci-failure` label:  
+  This helps track it in the [CI Failures GitHub Project](https://github.com/orgs/vllm-project/projects/20).
+
+## Other Resources
+
+- 🔍 [Test Reliability on `main`](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main&order=ASC&sort_by=reliability)
+- 🧪 [Latest Buildkite CI Runs](https://buildkite.com/vllm/ci/builds?branch=main)
+
+## Daily Triage
+
+Use [Buildkite analytics (2-day view)](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main&period=2days) to:
+
+- Identify recent test failures **on `main`**.
+- Exclude legitimate test failures on PRs.
+- (Optional) Ignore tests with 0% reliability.
+
+Compare to the [CI Failures Dashboard](https://github.com/orgs/vllm-project/projects/20).
diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md
index 9e506d3d7ba3..93d9e80f5b01 100644
--- a/docs/deployment/docker.md
+++ b/docs/deployment/docker.md
@@ -46,11 +46,11 @@ You can add any other [engine-args][engine-args] you need after the image tag (`
     create a custom Dockerfile on top of the base image with an extra layer that installs them:
 
     ```Dockerfile
-    FROM vllm/vllm-openai:v0.8.3
+    FROM vllm/vllm-openai:v0.9.0
 
     # e.g. install the `audio` optional dependencies
     # NOTE: Make sure the version of vLLM matches the base image!
-    RUN uv pip install --system vllm[audio]==0.8.3
+    RUN uv pip install --system vllm[audio]==0.9.0
     ```
 
 !!! tip
diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index 6ee1060dd050..3547069f724d 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -238,9 +238,11 @@ Flags: `--tool-call-parser hermes`
 ### DeepSeek-V3 Models (`deepseek_v3`)
 
 Supported models:
-* `deepseek-ai/DeepSeek-V3-0324`
 
-Flags: `--tool-call-parser deepseek_v3 --chat-template examples/tool_chat_template_deepseekv3.jinja`
+* `deepseek-ai/DeepSeek-V3-0324` (use with <gh-file:examples/tool_chat_template_deepseekv3.jinja>)
+* `deepseek-ai/DeepSeek-R1-0528` (use with <gh-file:examples/tool_chat_template_deepseekr1.jinja>)
+
+Flags: `--tool-call-parser deepseek_v3 --chat-template {see_above}`
 
 ### Models with Pythonic Tool Calls (`pythonic`)
 
diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py
index 6f290efe45c2..7cfc89605150 100644
--- a/docs/mkdocs/hooks/generate_examples.py
+++ b/docs/mkdocs/hooks/generate_examples.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import itertools
 from dataclasses import dataclass, field
 from pathlib import Path
diff --git a/docs/mkdocs/hooks/remove_announcement.py b/docs/mkdocs/hooks/remove_announcement.py
index e5f8549d8383..f67941d2ad1b 100644
--- a/docs/mkdocs/hooks/remove_announcement.py
+++ b/docs/mkdocs/hooks/remove_announcement.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 from typing import Literal
 
diff --git a/docs/mkdocs/hooks/url_schemes.py b/docs/mkdocs/hooks/url_schemes.py
index c738828085ba..6484581ed947 100644
--- a/docs/mkdocs/hooks/url_schemes.py
+++ b/docs/mkdocs/hooks/url_schemes.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import regex as re
 from mkdocs.config.defaults import MkDocsConfig
 from mkdocs.structure.files import Files
diff --git a/docs/models/extensions/tensorizer.md b/docs/models/extensions/tensorizer.md
index b6feb405c6ca..e0b4479c0beb 100644
--- a/docs/models/extensions/tensorizer.md
+++ b/docs/models/extensions/tensorizer.md
@@ -10,7 +10,7 @@ shorter Pod startup times and CPU memory usage. Tensor encryption is also suppor
 
 For more information on CoreWeave's Tensorizer, please refer to
 [CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see
-the [vLLM example script](https://docs.vllm.ai/en/latest/examples/tensorize_vllm_model.html).
+the [vLLM example script](https://docs.vllm.ai/en/latest/examples/others/tensorize_vllm_model.html).
 
 !!! note
     Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`.
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 71414d2aad82..a8a6f3417e54 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -346,6 +346,7 @@ Specified using `--task generate`.
 | `MixtralForCausalLM`                              | Mixtral-8x7B, Mixtral-8x7B-Instruct                 | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.                                                          | ✅︎                     | ✅︎                          |
 | `MPTForCausalLM`                                  | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter        | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc.                                                                                                   |                        | ✅︎                          |
 | `NemotronForCausalLM`                             | Nemotron-3, Nemotron-4, Minitron                    | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc.                                                                                                         | ✅︎                     | ✅︎                          |
+| `NemotronHForCausalLM`                            | Nemotron-H                                          | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc.                                                                       | ✅︎                     | ✅︎                          |
 | `OLMoForCausalLM`                                 | OLMo                                                | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc.                                                                                                                             |                        | ✅︎                          |
 | `OLMo2ForCausalLM`                                | OLMo2                                               | `allenai/OLMo-2-0425-1B`, etc.                                                                                                                                               |                        | ✅︎                          |
 | `OLMoEForCausalLM`                                | OLMoE                                               | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc.                                                                                                        |                        | ✅︎                          |
diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index 9992884a6ab1..f8519f57128b 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -40,6 +40,8 @@ This living user guide outlines a few known **important changes and limitations*
 | **NVIDIA** | <nobr>🚀 Natively Supported</nobr>         |
 | **AMD**    | <nobr>🚧 WIP</nobr>           |
 | **TPU**    | <nobr>🚧 WIP</nobr>           |
+| **CPU**    | <nobr>🚧 WIP</nobr>           |
+
 #### Feature / Model
 
 | Feature / Model | Status |
@@ -53,7 +55,7 @@ This living user guide outlines a few known **important changes and limitations*
 | **Spec Decode**                             | <nobr>🚧 WIP ([PR #13933](https://github.com/vllm-project/vllm/pull/13933))</nobr>|
 | **Prompt Logprobs with Prefix Caching**     | <nobr>🟡 Planned ([RFC #13414](https://github.com/vllm-project/vllm/issues/13414))</nobr>|
 | **Structured Output Alternative Backends**  | <nobr>🟡 Planned</nobr>                                                           |
-| **Embedding Models**                        | <nobr>🚧 WIP ([PR #18015](https://github.com/vllm-project/vllm/pull/18015))</nobr> |
+| **Embedding Models**                        | <nobr>🚧 WIP ([PR #16188](https://github.com/vllm-project/vllm/pull/16188))</nobr> |
 | **Mamba Models**                            | <nobr>🟡 Planned</nobr>                                                           |
 | **Encoder-Decoder Models**                  | <nobr>🟠 Delayed</nobr>                                                           |
 | **Request-level Structured Output Backend** | <nobr>🔴 Deprecated</nobr>                                                        |
@@ -149,9 +151,9 @@ vLLM V1 currently excludes model architectures with the `SupportsV0Only` protoco
 and the majority fall into the following categories. V1 support for these models will be added eventually.
 
 **Embedding Models**  
-Initially, we will create a [separate model runner](https://github.com/vllm-project/vllm/pull/18015) to provide V1 support without conflicting with other ongoing work.
+The initial support will be provided by [PR #16188](https://github.com/vllm-project/vllm/pull/16188).
 
-Later, we will consider using [hidden states processor](https://github.com/vllm-project/vllm/issues/12249), which is based on [global logits processor](https://github.com/vllm-project/vllm/pull/13360) to enable simultaneous generation and embedding using the same engine instance in V1. [PR #16188](https://github.com/vllm-project/vllm/pull/16188) is the first step towards enabling this.
+Later, we will consider using [hidden states processor](https://github.com/vllm-project/vllm/issues/12249), which is based on [global logits processor](https://github.com/vllm-project/vllm/pull/13360) to enable simultaneous generation and embedding using the same engine instance in V1.
 
 **Mamba Models**  
 Models using selective state-space mechanisms (instead of standard transformer attention)
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 56cdd6861baa..8e5cac78a4b2 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to use vLLM for running offline inference
 with the correct prompt format on audio language models.
diff --git a/examples/offline_inference/automatic_prefix_caching.py b/examples/offline_inference/automatic_prefix_caching.py
index 0d8c73304237..a01a9565a5fd 100644
--- a/examples/offline_inference/automatic_prefix_caching.py
+++ b/examples/offline_inference/automatic_prefix_caching.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Demonstration script for Automatic Prefix Caching (APC) in vLLM.
 
diff --git a/examples/offline_inference/basic/basic.py b/examples/offline_inference/basic/basic.py
index ae5ae7cb4834..78bfda9bcf4e 100644
--- a/examples/offline_inference/basic/basic.py
+++ b/examples/offline_inference/basic/basic.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, SamplingParams
 
diff --git a/examples/offline_inference/basic/chat.py b/examples/offline_inference/basic/chat.py
index b0bb5aa71b8a..d078c517d00e 100644
--- a/examples/offline_inference/basic/chat.py
+++ b/examples/offline_inference/basic/chat.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, EngineArgs
 from vllm.utils import FlexibleArgumentParser
diff --git a/examples/offline_inference/basic/classify.py b/examples/offline_inference/basic/classify.py
index 40ccb1294e42..219064e97429 100644
--- a/examples/offline_inference/basic/classify.py
+++ b/examples/offline_inference/basic/classify.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from argparse import Namespace
 
diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py
index 38a73ccca251..fc5ca23787be 100644
--- a/examples/offline_inference/basic/embed.py
+++ b/examples/offline_inference/basic/embed.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from argparse import Namespace
 
diff --git a/examples/offline_inference/basic/generate.py b/examples/offline_inference/basic/generate.py
index 72f4a8208386..6a41ef4d84bb 100644
--- a/examples/offline_inference/basic/generate.py
+++ b/examples/offline_inference/basic/generate.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, EngineArgs
 from vllm.utils import FlexibleArgumentParser
diff --git a/examples/offline_inference/basic/score.py b/examples/offline_inference/basic/score.py
index 3da73c6c407d..6a08de2d2c38 100644
--- a/examples/offline_inference/basic/score.py
+++ b/examples/offline_inference/basic/score.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from argparse import Namespace
 
diff --git a/examples/offline_inference/batch_llm_inference.py b/examples/offline_inference/batch_llm_inference.py
index c1edfb52ff70..b1c1ef620da8 100644
--- a/examples/offline_inference/batch_llm_inference.py
+++ b/examples/offline_inference/batch_llm_inference.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to use Ray Data for data parallel batch inference.
 
diff --git a/examples/offline_inference/chat_with_tools.py b/examples/offline_inference/chat_with_tools.py
index 61230d895584..6e56e24f2092 100644
--- a/examples/offline_inference/chat_with_tools.py
+++ b/examples/offline_inference/chat_with_tools.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # ruff: noqa
 import json
diff --git a/examples/offline_inference/context_extension.py b/examples/offline_inference/context_extension.py
index 1a70446c30a0..df39e4c25d5c 100644
--- a/examples/offline_inference/context_extension.py
+++ b/examples/offline_inference/context_extension.py
@@ -1,36 +1,51 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This script demonstrates how to extend the context length
+of a Qwen model using the YARN method (rope_scaling)
+and run a simple chat example.
+
+Usage:
+    python examples/offline_inference/context_extension.py
+"""
 
 from vllm import LLM, SamplingParams
 
-rope_theta = 1000000
-original_max_position_embeddings = 32768
-factor = 4.0
-
-# Use yarn to extend context
-hf_overrides = {
-    "rope_theta": rope_theta,
-    "rope_scaling": {
-        "rope_type": "yarn",
-        "factor": factor,
-        "original_max_position_embeddings": original_max_position_embeddings,
-    },
-    "max_model_len": int(original_max_position_embeddings * factor),
-}
-
-llm = LLM(model="Qwen/Qwen3-0.6B", hf_overrides=hf_overrides)
-
-sampling_params = SamplingParams(
-    temperature=0.8,
-    top_p=0.95,
-    max_tokens=128,
-)
-
-conversation = [
-    {"role": "system", "content": "You are a helpful assistant"},
-    {"role": "user", "content": "Hello"},
-    {"role": "assistant", "content": "Hello! How can I assist you today?"},
-]
-outputs = llm.chat(conversation, sampling_params, use_tqdm=False)
+
+def create_llm():
+    rope_theta = 1000000
+    original_max_position_embeddings = 32768
+    factor = 4.0
+
+    # Use yarn to extend context
+    hf_overrides = {
+        "rope_theta": rope_theta,
+        "rope_scaling": {
+            "rope_type": "yarn",
+            "factor": factor,
+            "original_max_position_embeddings": original_max_position_embeddings,
+        },
+        "max_model_len": int(original_max_position_embeddings * factor),
+    }
+
+    llm = LLM(model="Qwen/Qwen3-0.6B", hf_overrides=hf_overrides)
+    return llm
+
+
+def run_llm_chat(llm):
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        max_tokens=128,
+    )
+
+    conversation = [
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": "Hello"},
+        {"role": "assistant", "content": "Hello! How can I assist you today?"},
+    ]
+    outputs = llm.chat(conversation, sampling_params, use_tqdm=False)
+    return outputs
 
 
 def print_outputs(outputs):
@@ -43,4 +58,11 @@ def print_outputs(outputs):
         print("-" * 80)
 
 
-print_outputs(outputs)
+def main():
+    llm = create_llm()
+    outputs = run_llm_chat(llm)
+    print_outputs(outputs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
index 15906e1a2768..3eccb4e11ab6 100644
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Usage:
 Single node:
diff --git a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
index 4ae5d3310e0b..8f3d1a5c0036 100644
--- a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
+++ b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, SamplingParams
 from vllm.config import KVTransferConfig
diff --git a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
index 5757a8a84b86..0bfe7ec0e6cf 100644
--- a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
+++ b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, SamplingParams
 from vllm.config import KVTransferConfig
diff --git a/examples/offline_inference/disaggregated_prefill.py b/examples/offline_inference/disaggregated_prefill.py
index 3ccab0dcd6d3..05a361fee071 100644
--- a/examples/offline_inference/disaggregated_prefill.py
+++ b/examples/offline_inference/disaggregated_prefill.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file demonstrates the example usage of disaggregated prefilling
 We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
diff --git a/examples/offline_inference/eagle.py b/examples/offline_inference/eagle.py
index 606ce7799a88..ce977ee99bb8 100644
--- a/examples/offline_inference/eagle.py
+++ b/examples/offline_inference/eagle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import json
 import os
diff --git a/examples/offline_inference/embed_jina_embeddings_v3.py b/examples/offline_inference/embed_jina_embeddings_v3.py
index 23f60c431fc2..e68128399ba2 100644
--- a/examples/offline_inference/embed_jina_embeddings_v3.py
+++ b/examples/offline_inference/embed_jina_embeddings_v3.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from argparse import Namespace
 
diff --git a/examples/offline_inference/embed_matryoshka_fy.py b/examples/offline_inference/embed_matryoshka_fy.py
index 59c0592ae9e2..7f5d74d9a3ae 100644
--- a/examples/offline_inference/embed_matryoshka_fy.py
+++ b/examples/offline_inference/embed_matryoshka_fy.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from argparse import Namespace
 
diff --git a/examples/offline_inference/encoder_decoder.py b/examples/offline_inference/encoder_decoder.py
index 83dd1f667eb5..0da6fa5c4af5 100644
--- a/examples/offline_inference/encoder_decoder.py
+++ b/examples/offline_inference/encoder_decoder.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Demonstrate prompting of text-to-text
 encoder/decoder models, specifically BART
diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py
index ae3737e37594..d27a902edb7e 100644
--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to use vLLM for running offline inference with
 the explicit/implicit prompt format on enc-dec LMMs for text generation.
diff --git a/examples/offline_inference/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py
index 5d5e55a83d22..d7f2a1633113 100644
--- a/examples/offline_inference/llm_engine_example.py
+++ b/examples/offline_inference/llm_engine_example.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file demonstrates using the `LLMEngine`
 for processing prompts with various sampling parameters.
diff --git a/examples/offline_inference/load_sharded_state.py b/examples/offline_inference/load_sharded_state.py
index 5bb2327a3f83..cc78c0cbbf7c 100644
--- a/examples/offline_inference/load_sharded_state.py
+++ b/examples/offline_inference/load_sharded_state.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Validates the loading of a model saved with the sharded_state format.
 This script demonstrates how to load a model that was previously saved
diff --git a/examples/offline_inference/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py
index 33c660015ba7..00d4cb9eb4c4 100644
--- a/examples/offline_inference/lora_with_quantization_inference.py
+++ b/examples/offline_inference/lora_with_quantization_inference.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to use LoRA with different quantization techniques
 for offline inference.
diff --git a/examples/offline_inference/metrics.py b/examples/offline_inference/metrics.py
index 7927f758cb57..00fb3f5bc891 100644
--- a/examples/offline_inference/metrics.py
+++ b/examples/offline_inference/metrics.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, SamplingParams
 from vllm.v1.metrics.reader import Counter, Gauge, Histogram, Vector
diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py
index 98fef2648f6b..330103d5818a 100644
--- a/examples/offline_inference/mistral-small.py
+++ b/examples/offline_inference/mistral-small.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # ruff: noqa
 import argparse
diff --git a/examples/offline_inference/mlpspeculator.py b/examples/offline_inference/mlpspeculator.py
index b750397f45b8..d5b1b4ad29a9 100644
--- a/examples/offline_inference/mlpspeculator.py
+++ b/examples/offline_inference/mlpspeculator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file demonstrates the usage of text generation with an LLM model,
 comparing the performance with and without speculative decoding.
diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py
index 1fa2f16f82a8..f0c00bcaaeb1 100644
--- a/examples/offline_inference/multilora_inference.py
+++ b/examples/offline_inference/multilora_inference.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to use the multi-LoRA functionality
 for offline inference.
diff --git a/examples/offline_inference/neuron.py b/examples/offline_inference/neuron.py
index f2d7698f22d7..7826629a36d0 100644
--- a/examples/offline_inference/neuron.py
+++ b/examples/offline_inference/neuron.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, SamplingParams
 
diff --git a/examples/offline_inference/neuron_eagle.py b/examples/offline_inference/neuron_eagle.py
index 5d7fb819d347..0b2070c8e253 100644
--- a/examples/offline_inference/neuron_eagle.py
+++ b/examples/offline_inference/neuron_eagle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to run offline inference with an EAGLE speculative
 decoding model on neuron. To use EAGLE speculative decoding, you must use
diff --git a/examples/offline_inference/neuron_int8_quantization.py b/examples/offline_inference/neuron_int8_quantization.py
index ec38525b9daf..c0ecfac50899 100644
--- a/examples/offline_inference/neuron_int8_quantization.py
+++ b/examples/offline_inference/neuron_int8_quantization.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 
diff --git a/examples/offline_inference/neuron_multimodal.py b/examples/offline_inference/neuron_multimodal.py
index a9478650b16f..6ff8faabd748 100644
--- a/examples/offline_inference/neuron_multimodal.py
+++ b/examples/offline_inference/neuron_multimodal.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import requests
 import torch
 from neuronx_distributed_inference.models.mllama.utils import add_instruct
diff --git a/examples/offline_inference/neuron_speculation.py b/examples/offline_inference/neuron_speculation.py
index ecacbab771c2..2ef69f29863d 100644
--- a/examples/offline_inference/neuron_speculation.py
+++ b/examples/offline_inference/neuron_speculation.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to run offline inference with a speculative
 decoding model on neuron.
diff --git a/examples/offline_inference/prefix_caching.py b/examples/offline_inference/prefix_caching.py
index d3dad24956a6..699891382394 100644
--- a/examples/offline_inference/prefix_caching.py
+++ b/examples/offline_inference/prefix_caching.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, SamplingParams
 from vllm.distributed import cleanup_dist_env_and_memory
diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py
index 21f7668adc86..567c448a8c97 100644
--- a/examples/offline_inference/prithvi_geospatial_mae.py
+++ b/examples/offline_inference/prithvi_geospatial_mae.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This is a demo script showing how to use the
 PrithviGeospatialMAE model with vLLM
diff --git a/examples/offline_inference/profiling.py b/examples/offline_inference/profiling.py
index 244a64b891c9..392fba8fc5ea 100644
--- a/examples/offline_inference/profiling.py
+++ b/examples/offline_inference/profiling.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import inspect
 import json
diff --git a/examples/offline_inference/profiling_tpu/profiling.py b/examples/offline_inference/profiling_tpu/profiling.py
index 82737d538df4..5200be82694a 100644
--- a/examples/offline_inference/profiling_tpu/profiling.py
+++ b/examples/offline_inference/profiling_tpu/profiling.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import dataclasses
diff --git a/examples/offline_inference/prompt_embed_inference.py b/examples/offline_inference/prompt_embed_inference.py
index 9f6a602233f8..5d79222a1bb3 100644
--- a/examples/offline_inference/prompt_embed_inference.py
+++ b/examples/offline_inference/prompt_embed_inference.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Demonstrates how to generate prompt embeddings using
 Hugging Face Transformers  and use them as input to vLLM
diff --git a/examples/offline_inference/qwen2_5_omni/only_thinker.py b/examples/offline_inference/qwen2_5_omni/only_thinker.py
index 6482490d1a93..62effd5c8b62 100644
--- a/examples/offline_inference/qwen2_5_omni/only_thinker.py
+++ b/examples/offline_inference/qwen2_5_omni/only_thinker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to use vLLM for running offline inference
 with the correct prompt format on Qwen2.5-Omni (thinker only).
diff --git a/examples/offline_inference/qwen_1m.py b/examples/offline_inference/qwen_1m.py
index 856a35b0e59b..d8d61667f688 100644
--- a/examples/offline_inference/qwen_1m.py
+++ b/examples/offline_inference/qwen_1m.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 from urllib.request import urlopen
 
diff --git a/examples/offline_inference/reproducibility.py b/examples/offline_inference/reproducibility.py
index 6d048986e710..d909438b4104 100644
--- a/examples/offline_inference/reproducibility.py
+++ b/examples/offline_inference/reproducibility.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Demonstrates how to achieve reproducibility in vLLM.
 
diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py
index a8f6977e29a4..c6e63531a99d 100644
--- a/examples/offline_inference/rlhf.py
+++ b/examples/offline_inference/rlhf.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 a simple demonstration of RLHF with vLLM, inspired by
 the OpenRLHF framework https://github.com/OpenRLHF/OpenRLHF .
diff --git a/examples/offline_inference/rlhf_colocate.py b/examples/offline_inference/rlhf_colocate.py
index 76eafdca1f6c..096363e68301 100644
--- a/examples/offline_inference/rlhf_colocate.py
+++ b/examples/offline_inference/rlhf_colocate.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 a simple demonstration to show how to co-locate
 vLLM worker with training actors on the same GPUs,
diff --git a/examples/offline_inference/rlhf_utils.py b/examples/offline_inference/rlhf_utils.py
index 3461af707eba..c445224d7568 100644
--- a/examples/offline_inference/rlhf_utils.py
+++ b/examples/offline_inference/rlhf_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch
 
 
diff --git a/examples/offline_inference/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py
index 860fe2b5fe06..9b154e370642 100644
--- a/examples/offline_inference/save_sharded_state.py
+++ b/examples/offline_inference/save_sharded_state.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Saves each worker's model state dict directly to a checkpoint, which enables a
 fast load path for large tensor-parallel models where each worker only needs to
diff --git a/examples/offline_inference/simple_profiling.py b/examples/offline_inference/simple_profiling.py
index d583110c8e69..46858fffadc5 100644
--- a/examples/offline_inference/simple_profiling.py
+++ b/examples/offline_inference/simple_profiling.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import time
diff --git a/examples/offline_inference/structured_outputs.py b/examples/offline_inference/structured_outputs.py
index 9ed7299606b7..8ef121ebe848 100644
--- a/examples/offline_inference/structured_outputs.py
+++ b/examples/offline_inference/structured_outputs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file demonstrates the example usage of guided decoding
 to generate structured outputs using vLLM. It shows how to apply
diff --git a/examples/offline_inference/torchrun_example.py b/examples/offline_inference/torchrun_example.py
index 2fa49c0835e3..3d3d7946cdb4 100644
--- a/examples/offline_inference/torchrun_example.py
+++ b/examples/offline_inference/torchrun_example.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 experimental support for tensor-parallel inference with torchrun,
 see https://github.com/vllm-project/vllm/issues/11400 for
diff --git a/examples/offline_inference/tpu.py b/examples/offline_inference/tpu.py
index f3c2859d44d1..9776f4fe322b 100644
--- a/examples/offline_inference/tpu.py
+++ b/examples/offline_inference/tpu.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import os
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 2ef87f4f4696..15dbd9f44128 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to use vLLM for running offline inference with
 the correct prompt format on vision language models for text generation.
diff --git a/examples/offline_inference/vision_language_embedding.py b/examples/offline_inference/vision_language_embedding.py
index cee02d06c607..1f5bd4ad72b0 100644
--- a/examples/offline_inference/vision_language_embedding.py
+++ b/examples/offline_inference/vision_language_embedding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to use vLLM for running offline inference with
 the correct prompt format on vision language models for multimodal embedding.
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 7ce28c5a4f09..ea7a793d026b 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to use vLLM for running offline inference with
 multi-image input on vision language models for text generation,
@@ -592,21 +593,21 @@ def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData:
 
 def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
     try:
-        from qwen_vl_utils import process_vision_info
+        from qwen_vl_utils import smart_resize
     except ModuleNotFoundError:
         print(
             "WARNING: `qwen-vl-utils` not installed, input images will not "
             "be automatically resized. You can enable this functionality by "
             "`pip install qwen-vl-utils`."
         )
-        process_vision_info = None
+        smart_resize = None
 
     model_name = "Qwen/Qwen2-VL-7B-Instruct"
 
     # Tested on L40
     engine_args = EngineArgs(
         model=model_name,
-        max_model_len=32768 if process_vision_info is None else 4096,
+        max_model_len=32768 if smart_resize is None else 4096,
         max_num_seqs=5,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
@@ -629,10 +630,18 @@ def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
         messages, tokenize=False, add_generation_prompt=True
     )
 
-    if process_vision_info is None:
+    if smart_resize is None:
         image_data = [fetch_image(url) for url in image_urls]
     else:
-        image_data, _ = process_vision_info(messages)
+
+        def post_process_image(image: Image) -> Image:
+            width, height = image.size
+            resized_height, resized_width = smart_resize(
+                height, width, max_pixels=1024 * 28 * 28
+            )
+            return image.resize((resized_width, resized_height))
+
+        image_data = [post_process_image(fetch_image(url)) for url in image_urls]
 
     return ModelRequestData(
         engine_args=engine_args,
@@ -643,20 +652,20 @@ def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
 
 def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
     try:
-        from qwen_vl_utils import process_vision_info
+        from qwen_vl_utils import smart_resize
     except ModuleNotFoundError:
         print(
             "WARNING: `qwen-vl-utils` not installed, input images will not "
             "be automatically resized. You can enable this functionality by "
             "`pip install qwen-vl-utils`."
         )
-        process_vision_info = None
+        smart_resize = None
 
     model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
 
     engine_args = EngineArgs(
         model=model_name,
-        max_model_len=32768 if process_vision_info is None else 4096,
+        max_model_len=32768 if smart_resize is None else 4096,
         max_num_seqs=5,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
@@ -679,10 +688,18 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
         messages, tokenize=False, add_generation_prompt=True
     )
 
-    if process_vision_info is None:
+    if smart_resize is None:
         image_data = [fetch_image(url) for url in image_urls]
     else:
-        image_data, _ = process_vision_info(messages, return_video_kwargs=False)
+
+        def post_process_image(image: Image) -> Image:
+            width, height = image.size
+            resized_height, resized_width = smart_resize(
+                height, width, max_pixels=1024 * 28 * 28
+            )
+            return image.resize((resized_width, resized_height))
+
+        image_data = [post_process_image(fetch_image(url)) for url in image_urls]
 
     return ModelRequestData(
         engine_args=engine_args,
diff --git a/examples/online_serving/api_client.py b/examples/online_serving/api_client.py
index cc190e91c141..84854911bade 100644
--- a/examples/online_serving/api_client.py
+++ b/examples/online_serving/api_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Example Python client for `vllm.entrypoints.api_server`
 Start the demo server:
     python -m vllm.entrypoints.api_server --model <model_name>
diff --git a/examples/online_serving/cohere_rerank_client.py b/examples/online_serving/cohere_rerank_client.py
index e57b94e8805f..63c9ff9e9398 100644
--- a/examples/online_serving/cohere_rerank_client.py
+++ b/examples/online_serving/cohere_rerank_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Example of using the OpenAI entrypoint's rerank API which is compatible with
 the Cohere SDK: https://github.com/cohere-ai/cohere-python
diff --git a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
index 2ffba4a7ed3f..16c32dcaa5d3 100644
--- a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
+++ b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file provides a disaggregated prefilling proxy demo to demonstrate an
 example usage of XpYd disaggregated prefilling.
diff --git a/examples/online_serving/gradio_openai_chatbot_webserver.py b/examples/online_serving/gradio_openai_chatbot_webserver.py
index 3f2a3d01b456..d5d0a07a2918 100644
--- a/examples/online_serving/gradio_openai_chatbot_webserver.py
+++ b/examples/online_serving/gradio_openai_chatbot_webserver.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Example for starting a Gradio OpenAI Chatbot Webserver
 Start vLLM API server:
     vllm serve meta-llama/Llama-2-7b-chat-hf
diff --git a/examples/online_serving/gradio_webserver.py b/examples/online_serving/gradio_webserver.py
index fd341ff493b5..86d9ceb48bb0 100644
--- a/examples/online_serving/gradio_webserver.py
+++ b/examples/online_serving/gradio_webserver.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Example for starting a Gradio Webserver
 Start vLLM API server:
     python -m vllm.entrypoints.api_server \
diff --git a/examples/online_serving/jinaai_rerank_client.py b/examples/online_serving/jinaai_rerank_client.py
index 7eb3d2193f41..908d6a9240aa 100644
--- a/examples/online_serving/jinaai_rerank_client.py
+++ b/examples/online_serving/jinaai_rerank_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Example of using the OpenAI entrypoint's rerank API which is compatible with
 Jina and Cohere https://jina.ai/reranker
diff --git a/examples/online_serving/kv_events_subscriber.py b/examples/online_serving/kv_events_subscriber.py
index 65d74dccab80..584db53db4e4 100644
--- a/examples/online_serving/kv_events_subscriber.py
+++ b/examples/online_serving/kv_events_subscriber.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any, Optional, Union
 
 import msgspec
diff --git a/examples/online_serving/multi_instance_data_parallel.py b/examples/online_serving/multi_instance_data_parallel.py
new file mode 100644
index 000000000000..62b1ec71af14
--- /dev/null
+++ b/examples/online_serving/multi_instance_data_parallel.py
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: Apache-2.0
+import asyncio
+from typing import Optional
+
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import SamplingParams
+
+"""
+To run this example, run the following commands simultaneously with
+different CUDA_VISIBLE_DEVICES:
+    python examples/online_serving/multi_instance_data_parallel.py
+
+    vllm serve ibm-research/PowerMoE-3b -dp 2 -dpr 1 \
+        --data-parallel-address 127.0.0.1 --data-parallel-rpc-port 62300 \
+        --data-parallel-size-local 1 --enforce-eager --headless
+
+Once both instances have completed the handshake, this example will
+send a request to the instance with DP rank 1.
+"""
+
+
+async def main():
+    engine_args = AsyncEngineArgs(
+        model="ibm-research/PowerMoE-3b",
+        data_parallel_size=2,
+        dtype="auto",
+        max_model_len=2048,
+        data_parallel_address="127.0.0.1",
+        data_parallel_rpc_port=62300,
+        data_parallel_size_local=1,
+        enforce_eager=True,
+    )
+
+    engine_client = AsyncLLMEngine.from_engine_args(engine_args)
+
+    sampling_params = SamplingParams(
+        temperature=0.7,
+        top_p=0.9,
+        max_tokens=100,
+    )
+
+    prompt = "Who won the 2004 World Series?"
+    final_output: Optional[RequestOutput] = None
+    async for output in engine_client.generate(
+        prompt=prompt,
+        sampling_params=sampling_params,
+        request_id="abcdef",
+        data_parallel_rank=1,
+    ):
+        final_output = output
+    if final_output:
+        print(final_output.outputs[0].text)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/examples/online_serving/openai_chat_completion_client.py b/examples/online_serving/openai_chat_completion_client.py
index 2856e3be3e2d..def95deb0c95 100644
--- a/examples/online_serving/openai_chat_completion_client.py
+++ b/examples/online_serving/openai_chat_completion_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Example Python client for OpenAI Chat Completion using vLLM API server
 NOTE: start a supported chat completion model server with `vllm serve`, e.g.
     vllm serve meta-llama/Llama-2-7b-chat-hf
diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
index 8c3c6ecdd4b0..c99b5148de87 100644
--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """An example showing how to use vLLM to serve multimodal models
 and run online serving with OpenAI client.
 
diff --git a/examples/online_serving/openai_chat_completion_client_with_tools.py b/examples/online_serving/openai_chat_completion_client_with_tools.py
index a0d7841f644f..41dbb3236297 100644
--- a/examples/online_serving/openai_chat_completion_client_with_tools.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Set up this example by starting a vLLM OpenAI-compatible server with tool call
 options enabled. For example:
diff --git a/examples/online_serving/openai_chat_completion_client_with_tools_required.py b/examples/online_serving/openai_chat_completion_client_with_tools_required.py
index 45c4232fe1de..7eb8668213ee 100644
--- a/examples/online_serving/openai_chat_completion_client_with_tools_required.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_required.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 To run this example, you can start the vLLM server
 without any specific flags:
diff --git a/examples/online_serving/openai_chat_completion_structured_outputs.py b/examples/online_serving/openai_chat_completion_structured_outputs.py
index a4134ea43c4b..5c55d53138a8 100644
--- a/examples/online_serving/openai_chat_completion_structured_outputs.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 To run this example, you need to start the vLLM server:
 
@@ -138,7 +139,6 @@ def extra_backend_options_completion(client: OpenAI, model: str):
             extra_body={
                 "guided_regex": r"\w+@\w+\.com\n",
                 "stop": ["\n"],
-                "guided_decoding_backend": "xgrammar",
                 "guided_decoding_disable_fallback": True,
             },
         )
diff --git a/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py b/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py
index c73208abe600..ec7d8b95472e 100644
--- a/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from openai import OpenAI
 
 # This example demonstrates the `structural_tag` response format.
diff --git a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
index 1ca61a8d5895..bfbee7513874 100644
--- a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 An example shows how to generate structured outputs from reasoning models
 like DeepSeekR1. The thinking process will not be guided by the JSON
diff --git a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
index a5febad45863..4006d07f73b0 100644
--- a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 An example demonstrates how to use tool calling with reasoning models 
 like QwQ-32B. The reasoning_content will not be parsed by the tool 
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning.py b/examples/online_serving/openai_chat_completion_with_reasoning.py
index f6b8082115f1..932dbeb2e7a2 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 An example shows how to generate chat completions from reasoning models
 like DeepSeekR1.
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
index f984fbabf24f..5a9192977094 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 An example shows how to generate chat completions from reasoning models
 like DeepSeekR1.
diff --git a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
index ee519e555ff7..70f3c2f19cf1 100644
--- a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import base64
diff --git a/examples/online_serving/openai_classification_client.py b/examples/online_serving/openai_classification_client.py
index 649cfa5d6686..b10e7acbd26c 100644
--- a/examples/online_serving/openai_classification_client.py
+++ b/examples/online_serving/openai_classification_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import pprint
diff --git a/examples/online_serving/openai_completion_client.py b/examples/online_serving/openai_completion_client.py
index b1d21b5e4b9f..df6e4e942965 100644
--- a/examples/online_serving/openai_completion_client.py
+++ b/examples/online_serving/openai_completion_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 
diff --git a/examples/online_serving/openai_cross_encoder_score.py b/examples/online_serving/openai_cross_encoder_score.py
index 7891e14cb71e..2e0d168d615c 100644
--- a/examples/online_serving/openai_cross_encoder_score.py
+++ b/examples/online_serving/openai_cross_encoder_score.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Example online usage of Score API.
 
diff --git a/examples/online_serving/openai_embedding_client.py b/examples/online_serving/openai_embedding_client.py
index a055654e9133..6bc390861e2e 100644
--- a/examples/online_serving/openai_embedding_client.py
+++ b/examples/online_serving/openai_embedding_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from openai import OpenAI
 
diff --git a/examples/online_serving/openai_embedding_matryoshka_fy.py b/examples/online_serving/openai_embedding_matryoshka_fy.py
index 4544dcfb5ab0..653da8d18b70 100644
--- a/examples/online_serving/openai_embedding_matryoshka_fy.py
+++ b/examples/online_serving/openai_embedding_matryoshka_fy.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Example Python client for embedding API dimensions using vLLM API server
 NOTE:
     start a supported Matryoshka Embeddings model server with `vllm serve`, e.g.
diff --git a/examples/online_serving/openai_pooling_client.py b/examples/online_serving/openai_pooling_client.py
index 2620a1232024..8252b36705cc 100644
--- a/examples/online_serving/openai_pooling_client.py
+++ b/examples/online_serving/openai_pooling_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Example online usage of Pooling API.
 
diff --git a/examples/online_serving/openai_transcription_client.py b/examples/online_serving/openai_transcription_client.py
index eb501ae72aa9..12d45de3c81b 100644
--- a/examples/online_serving/openai_transcription_client.py
+++ b/examples/online_serving/openai_transcription_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 import json
 
diff --git a/examples/online_serving/opentelemetry/dummy_client.py b/examples/online_serving/opentelemetry/dummy_client.py
index 33d365f0caa5..018d986ad873 100644
--- a/examples/online_serving/opentelemetry/dummy_client.py
+++ b/examples/online_serving/opentelemetry/dummy_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import requests
 from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
diff --git a/examples/online_serving/prompt_embed_inference_with_openai_client.py b/examples/online_serving/prompt_embed_inference_with_openai_client.py
index 85ea2340736e..3a9042138377 100644
--- a/examples/online_serving/prompt_embed_inference_with_openai_client.py
+++ b/examples/online_serving/prompt_embed_inference_with_openai_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 vLLM OpenAI-Compatible Client with Prompt Embeddings
 
diff --git a/examples/online_serving/ray_serve_deepseek.py b/examples/online_serving/ray_serve_deepseek.py
index a76020130c3a..9471563ddb76 100644
--- a/examples/online_serving/ray_serve_deepseek.py
+++ b/examples/online_serving/ray_serve_deepseek.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Example to deploy DeepSeek R1 or V3 with Ray Serve LLM.
 See more details at:
diff --git a/examples/online_serving/retrieval_augmented_generation_with_langchain.py b/examples/online_serving/retrieval_augmented_generation_with_langchain.py
index 37af3b3887f5..d9a4cadb036e 100644
--- a/examples/online_serving/retrieval_augmented_generation_with_langchain.py
+++ b/examples/online_serving/retrieval_augmented_generation_with_langchain.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Retrieval Augmented Generation (RAG) Implementation with Langchain
 ==================================================================
diff --git a/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py b/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py
index 08796b1b3a54..be4796acd1b6 100644
--- a/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py
+++ b/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 RAG (Retrieval Augmented Generation) Implementation with LlamaIndex
 ================================================================
diff --git a/examples/online_serving/streamlit_openai_chatbot_webserver.py b/examples/online_serving/streamlit_openai_chatbot_webserver.py
index 0722aa671f66..dab56172ee3a 100644
--- a/examples/online_serving/streamlit_openai_chatbot_webserver.py
+++ b/examples/online_serving/streamlit_openai_chatbot_webserver.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 vLLM Chat Assistant - A Streamlit Web Interface
 
diff --git a/examples/online_serving/utils.py b/examples/online_serving/utils.py
index 0781a27f19c5..a512d8a31b53 100644
--- a/examples/online_serving/utils.py
+++ b/examples/online_serving/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from openai import APIConnectionError, OpenAI
 from openai.pagination import SyncPage
 from openai.types.model import Model
diff --git a/examples/others/lmcache/cpu_offload_lmcache.py b/examples/others/lmcache/cpu_offload_lmcache.py
index 98eafb31ed4f..354e4cc8c572 100644
--- a/examples/others/lmcache/cpu_offload_lmcache.py
+++ b/examples/others/lmcache/cpu_offload_lmcache.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file demonstrates the example usage of cpu offloading
 with LMCache in vLLM v1 or v0.
diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v0.py b/examples/others/lmcache/disagg_prefill_lmcache_v0.py
index b2b7b3b2c1f9..6669eb3fb3d3 100644
--- a/examples/others/lmcache/disagg_prefill_lmcache_v0.py
+++ b/examples/others/lmcache/disagg_prefill_lmcache_v0.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file demonstrates the example usage of disaggregated prefilling
 with LMCache.
diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
index df8a41293504..0b6c9213ebff 100644
--- a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
+++ b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
@@ -33,7 +33,7 @@ check_num_gpus() {
 
 ensure_python_library_installed() {
     echo "Checking if $1 is installed..."
-    python -c "import $1" > /dev/null 2>&1
+    python3 -c "import $1" > /dev/null 2>&1
     if [ $? -ne 0 ]; then
         if [ "$1" == "nixl" ]; then
             echo "$1 is not installed. Please refer to https://github.com/ai-dynamo/nixl for installation."
@@ -121,8 +121,8 @@ main() {
     echo "All servers are up. Starting benchmark..."
 
     # begin benchmark
-    cd ../../../benchmarks/
-    python benchmark_serving.py --port 9000 --seed $(date +%s) \
+    cd ../../../../benchmarks/
+    python3 benchmark_serving.py --port 9000 --seed $(date +%s) \
         --model meta-llama/Llama-3.1-8B-Instruct \
         --dataset-name random --random-input-len 7500 --random-output-len 200 \
         --num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log
diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
index 20155c203658..5d8e38c73b89 100644
--- a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
+++ b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import os
diff --git a/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py b/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py
index 89945d67a6f3..508cf4a5a498 100644
--- a/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py
+++ b/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file demonstrates the example usage of remote KV cache sharing
 with LMCache.
diff --git a/examples/others/tensorize_vllm_model.py b/examples/others/tensorize_vllm_model.py
index 175777630833..9e1003a5c39d 100644
--- a/examples/others/tensorize_vllm_model.py
+++ b/examples/others/tensorize_vllm_model.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import dataclasses
diff --git a/examples/tool_chat_template_deepseekr1.jinja b/examples/tool_chat_template_deepseekr1.jinja
new file mode 100644
index 000000000000..9ae19341fc48
--- /dev/null
+++ b/examples/tool_chat_template_deepseekr1.jinja
@@ -0,0 +1,92 @@
+{% if not add_generation_prompt is defined %}
+    {% set add_generation_prompt = false %}
+{% endif %}
+{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true, is_last_user=false) %}
+{%- for message in messages %}
+    {%- if message['role'] == 'system' %}
+        {%- if ns.is_first_sp %}
+            {% set ns.system_prompt = ns.system_prompt + message['content'] %}
+            {% set ns.is_first_sp = false %}
+        {%- else %}
+            {% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+
+{#- Adapted from https://github.com/sgl-project/sglang/blob/main/examples/chat_template/tool_chat_template_deepseekr1.jinja #}
+{% if tools is defined and tools is not none %}
+    {% set tool_ns = namespace(text='You are a helpful assistant with tool calling capabilities. '
+        'When a tool call is needed, you MUST use the following format to issue the call:\n'
+        '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>FUNCTION_NAME\n'
+        '```json\n{"param1": "value1", "param2": "value2"}\n```<｜tool▁call▁end｜><｜tool▁calls▁end｜>\n\n'
+        'Make sure the JSON is valid.'
+        '## Tools\n\n### Function\n\nYou have the following functions available:\n\n') %}
+    {% for tool in tools %}
+        {% set tool_ns.text = tool_ns.text + '\n```json\n' + (tool | tojson) + '\n```\n' %}
+    {% endfor %}
+    {% set ns.system_prompt = ns.system_prompt + '\n\n' + tool_ns.text %}
+{% endif %}
+
+{{ bos_token }}
+{{ ns.system_prompt }}
+{%- for message in messages %}
+    {% set content = message['content'] %}
+    {%- if message['role'] == 'user' %}
+        {%- set ns.is_tool = false -%}
+        {%- set ns.is_first = false -%}
+        {%- set ns.is_last_user = true -%}
+        {{'<｜User｜>' + content + '<｜Assistant｜>'}}
+    {%- endif %}
+    {%- if message['role'] == 'assistant' %}
+        {% if '</think>' in content %}
+            {% set content = content.split('</think>')[-1] %}
+        {% endif %}
+    {% endif %}
+    {%- if message['role'] == 'assistant' and message['tool_calls'] is defined and message['tool_calls'] is not none %}
+        {%- set ns.is_last_user = false -%}
+        {%- if ns.is_tool %}
+            {{'<｜tool▁outputs▁end｜>'}}
+        {%- endif %}
+        {%- set ns.is_first = false %}
+        {%- set ns.is_tool = false -%}
+        {%- set ns.is_output_first = true %}
+        {%- for tool in message['tool_calls'] %}
+            {%- if not ns.is_first %}
+                {%- if content is none %}
+                    {{'<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments']|tojson + '\n' + '```' + '<｜tool▁call▁end｜>'}}
+                {%- else %}
+                    {{content + '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments']|tojson + '\n' + '```' + '<｜tool▁call▁end｜>'}}
+                {%- endif %}
+                {%- set ns.is_first = true -%}
+            {%- else %}
+                {{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments']|tojson + '\n' + '```' + '<｜tool▁call▁end｜>'}}
+            {%- endif %}
+        {%- endfor %}
+        {{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}
+    {%- endif %}
+    {%- if message['role'] == 'assistant' and (message['tool_calls'] is not defined or message['tool_calls'] is none)%}
+        {%- set ns.is_last_user = false -%}
+        {%- if ns.is_tool %}
+            {{'<｜tool▁outputs▁end｜>' + content + '<｜end▁of▁sentence｜>'}}
+            {%- set ns.is_tool = false -%}
+        {%- else %}
+            {{content + '<｜end▁of▁sentence｜>'}}
+        {%- endif %}
+    {%- endif %}
+    {%- if message['role'] == 'tool' %}
+        {%- set ns.is_last_user = false -%}
+        {%- set ns.is_tool = true -%}
+        {%- if ns.is_output_first %}
+            {{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + content + '<｜tool▁output▁end｜>'}}
+            {%- set ns.is_output_first = false %}
+        {%- else %}
+            {{'\n<｜tool▁output▁begin｜>' + content + '<｜tool▁output▁end｜>'}}
+        {%- endif %}
+    {%- endif %}
+{%- endfor -%}
+{% if ns.is_tool %}
+    {{'<｜tool▁outputs▁end｜>'}}
+{% endif %}
+{% if add_generation_prompt and not ns.is_last_user and not ns.is_tool %}
+    {{'<｜Assistant｜>'}}
+{% endif %}
diff --git a/find_cuda_init.py b/find_cuda_init.py
index 0d13b2f86210..308fc6fc2d61 100644
--- a/find_cuda_init.py
+++ b/find_cuda_init.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import importlib
 import traceback
diff --git a/requirements/common.txt b/requirements/common.txt
index de4b3b53166c..a6a1ffe76196 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -37,7 +37,7 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.9.4 # required for compressed-tensors
+compressed-tensors == 0.10.0 # required for compressed-tensors
 depyf==0.18.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index 1213301584ce..e43b44397752 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -1,6 +1,9 @@
 # Common dependencies
 -r common.txt
 
+numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+numba == 0.61.2; python_version > '3.9'
+
 # Dependencies for CPUs
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt
index e9b466d3a82d..3475ada9f4c9 100644
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@@ -9,7 +9,9 @@ pytest-shard
 pytest-timeout
 
 librosa # required by audio tests in entrypoints/openai
-sentence-transformers
+sentence-transformers # required for embedding tests
+transformers==4.51.3
+transformers_stream_generator # required for qwen-vl test
 numba == 0.61.2; python_version > '3.9'
 # testing utils
 boto3
@@ -38,4 +40,7 @@ matplotlib # required for qwen-vl test
 # required for  Multi-Modal Models Test (Standard)
 num2words # required for smolvlm test
 pqdm
-timm # required for internvl test
\ No newline at end of file
+timm # required for internvl test
+
+schemathesis>=3.39.15  # Required for openai schema test.
+mteb>=1.38.11, <2 # required for mteb test
diff --git a/requirements/rocm.txt b/requirements/rocm.txt
index fb1febdac506..d33021fc7597 100644
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -12,7 +12,8 @@ ray>=2.10.0,<2.45.0
 peft
 pytest-asyncio
 tensorizer>=2.9.0
-setuptools-scm>=8
+packaging>=24.2
 setuptools>=77.0.3,<80.0.0
+setuptools-scm>=8
 runai-model-streamer==0.11.0
 runai-model-streamer-s3==0.11.0
diff --git a/requirements/test.in b/requirements/test.in
index 9b574a09fcce..bbbd41e168a6 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -34,7 +34,7 @@ opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.8 # required for model evaluation test
 mteb>=1.38.11, <2 # required for mteb test
-transformers==4.51.3
+transformers==4.52.4
 tokenizers==0.21.1
 huggingface-hub[hf_xet]>=0.30.0  # Required for Xet downloads.
 schemathesis>=3.39.15 # Required for openai schema test.
diff --git a/requirements/test.txt b/requirements/test.txt
index 03aec80ac128..fb0eede080ff 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -794,7 +794,7 @@ tqdm==4.66.6
     #   transformers
 tqdm-multiprocess==0.0.11
     # via lm-eval
-transformers==4.51.3
+transformers==4.52.4
     # via
     #   -r requirements/test.in
     #   genai-perf
diff --git a/requirements/tpu.txt b/requirements/tpu.txt
index 47e638463bf5..a26dfd460d8e 100644
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -18,9 +18,9 @@ setuptools==78.1.0
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch==2.8.0.dev20250529
-torchvision==0.22.0.dev20250529
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250529-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250529-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250529-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch==2.8.0.dev20250605
+torchvision==0.23.0.dev20250605
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250605-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250605-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250605-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
 
diff --git a/setup.py b/setup.py
index c190864dda94..ea7cd0169c8b 100644
--- a/setup.py
+++ b/setup.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import ctypes
 import importlib.util
@@ -687,6 +688,7 @@ def _read_requirements(filename: str) -> list[str]:
     ext_modules=ext_modules,
     install_requires=get_requirements(),
     extras_require={
+        "bench": ["pandas", "datasets"],
         "tensorizer": ["tensorizer>=2.9.0"],
         "fastsafetensors": ["fastsafetensors >= 0.1.10"],
         "runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py
index 1e3c2d1a473a..163185b90be9 100644
--- a/tests/async_engine/api_server_async_engine.py
+++ b/tests/async_engine/api_server_async_engine.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """vllm.entrypoints.api_server with some extra logging for testing."""
 from collections.abc import Iterable
 from typing import Any
diff --git a/tests/async_engine/conftest.py b/tests/async_engine/conftest.py
index 1a20e2c135c2..375b248ebeda 100644
--- a/tests/async_engine/conftest.py
+++ b/tests/async_engine/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 
diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py
index 410cece795e9..38ecaf2233d9 100644
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import subprocess
diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py
index b6f44871497c..043b75cc5d38 100644
--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import os
@@ -383,3 +384,25 @@ async def test_delayed_generator(async_engine, stop):
     assert final_output is not None
     assert len(final_output.outputs[0].token_ids) == 10
     assert final_output.finished
+
+
+@pytest.mark.asyncio(scope="module")
+async def test_invalid_argument(async_engine):
+    scheduler_config = await async_engine.get_scheduler_config()
+
+    if scheduler_config.num_scheduler_steps != 1:
+        pytest.skip("no need to test this one with multistep")
+
+    sampling_params = SamplingParams(
+        temperature=0,
+        min_tokens=10,
+        max_tokens=10,
+    )
+
+    # Targeting specific DP rank only supported in v1 multi-instance DP
+    with pytest.raises(ValueError):
+        async for _ in async_engine.generate("test",
+                                             sampling_params,
+                                             request_id=uid(),
+                                             data_parallel_rank=0):
+            pass
diff --git a/tests/async_engine/test_request_tracker.py b/tests/async_engine/test_request_tracker.py
index fd6d89d4e00d..1851eeeda790 100644
--- a/tests/async_engine/test_request_tracker.py
+++ b/tests/async_engine/test_request_tracker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 11c8e7a4b9d1..2e103019f7af 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the short outputs of HF and vLLM when using greedy sampling.
 
 Run `pytest tests/basic_correctness/test_basic_correctness.py`.
@@ -127,15 +128,21 @@ def test_models(
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize(
     "model, distributed_executor_backend, attention_backend, "
-    "test_suite", [
-        ("distilbert/distilgpt2", "ray", "", "L4"),
-        ("distilbert/distilgpt2", "mp", "", "L4"),
-        ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
-        ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
-        ("distilbert/distilgpt2", "ray", "", "A100"),
-        ("distilbert/distilgpt2", "mp", "", "A100"),
-        ("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"),
-        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
+    "test_suite, extra_env", [
+        ("distilbert/distilgpt2", "ray", "", "L4", {}),
+        ("distilbert/distilgpt2", "mp", "", "L4", {}),
+        ("distilbert/distilgpt2", "ray", "", "L4", {
+            "VLLM_SLEEP_WHEN_IDLE": "1"
+        }),
+        ("distilbert/distilgpt2", "mp", "", "L4", {
+            "VLLM_SLEEP_WHEN_IDLE": "1"
+        }),
+        ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4", {}),
+        ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4", {}),
+        ("distilbert/distilgpt2", "ray", "", "A100", {}),
+        ("distilbert/distilgpt2", "mp", "", "A100", {}),
+        ("distilbert/distilgpt2", "mp", "FLASHINFER", "A100", {}),
+        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100", {}),
     ])
 @pytest.mark.parametrize("enable_prompt_embeds", [True, False])
 def test_models_distributed(
@@ -147,6 +154,7 @@ def test_models_distributed(
     distributed_executor_backend: str,
     attention_backend: str,
     test_suite: str,
+    extra_env: dict[str, str],
     enable_prompt_embeds: bool,
 ) -> None:
 
@@ -172,6 +180,9 @@ def test_models_distributed(
                 attention_backend,
             )
 
+        for k, v in extra_env.items():
+            monkeypatch_context.setenv(k, v)
+
         dtype = "half"
         max_tokens = 5
 
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index 06c9e25ed8dd..eb5b09ff74f6 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the outputs of HF and vLLM when using greedy sampling.
 
 It tests chunked prefill. Chunked prefill can be enabled by
diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py
index be3ad12396b4..28bfe9e7c802 100644
--- a/tests/basic_correctness/test_cpu_offload.py
+++ b/tests/basic_correctness/test_cpu_offload.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from ..utils import compare_two_settings
 
diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index 76b266aada68..34f9389c82a9 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
index 63dc0f8c8e3b..341a39a42b85 100644
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the short outputs of HF and vLLM when using greedy sampling.
 
 VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test.
diff --git a/tests/benchmarks/test_latency_cli.py b/tests/benchmarks/test_latency_cli.py
index 8537459b9f94..2279c846e01c 100644
--- a/tests/benchmarks/test_latency_cli.py
+++ b/tests/benchmarks/test_latency_cli.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import subprocess
 
 import pytest
diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py
index b746d6b7853c..a3181952677f 100644
--- a/tests/benchmarks/test_serve_cli.py
+++ b/tests/benchmarks/test_serve_cli.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import subprocess
 
 import pytest
diff --git a/tests/benchmarks/test_throughput_cli.py b/tests/benchmarks/test_throughput_cli.py
index 2045b3629356..b61e51db4fbe 100644
--- a/tests/benchmarks/test_throughput_cli.py
+++ b/tests/benchmarks/test_throughput_cli.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import subprocess
 
 import pytest
diff --git a/tests/build_cython.py b/tests/build_cython.py
index 9dea6bcd62f3..f4a334aa3b48 100644
--- a/tests/build_cython.py
+++ b/tests/build_cython.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import Cython.Compiler.Options
 from Cython.Build import cythonize
 from setuptools import setup
diff --git a/tests/compile/backend.py b/tests/compile/backend.py
index 5a02c4e2b378..60334f5e4f68 100644
--- a/tests/compile/backend.py
+++ b/tests/compile/backend.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from copy import deepcopy
 from typing import Callable, Union
diff --git a/tests/compile/conftest.py b/tests/compile/conftest.py
index 7118810a5861..d86ca3710923 100644
--- a/tests/compile/conftest.py
+++ b/tests/compile/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 
diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/piecewise/test_full_cudagraph.py
index a71a40cda73e..134bade48607 100644
--- a/tests/compile/piecewise/test_full_cudagraph.py
+++ b/tests/compile/piecewise/test_full_cudagraph.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import contextlib
 import os
 
@@ -6,6 +7,7 @@
 
 from vllm import LLM, SamplingParams
 from vllm.config import CompilationConfig
+from vllm.platforms import current_platform
 
 MODEL = "Qwen/Qwen2-1.5B-Instruct"
 
@@ -36,7 +38,7 @@ def full_cudagraph_llm():
             "VLLM_FLASH_ATTN_VERSION": "3"
     }):
         return LLM(model=MODEL,
-                   gpu_memory_utilization=0.2,
+                   gpu_memory_utilization=0.3,
                    compilation_config=CompilationConfig(full_cuda_graph=True))
 
 
@@ -47,7 +49,7 @@ def piecewise_llm():
             "VLLM_FLASH_ATTN_VERSION": "3"
     }):
         return LLM(model=MODEL,
-                   gpu_memory_utilization=0.5,
+                   gpu_memory_utilization=0.6,
                    compilation_config=CompilationConfig())
 
 
@@ -60,6 +62,8 @@ def generate_text(llm: LLM, batch_size: int, max_tokens: int):
     return llm.generate(prompts, sampling_params)
 
 
+@pytest.mark.skipif(current_platform.get_device_capability() != (9, 0),
+                    reason="Only Hopper GPUs support FlashAttention 3")
 @pytest.mark.parametrize(("batch_size", "max_tokens"), [(1, 10), (7, 10),
                                                         (16, 10), (25, 10),
                                                         (32, 10), (45, 10),
diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index 5ce520a44025..a050646e5520 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Test the piecewise compilation with a simple model so that we
 can exactly calculate the expected output and side effects.
@@ -94,7 +95,7 @@ def _test_simple_piecewise_compile(*, use_inductor):
             num_piecewise_graphs_seen=5,  # 2 * num_layers + 1
             num_piecewise_capturable_graphs_seen=3,  # 1 + num_layers
             num_backend_compilations=3,  # num_piecewise_capturable_graphs_seen
-            num_cudagraph_caputured=
+            num_cudagraph_captured=
             6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
     ):
 
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index 22560befcbd5..410c0101c99b 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Test the piecewise compilation with a simple model, comparing the output
 with and without the piecewise compilation.
@@ -326,7 +327,7 @@ def _test_toy_llama(*, use_inductor):
             num_piecewise_graphs_seen=0,
             num_piecewise_capturable_graphs_seen=0,
             num_backend_compilations=0,
-            num_cudagraph_caputured=0,
+            num_cudagraph_captured=0,
     ):
         outputs.append(
             run_model(llama_config, use_inductor=False, use_compile=False))
@@ -342,7 +343,7 @@ def _test_toy_llama(*, use_inductor):
             num_piecewise_graphs_seen=1,
             num_piecewise_capturable_graphs_seen=1,
             num_backend_compilations=1,  # num_piecewise_capturable_graphs_seen
-            num_cudagraph_caputured=
+            num_cudagraph_captured=
             2,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
             **kwargs,
     ):
@@ -360,7 +361,7 @@ def _test_toy_llama(*, use_inductor):
             llama_config.num_layers,  # 1 + num_layers
             num_backend_compilations=1 +
             llama_config.num_layers,  # num_piecewise_capturable_graphs_seen
-            num_cudagraph_caputured=2 *
+            num_cudagraph_captured=2 *
         (1 + llama_config.num_layers
          ),  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
     ):
diff --git a/tests/compile/test_async_tp.py b/tests/compile/test_async_tp.py
index 8e4e0ba83579..1e4ee571f1af 100644
--- a/tests/compile/test_async_tp.py
+++ b/tests/compile/test_async_tp.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index b6b45d1cbe88..dc6cfe9daccd 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 
 import dataclasses
diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
new file mode 100644
index 000000000000..4f0d6fc5aab9
--- /dev/null
+++ b/tests/compile/test_config.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+import vllm
+from vllm.compilation.counter import compilation_counter
+from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
+                         set_current_vllm_config)
+
+from .piecewise.test_simple import SillyModel
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v1(monkeypatch):
+    """
+    TODO(rzou): The rest of tests/compile runs VLLM_USE_V1=0 right now,
+    I'll switch them over later.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '1')
+
+
+@pytest.mark.parametrize("enabled", [True, False])
+def test_use_cudagraphs(enabled):
+    assert vllm.envs.VLLM_USE_V1
+    vllm_config = VllmConfig(compilation_config=CompilationConfig(
+        level=CompilationLevel.PIECEWISE,
+        use_cudagraph=enabled,
+        cudagraph_capture_sizes=[100],
+    ))
+    with set_current_vllm_config(vllm_config):
+        model = SillyModel(vllm_config=vllm_config, prefix='')
+
+    inputs = torch.randn(100, device="cuda")
+
+    with compilation_counter.expect(
+            num_graphs_seen=1,  # one graph for the model
+            num_cudagraph_captured=1 if enabled else 0,
+    ):
+        # first run is warmup
+        model(inputs)
+        # second run does CUDAGraphs recording (if enabled)
+        model(inputs)
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index 397517b8665b..1d000fe00c59 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from __future__ import annotations
 
diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py
index 5d38ff91490e..aade29b99de7 100644
--- a/tests/compile/test_functionalization.py
+++ b/tests/compile/test_functionalization.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index 509593e7328d..0c25aae52d46 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py
index b630d0e85d31..251cc46e9e98 100644
--- a/tests/compile/test_pass_manager.py
+++ b/tests/compile/test_pass_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
 
 import pytest
diff --git a/tests/compile/test_sequence_parallelism.py b/tests/compile/test_sequence_parallelism.py
index 2cd7ebaacec0..c689befdf2da 100644
--- a/tests/compile/test_sequence_parallelism.py
+++ b/tests/compile/test_sequence_parallelism.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py
index 9eae48d60f36..df36b86abdbe 100644
--- a/tests/compile/test_silu_mul_quant_fusion.py
+++ b/tests/compile/test_silu_mul_quant_fusion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 
diff --git a/tests/compile/test_wrapper.py b/tests/compile/test_wrapper.py
index 0934c6113579..5e39f6821d16 100644
--- a/tests/compile/test_wrapper.py
+++ b/tests/compile/test_wrapper.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 6336c6c2ce01..5ec3926bd31f 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
 import os
 import tempfile
diff --git a/tests/core/block/conftest.py b/tests/core/block/conftest.py
index b7a9863f4aaf..6afe98d78ce8 100644
--- a/tests/core/block/conftest.py
+++ b/tests/core/block/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/core/block/e2e/conftest.py b/tests/core/block/e2e/conftest.py
index 83259b690337..e2c6c66b259c 100644
--- a/tests/core/block/e2e/conftest.py
+++ b/tests/core/block/e2e/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from typing import Callable, Optional
diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
index 9e8e315d87b1..f296c81e1768 100644
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from itertools import cycle
 
diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py
index 039b5e739892..3429a858dda5 100644
--- a/tests/core/block/e2e/test_correctness_sliding_window.py
+++ b/tests/core/block/e2e/test_correctness_sliding_window.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 
diff --git a/tests/core/block/test_block_manager.py b/tests/core/block/test_block_manager.py
index 68d9618ae245..9eed264fd7d4 100644
--- a/tests/core/block/test_block_manager.py
+++ b/tests/core/block/test_block_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/core/block/test_block_table.py b/tests/core/block/test_block_table.py
index 250c9a7497d2..ba085001136b 100644
--- a/tests/core/block/test_block_table.py
+++ b/tests/core/block/test_block_table.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/core/block/test_common.py b/tests/core/block/test_common.py
index 20260873003d..65400899b811 100644
--- a/tests/core/block/test_common.py
+++ b/tests/core/block/test_common.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 
diff --git a/tests/core/block/test_cpu_gpu_block_allocator.py b/tests/core/block/test_cpu_gpu_block_allocator.py
index a1414edd9562..795eef6743fd 100644
--- a/tests/core/block/test_cpu_gpu_block_allocator.py
+++ b/tests/core/block/test_cpu_gpu_block_allocator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/core/block/test_naive_block.py b/tests/core/block/test_naive_block.py
index 4b9454c84ff6..a31d1c46b37f 100644
--- a/tests/core/block/test_naive_block.py
+++ b/tests/core/block/test_naive_block.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py
index 50233624f7d1..46e224c6f53b 100644
--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 import random
diff --git a/tests/core/conftest.py b/tests/core/conftest.py
index 1a20e2c135c2..375b248ebeda 100644
--- a/tests/core/conftest.py
+++ b/tests/core/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 
diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py
index 161b32f01b11..d4dacc4f1296 100644
--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from unittest.mock import MagicMock
 
diff --git a/tests/core/test_num_computed_tokens_update.py b/tests/core/test_num_computed_tokens_update.py
index a4a90144482b..1b958e34df87 100644
--- a/tests/core/test_num_computed_tokens_update.py
+++ b/tests/core/test_num_computed_tokens_update.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
index a5ba16898d89..db78a9d55642 100644
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 from collections import deque
diff --git a/tests/core/test_scheduler_encoder_decoder.py b/tests/core/test_scheduler_encoder_decoder.py
index c6049b26a2bc..20cc083ec8db 100644
--- a/tests/core/test_scheduler_encoder_decoder.py
+++ b/tests/core/test_scheduler_encoder_decoder.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest  # noqa
 
diff --git a/tests/core/test_serialization.py b/tests/core/test_serialization.py
index 64b3e148ee72..8281298d6634 100644
--- a/tests/core/test_serialization.py
+++ b/tests/core/test_serialization.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import msgspec
 
diff --git a/tests/core/utils.py b/tests/core/utils.py
index 84b0426b470b..b746c1786464 100644
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 from collections import defaultdict
diff --git a/tests/detokenizer/conftest.py b/tests/detokenizer/conftest.py
index 59394b0351bd..f2c125355c83 100644
--- a/tests/detokenizer/conftest.py
+++ b/tests/detokenizer/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 
diff --git a/tests/detokenizer/test_disable_detokenization.py b/tests/detokenizer/test_disable_detokenization.py
index 14f9babb8d8a..ae06a985c7ec 100644
--- a/tests/detokenizer/test_disable_detokenization.py
+++ b/tests/detokenizer/test_disable_detokenization.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/detokenizer/test_stop_checker.py b/tests/detokenizer/test_stop_checker.py
index e9ad8d161210..bd221977224f 100644
--- a/tests/detokenizer/test_stop_checker.py
+++ b/tests/detokenizer/test_stop_checker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from unittest.mock import MagicMock
 
diff --git a/tests/detokenizer/test_stop_reason.py b/tests/detokenizer/test_stop_reason.py
index 4b1e4f5cf45e..9716f7d72a58 100644
--- a/tests/detokenizer/test_stop_reason.py
+++ b/tests/detokenizer/test_stop_reason.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test the different finish_reason="stop" situations during generation:
     1. One of the provided stop strings
     2. One of the provided stop tokens
diff --git a/tests/detokenizer/test_stop_strings.py b/tests/detokenizer/test_stop_strings.py
index 0607dd01a339..efe938a20c4f 100644
--- a/tests/detokenizer/test_stop_strings.py
+++ b/tests/detokenizer/test_stop_strings.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/tests/distributed/conftest.py b/tests/distributed/conftest.py
index ee8f2097933d..666a715cc0da 100644
--- a/tests/distributed/conftest.py
+++ b/tests/distributed/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import random
 from typing import Optional, Union
 
@@ -12,11 +13,13 @@
 
 from .test_events import SampleBatch
 
+DP_RANK = 0
+
 
 @pytest.fixture
 def random_port():
     """Generate a random port number for testing"""
-    return random.randint(10000, 60000)
+    return random.randint(10000, 59900)
 
 
 @pytest.fixture
@@ -29,21 +32,23 @@ def publisher_config(random_port, request):
         replay_endpoint = endpoint + "-replay"
     else:
         endpoint = f"tcp://*:{random_port}"
-        replay_endpoint = f"tcp://*:{random_port + 1}"
+        replay_endpoint = f"tcp://*:{random_port + 100}"
 
-    return KVEventsConfig(enable_kv_cache_events=True,
-                          publisher="zmq",
-                          endpoint=endpoint,
-                          replay_endpoint=replay_endpoint,
-                          buffer_steps=100,
-                          hwm=1000,
-                          topic="test")
+    return KVEventsConfig(
+        enable_kv_cache_events=True,
+        publisher="zmq",
+        endpoint=endpoint,
+        replay_endpoint=replay_endpoint,
+        buffer_steps=100,
+        hwm=1000,
+        topic="test",
+    )
 
 
 @pytest.fixture
 def publisher(publisher_config):
     """Create and return a publisher instance"""
-    pub = EventPublisherFactory.create(publisher_config)
+    pub = EventPublisherFactory.create(publisher_config, DP_RANK)
     yield pub
     pub.shutdown()
 
@@ -59,7 +64,11 @@ def subscriber(publisher_config):
     if replay_endpoint and replay_endpoint.startswith("tcp://*"):
         replay_endpoint = replay_endpoint.replace("*", "127.0.0.1")
 
-    sub = MockSubscriber(endpoint, replay_endpoint, publisher_config.topic)
+    sub = MockSubscriber(
+        [endpoint],
+        [replay_endpoint] if replay_endpoint else None,
+        publisher_config.topic,
+    )
     yield sub
     sub.close()
 
@@ -67,26 +76,37 @@ def subscriber(publisher_config):
 class MockSubscriber:
     """Helper class to receive and verify published events"""
 
-    def __init__(self,
-                 pub_endpoint: str,
-                 replay_endpoint: Optional[str] = None,
-                 topic: str = "",
-                 decode_type=SampleBatch):
+    def __init__(
+        self,
+        pub_endpoints: Union[str, list[str]],
+        replay_endpoints: Optional[Union[str, list[str]]] = None,
+        topic: str = "",
+        decode_type=SampleBatch,
+    ):
         self.ctx = zmq.Context.instance()
 
-        # Set up subscriber socket
-        self.sub = self.ctx.socket(zmq.SUB)
-        self.sub.setsockopt(zmq.SUBSCRIBE, topic.encode('utf-8'))
-        self.sub.connect(pub_endpoint)
+        # Convert single endpoint to list for consistency
+        if isinstance(pub_endpoints, str):
+            pub_endpoints = [pub_endpoints]
+        if isinstance(replay_endpoints, str):
+            replay_endpoints = [replay_endpoints]
 
-        # Set up replay socket if provided
-        self.replay = None
-        if replay_endpoint:
-            self.replay = self.ctx.socket(zmq.REQ)
-            self.replay.connect(replay_endpoint)
+        # Set up subscriber socket - connect to all endpoints
+        self.sub = self.ctx.socket(zmq.SUB)
+        self.sub.setsockopt(zmq.SUBSCRIBE, topic.encode("utf-8"))
+        for endpoint in pub_endpoints:
+            self.sub.connect(endpoint)
+
+        # Set up replay sockets if provided
+        self.replay_sockets = []
+        if replay_endpoints:
+            for replay_endpoint in replay_endpoints:
+                replay = self.ctx.socket(zmq.REQ)
+                replay.connect(replay_endpoint)
+                self.replay_sockets.append(replay)
 
         self.topic = topic
-        self.topic_bytes = topic.encode('utf-8')
+        self.topic_bytes = topic.encode("utf-8")
         self.received_msgs: list[tuple[int, SampleBatch]] = []
         self.last_seq = -1
         self.decoder = msgspec.msgpack.Decoder(type=decode_type)
@@ -106,25 +126,31 @@ def receive_one(self,
         self.received_msgs.append((seq, data))
         return seq, data
 
-    def request_replay(self, start_seq: int) -> None:
+    def request_replay(self, start_seq: int, socket_idx: int = 0) -> None:
         """Request replay of messages starting from start_seq"""
-        if not self.replay:
-            raise ValueError("Replay socket not initialized")
-
-        self.replay.send(start_seq.to_bytes(8, "big"))
-
-    def receive_replay(self) -> list[tuple[int, SampleBatch]]:
-        """Receive replayed messages"""
-        if not self.replay:
-            raise ValueError("Replay socket not initialized")
-
+        if not self.replay_sockets:
+            raise ValueError("Replay sockets not initialized")
+        if socket_idx >= len(self.replay_sockets):
+            raise ValueError(f"Invalid socket index {socket_idx}")
+
+        self.replay_sockets[socket_idx].send(start_seq.to_bytes(8, "big"))
+
+    def receive_replay(self,
+                       socket_idx: int = 0) -> list[tuple[int, SampleBatch]]:
+        """Receive replayed messages from a specific replay socket"""
+        if not self.replay_sockets:
+            raise ValueError("Replay sockets not initialized")
+        if socket_idx >= len(self.replay_sockets):
+            raise ValueError(f"Invalid socket index {socket_idx}")
+
+        replay_socket = self.replay_sockets[socket_idx]
         replayed: list[tuple[int, SampleBatch]] = []
         while True:
             try:
-                if not self.replay.poll(1000):
+                if not replay_socket.poll(1000):
                     break
 
-                frames = self.replay.recv_multipart()
+                frames = replay_socket.recv_multipart()
                 if not frames or not frames[-1]:
                     # End of replay marker
                     break
@@ -141,5 +167,5 @@ def receive_replay(self) -> list[tuple[int, SampleBatch]]:
     def close(self):
         """Clean up resources"""
         self.sub.close()
-        if self.replay:
-            self.replay.close()
+        for replay in self.replay_sockets:
+            replay.close()
diff --git a/tests/distributed/test_ca_buffer_sharing.py b/tests/distributed/test_ca_buffer_sharing.py
index 72e7ebdb7b59..e2de462612b4 100644
--- a/tests/distributed/test_ca_buffer_sharing.py
+++ b/tests/distributed/test_ca_buffer_sharing.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # can only run on machines with p2p access across GPUs
 # can only run with torchrun:
diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py
index 8f4c3537e158..e2cb579e22dc 100644
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test the communication operators.
 
 Run `pytest tests/distributed/test_comm_ops.py`.
diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
index a7ba45c9e546..fae49c41d5f8 100644
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 
diff --git a/tests/distributed/test_distributed_oot.py b/tests/distributed/test_distributed_oot.py
index 4b0c65d1d3a4..b93696e4be0e 100644
--- a/tests/distributed/test_distributed_oot.py
+++ b/tests/distributed/test_distributed_oot.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from ..entrypoints.openai.test_oot_registration import (
     run_and_test_dummy_opt_api_server)
diff --git a/tests/distributed/test_events.py b/tests/distributed/test_events.py
index 8de1aa20eabd..8be9ee0a1889 100644
--- a/tests/distributed/test_events.py
+++ b/tests/distributed/test_events.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import threading
 import time
 
@@ -8,6 +9,8 @@
 from vllm.distributed.kv_events import (EventBatch, EventPublisherFactory,
                                         NullEventPublisher)
 
+DP_RANK = 0
+
 
 class EventSample(
         msgspec.Struct,
@@ -120,7 +123,7 @@ def test_topic_filtering(publisher_config):
     publisher_config.replay_endpoint = None
 
     publisher_config.topic = "foo"
-    pub = EventPublisherFactory.create(publisher_config)
+    pub = EventPublisherFactory.create(publisher_config, DP_RANK)
 
     from .conftest import MockSubscriber
     sub_foo = MockSubscriber(publisher_config.endpoint, None, "foo")
@@ -184,9 +187,72 @@ def publish_events():
 
 def test_null_publisher():
     """Test that NullEventPublisher can be used without errors"""
-    publisher = NullEventPublisher()
+    publisher = NullEventPublisher(DP_RANK)
 
     # This should not raise any errors
     batch = create_test_events(5)
     publisher.publish(batch)
     publisher.shutdown()
+
+
+def test_data_parallel_rank_tagging(publisher_config):
+    """Test that events are properly tagged with their data parallel rank"""
+
+    publisher_config.topic = "foo"
+    pub_0 = EventPublisherFactory.create(publisher_config, DP_RANK)
+    pub_1 = EventPublisherFactory.create(publisher_config, DP_RANK + 1)
+
+    # Hardcode the expected endpoints based on port offsetting behavior
+    # Both ranks get offsets according to _offset_endpoint_port function
+    base_endpoint = publisher_config.endpoint
+    if "tcp://" in base_endpoint:
+        # For TCP endpoints: tcp://localhost:5557 -> tcp://localhost:5557, tcp://localhost:5558
+        expected_endpoint_0 = base_endpoint  # rank 0 gets port + 0 = same port
+        expected_endpoint_1 = base_endpoint.replace(
+            ":5557", ":5558")  # rank 1 gets port + 1
+    else:
+        # For inproc endpoints: inproc://test -> inproc://test_dp0, inproc://test_dp1
+        expected_endpoint_0 = base_endpoint  # rank 0 gets base
+        expected_endpoint_1 = base_endpoint + "_dp1"  # rank 1 gets _dp1
+
+    from .conftest import MockSubscriber
+    sub_0 = MockSubscriber(expected_endpoint_0, None, publisher_config.topic)
+    sub_1 = MockSubscriber(expected_endpoint_1, None, publisher_config.topic)
+
+    try:
+        time.sleep(0.1)  # Let publishers start up
+
+        # Publish events from different ranks
+        batch_0 = create_test_events(2)
+        batch_1 = create_test_events(3)
+
+        pub_0.publish(batch_0)
+        pub_1.publish(batch_1)
+
+        # Receive events from rank 0
+        result_0 = sub_0.receive_one(timeout=200)
+        assert result_0 is not None, "No message received from rank 0"
+        seq_0, received_0 = result_0
+
+        # Receive events from rank 1
+        result_1 = sub_1.receive_one(timeout=200)
+        assert result_1 is not None, "No message received from rank 1"
+        seq_1, received_1 = result_1
+
+        # Verify DP rank tagging
+        assert received_0.data_parallel_rank == 0, (
+            f"Expected DP rank 0, got {received_0.data_parallel_rank}")
+        assert received_1.data_parallel_rank == 1, (
+            f"Expected DP rank 1, got {received_1.data_parallel_rank}")
+
+        # Verify event content is correct
+        assert len(
+            received_0.events) == 2, "Wrong number of events from rank 0"
+        assert len(
+            received_1.events) == 3, "Wrong number of events from rank 1"
+
+    finally:
+        pub_0.shutdown()
+        pub_1.shutdown()
+        sub_0.close()
+        sub_1.close()
diff --git a/tests/distributed/test_expert_parallel.py b/tests/distributed/test_expert_parallel.py
index db8281617803..f641bf160414 100644
--- a/tests/distributed/test_expert_parallel.py
+++ b/tests/distributed/test_expert_parallel.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Literal, NamedTuple, Optional
diff --git a/tests/distributed/test_multi_node_assignment.py b/tests/distributed/test_multi_node_assignment.py
index c86d2d8a0061..ef17a51fff0e 100644
--- a/tests/distributed/test_multi_node_assignment.py
+++ b/tests/distributed/test_multi_node_assignment.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Make sure ray assigns GPU workers to the correct node.
 
 Run:
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index e6410ab068d2..7d569fd83821 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 WARNING: This test runs in both single-node (4 GPUs) and multi-node
  (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
diff --git a/tests/distributed/test_pipeline_partition.py b/tests/distributed/test_pipeline_partition.py
index 7bf93f270148..69ceedd345a8 100644
--- a/tests/distributed/test_pipeline_partition.py
+++ b/tests/distributed/test_pipeline_partition.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 
diff --git a/tests/distributed/test_pp_cudagraph.py b/tests/distributed/test_pp_cudagraph.py
index 3ca6e7b33a5e..a027a9e37dd6 100644
--- a/tests/distributed/test_pp_cudagraph.py
+++ b/tests/distributed/test_pp_cudagraph.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index 2c323edfa2af..5b32b90f3cfe 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import multiprocessing
 import os
diff --git a/tests/distributed/test_same_node.py b/tests/distributed/test_same_node.py
index 9b1bbd6e545c..94ad8f4f1213 100644
--- a/tests/distributed/test_same_node.py
+++ b/tests/distributed/test_same_node.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 
diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py
index c9eba2b43788..91a594eac5c4 100644
--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 WARNING: This test runs in both single-node (4 GPUs) and multi-node
  (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py
index f9eacc11d75f..e1357b4a34e9 100644
--- a/tests/distributed/test_shm_broadcast.py
+++ b/tests/distributed/test_shm_broadcast.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import multiprocessing
 import random
diff --git a/tests/distributed/test_torchrun_example.py b/tests/distributed/test_torchrun_example.py
index bb38e908b734..9f2c3eaec359 100644
--- a/tests/distributed/test_torchrun_example.py
+++ b/tests/distributed/test_torchrun_example.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # unit test for `examples/offline_inference/torchrun_example.py`
 import os
diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py
index 4432950f274e..0287ad94e388 100644
--- a/tests/distributed/test_utils.py
+++ b/tests/distributed/test_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import socket
 
diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py
index 0f46fba3ac49..8b99d9d6e21f 100644
--- a/tests/encoder_decoder/test_e2e_correctness.py
+++ b/tests/encoder_decoder/test_e2e_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """E2E tests to verify the correctness of the encoder-decoder framework
 
 Run `pytest tests/encoder_decoder/test_e2e_correctness.py`.
diff --git a/tests/engine/conftest.py b/tests/engine/conftest.py
index 1a20e2c135c2..375b248ebeda 100644
--- a/tests/engine/conftest.py
+++ b/tests/engine/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index 05d9cfc7ab74..cfbc7c245ffd 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -1,17 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from argparse import ArgumentError, ArgumentTypeError
 from contextlib import nullcontext
 from dataclasses import dataclass, field
-from typing import Literal, Optional
+from typing import Annotated, Literal, Optional
 
 import pytest
 
 from vllm.config import CompilationConfig, config
 from vllm.engine.arg_utils import (EngineArgs, contains_type, get_kwargs,
-                                   get_type, is_not_builtin, is_type,
-                                   literal_to_kwargs, nullable_kvs,
+                                   get_type, get_type_hints, is_not_builtin,
+                                   is_type, literal_to_kwargs, nullable_kvs,
                                    optional_type, parse_type)
 from vllm.utils import FlexibleArgumentParser
 
@@ -159,6 +160,18 @@ def test_is_not_builtin(type_hint, expected):
     assert is_not_builtin(type_hint) == expected
 
 
+@pytest.mark.parametrize(
+    ("type_hint", "expected"), [
+        (Annotated[int, "annotation"], {int}),
+        (Optional[int], {int, type(None)}),
+        (Annotated[Optional[int], "annotation"], {int, type(None)}),
+        (Optional[Annotated[int, "annotation"]], {int, type(None)}),
+    ],
+    ids=["Annotated", "Optional", "Annotated_Optional", "Optional_Annotated"])
+def test_get_type_hints(type_hint, expected):
+    assert get_type_hints(type_hint) == expected
+
+
 def test_get_kwargs():
     kwargs = get_kwargs(DummyConfig)
     print(kwargs)
diff --git a/tests/engine/test_computed_prefix_blocks.py b/tests/engine/test_computed_prefix_blocks.py
index 049fa2c8b12b..ac5a1f957dfe 100644
--- a/tests/engine/test_computed_prefix_blocks.py
+++ b/tests/engine/test_computed_prefix_blocks.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/engine/test_executor.py b/tests/engine/test_executor.py
index 91c9ba4a74e6..15c7a97b50e1 100644
--- a/tests/engine/test_executor.py
+++ b/tests/engine/test_executor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import os
diff --git a/tests/engine/test_multi_step_output_processor.py b/tests/engine/test_multi_step_output_processor.py
index b67dd86bfdf0..458f4deb743a 100644
--- a/tests/engine/test_multi_step_output_processor.py
+++ b/tests/engine/test_multi_step_output_processor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 from unittest.mock import MagicMock
diff --git a/tests/engine/test_multiproc_workers.py b/tests/engine/test_multiproc_workers.py
index 9b2f45def6c5..b5381b61a020 100644
--- a/tests/engine/test_multiproc_workers.py
+++ b/tests/engine/test_multiproc_workers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 from concurrent.futures import ThreadPoolExecutor
diff --git a/tests/engine/test_options.py b/tests/engine/test_options.py
index 0cf4f69d56a8..fc6a78a5112a 100644
--- a/tests/engine/test_options.py
+++ b/tests/engine/test_options.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from contextlib import nullcontext
 
 import pytest
diff --git a/tests/engine/test_short_mm_context.py b/tests/engine/test_short_mm_context.py
index b29d6362f571..9c62761d78af 100644
--- a/tests/engine/test_short_mm_context.py
+++ b/tests/engine/test_short_mm_context.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py
index 3b596ea3e6a0..a7c533ec2419 100644
--- a/tests/entrypoints/conftest.py
+++ b/tests/entrypoints/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py
index 95657455bd7b..a2d35486a5e8 100644
--- a/tests/entrypoints/llm/test_accuracy.py
+++ b/tests/entrypoints/llm/test_accuracy.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file test accuracy of the vLLM server via LMEval.
 It uses local-completions, which interacts with vLLM
diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py
index 742a66683445..97cf3b5ce8fc 100644
--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import weakref
 
 import pytest
diff --git a/tests/entrypoints/llm/test_collective_rpc.py b/tests/entrypoints/llm/test_collective_rpc.py
index 6470249dddbc..3a13f8c979f2 100644
--- a/tests/entrypoints/llm/test_collective_rpc.py
+++ b/tests/entrypoints/llm/test_collective_rpc.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py
index d10257761c86..f0fa54aa3131 100644
--- a/tests/entrypoints/llm/test_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import weakref
 
diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py
index 9a895c922cc3..4676dc992a87 100644
--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import weakref
 
diff --git a/tests/entrypoints/llm/test_generate_multiple_loras.py b/tests/entrypoints/llm/test_generate_multiple_loras.py
index 099af0f36088..b7d53e31fd71 100644
--- a/tests/entrypoints/llm/test_generate_multiple_loras.py
+++ b/tests/entrypoints/llm/test_generate_multiple_loras.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import weakref
 
diff --git a/tests/entrypoints/llm/test_gpu_utilization.py b/tests/entrypoints/llm/test_gpu_utilization.py
index c2b4a935886b..533da9e6d6ea 100644
--- a/tests/entrypoints/llm/test_gpu_utilization.py
+++ b/tests/entrypoints/llm/test_gpu_utilization.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, SamplingParams
 
diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
index dd5d17885eb9..d41b0a436c62 100644
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 import weakref
diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py
index f065f6564cd2..61b6b4fbf8e3 100644
--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/tests/entrypoints/llm/test_lazy_outlines.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import sys
 from contextlib import nullcontext
diff --git a/tests/entrypoints/llm/test_prompt_validation.py b/tests/entrypoints/llm/test_prompt_validation.py
index 665c6ea1e699..1b7be15d5d69 100644
--- a/tests/entrypoints/llm/test_prompt_validation.py
+++ b/tests/entrypoints/llm/test_prompt_validation.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py
index 23fd72f4ebbb..a606eeab5887 100644
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for HF_HUB_OFFLINE mode"""
 import importlib
 import sys
diff --git a/tests/entrypoints/openai/correctness/test_lmeval.py b/tests/entrypoints/openai/correctness/test_lmeval.py
index d3948e2ed575..41b70f80e3b8 100644
--- a/tests/entrypoints/openai/correctness/test_lmeval.py
+++ b/tests/entrypoints/openai/correctness/test_lmeval.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file test accuracy of the vLLM server via LMEval.
 It uses local-completions, which interacts with vLLM
diff --git a/tests/entrypoints/openai/correctness/test_mteb.py b/tests/entrypoints/openai/correctness/test_mteb.py
index 44d7ac193760..437c48511352 100644
--- a/tests/entrypoints/openai/correctness/test_mteb.py
+++ b/tests/entrypoints/openai/correctness/test_mteb.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 
 import pytest
diff --git a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
index 642c204b9ff0..58195f98bd35 100644
--- a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
+++ b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Evaluate Transcription API correctness by computing Word Error Rate (WER)
 on a given ASR dataset. When provided, it will also compare the WER against
diff --git a/tests/entrypoints/openai/test_async_tokenization.py b/tests/entrypoints/openai/test_async_tokenization.py
index 1f7ba0da4f24..ab3c80905438 100644
--- a/tests/entrypoints/openai/test_async_tokenization.py
+++ b/tests/entrypoints/openai/test_async_tokenization.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import contextlib
diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index 7f959f312019..d67c05ab3e8d 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 
diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py
index a4ac80070773..a55941976cd8 100644
--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 from http import HTTPStatus
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index 2509ef0d280a..dab947b21b28 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # imports for guided decoding tests
 import json
diff --git a/tests/entrypoints/openai/test_chat_echo.py b/tests/entrypoints/openai/test_chat_echo.py
index 86ee17c6f449..de63f4ed218b 100644
--- a/tests/entrypoints/openai/test_chat_echo.py
+++ b/tests/entrypoints/openai/test_chat_echo.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import NamedTuple
 
diff --git a/tests/entrypoints/openai/test_chat_logit_bias_validation.py b/tests/entrypoints/openai/test_chat_logit_bias_validation.py
index 9dab524ea480..e9d1a855294c 100644
--- a/tests/entrypoints/openai/test_chat_logit_bias_validation.py
+++ b/tests/entrypoints/openai/test_chat_logit_bias_validation.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai
 import pytest
diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py
index f18fbb0a9c71..daa4a78c935a 100644
--- a/tests/entrypoints/openai/test_chat_template.py
+++ b/tests/entrypoints/openai/test_chat_template.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
index e00f001ef730..03730b67283c 100644
--- a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
+++ b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai  # use the official client for correctness check
 import pytest
diff --git a/tests/entrypoints/openai/test_chunked_prompt.py b/tests/entrypoints/openai/test_chunked_prompt.py
index 0419395f1816..3c8ed955a65a 100644
--- a/tests/entrypoints/openai/test_chunked_prompt.py
+++ b/tests/entrypoints/openai/test_chunked_prompt.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai  # use the official client for correctness check
 import pytest
diff --git a/tests/entrypoints/openai/test_classification.py b/tests/entrypoints/openai/test_classification.py
index 97124c85e0d3..6d5f925152c3 100644
--- a/tests/entrypoints/openai/test_classification.py
+++ b/tests/entrypoints/openai/test_classification.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import requests
diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py
index 8d1abe28a027..504fd72aa4ae 100644
--- a/tests/entrypoints/openai/test_cli_args.py
+++ b/tests/entrypoints/openai/test_cli_args.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 
diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
index 9d12f27a2b87..7e54143f6e1c 100644
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # imports for guided decoding tests
 import json
 import shutil
diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py
index dad76b54c5e9..84ad7a09165a 100644
--- a/tests/entrypoints/openai/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/test_completion_with_function_calling.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai  # use the official client for correctness check
 import pytest
@@ -8,7 +9,7 @@
 from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
-MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 
 
 @pytest.fixture(scope="module")
@@ -21,7 +22,9 @@ def server():  # noqa: F811
         "--guided-decoding-backend",
         "xgrammar",
         "--tool-call-parser",
-        "hermes"
+        "hermes",
+        "--reasoning-parser",
+        "qwen3",
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -36,7 +39,12 @@ async def client(server):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_required_tool_use(client: openai.AsyncOpenAI, model_name: str):
+@pytest.mark.parametrize("stream", [True, False])
+@pytest.mark.parametrize("tool_choice", ["auto", "required"])
+@pytest.mark.parametrize("enable_thinking", [True, False])
+async def test_function_tool_use(client: openai.AsyncOpenAI, model_name: str,
+                                 stream: bool, tool_choice: str,
+                                 enable_thinking: bool):
     tools = [
         {
             "type": "function",
@@ -125,30 +133,38 @@ async def test_required_tool_use(client: openai.AsyncOpenAI, model_name: str):
             "forecast for the next 5 days, in fahrenheit?",
         },
     ]
-
-    # Non-streaming test
-    chat_completion = await client.chat.completions.create(
-        messages=messages,
-        model=model_name,
-        tools=tools,
-        tool_choice="required",
-    )
-
-    assert chat_completion.choices[0].message.tool_calls is not None
-    assert len(chat_completion.choices[0].message.tool_calls) > 0
-
-    # Streaming test
-    stream = await client.chat.completions.create(
-        messages=messages,
-        model=model_name,
-        tools=tools,
-        tool_choice="required",
-        stream=True,
-    )
-
-    output = []
-    async for chunk in stream:
-        if chunk.choices and chunk.choices[0].delta.tool_calls:
-            output.extend(chunk.choices[0].delta.tool_calls)
-
-    assert len(output) > 0
+    if not stream:
+        # Non-streaming test
+        chat_completion = await client.chat.completions.create(
+            messages=messages,
+            model=model_name,
+            tools=tools,
+            tool_choice=tool_choice,
+            extra_body={
+                "chat_template_kwargs": {
+                    "enable_thinking": enable_thinking
+                }
+            })
+
+        assert chat_completion.choices[0].message.tool_calls is not None
+        assert len(chat_completion.choices[0].message.tool_calls) > 0
+    else:
+        # Streaming test
+        output_stream = await client.chat.completions.create(
+            messages=messages,
+            model=model_name,
+            tools=tools,
+            tool_choice=tool_choice,
+            stream=True,
+            extra_body={
+                "chat_template_kwargs": {
+                    "enable_thinking": enable_thinking
+                }
+            })
+
+        output = []
+        async for chunk in output_stream:
+            if chunk.choices and chunk.choices[0].delta.tool_calls:
+                output.extend(chunk.choices[0].delta.tool_calls)
+
+        assert len(output) > 0
diff --git a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
index b7ee3e33c2d2..00d3ffb61ee9 100644
--- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
+++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import base64
 import io
diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
index 81ca65b6541a..80640a2e1a8b 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import base64
 
diff --git a/tests/entrypoints/openai/test_embedding_dimensions.py b/tests/entrypoints/openai/test_embedding_dimensions.py
index 341defae0b31..08b797dc57ad 100644
--- a/tests/entrypoints/openai/test_embedding_dimensions.py
+++ b/tests/entrypoints/openai/test_embedding_dimensions.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Run `pytest tests/entrypoints/openai/test_embedding_dimensions.py`.
 """
diff --git a/tests/entrypoints/openai/test_encoder_decoder.py b/tests/entrypoints/openai/test_encoder_decoder.py
index 52b4df9ceecd..9c2aef23e877 100644
--- a/tests/entrypoints/openai/test_encoder_decoder.py
+++ b/tests/entrypoints/openai/test_encoder_decoder.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai
 import pytest
diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py
index cd07ca46ca65..bcdeaaacedea 100644
--- a/tests/entrypoints/openai/test_lora_adapters.py
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import json
diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py
index c96151349eb3..d4afdf7751c8 100644
--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import suppress
 from dataclasses import dataclass, field
diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index b21c0173c7b8..2d7b845736b8 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import subprocess
 import sys
diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py
index 3d4f1cde2789..1980daa80db9 100644
--- a/tests/entrypoints/openai/test_models.py
+++ b/tests/entrypoints/openai/test_models.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai  # use the official client for correctness check
 import pytest
diff --git a/tests/entrypoints/openai/test_oot_registration.py b/tests/entrypoints/openai/test_oot_registration.py
index a1b7a205a457..f0ce50debe49 100644
--- a/tests/entrypoints/openai/test_oot_registration.py
+++ b/tests/entrypoints/openai/test_oot_registration.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from ...utils import VLLM_PATH, RemoteOpenAIServer
 
diff --git a/tests/entrypoints/openai/test_openai_schema.py b/tests/entrypoints/openai/test_openai_schema.py
index cae2a3b59553..4ded37595384 100644
--- a/tests/entrypoints/openai/test_openai_schema.py
+++ b/tests/entrypoints/openai/test_openai_schema.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Final
 
 import pytest
diff --git a/tests/entrypoints/openai/test_pooling.py b/tests/entrypoints/openai/test_pooling.py
index 72ab12c56460..cf16ace6537a 100644
--- a/tests/entrypoints/openai/test_pooling.py
+++ b/tests/entrypoints/openai/test_pooling.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import base64
 
diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py
index e384915899d3..ff0730c77032 100644
--- a/tests/entrypoints/openai/test_prompt_validation.py
+++ b/tests/entrypoints/openai/test_prompt_validation.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # imports for guided decoding tests
 import openai
diff --git a/tests/entrypoints/openai/test_rerank.py b/tests/entrypoints/openai/test_rerank.py
index ba11cd3a29a8..19eba320c279 100644
--- a/tests/entrypoints/openai/test_rerank.py
+++ b/tests/entrypoints/openai/test_rerank.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import requests
diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py
index 6474858642d7..099062e55c72 100644
--- a/tests/entrypoints/openai/test_return_tokens_as_ids.py
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Separate these tests out from test_completion and test_chat, because they
 # require launching a second server with a different flag. Running both servers
diff --git a/tests/entrypoints/openai/test_root_path.py b/tests/entrypoints/openai/test_root_path.py
index 106d6b2c14f8..7b4966848b9d 100644
--- a/tests/entrypoints/openai/test_root_path.py
+++ b/tests/entrypoints/openai/test_root_path.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import contextlib
 import os
diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py
index 99639ce51aa7..e23f41e983b0 100644
--- a/tests/entrypoints/openai/test_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 import subprocess
diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py
index b373f2912752..af51a0a3eeeb 100644
--- a/tests/entrypoints/openai/test_score.py
+++ b/tests/entrypoints/openai/test_score.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any
 
 import pytest
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 5e11af8cf892..94740fefc870 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 from contextlib import suppress
diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py
index e8f3c2f8b39e..28af6489a4d0 100644
--- a/tests/entrypoints/openai/test_serving_models.py
+++ b/tests/entrypoints/openai/test_serving_models.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from http import HTTPStatus
 from unittest.mock import MagicMock
diff --git a/tests/entrypoints/openai/test_shutdown.py b/tests/entrypoints/openai/test_shutdown.py
index 0f12ac9b260b..29a94c852bba 100644
--- a/tests/entrypoints/openai/test_shutdown.py
+++ b/tests/entrypoints/openai/test_shutdown.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai
 import pytest
diff --git a/tests/entrypoints/openai/test_sleep.py b/tests/entrypoints/openai/test_sleep.py
index 3ca8a9a410ff..0dd6af17ef22 100644
--- a/tests/entrypoints/openai/test_sleep.py
+++ b/tests/entrypoints/openai/test_sleep.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import requests
 
diff --git a/tests/entrypoints/openai/test_tensorizer_entrypoint.py b/tests/entrypoints/openai/test_tensorizer_entrypoint.py
index f1ab7223048d..e143150356d9 100644
--- a/tests/entrypoints/openai/test_tensorizer_entrypoint.py
+++ b/tests/entrypoints/openai/test_tensorizer_entrypoint.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import gc
 import json
 import tempfile
diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py
index 7d823542e374..57dd25fe1b16 100644
--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import pytest_asyncio
diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py
index 5c48df3cebbc..1cb0a39df513 100644
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # imports for guided decoding tests
 import io
diff --git a/tests/entrypoints/openai/test_truncation.py b/tests/entrypoints/openai/test_truncation.py
index 137ed9db8589..b33a26af65b3 100644
--- a/tests/entrypoints/openai/test_truncation.py
+++ b/tests/entrypoints/openai/test_truncation.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any
 
 import openai
diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py
index 53f057a294c0..990ea3579291 100644
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index 1ab50b41c7ec..4513d8b3420f 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
index 26c68e06c199..fe982e286ae4 100644
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 
diff --git a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
index f5f327ea068c..8c86b4889e15 100644
--- a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from unittest.mock import MagicMock, patch
 
diff --git a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
index 71f41ea7d93b..d83137472598 100644
--- a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from unittest.mock import MagicMock, patch
 
diff --git a/tests/entrypoints/openai/tool_parsers/utils.py b/tests/entrypoints/openai/tool_parsers/utils.py
index ab8f4bd678fd..e1b41f45f554 100644
--- a/tests/entrypoints/openai/tool_parsers/utils.py
+++ b/tests/entrypoints/openai/tool_parsers/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from typing import Union
diff --git a/tests/entrypoints/test_api_server_process_manager.py b/tests/entrypoints/test_api_server_process_manager.py
index 0dd1fdd99694..e4af60a78265 100644
--- a/tests/entrypoints/test_api_server_process_manager.py
+++ b/tests/entrypoints/test_api_server_process_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import multiprocessing
 import socket
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 9f1f2321d9e6..49294664275a 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import warnings
 from typing import Optional
diff --git a/tests/entrypoints/test_ssl_cert_refresher.py b/tests/entrypoints/test_ssl_cert_refresher.py
index 23ce7a679f3e..33ad2cfd3a33 100644
--- a/tests/entrypoints/test_ssl_cert_refresher.py
+++ b/tests/entrypoints/test_ssl_cert_refresher.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 import tempfile
 from pathlib import Path
diff --git a/tests/fastsafetensors_loader/test_fastsafetensors_loader.py b/tests/fastsafetensors_loader/test_fastsafetensors_loader.py
index 184bee2a7153..1b95bf59f67c 100644
--- a/tests/fastsafetensors_loader/test_fastsafetensors_loader.py
+++ b/tests/fastsafetensors_loader/test_fastsafetensors_loader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import SamplingParams
 from vllm.config import LoadFormat
diff --git a/tests/fastsafetensors_loader/test_weight_utils.py b/tests/fastsafetensors_loader/test_weight_utils.py
index 8772035af502..78d23acfec7c 100644
--- a/tests/fastsafetensors_loader/test_weight_utils.py
+++ b/tests/fastsafetensors_loader/test_weight_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import glob
 import tempfile
diff --git a/tests/kernels/allclose_default.py b/tests/kernels/allclose_default.py
index 97ceffab4eb8..9d65159bf64f 100644
--- a/tests/kernels/allclose_default.py
+++ b/tests/kernels/allclose_default.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/tests/kernels/attention/conftest.py b/tests/kernels/attention/conftest.py
index 4f04ec947532..88a2fb62b254 100644
--- a/tests/kernels/attention/conftest.py
+++ b/tests/kernels/attention/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py
index d9f956fbc7c0..2d381a99be60 100644
--- a/tests/kernels/attention/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 from typing import Optional
diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
index 58da01f0ebbf..f3e64155703c 100644
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from unittest.mock import patch
 
@@ -84,7 +85,10 @@ def test_env(
                        CpuPlatform()):
                 backend = get_attn_backend(16, torch.float16, torch.float16,
                                            block_size, False)
-            assert backend.get_name() == "TORCH_SDPA"
+            if use_v1:
+                assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
+            else:
+                assert backend.get_name() == "TORCH_SDPA"
 
         elif device == "hip":
             with patch("vllm.attention.selector.current_platform",
diff --git a/tests/kernels/attention/test_blocksparse_attention.py b/tests/kernels/attention/test_blocksparse_attention.py
index 82d038257575..9aee818c9956 100644
--- a/tests/kernels/attention/test_blocksparse_attention.py
+++ b/tests/kernels/attention/test_blocksparse_attention.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 from typing import Optional
diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py
index 2f2212dd2b0e..e508505c2b05 100644
--- a/tests/kernels/attention/test_cache.py
+++ b/tests/kernels/attention/test_cache.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 
diff --git a/tests/kernels/attention/test_cascade_flash_attn.py b/tests/kernels/attention/test_cascade_flash_attn.py
index d6570e6334b1..1e7e7e0a7f84 100755
--- a/tests/kernels/attention/test_cascade_flash_attn.py
+++ b/tests/kernels/attention/test_cascade_flash_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/kernels/attention/test_encoder_decoder_attn.py b/tests/kernels/attention/test_encoder_decoder_attn.py
index c8ee46bc65d4..c6ce7b0cce40 100644
--- a/tests/kernels/attention/test_encoder_decoder_attn.py
+++ b/tests/kernels/attention/test_encoder_decoder_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Tests:
 
diff --git a/tests/kernels/attention/test_flash_attn.py b/tests/kernels/attention/test_flash_attn.py
index 88516b75cde2..bd3190d09b0f 100644
--- a/tests/kernels/attention/test_flash_attn.py
+++ b/tests/kernels/attention/test_flash_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/kernels/attention/test_flashinfer.py b/tests/kernels/attention/test_flashinfer.py
index 5ad1137aa6af..3ad6e1d32911 100644
--- a/tests/kernels/attention/test_flashinfer.py
+++ b/tests/kernels/attention/test_flashinfer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/kernels/attention/test_flashmla.py b/tests/kernels/attention/test_flashmla.py
index 0d51a8e7fee1..21b08e45fd6f 100644
--- a/tests/kernels/attention/test_flashmla.py
+++ b/tests/kernels/attention/test_flashmla.py
@@ -1,5 +1,6 @@
 # Adapted from: https://github.com/deepseek-ai/FlashMLA/blob/main/tests/test_flash_mla.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 import random
 
diff --git a/tests/kernels/attention/test_lightning_attn.py b/tests/kernels/attention/test_lightning_attn.py
index fbad52987dd2..de45ee1ed5cc 100644
--- a/tests/kernels/attention/test_lightning_attn.py
+++ b/tests/kernels/attention/test_lightning_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/kernels/attention/test_merge_attn_states.py b/tests/kernels/attention/test_merge_attn_states.py
index 7038fbea5c22..9d1a301ebe30 100644
--- a/tests/kernels/attention/test_merge_attn_states.py
+++ b/tests/kernels/attention/test_merge_attn_states.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import pytest
diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py
index 5a18b7916f0f..53c37554b15a 100644
--- a/tests/kernels/attention/test_mha_attn.py
+++ b/tests/kernels/attention/test_mha_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Test:
 
diff --git a/tests/kernels/attention/test_mla_decode_cpu.py b/tests/kernels/attention/test_mla_decode_cpu.py
index 8cebe32c4c5b..5a7480a6beae 100644
--- a/tests/kernels/attention/test_mla_decode_cpu.py
+++ b/tests/kernels/attention/test_mla_decode_cpu.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 import torch.nn.functional as F
diff --git a/tests/kernels/attention/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py
index 9333777d38ea..b09e1bbc4279 100644
--- a/tests/kernels/attention/test_prefix_prefill.py
+++ b/tests/kernels/attention/test_prefix_prefill.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 import random
diff --git a/tests/kernels/attention/test_rocm_attention_selector.py b/tests/kernels/attention/test_rocm_attention_selector.py
index 6ffe27abf709..ed58880cc9e6 100644
--- a/tests/kernels/attention/test_rocm_attention_selector.py
+++ b/tests/kernels/attention/test_rocm_attention_selector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/kernels/attention/test_triton_decode_attention.py b/tests/kernels/attention/test_triton_decode_attention.py
index fd3c9fa4196a..358b374ea75b 100644
--- a/tests/kernels/attention/test_triton_decode_attention.py
+++ b/tests/kernels/attention/test_triton_decode_attention.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/kernels/attention/test_triton_unified_attention.py b/tests/kernels/attention/test_triton_unified_attention.py
index be3d1879de24..0cb7f5963c79 100644
--- a/tests/kernels/attention/test_triton_unified_attention.py
+++ b/tests/kernels/attention/test_triton_unified_attention.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/kernels/core/test_activation.py b/tests/kernels/core/test_activation.py
index 79f838a954e7..29c5e70a8ba8 100644
--- a/tests/kernels/core/test_activation.py
+++ b/tests/kernels/core/test_activation.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 
diff --git a/tests/kernels/core/test_fused_quant_layernorm.py b/tests/kernels/core/test_fused_quant_layernorm.py
index 7a591f536783..19703b8a2f97 100644
--- a/tests/kernels/core/test_fused_quant_layernorm.py
+++ b/tests/kernels/core/test_fused_quant_layernorm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional, Union
 
diff --git a/tests/kernels/core/test_layernorm.py b/tests/kernels/core/test_layernorm.py
index fa4bbe458645..3eac062738f8 100644
--- a/tests/kernels/core/test_layernorm.py
+++ b/tests/kernels/core/test_layernorm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/kernels/core/test_opcheck.py b/tests/kernels/core/test_opcheck.py
index c9a9679c5d80..40ced08b933a 100644
--- a/tests/kernels/core/test_opcheck.py
+++ b/tests/kernels/core/test_opcheck.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Tests for miscellaneous utilities
 """
diff --git a/tests/kernels/core/test_permute_cols.py b/tests/kernels/core/test_permute_cols.py
index 35d62079fb65..e18f6230dbce 100644
--- a/tests/kernels/core/test_permute_cols.py
+++ b/tests/kernels/core/test_permute_cols.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py
index 8cb56314cf94..ab6f1ccf881f 100644
--- a/tests/kernels/core/test_pos_encoding.py
+++ b/tests/kernels/core/test_pos_encoding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from itertools import accumulate, product
 from typing import Callable, Optional
diff --git a/tests/kernels/core/test_rotary_embedding.py b/tests/kernels/core/test_rotary_embedding.py
index 8383f943b9fa..db0fdcbf5ef2 100644
--- a/tests/kernels/core/test_rotary_embedding.py
+++ b/tests/kernels/core/test_rotary_embedding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Tests for miscellaneous utilities
 """
diff --git a/tests/kernels/core/test_uva.py b/tests/kernels/core/test_uva.py
index f641ae7b67c2..c71215e4c646 100644
--- a/tests/kernels/core/test_uva.py
+++ b/tests/kernels/core/test_uva.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 
diff --git a/tests/kernels/mamba/test_causal_conv1d.py b/tests/kernels/mamba/test_causal_conv1d.py
index 93064e23dd7d..addb8bfcda13 100644
--- a/tests/kernels/mamba/test_causal_conv1d.py
+++ b/tests/kernels/mamba/test_causal_conv1d.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/kernels/mamba/test_mamba_mixer2.py b/tests/kernels/mamba/test_mamba_mixer2.py
index abcf3888fea2..f5c6a18614ff 100644
--- a/tests/kernels/mamba/test_mamba_mixer2.py
+++ b/tests/kernels/mamba/test_mamba_mixer2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import unittest
 
diff --git a/tests/kernels/mamba/test_mamba_ssm.py b/tests/kernels/mamba/test_mamba_ssm.py
index 84d4c347e0d8..8dece26ddb29 100644
--- a/tests/kernels/mamba/test_mamba_ssm.py
+++ b/tests/kernels/mamba/test_mamba_ssm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/kernels/mamba/test_mamba_ssm_ssd.py b/tests/kernels/mamba/test_mamba_ssm_ssd.py
index f5e751bea414..abed1252a3ce 100644
--- a/tests/kernels/mamba/test_mamba_ssm_ssd.py
+++ b/tests/kernels/mamba/test_mamba_ssm_ssd.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/kernels/moe/__init__.py b/tests/kernels/moe/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/kernels/moe/deepep_utils.py b/tests/kernels/moe/deepep_utils.py
new file mode 100644
index 000000000000..117f1babdf62
--- /dev/null
+++ b/tests/kernels/moe/deepep_utils.py
@@ -0,0 +1,191 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+DeepEP test utilities
+"""
+import dataclasses
+import importlib
+import traceback
+from typing import Callable, Optional
+
+import torch
+from torch.distributed import ProcessGroup
+from torch.multiprocessing import (
+    spawn)  # pyright: ignore[reportPrivateImportUsage]
+from typing_extensions import Concatenate, ParamSpec
+
+has_deep_ep = importlib.util.find_spec("deep_ep") is not None
+if has_deep_ep:
+    from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
+        DeepEPHTPrepareAndFinalize)
+    from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
+        DeepEPLLPrepareAndFinalize)
+
+## Parallel Processes Utils
+
+P = ParamSpec("P")
+
+
+@dataclasses.dataclass
+class ProcessGroupInfo:
+    world_size: int
+    world_local_size: int
+    rank: int
+    node_rank: int
+    local_rank: int
+    device: torch.device
+
+
+def _worker_parallel_launch(
+    local_rank: int,
+    world_size: int,
+    world_local_size: int,
+    node_rank: int,
+    init_method: str,
+    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
+    *args: P.args,
+    **kwargs: P.kwargs,
+) -> None:
+    rank = node_rank * world_local_size + local_rank
+    torch.cuda.set_device(local_rank)
+    device = torch.device("cuda", local_rank)
+    torch.distributed.init_process_group(
+        backend="cpu:gloo,cuda:nccl",
+        init_method=init_method,
+        rank=rank,
+        world_size=world_size,
+        device_id=device,
+    )
+    barrier = torch.tensor([rank], device=device)
+    torch.distributed.all_reduce(barrier)
+
+    try:
+        worker(
+            ProcessGroupInfo(
+                world_size=world_size,
+                world_local_size=world_local_size,
+                rank=rank,
+                node_rank=node_rank,
+                local_rank=local_rank,
+                device=device,
+            ),
+            *args,
+            **kwargs,
+        )
+    except Exception as ex:
+        print(ex)
+        traceback.print_exc()
+        raise
+    finally:
+        torch.distributed.destroy_process_group()
+
+
+def parallel_launch(
+    world_size: int,
+    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
+    *args: P.args,
+    **kwargs: P.kwargs,
+) -> None:
+    assert not kwargs
+    spawn(
+        _worker_parallel_launch,
+        args=(
+            world_size,
+            world_size,
+            0,
+            "tcp://localhost:29500",
+            worker,
+        ) + args,
+        nprocs=world_size,
+        join=True,
+    )
+
+
+## DeepEP specific utils
+
+
+@dataclasses.dataclass
+class DeepEPHTArgs:
+    num_local_experts: int
+
+
+@dataclasses.dataclass
+class DeepEPLLArgs:
+    max_tokens_per_rank: int
+    hidden_size: int
+    num_experts: int
+    use_fp8_dispatch: bool
+
+
+def make_deepep_ht_a2a(pg: ProcessGroup,
+                       pgi: ProcessGroupInfo,
+                       dp_size: int,
+                       ht_args: DeepEPHTArgs,
+                       q_dtype: Optional[torch.dtype] = None,
+                       block_shape: Optional[list[int]] = None):
+
+    import deep_ep
+
+    # high throughput a2a
+    num_nvl_bytes = 1024 * 1024 * 1024  # 1GB
+    num_rdma_bytes, low_latency_mode, num_qps_per_rank = 0, False, 1
+    buffer = deep_ep.Buffer(group=pg,
+                            num_nvl_bytes=num_nvl_bytes,
+                            num_rdma_bytes=num_rdma_bytes,
+                            low_latency_mode=low_latency_mode,
+                            num_qps_per_rank=num_qps_per_rank)
+    return DeepEPHTPrepareAndFinalize(buffer=buffer,
+                                      world_size=pgi.world_size,
+                                      rank=pgi.rank,
+                                      dp_size=dp_size,
+                                      rank_expert_offset=pgi.rank *
+                                      ht_args.num_local_experts,
+                                      quant_dtype=q_dtype,
+                                      block_shape=block_shape)
+
+
+def make_deepep_ll_a2a(pg: ProcessGroup,
+                       pgi: ProcessGroupInfo,
+                       dp_size: int,
+                       deepep_ll_args: DeepEPLLArgs,
+                       q_dtype: Optional[torch.dtype] = None,
+                       block_shape: Optional[list[int]] = None):
+
+    import deep_ep
+
+    # low-latency a2a
+    num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(
+        deepep_ll_args.max_tokens_per_rank, deepep_ll_args.hidden_size,
+        pgi.world_size, deepep_ll_args.num_experts)
+
+    buffer = deep_ep.Buffer(group=pg,
+                            num_rdma_bytes=num_rdma_bytes,
+                            low_latency_mode=True,
+                            num_qps_per_rank=deepep_ll_args.num_experts //
+                            pgi.world_size)
+
+    return DeepEPLLPrepareAndFinalize(
+        buffer=buffer,
+        world_size=pgi.world_size,
+        dp_size=dp_size,
+        max_tokens_per_rank=deepep_ll_args.max_tokens_per_rank,
+        quant_dtype=q_dtype,
+        block_shape=block_shape,
+        use_fp8_dispatch=deepep_ll_args.use_fp8_dispatch,
+    )
+
+
+def make_deepep_a2a(pg: ProcessGroup,
+                    pgi: ProcessGroupInfo,
+                    dp_size: int,
+                    deepep_ht_args: Optional[DeepEPHTArgs],
+                    deepep_ll_args: Optional[DeepEPLLArgs],
+                    q_dtype: Optional[torch.dtype] = None,
+                    block_shape: Optional[list[int]] = None):
+    if deepep_ht_args is not None:
+        assert deepep_ll_args is None
+        return make_deepep_ht_a2a(pg, pgi, dp_size, deepep_ht_args, q_dtype,
+                                  block_shape)
+
+    assert deepep_ll_args is not None
+    return make_deepep_ll_a2a(pg, pgi, dp_size, deepep_ll_args, q_dtype,
+                              block_shape)
diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
index 7d369edfc86a..b0e0feab4689 100644
--- a/tests/kernels/moe/test_batched_moe.py
+++ b/tests/kernels/moe/test_batched_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 
diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py
index 7db4fe0f46e3..474745f94815 100644
--- a/tests/kernels/moe/test_cutlass_moe.py
+++ b/tests/kernels/moe/test_cutlass_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import dataclasses
 from typing import Optional
 
@@ -192,14 +193,10 @@ def run_8_bit(moe_tensors: MOETensors8Bit,
 
     kwargs = {
         'a': moe_tensors.a,
-        'w1_q': moe_tensors.w1_q.transpose(1, 2),  # type: ignore[union-attr]
-        'w2_q': moe_tensors.w2_q.transpose(1, 2),  # type: ignore[union-attr]
+        'w1_q': moe_tensors.w1_q,  # type: ignore[union-attr]
+        'w2_q': moe_tensors.w2_q,  # type: ignore[union-attr]
         'topk_weights': topk_weights,
         'topk_ids': topk_ids,
-        'ab_strides1': moe_tensors.ab_strides1,
-        'c_strides1': moe_tensors.c_strides1,
-        'ab_strides2': moe_tensors.ab_strides2,
-        'c_strides2': moe_tensors.c_strides2,
         'w1_scale': moe_tensors.w1_scale,
         'w2_scale': moe_tensors.w2_scale,
         'a1_scale': moe_tensors.a_scale
diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py
new file mode 100644
index 000000000000..2d7cf39a8cca
--- /dev/null
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -0,0 +1,513 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Test DeepEP + DeepGEMM integration 
+DeepGEMM are gemm kernels specialized for the
+fp8 block-quantized case.
+"""
+
+import dataclasses
+import importlib
+from typing import Optional
+
+import pytest
+import torch.distributed
+from torch.distributed import ProcessGroup
+from typing_extensions import ParamSpec
+
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEModularKernel)
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8)
+from vllm.platforms import current_platform
+
+from .deepep_utils import ProcessGroupInfo, parallel_launch
+
+has_deep_ep = importlib.util.find_spec("deep_ep") is not None
+
+try:
+    import deep_gemm
+    has_deep_gemm = True
+except ImportError:
+    has_deep_gemm = False
+
+if has_deep_ep:
+    from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
+        DeepEPHTPrepareAndFinalize)
+    from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
+        DeepEPLLPrepareAndFinalize)
+
+    from .deepep_utils import DeepEPHTArgs, DeepEPLLArgs, make_deepep_a2a
+
+if has_deep_gemm:
+    from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
+        BatchedDeepGemmExperts)
+    from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
+        DeepGemmExperts)
+
+requires_deep_ep = pytest.mark.skipif(
+    not has_deep_ep,
+    reason="Requires deep_ep kernels",
+)
+
+requires_deep_gemm = pytest.mark.skipif(
+    not has_deep_gemm,
+    reason="Requires deep_gemm kernels",
+)
+
+P = ParamSpec("P")
+
+
+def next_power_of_2(x):
+    import math
+    if x == 0:
+        return 1
+    return 2**math.ceil(math.log2(x))
+
+
+def per_block_cast_to_fp8(
+        x: torch.Tensor,
+        block_size_n: int = 128) -> tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    x_padded = torch.zeros(
+        (deep_gemm.ceil_div(m, 128) * 128,
+         deep_gemm.ceil_div(n, block_size_n) * block_size_n),
+        dtype=x.dtype,
+        device=x.device)
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, block_size_n)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
+    x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous()
+    scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
+    return x_scaled_sub, scales
+
+
+def make_block_quant_fp8_weights(
+    e: int,
+    n: int,
+    k: int,
+    block_size: list[int],
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Return weights w1, w2, w1q, w2q, w1_scale, w2_scale
+    """
+    dtype = torch.bfloat16
+
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+    w1_bf16 = torch.randn((e, 2 * n, k), dtype=dtype) / 10
+    w1_bf16 = w1_bf16.clamp(min=fp8_min, max=fp8_max).to(dtype=dtype)
+
+    w2_bf16 = torch.randn((e, k, n), dtype=dtype) / 10
+    w2_bf16 = w2_bf16.clamp(min=fp8_min, max=fp8_max).to(dtype=dtype)
+
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles_w1 = ((2 * n) + block_n - 1) // block_n
+    k_tiles_w1 = (k + block_k - 1) // block_k
+    n_tiles_w2 = (k + block_n - 1) // block_n
+    k_tiles_w2 = (n + block_k - 1) // block_k
+
+    w1 = torch.empty_like(w1_bf16, dtype=torch.float8_e4m3fn)
+    w2 = torch.empty_like(w2_bf16, dtype=torch.float8_e4m3fn)
+
+    w1_s = torch.empty((e, n_tiles_w1, k_tiles_w1),
+                       device="cuda",
+                       dtype=torch.float32)
+    w2_s = torch.empty((e, n_tiles_w2, k_tiles_w2),
+                       device="cuda",
+                       dtype=torch.float32)
+
+    assert w1_s.shape == (e, (2 * n + 127) // 128, (k + 127) // 128)
+    assert (w2.shape[-2] + block_n - 1) // block_n == w2_s.shape[-2]
+
+    for i in range(e):
+        w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i])
+        w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i])
+
+    return w1, w2, w1_s, w2_s
+
+
+@dataclasses.dataclass
+class TestConfig:
+    topk: int
+    m: int
+    k: int
+    n: int
+    num_experts: int
+    block_size: list[int]
+    # configs for testing low-latency kernels
+    low_latency: bool
+    use_fp8_dispatch: Optional[bool] = False
+
+
+@dataclasses.dataclass
+class TestTensors:
+    rank_tokens: torch.Tensor  # all ranks make this many tokens
+    rank_token_scales: Optional[torch.Tensor]
+    topk: torch.Tensor
+    topk_weights: torch.Tensor
+    config: TestConfig
+
+    @staticmethod
+    def make(config: TestConfig, rank) -> "TestTensors":
+
+        dtype = torch.bfloat16
+        topk, m, k, block_size = (config.topk, config.m, config.k,
+                                  config.block_size)
+
+        fp8_info = torch.finfo(torch.float8_e4m3fn)
+        fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+        rank_tokens = torch.randn(
+            (m, k), device=torch.cuda.current_device(), dtype=dtype) / 10.0
+        rank_tokens = rank_tokens.clamp(min=fp8_min, max=fp8_max)
+
+        block_k = block_size[1]
+        _, rank_token_scales = per_token_group_quant_fp8(rank_tokens, block_k)
+
+        topk_ids = torch.randint(
+            low=0,
+            high=config.num_experts,
+            size=(m, topk),
+            device=torch.cuda.current_device()).to(dtype=torch.int64)
+
+        topk_weights = torch.randn(topk_ids.shape,
+                                   dtype=torch.float32,
+                                   device=torch.cuda.current_device())
+
+        return TestTensors(rank_tokens=rank_tokens,
+                           rank_token_scales=rank_token_scales,
+                           topk=topk_ids,
+                           topk_weights=topk_weights,
+                           config=config)
+
+
+def make_ll_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo,
+                           max_tokens_per_rank: int, dp_size: int,
+                           hidden_size: int, q_dtype: Optional[torch.dtype],
+                           test_config: TestConfig) -> FusedMoEModularKernel:
+
+    assert test_config.low_latency
+    assert test_config.use_fp8_dispatch is not None
+
+    a2a: DeepEPLLPrepareAndFinalize = make_deepep_a2a(
+        pg=pg,
+        pgi=pgi,
+        dp_size=dp_size,
+        deepep_ht_args=None,
+        deepep_ll_args=DeepEPLLArgs(
+            max_tokens_per_rank=max_tokens_per_rank,
+            hidden_size=hidden_size,
+            num_experts=test_config.num_experts,
+            use_fp8_dispatch=test_config.use_fp8_dispatch),
+        q_dtype=q_dtype,
+        block_shape=test_config.block_size)
+
+    fused_experts = BatchedDeepGemmExperts(max_num_tokens=max_tokens_per_rank,
+                                           world_size=pgi.world_size,
+                                           dp_size=dp_size,
+                                           block_shape=test_config.block_size)
+    mk = FusedMoEModularKernel(prepare_finalize=a2a,
+                               fused_experts=fused_experts)
+    return mk
+
+
+def make_ht_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo,
+                           dp_size: int, num_local_experts: int,
+                           q_dtype: Optional[torch.dtype],
+                           test_config: TestConfig) -> FusedMoEModularKernel:
+
+    assert not test_config.low_latency
+    assert test_config.use_fp8_dispatch is None
+
+    a2a: DeepEPHTPrepareAndFinalize = make_deepep_a2a(
+        pg=pg,
+        pgi=pgi,
+        dp_size=dp_size,
+        deepep_ht_args=DeepEPHTArgs(num_local_experts=num_local_experts),
+        deepep_ll_args=None,
+        q_dtype=q_dtype,
+        block_shape=test_config.block_size)
+
+    fused_experts = DeepGemmExperts()
+    mk = FusedMoEModularKernel(prepare_finalize=a2a,
+                               fused_experts=fused_experts)
+    return mk
+
+
+def make_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo, dp_size: int,
+                        num_local_experts: int,
+                        test_tensors: TestTensors) -> FusedMoEModularKernel:
+
+    q_dtype = torch.float8_e4m3fn
+    test_config = test_tensors.config
+
+    mk: FusedMoEModularKernel
+    # Make modular kernel
+    if test_config.low_latency:
+        max_tokens_per_rank = max(
+            64, next_power_of_2(test_tensors.rank_tokens.size(0)))
+        hidden_size = test_tensors.rank_tokens.size(-1)
+
+        mk = make_ll_modular_kernel(pg=pg,
+                                    pgi=pgi,
+                                    max_tokens_per_rank=max_tokens_per_rank,
+                                    dp_size=dp_size,
+                                    hidden_size=hidden_size,
+                                    q_dtype=q_dtype,
+                                    test_config=test_config)
+    else:
+        mk = make_ht_modular_kernel(pg, pgi, dp_size, num_local_experts,
+                                    q_dtype, test_config)
+
+    return mk
+
+
+def deepep_deepgemm_moe_impl(pg: ProcessGroup, pgi: ProcessGroupInfo,
+                             dp_size: int, test_tensors: TestTensors,
+                             w1: torch.Tensor, w2: torch.Tensor,
+                             w1_scale: Optional[torch.Tensor],
+                             w2_scale: Optional[torch.Tensor]) -> torch.Tensor:
+
+    test_config = test_tensors.config
+    num_experts = test_config.num_experts
+    num_local_experts = w1.size(0)
+
+    def build_expert_map():
+        num_local_experts = w1.size(0)
+        expert_map = torch.full((num_experts, ),
+                                fill_value=-1,
+                                dtype=torch.int32)
+        s = pgi.rank * num_local_experts
+        e = s + num_local_experts
+        expert_map[s:e] = torch.tensor(list(range(num_local_experts)))
+        return expert_map.to(device=torch.cuda.current_device(),
+                             dtype=torch.int32)
+
+    # Make modular kernel
+    mk: FusedMoEModularKernel = make_modular_kernel(
+        pg=pg,
+        pgi=pgi,
+        dp_size=dp_size,
+        num_local_experts=num_local_experts,
+        test_tensors=test_tensors)
+
+    # Low-Latency kernels can't dispatch scales.
+    a1_scale = (None
+                if test_config.low_latency else test_tensors.rank_token_scales)
+
+    out = mk.forward(hidden_states=test_tensors.rank_tokens,
+                     w1=w1,
+                     w2=w2,
+                     topk_weights=test_tensors.topk_weights,
+                     topk_ids=test_tensors.topk,
+                     inplace=False,
+                     activation="silu",
+                     global_num_experts=num_experts,
+                     expert_map=build_expert_map(),
+                     w1_scale=w1_scale,
+                     w2_scale=w2_scale,
+                     w1_zp=None,
+                     w2_zp=None,
+                     a1_scale=a1_scale,
+                     a2_scale=None,
+                     apply_router_weight_on_input=False)
+    return out
+
+
+def triton_impl(a: torch.Tensor, topk_ids: torch.Tensor,
+                topk_weights: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor,
+                w1_scale: torch.Tensor, w2_scale: torch.Tensor,
+                a1_scale: torch.Tensor, block_shape: list[int]):
+
+    return fused_experts(
+        hidden_states=a,
+        w1=w1,
+        w2=w2,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        inplace=False,
+        use_fp8_w8a8=True,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        a1_scale=a1_scale,
+        block_shape=block_shape,
+        # Make sure this is set to False so we
+        # dont end up comparing the same implementation.
+        allow_deep_gemm=False)
+
+
+def _test_deepep_deepgemm_moe(
+    pgi: ProcessGroupInfo,
+    dp_size: int,
+    config: TestConfig,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+):
+    current_platform.seed_everything(pgi.rank)
+
+    w1 = w1.to(device=torch.cuda.current_device())
+    w2 = w2.to(device=torch.cuda.current_device())
+    w1_scale = w1_scale.to(device=torch.cuda.current_device())
+    w2_scale = w2_scale.to(device=torch.cuda.current_device())
+
+    pg = torch.distributed.new_group(list(range(pgi.world_size)))
+    test_tensors = TestTensors.make(config, pgi.rank)
+    block_shape = [
+        w1.size(1) // w1_scale.size(1),
+        w1.size(2) // w1_scale.size(2)
+    ]
+
+    with set_current_vllm_config(VllmConfig()):
+        # Reference
+        triton_moe = triton_impl(a=test_tensors.rank_tokens,
+                                 topk_ids=test_tensors.topk,
+                                 topk_weights=test_tensors.topk_weights,
+                                 w1=w1,
+                                 w2=w2,
+                                 w1_scale=w1_scale,
+                                 w2_scale=w2_scale,
+                                 a1_scale=test_tensors.rank_token_scales,
+                                 block_shape=block_shape)
+
+        # Slice experts for this rank.
+        num_local_experts = config.num_experts // pgi.world_size
+        e_start = num_local_experts * pgi.rank
+        e_end = e_start + num_local_experts
+        w1_ep = w1[e_start:e_end]
+        w2_ep = w2[e_start:e_end]
+        w1_scale_ep = w1_scale[e_start:e_end]
+        w2_scale_ep = w2_scale[e_start:e_end]
+
+        deepep_moe = deepep_deepgemm_moe_impl(
+            pg,
+            pgi,
+            dp_size,
+            test_tensors,
+            w1_ep,
+            w2_ep,
+            w1_scale_ep,
+            w2_scale_ep,
+        )
+
+    torch.testing.assert_close(
+        triton_moe,
+        deepep_moe,
+        atol=6e-2,
+        rtol=6e-2,
+    )
+
+
+MNKs = [
+    (8, 128, 128),
+    (8, 128, 512),
+    (8, 512, 512),
+    (3, 1024, 2048),
+    (32, 128, 1024),
+    (45, 512, 2048),
+    (64, 1024, 1024),
+    (129, 128, 256),
+    (129, 1024, 2048),
+    (222, 1024, 2048),
+]
+
+TOPKS = [2, 6]
+NUM_EXPERTS = [32]
+
+
+@pytest.mark.parametrize("mnk", MNKs)
+@pytest.mark.parametrize("num_experts", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOPKS)
+@pytest.mark.parametrize("world_dp_size", [(2, 1)])
+@requires_deep_ep
+@requires_deep_gemm
+def test_ht_deepep_deepgemm_moe(mnk: tuple[int, int, int], num_experts: int,
+                                topk: int, world_dp_size: tuple[int, int]):
+    """
+    Tests for High-Throughput DeepEP + DeepGemm integration.
+    """
+
+    m, n, k = mnk
+    current_platform.seed_everything(7)
+
+    if topk > num_experts:
+        pytest.skip(f"Skipping test: topk={topk} > E={num_experts}")
+
+    block_m = deep_gemm.get_m_alignment_for_contiguous_layout()
+    block_size = [block_m, block_m]
+
+    world_size, dp_size = world_dp_size
+    config = TestConfig(topk=topk,
+                        m=m,
+                        k=k,
+                        n=n,
+                        num_experts=num_experts,
+                        block_size=block_size,
+                        low_latency=False,
+                        use_fp8_dispatch=None)
+
+    w1, w2, w1_scale, w2_scale = make_block_quant_fp8_weights(
+        num_experts, n, k, block_size)
+
+    parallel_launch(world_size, _test_deepep_deepgemm_moe, dp_size, config, w1,
+                    w2, w1_scale, w2_scale)
+
+
+MNKs = [
+    (1, 128, 2560),
+    (2, 128, 2560),
+    (3, 1024, 2560),
+    (32, 128, 2560),
+    (45, 512, 2560),
+    (64, 1024, 2560),
+    (222, 1024, 2560),
+]
+# Fix tests for USE_FP8_DISPATCH=True
+USE_FP8_DISPATCH = [False]
+
+
+@pytest.mark.parametrize("mnk", MNKs)
+@pytest.mark.parametrize("num_experts", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOPKS)
+@pytest.mark.parametrize("use_fp8_dispatch", USE_FP8_DISPATCH)
+@pytest.mark.parametrize("block_size", [[128, 128]])
+@pytest.mark.parametrize("world_dp_size", [(2, 1)])
+@requires_deep_ep
+@requires_deep_gemm
+def test_ll_deepep_deepgemm_moe(mnk: tuple[int, int,
+                                           int], num_experts: int, topk: int,
+                                use_fp8_dispatch: bool, block_size: list[int],
+                                world_dp_size: tuple[int, int]):
+    """
+    Tests for Low-Latency DeepEP + DeepGemm integration.
+    """
+
+    m, n, k = mnk
+    current_platform.seed_everything(7)
+
+    if topk > num_experts:
+        pytest.skip(f"Skipping test: topk={topk} > E={num_experts}")
+
+    world_size, dp_size = world_dp_size
+    config = TestConfig(
+        topk=topk,
+        m=m,
+        k=k,
+        n=n,
+        num_experts=num_experts,
+        block_size=block_size,
+        low_latency=True,
+        use_fp8_dispatch=use_fp8_dispatch,
+    )
+
+    w1, w2, w1_scale, w2_scale = make_block_quant_fp8_weights(
+        num_experts, n, k, block_size)
+
+    parallel_launch(world_size, _test_deepep_deepgemm_moe, dp_size, config, w1,
+                    w2, w1_scale, w2_scale)
diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py
new file mode 100644
index 000000000000..7e029ea95055
--- /dev/null
+++ b/tests/kernels/moe/test_deepep_moe.py
@@ -0,0 +1,459 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Test deepep dispatch-combine logic
+"""
+
+import dataclasses
+import importlib
+from typing import Optional, Union
+
+import pytest
+import torch.distributed
+from torch.distributed import ProcessGroup
+
+from vllm import _custom_ops as ops
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import TritonExperts
+from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+    BatchedTritonExperts)
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEModularKernel)
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8)
+from vllm.platforms import current_platform
+
+from .deepep_utils import ProcessGroupInfo, parallel_launch
+
+has_deep_ep = importlib.util.find_spec("deep_ep") is not None
+
+if has_deep_ep:
+    from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
+        DeepEPHTPrepareAndFinalize)
+    from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
+        DeepEPLLPrepareAndFinalize)
+
+    from .deepep_utils import DeepEPHTArgs, DeepEPLLArgs, make_deepep_a2a
+
+requires_deep_ep = pytest.mark.skipif(
+    not has_deep_ep,
+    reason="Requires deep_ep kernels",
+)
+
+MAX_TOKENS_PER_RANK = 64
+
+
+def make_weights(
+        e, n, k, dtype
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Return weights w1, w2, w1_scale, w2_scale
+    """
+    if dtype in [torch.float16, torch.bfloat16]:
+        w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+        w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+        return w1, w2, None, None
+
+    # per-out-channel weight quantization
+    assert dtype == torch.float8_e4m3fn
+    w1 = torch.empty((e, 2 * n, k), device="cuda", dtype=torch.float16)
+    w2 = torch.empty((e, k, n), device="cuda", dtype=torch.float16)
+
+    n_b_scales = 2 * n
+    k_b_scales = k
+    w1_q = torch.empty_like(w1, dtype=dtype)
+    w2_q = torch.empty_like(w2, dtype=dtype)
+    w1_scale = torch.empty((e, n_b_scales, 1),
+                           device="cuda",
+                           dtype=torch.float32)
+    w2_scale = torch.empty((e, k_b_scales, 1),
+                           device="cuda",
+                           dtype=torch.float32)
+    for expert in range(e):
+        w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
+            w1[expert], use_per_token_if_dynamic=True)
+        w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
+            w2[expert], use_per_token_if_dynamic=True)
+    return w1_q, w2_q, w1_scale, w2_scale
+
+
+@dataclasses.dataclass
+class TestConfig:
+    dtype: torch.dtype
+    topk: int
+    m: int
+    k: int
+    n: int
+    num_experts: int
+
+
+@dataclasses.dataclass
+class TestTensors:
+    rank_tokens: torch.Tensor  # all ranks make this many tokens
+    rank_token_scales: Optional[torch.Tensor]
+    topk: torch.Tensor
+    topk_weights: torch.Tensor
+    config: TestConfig
+
+    @staticmethod
+    def make(config: TestConfig, low_latency_mode: bool) -> "TestTensors":
+        # TODO (varun) - check that float16 works ?
+        assert config.dtype in [torch.bfloat16, torch.float8_e4m3fn]
+        token_dtype = (torch.bfloat16 if config.dtype == torch.float8_e4m3fn
+                       else config.dtype)
+        rank_tokens = torch.randn(
+            (config.m, config.k), device="cuda", dtype=token_dtype) / 10
+        rank_token_scales = None
+        if config.dtype == torch.float8_e4m3fn:
+            # low_latency_mode kernels dont support per-token quant.
+            _, rank_token_scales = ops.scaled_fp8_quant(
+                rank_tokens, use_per_token_if_dynamic=not low_latency_mode)
+
+        topk = torch.randint(low=0,
+                             high=config.num_experts,
+                             size=(config.m, config.topk),
+                             device="cuda").to(dtype=torch.int64)
+        topk_weights = torch.randn(topk.shape,
+                                   dtype=torch.float32,
+                                   device="cuda")
+        return TestTensors(rank_tokens=rank_tokens,
+                           rank_token_scales=rank_token_scales,
+                           topk=topk,
+                           topk_weights=topk_weights,
+                           config=config)
+
+
+def make_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo,
+                        low_latency_mode: bool, hidden_size: int, dp_size: int,
+                        num_experts: int, num_local_experts: int,
+                        q_dtype: Optional[torch.dtype],
+                        use_fp8_dispatch: bool) -> FusedMoEModularKernel:
+
+    is_quantized = q_dtype is not None
+
+    ht_args: Optional[DeepEPHTArgs] = None
+    ll_args: Optional[DeepEPLLArgs] = None
+
+    if low_latency_mode:
+        ll_args = DeepEPLLArgs(max_tokens_per_rank=MAX_TOKENS_PER_RANK,
+                               hidden_size=hidden_size,
+                               num_experts=num_experts,
+                               use_fp8_dispatch=use_fp8_dispatch)
+    else:
+        assert not use_fp8_dispatch, (
+            "FP8 Dispatch is valid only for low-latency kernels")
+        ht_args = DeepEPHTArgs(num_local_experts=num_local_experts)
+
+    a2a : Union[DeepEPHTPrepareAndFinalize, DeepEPLLPrepareAndFinalize] = \
+        make_deepep_a2a(pg = pg,
+                        pgi = pgi,
+                        dp_size = dp_size,
+                        q_dtype = q_dtype,
+                        block_shape = None,
+                        deepep_ht_args = ht_args,
+                        deepep_ll_args = ll_args)
+
+    if low_latency_mode:
+        fused_experts = BatchedTritonExperts(
+            max_num_tokens=MAX_TOKENS_PER_RANK,
+            world_size=pgi.world_size,
+            dp_size=dp_size,
+            use_fp8_w8a8=is_quantized,
+            use_int8_w8a8=False,
+            use_int8_w8a16=False,
+            use_int4_w4a16=False)
+    else:
+        fused_experts = TritonExperts(use_fp8_w8a8=is_quantized,
+                                      use_int8_w8a8=False,
+                                      use_int8_w8a16=False,
+                                      use_int4_w4a16=False,
+                                      per_channel_quant=False)
+
+    mk = FusedMoEModularKernel(prepare_finalize=a2a,
+                               fused_experts=fused_experts)
+    return mk
+
+
+def deep_ep_moe_impl(pg: ProcessGroup, pgi: ProcessGroupInfo,
+                     low_latency_mode: bool, dp_size: int,
+                     test_tensors: TestTensors, w1: torch.Tensor,
+                     w2: torch.Tensor, w1_scale: Optional[torch.Tensor],
+                     w2_scale: Optional[torch.Tensor], num_experts: int,
+                     use_fp8_dispatch: bool) -> torch.Tensor:
+
+    num_local_experts = w1.size(0)
+
+    def build_expert_map():
+        num_local_experts = w1.size(0)
+        expert_map = torch.full((num_experts, ),
+                                fill_value=-1,
+                                dtype=torch.int32)
+        s = pgi.rank * num_local_experts
+        e = s + num_local_experts
+        expert_map[s:e] = torch.tensor(list(range(num_local_experts)))
+        return expert_map.to(device=torch.cuda.current_device(),
+                             dtype=torch.int32)
+
+    hidden_size = test_tensors.rank_tokens.size(1)
+    is_quantized = w1.dtype == torch.float8_e4m3fn
+    q_dtype = None
+    if is_quantized:
+        q_dtype = torch.float8_e4m3fn
+
+    # Make modular kernel
+    mk: FusedMoEModularKernel = make_modular_kernel(pg, pgi, low_latency_mode,
+                                                    hidden_size, dp_size,
+                                                    num_experts,
+                                                    num_local_experts, q_dtype,
+                                                    use_fp8_dispatch)
+
+    out_hidden_states = torch.empty_like(test_tensors.rank_tokens)
+    total_num_tokens = test_tensors.rank_tokens.size(0)
+
+    def process_chunk(chunk_start, chunk_end, skip_result_store=False):
+        rank_tokens_chunk = test_tensors.rank_tokens[chunk_start:chunk_end]
+        topk_weights_chunk = test_tensors.topk_weights[chunk_start:chunk_end]
+        topk_chunk = test_tensors.topk[chunk_start:chunk_end]
+        rank_token_scales_chunk = test_tensors.rank_token_scales
+        if rank_token_scales_chunk is not None and rank_token_scales_chunk.size(
+                0) == total_num_tokens:
+            # per act token
+            rank_token_scales_chunk = rank_token_scales_chunk[
+                chunk_start:chunk_end]
+
+        out = mk.forward(hidden_states=rank_tokens_chunk,
+                         w1=w1,
+                         w2=w2,
+                         topk_weights=topk_weights_chunk,
+                         topk_ids=topk_chunk,
+                         inplace=False,
+                         activation="silu",
+                         global_num_experts=num_experts,
+                         expert_map=build_expert_map(),
+                         w1_scale=w1_scale,
+                         w2_scale=w2_scale,
+                         w1_zp=None,
+                         w2_zp=None,
+                         a1_scale=rank_token_scales_chunk,
+                         a2_scale=None,
+                         apply_router_weight_on_input=False)
+
+        if not skip_result_store:
+            out_hidden_states[chunk_start:chunk_end, :].copy_(
+                out, non_blocking=True)
+
+    max_num_tokens_per_dp = (MAX_TOKENS_PER_RANK
+                             if low_latency_mode else total_num_tokens)
+
+    for chunk_start_ in range(0, total_num_tokens, max_num_tokens_per_dp):
+        chunk_start = chunk_start_
+        chunk_end = min(chunk_start + max_num_tokens_per_dp, total_num_tokens)
+        # clamp start and end
+        chunk_start = min(chunk_start, total_num_tokens - 1)
+        chunk_end = min(chunk_end, total_num_tokens)
+
+        process_chunk(chunk_start,
+                      chunk_end,
+                      skip_result_store=chunk_start_ >= total_num_tokens)
+
+    return out_hidden_states
+
+
+def torch_moe_impl(test_tensors: TestTensors, w1: torch.Tensor,
+                   w2: torch.Tensor, w1_scale: Optional[torch.Tensor],
+                   w2_scale: Optional[torch.Tensor], using_fp8_dispatch: bool):
+
+    a, topk_ids, topk_weights = (test_tensors.rank_tokens, test_tensors.topk,
+                                 test_tensors.topk_weights)
+    if using_fp8_dispatch:
+        # The DeepEP implementation is requested to dispatch using FP8.
+        # For numerical stability for testing, emulate the fp8 dispatch by
+        # blockwise quant and de-quant.
+        a = test_tensors.rank_tokens
+        aq, aq_scale = per_token_group_quant_fp8(a, 128)
+        a = (aq.view(-1, 128).to(torch.float32) * aq_scale.view(-1, 1)).view(
+            a.shape).to(a.dtype)
+
+    is_quantized = w1.dtype == torch.float8_e4m3fn
+    a_dtype = a.dtype
+    if is_quantized:
+        w1 = w1.to(dtype=torch.float32) * w1_scale
+        w2 = w2.to(dtype=torch.float32) * w2_scale
+        a = a.to(dtype=torch.float32)
+
+    m, _ = a.shape
+    topk = topk_ids.size(1)
+    out = torch.zeros_like(a)
+
+    for i in range(m):
+        a_i = a[i]
+        o_i = out[i]
+        for j in range(topk):
+            e = topk_ids[i][j]
+            e_w = topk_weights[i][j]
+            w1_e = w1[e]
+            w2_e = w2[e]
+            o_i += (SiluAndMul()
+                    (a_i @ w1_e.transpose(0, 1)) @ w2_e.transpose(0, 1)) * e_w
+
+    if is_quantized:
+        out = out.to(dtype=a_dtype)
+
+    return out
+
+
+def _deep_ep_moe(
+    pgi: ProcessGroupInfo,
+    low_latency_mode: bool,
+    dp_size: int,
+    config: TestConfig,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: Optional[torch.Tensor],
+    w2_scale: Optional[torch.Tensor],
+    use_fp8_dispatch: bool,
+):
+
+    if not low_latency_mode:
+        assert not use_fp8_dispatch, (
+            "FP8 dispatch interface is available only in low-latency mode")
+
+    is_quantized = w1.dtype == torch.float8_e4m3fn
+    w1 = w1.to(device=torch.cuda.current_device())
+    w2 = w2.to(device=torch.cuda.current_device())
+    if is_quantized:
+        w1_scale = w1_scale.to(  # type: ignore
+            device=torch.cuda.current_device())
+        w2_scale = w2_scale.to(  # type: ignore
+            device=torch.cuda.current_device())
+
+    pg = torch.distributed.new_group(list(range(pgi.world_size)))
+    test_tensors = TestTensors.make(config, low_latency_mode)
+
+    with set_current_vllm_config(VllmConfig()):
+        # Reference
+        torch_combined = torch_moe_impl(test_tensors, w1, w2, w1_scale,
+                                        w2_scale, use_fp8_dispatch)
+
+        # Splice experts for this rank.
+        num_local_experts = config.num_experts // pgi.world_size
+        e_start = num_local_experts * pgi.rank
+        e_end = e_start + num_local_experts
+        w1_ep = w1[e_start:e_end]
+        w2_ep = w2[e_start:e_end]
+
+        w1_scale_ep, w2_scale_ep = None, None
+        if is_quantized:
+            w1_scale_ep = w1_scale[e_start:e_end]  # type: ignore
+            w2_scale_ep = w2_scale[e_start:e_end]  # type: ignore
+        deepep_combined = deep_ep_moe_impl(
+            pg,
+            pgi,
+            low_latency_mode,
+            dp_size,
+            test_tensors,
+            w1_ep,
+            w2_ep,
+            w1_scale_ep,
+            w2_scale_ep,
+            config.num_experts,
+            use_fp8_dispatch,
+        )
+
+    torch.testing.assert_close(
+        torch_combined,
+        deepep_combined,
+        atol=6e-2,
+        rtol=6e-2,
+    )
+
+
+MNKs = [
+    (1, 128, 128),
+    (2, 128, 512),
+    (3, 1024, 2048),
+    (32, 128, 1024),
+    (45, 512, 2048),
+    (64, 1024, 1024),
+    (222, 1024, 2048),
+]
+
+DTYPES = [torch.bfloat16, torch.float8_e4m3fn]
+
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("mnk", MNKs)
+@pytest.mark.parametrize("num_experts", [32])
+@pytest.mark.parametrize("topk", [6])
+@pytest.mark.parametrize("world_dp_size", [(2, 1)])
+@requires_deep_ep
+def test_deep_ep_moe(dtype: torch.dtype, mnk: tuple[int, int, int],
+                     num_experts: int, topk: int, world_dp_size: tuple[int,
+                                                                       int]):
+    low_latency_mode = False
+    use_fp8_dispatch = False
+    m, n, k = mnk
+
+    current_platform.seed_everything(7)
+    world_size, dp_size = world_dp_size
+    config = TestConfig(dtype=dtype,
+                        topk=topk,
+                        m=m,
+                        k=k,
+                        n=n,
+                        num_experts=num_experts)
+
+    w1, w2, w1_scale, w2_scale = make_weights(num_experts, n, k, dtype)
+
+    parallel_launch(world_size, _deep_ep_moe, low_latency_mode, dp_size,
+                    config, w1, w2, w1_scale, w2_scale, use_fp8_dispatch)
+
+
+MNKs = [
+    (1, 128, 2560),
+    (2, 128, 2560),
+    (3, 1024, 2560),
+    (32, 128, 2560),
+    (45, 512, 2560),
+    (64, 1024, 2560),
+    (222, 1024, 2560),
+]
+DTYPES = [torch.float8_e4m3fn, torch.bfloat16]
+USE_FP8_DISPATCH = [True, False]
+
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("mnk", MNKs)
+@pytest.mark.parametrize("num_experts", [32])
+@pytest.mark.parametrize("topk", [6])
+@pytest.mark.parametrize("world_dp_size", [(2, 1)])
+@pytest.mark.parametrize("use_fp8_dispatch", USE_FP8_DISPATCH)
+@requires_deep_ep
+def test_low_latency_deep_ep_moe(dtype: torch.dtype, mnk: tuple[int, int, int],
+                                 num_experts: int, topk: int,
+                                 world_dp_size: tuple[int, int],
+                                 use_fp8_dispatch: bool):
+
+    low_latency_mode = True
+    m, n, k = mnk
+
+    if (low_latency_mode
+            and k not in DeepEPLLPrepareAndFinalize.SUPPORTED_HIDDEN_SIZES):
+        pytest.skip(
+            f"Skipping test as hidden size {k} is not in list of supported "
+            f"hidden sizes {DeepEPLLPrepareAndFinalize.SUPPORTED_HIDDEN_SIZES}"
+        )
+
+    current_platform.seed_everything(7)
+    world_size, dp_size = world_dp_size
+    config = TestConfig(dtype=dtype,
+                        topk=topk,
+                        m=m,
+                        k=k,
+                        n=n,
+                        num_experts=num_experts)
+
+    w1, w2, w1_scale, w2_scale = make_weights(num_experts, n, k, dtype)
+
+    parallel_launch(world_size, _deep_ep_moe, low_latency_mode, dp_size,
+                    config, w1, w2, w1_scale, w2_scale, use_fp8_dispatch)
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
index 299279390fe0..7238813a299d 100644
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the MOE layers.
 
 Run `pytest tests/kernels/test_moe.py`.
diff --git a/tests/kernels/moe/test_moe_permute_unpermute.py b/tests/kernels/moe/test_moe_permute_unpermute.py
index 10e6ac64df87..7cc83b512c8b 100644
--- a/tests/kernels/moe/test_moe_permute_unpermute.py
+++ b/tests/kernels/moe/test_moe_permute_unpermute.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the MOE permute/unpermute kernel
 
 Run `pytest tests/kernels/test_moe_permute_unpermute.py`.
diff --git a/tests/kernels/moe/test_nvfp4_moe.py b/tests/kernels/moe/test_nvfp4_moe.py
index ae63b379f39d..22482d9ca85a 100644
--- a/tests/kernels/moe/test_nvfp4_moe.py
+++ b/tests/kernels/moe/test_nvfp4_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 
@@ -79,7 +80,10 @@ def test_cutlass_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int,
                 w2[expert], w2_gs[expert])
 
         score = torch.randn((m, e), device="cuda", dtype=dtype)
-        topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
+        topk_weights, topk_ids, _ = fused_topk(a,
+                                               score,
+                                               topk,
+                                               renormalize=False)
 
         a1_gs = torch.ones((e, ), device="cuda", dtype=torch.float32)
         a2_gs = torch.ones((e, ), device="cuda", dtype=torch.float32)
diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py
new file mode 100644
index 000000000000..ef3e6adcfa36
--- /dev/null
+++ b/tests/kernels/moe/test_pplx_cutlass_moe.py
@@ -0,0 +1,287 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from tests.pplx_utils import ProcessGroupInfo, parallel_launch
+from vllm import _custom_ops as ops
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEModularKernel)
+from vllm.platforms import current_platform
+
+try:
+    from pplx_kernels import AllToAll
+    from pplx_kernels.nvshmem import (nvshmem_alloc_empty_unique_id,
+                                      nvshmem_finalize, nvshmem_get_unique_id,
+                                      nvshmem_init)
+    has_pplx = True
+except ImportError:
+    has_pplx = False
+
+requires_pplx = pytest.mark.skipif(
+    not has_pplx,
+    reason="Requires PPLX kernels",
+)
+
+NUM_EXPERTS = [40, 64]
+TOP_KS = [6, 8]
+
+
+def rank_chunk(num, r, w):
+    rem = num % w
+    return (num // w) + (1 if r < rem else 0)
+
+
+def chunk_by_rank(t, r, w):
+    num = t.shape[0]
+    chunk = rank_chunk(num, r, w)
+    rem = num % w
+    if rem == 0 or r < rem:
+        return t[(r * chunk):(r + 1) * chunk].contiguous()
+    else:
+        long_chunks = (num // w + 1) * rem
+        short_chunks = (r - rem) * chunk
+        start = long_chunks + short_chunks
+        return t[start:start + chunk].contiguous()
+
+
+def pplx_cutlass_moe(
+    pgi: ProcessGroupInfo,
+    dp_size: int,
+    a: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    a1_scale: torch.Tensor,
+    out_dtype,
+    per_act_token: bool,
+    per_out_ch: bool,
+):
+    from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
+        PplxPrepareAndFinalize)
+    assert torch.cuda.current_device() == pgi.local_rank
+
+    num_tokens, hidden_dim = a.shape
+    num_experts = w1.shape[0]
+    block_size = hidden_dim  # TODO support more cases
+    device = pgi.device
+    rank = pgi.rank
+    world_size = pgi.world_size
+    rank_num_tokens = rank_chunk(num_tokens, rank, world_size)
+    max_num_tokens = rank_chunk(num_tokens, 0, world_size)
+    topk = topk_ids.shape[1]
+
+    if block_size == hidden_dim:
+        scale_elems = 4  # hack to circumvent pplx data format requirements
+    else:
+        scale_elems = (hidden_dim + block_size - 1) // block_size
+
+    ata = AllToAll.internode(
+        max_num_tokens=max_num_tokens,
+        num_experts=num_experts,
+        experts_per_token=topk,
+        rank=rank,
+        world_size=pgi.world_size,
+        dp_size=dp_size,
+        hidden_dim=hidden_dim,
+        hidden_dim_bytes=hidden_dim,  # because a.dtype.itemsize == 1
+        hidden_dim_scale_bytes=scale_elems * torch.float32.itemsize,
+    )
+
+    w1 = w1.to(device)
+    w2 = w2.to(device)
+    w1_scale = w1_scale.to(device)
+    w2_scale = w2_scale.to(device)
+    a1_scale = a1_scale.to(device)
+
+    prepare_finalize = PplxPrepareAndFinalize(
+        ata,
+        max_num_tokens,
+        pgi.world_size,
+        rank,
+        dp_size,
+        quant_dtype=torch.float8_e4m3fn,
+        per_act_token=per_act_token,
+    )
+
+    experts = CutlassExpertsFp8((num_experts + world_size - 1) // world_size,
+                                out_dtype, per_act_token, per_out_ch)
+
+    fused_cutlass_experts = FusedMoEModularKernel(
+        prepare_finalize,
+        experts,
+    )
+
+    a_chunk = chunk_by_rank(a, rank, world_size).to(device)
+    chunk_topk_weight = chunk_by_rank(topk_weights, rank,
+                                      world_size).to(device)
+    chunk_topk_ids = chunk_by_rank(topk_ids, rank,
+                                   world_size).to(torch.uint32).to(device)
+
+    out = fused_cutlass_experts(
+        a_chunk,
+        chunk_by_rank(w1, rank, world_size),
+        chunk_by_rank(w2, rank, world_size),
+        chunk_topk_weight,
+        chunk_topk_ids,
+        global_num_experts=num_experts,
+        expert_map=None,  #TODO
+        w1_scale=chunk_by_rank(w1_scale, rank, world_size),
+        w2_scale=chunk_by_rank(w2_scale, rank, world_size),
+        a1_scale=chunk_by_rank(a1_scale, rank, world_size)
+        if per_act_token else a1_scale[rank])
+
+    torch.cuda.synchronize()
+
+    ata.destroy()
+
+    return out[:rank_num_tokens]
+
+
+vllm_config = VllmConfig()
+vllm_config.scheduler_config.max_num_seqs = 128
+vllm_config.scheduler_config.max_model_len = 8192
+
+
+def torch_moe2(a, w1, w2, topk_weight, topk_ids):
+    M, K = a.shape
+    topk = topk_ids.shape[1]
+    a = a.view(M, -1, K).repeat(1, topk, 1).reshape(-1, K)
+    out = torch.zeros(M * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+    num_experts = w1.shape[0]
+    for i in range(num_experts):
+        mask = (topk_ids == i).view(-1)
+        if mask.sum():
+            out[mask] = SiluAndMul()(
+                a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(0, 1)
+
+    return (out.view(M, -1, w2.shape[1]) *
+            topk_weight.view(M, -1, 1).to(out.dtype)).sum(dim=1)
+
+
+def _pplx_moe(
+    pgi: ProcessGroupInfo,
+    dp_size: int,
+    a: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    a1_scale: torch.Tensor,
+    out_dtype,
+    a_full: torch.Tensor,
+    w1_full: torch.Tensor,
+    w2_full: torch.Tensor,
+    per_act_token: bool,
+    per_out_ch: bool,
+):
+    uid = nvshmem_get_unique_id(
+    ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
+    torch.distributed.broadcast(uid, src=0)
+    nvshmem_init(uid, pgi.rank, pgi.world_size)
+
+    with set_current_vllm_config(vllm_config):
+        torch_output = torch_moe2(a_full, w1_full, w2_full, topk_weights,
+                                  topk_ids)
+        pplx_output = pplx_cutlass_moe(pgi, dp_size, a, w1, w2, w1_scale,
+                                       w2_scale, topk_weights, topk_ids,
+                                       a1_scale, out_dtype, per_act_token,
+                                       per_out_ch)
+
+        torch_output = chunk_by_rank(torch_output, pgi.rank,
+                                     pgi.world_size).to(pplx_output.device)
+
+    # Uncomment if more debugging is needed
+    # print("PPLX OUT:", pplx_output)
+    # print("TORCH OUT:", torch_output)
+
+    torch.testing.assert_close(pplx_output, torch_output, atol=0.05, rtol=0)
+
+    nvshmem_finalize()
+
+
+@pytest.mark.parametrize("m", [2, 224])
+@pytest.mark.parametrize("n", [3072])
+@pytest.mark.parametrize("k", [1536])
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("per_act_token", [True, False])
+@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("world_dp_size", [[2, 1]])  #, [4, 2]])
+@pytest.mark.skipif(
+    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
+        current_platform.get_device_capability()),
+    reason="Grouped gemm is not supported on this GPU type.")
+@requires_pplx
+def test_cutlass_moe_pplx(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    per_act_token: bool,
+    per_out_ch: bool,
+    world_dp_size: tuple[int, int],
+):
+    current_platform.seed_everything(7)
+
+    with set_current_vllm_config(vllm_config):
+
+        dtype = torch.half
+
+        a = torch.randn((m, k), device="cuda", dtype=dtype) / 10.0
+        w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10.0
+        w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10.0
+
+        n_b_scales = 2 * n if per_out_ch else 1
+        k_b_scales = k if per_out_ch else 1
+
+        w1_q = torch.empty((e, 2 * n, k),
+                           device="cuda",
+                           dtype=torch.float8_e4m3fn)
+        w2_q = torch.empty((e, k, n), device="cuda", dtype=torch.float8_e4m3fn)
+        w1_scale = torch.empty((e, n_b_scales, 1),
+                               device="cuda",
+                               dtype=torch.float32)
+        w2_scale = torch.empty((e, k_b_scales, 1),
+                               device="cuda",
+                               dtype=torch.float32)
+
+        for expert in range(e):
+            w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
+                w1[expert], use_per_token_if_dynamic=per_out_ch)
+            w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
+                w2[expert], use_per_token_if_dynamic=per_out_ch)
+
+        w1_d = torch.empty_like(w1)
+        w2_d = torch.empty_like(w2)
+        for expert in range(e):
+            w1_d[expert] = (w1_q[expert].float() * w1_scale[expert]).half()
+            w2_d[expert] = (w2_q[expert].float() * w2_scale[expert]).half()
+
+        score = torch.randn((m, e), device="cuda", dtype=dtype)
+        topk_weights, topk_ids, _ = fused_topk(a,
+                                               score,
+                                               topk,
+                                               renormalize=False)
+
+        world_size, dp_size = world_dp_size
+        a_scale1 = torch.randn(
+            (m if per_act_token else 1, 1), device="cuda",
+            dtype=torch.float32) / 10.0
+        if not per_act_token:
+            a_scale1 = a_scale1.repeat(world_size, 1)
+
+        parallel_launch(world_size, _pplx_moe, dp_size, a, w1_q, w2_q,
+                        w1_scale, w2_scale, topk_weights, topk_ids, a_scale1,
+                        dtype, a, w1_d, w2_d, per_act_token, per_out_ch)
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
index 8c4a2c3fa440..bbfe31d0e650 100644
--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@@ -1,12 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the MOE layers.
 
 Run `pytest tests/kernels/test_pplx_moe.py`.
 """
-import dataclasses
-import os
-import traceback
-from typing import Callable, Optional
+from typing import Optional
 
 import pytest
 import torch
@@ -20,10 +18,7 @@
 except ImportError:
     has_pplx = False
 
-from torch.multiprocessing import (
-    spawn)  # pyright: ignore[reportPrivateImportUsage]
-from typing_extensions import Concatenate, ParamSpec
-
+from tests.pplx_utils import ProcessGroupInfo, parallel_launch
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import override_config
@@ -35,6 +30,11 @@
     FusedMoEModularKernel)
 from vllm.platforms import current_platform
 
+requires_pplx = pytest.mark.skipif(
+    not has_pplx,
+    reason="Requires PPLX kernels",
+)
+
 PPLX_PREPARE_COMBOS = [(4, 128, 128), (32, 1024, 512), (64, 1024, 512),
                        (222, 2048, 1024)]
 
@@ -56,122 +56,6 @@
 vllm_config.scheduler_config.max_num_seqs = 128
 vllm_config.scheduler_config.max_model_len = 8192
 
-P = ParamSpec("P")
-
-requires_pplx = pytest.mark.skipif(
-    not has_pplx,
-    reason="Requires PPLX kernels",
-)
-
-
-@dataclasses.dataclass
-class ProcessGroupInfo:
-    world_size: int
-    world_local_size: int
-    rank: int
-    node_rank: int
-    local_rank: int
-    device: torch.device
-
-
-def _worker_parallel_launch(
-    local_rank: int,
-    world_size: int,
-    world_local_size: int,
-    node_rank: int,
-    init_method: str,
-    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
-    *args: P.args,
-    **kwargs: P.kwargs,
-) -> None:
-    rank = node_rank * world_local_size + local_rank
-    torch.cuda.set_device(local_rank)
-    device = torch.device("cuda", local_rank)
-    torch.distributed.init_process_group(
-        backend="cpu:gloo,cuda:nccl",
-        init_method=init_method,
-        rank=rank,
-        world_size=world_size,
-        device_id=device,
-    )
-    barrier = torch.tensor([rank], device=device)
-    torch.distributed.all_reduce(barrier)
-
-    try:
-        worker(
-            ProcessGroupInfo(
-                world_size=world_size,
-                world_local_size=world_local_size,
-                rank=rank,
-                node_rank=node_rank,
-                local_rank=local_rank,
-                device=device,
-            ),
-            *args,
-            **kwargs,
-        )
-    except Exception as ex:
-        print(ex)
-        traceback.print_exc()
-        raise
-    finally:
-        torch.distributed.destroy_process_group()
-
-
-def parallel_launch(
-    world_size: int,
-    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
-    *args: P.args,
-    **kwargs: P.kwargs,
-) -> None:
-    assert not kwargs
-    spawn(
-        _worker_parallel_launch,
-        args=(
-            world_size,
-            world_size,
-            0,
-            "tcp://localhost:29500",
-            worker,
-        ) + args,
-        nprocs=world_size,
-        join=True,
-    )
-
-
-def parallel_launch_from_env(
-    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
-    *args: P.args,
-    **kwargs: P.kwargs,
-) -> None:
-    """
-    Launches a worker function in parallel across all processes in the current
-    environment. The environment must have the following variables set:
-    - WORLD_SIZE: The total number of processes.
-    - WORLD_LOCAL_SIZE: The number of processes on the current node.
-    - NODE_RANK: The rank of the current
-    - MASTER_ADDR: The address of the master process.
-    - MASTER_PORT: The port of the master process.
-    """
-    assert not kwargs
-    world_size = int(os.environ["WORLD_SIZE"])
-    world_local_size = int(os.environ["WORLD_LOCAL_SIZE"])
-    node_rank = int(os.environ["NODE_RANK"])
-    assert "MASTER_ADDR" in os.environ
-    assert "MASTER_PORT" in os.environ
-    spawn(
-        _worker_parallel_launch,
-        args=(
-            world_size,
-            world_local_size,
-            node_rank,
-            "env://",
-            worker,
-        ) + args,
-        nprocs=world_local_size,
-        join=True,
-    )
-
 
 def torch_prepare(
     a: torch.Tensor,
diff --git a/tests/kernels/moe/test_rocm_aiter_topk.py b/tests/kernels/moe/test_rocm_aiter_topk.py
index 922fd66dbef4..1c51c530c193 100644
--- a/tests/kernels/moe/test_rocm_aiter_topk.py
+++ b/tests/kernels/moe/test_rocm_aiter_topk.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # This is a test for the AITER ops.
 # It tests if the AITER ops are
 # 1. correctly registered as custom ops
diff --git a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
index 3b5838a99fa1..dfd0f35c8da3 100644
--- a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
+++ b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_triton_moe_channel_fp8_kernel.py
 import itertools
diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py
index 892309a017e4..0840cc7b54fc 100644
--- a/tests/kernels/quant_utils.py
+++ b/tests/kernels/quant_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional, Union
 
diff --git a/tests/kernels/quantization/nvfp4_utils.py b/tests/kernels/quantization/nvfp4_utils.py
index 58eaeee1c0b8..1095975ab2b4 100644
--- a/tests/kernels/quantization/nvfp4_utils.py
+++ b/tests/kernels/quantization/nvfp4_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch
 
 from vllm.scalar_type import scalar_types
diff --git a/tests/kernels/quantization/test_allspark_gemm.py b/tests/kernels/quantization/test_allspark_gemm.py
index 896e0265738b..3de9cb364468 100644
--- a/tests/kernels/quantization/test_allspark_gemm.py
+++ b/tests/kernels/quantization/test_allspark_gemm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 
diff --git a/tests/kernels/quantization/test_aqlm.py b/tests/kernels/quantization/test_aqlm.py
index 7d36172815b7..427db3e60292 100644
--- a/tests/kernels/quantization/test_aqlm.py
+++ b/tests/kernels/quantization/test_aqlm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/tests/kernels/quantization/test_awq.py b/tests/kernels/quantization/test_awq.py
index 248b294e546b..bc0868123d82 100644
--- a/tests/kernels/quantization/test_awq.py
+++ b/tests/kernels/quantization/test_awq.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/kernels/quantization/test_awq_triton.py b/tests/kernels/quantization/test_awq_triton.py
index 3fc3feaf4972..96797e85bd12 100644
--- a/tests/kernels/quantization/test_awq_triton.py
+++ b/tests/kernels/quantization/test_awq_triton.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the AWQ Triton kernel.
 
 Run `pytest tests/kernels/test_awq_triton.py`.
diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py
index ae05d61173f3..8c5ee98743d7 100644
--- a/tests/kernels/quantization/test_block_fp8.py
+++ b/tests/kernels/quantization/test_block_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from https://github.com/sgl-project/sglang/pull/2575
 import itertools
diff --git a/tests/kernels/quantization/test_block_int8.py b/tests/kernels/quantization/test_block_int8.py
index a4e9f83f0eaf..fa2c9f890d6f 100644
--- a/tests/kernels/quantization/test_block_int8.py
+++ b/tests/kernels/quantization/test_block_int8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_block_int8.py
 import itertools
diff --git a/tests/kernels/quantization/test_cutlass_2of4_sparse.py b/tests/kernels/quantization/test_cutlass_2of4_sparse.py
index d67d2dbb8998..878f66647e19 100644
--- a/tests/kernels/quantization/test_cutlass_2of4_sparse.py
+++ b/tests/kernels/quantization/test_cutlass_2of4_sparse.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for sparse cutlass kernels
 
 Run `pytest tests/kernels/test_semi_structured.py`.
diff --git a/tests/kernels/quantization/test_cutlass_scaled_mm.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py
index 633addd421f4..c4d349f1a5a0 100644
--- a/tests/kernels/quantization/test_cutlass_scaled_mm.py
+++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for cutlass kernels
 
 Run `pytest tests/kernels/test_cutlass.py`.
@@ -631,7 +632,8 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
     ops.cutlass_moe_mm(out_tensors_stacked, a_tensors_stacked,
                        b_tensors_stacked, a_scales_tensors_stacked,
                        b_scales_tensors_stacked, expert_offsets[:-1],
-                       problem_sizes, ab_strides, ab_strides, c_strides)
+                       problem_sizes, ab_strides, ab_strides, c_strides,
+                       per_act_token, per_out_ch)
 
     # Validate each group's result against the baseline
     for g in range(num_experts):
diff --git a/tests/kernels/quantization/test_fp8_quant.py b/tests/kernels/quantization/test_fp8_quant.py
index 876cf03fd644..0a3edd4ddc16 100644
--- a/tests/kernels/quantization/test_fp8_quant.py
+++ b/tests/kernels/quantization/test_fp8_quant.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/kernels/quantization/test_ggml.py b/tests/kernels/quantization/test_ggml.py
index 73697a6d1242..07651fef39bf 100644
--- a/tests/kernels/quantization/test_ggml.py
+++ b/tests/kernels/quantization/test_ggml.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import gguf
 import pytest
diff --git a/tests/kernels/quantization/test_gguf.py b/tests/kernels/quantization/test_gguf.py
index ad755fe7f7a0..436d5cb64021 100644
--- a/tests/kernels/quantization/test_gguf.py
+++ b/tests/kernels/quantization/test_gguf.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from pathlib import Path
 
diff --git a/tests/kernels/quantization/test_gptq.py b/tests/kernels/quantization/test_gptq.py
index fea013d9e579..7fb57a1576bd 100644
--- a/tests/kernels/quantization/test_gptq.py
+++ b/tests/kernels/quantization/test_gptq.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/tests/kernels/quantization/test_int8_kernel.py b/tests/kernels/quantization/test_int8_kernel.py
index 4c7543527c32..dc5fecbf4ccc 100644
--- a/tests/kernels/quantization/test_int8_kernel.py
+++ b/tests/kernels/quantization/test_int8_kernel.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_int8_kernel.py
 import itertools
diff --git a/tests/kernels/quantization/test_int8_quant.py b/tests/kernels/quantization/test_int8_quant.py
index 25dcb587e487..63ccf4a91736 100644
--- a/tests/kernels/quantization/test_int8_quant.py
+++ b/tests/kernels/quantization/test_int8_quant.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/kernels/quantization/test_machete_mm.py b/tests/kernels/quantization/test_machete_mm.py
index 5aeaaa654ed6..998171baaf2d 100644
--- a/tests/kernels/quantization/test_machete_mm.py
+++ b/tests/kernels/quantization/test_machete_mm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the machete kernel.
 
 Run `pytest tests/kernels/test_machete_mm.py`.
diff --git a/tests/kernels/quantization/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py
index 52507b375c27..92914bd5cbba 100644
--- a/tests/kernels/quantization/test_marlin_gemm.py
+++ b/tests/kernels/quantization/test_marlin_gemm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the marlin kernel.
 
 Run `pytest tests/kernels/marlin/test_marlin_gemm.py`.
diff --git a/tests/kernels/quantization/test_nvfp4_quant.py b/tests/kernels/quantization/test_nvfp4_quant.py
index b8aa1672100e..3a8f4c17598c 100644
--- a/tests/kernels/quantization/test_nvfp4_quant.py
+++ b/tests/kernels/quantization/test_nvfp4_quant.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 
diff --git a/tests/kernels/quantization/test_nvfp4_scaled_mm.py b/tests/kernels/quantization/test_nvfp4_scaled_mm.py
index 1f49900b2d90..0b45c2298175 100644
--- a/tests/kernels/quantization/test_nvfp4_scaled_mm.py
+++ b/tests/kernels/quantization/test_nvfp4_scaled_mm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 from nvfp4_utils import (FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX,
diff --git a/tests/kernels/quantization/test_rocm_skinny_gemms.py b/tests/kernels/quantization/test_rocm_skinny_gemms.py
index c7eee899896a..533a4fe59677 100644
--- a/tests/kernels/quantization/test_rocm_skinny_gemms.py
+++ b/tests/kernels/quantization/test_rocm_skinny_gemms.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 
diff --git a/tests/kernels/quantization/test_triton_scaled_mm.py b/tests/kernels/quantization/test_triton_scaled_mm.py
index 30e6eeb8d566..8a2cc3baced2 100644
--- a/tests/kernels/quantization/test_triton_scaled_mm.py
+++ b/tests/kernels/quantization/test_triton_scaled_mm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the triton_scaled_mm kernel
 
 Run `pytest tests/kernels/test_triton_scaled_mm.py`.
diff --git a/tests/kernels/test_apply_repetition_penalties.py b/tests/kernels/test_apply_repetition_penalties.py
new file mode 100644
index 000000000000..9115949a1651
--- /dev/null
+++ b/tests/kernels/test_apply_repetition_penalties.py
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+
+from tests.kernels.utils import opcheck
+from vllm._custom_ops import (apply_repetition_penalties_cuda,
+                              apply_repetition_penalties_torch)
+from vllm.platforms import current_platform
+
+NUM_SEQS = [1, 2, 3, 4, 8, 13, 17, 32, 37, 256, 1023, 1024, 1025]
+# [stress, stress, stress, Qwen, llama 4]
+VOCAB_SIZES = [17, 256, 1019, 151936, 202048]
+REPETITION_PENALTY_VALUES = [1.05]
+SEEDS = [0]
+DTYPES = [torch.float32, torch.float16]
+
+
+@pytest.mark.parametrize("num_seqs", NUM_SEQS)
+@pytest.mark.parametrize("vocab_size", VOCAB_SIZES)
+@pytest.mark.parametrize("repetition_penalty", REPETITION_PENALTY_VALUES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.skipif(not current_platform.is_cuda(),
+                    reason="This test for checking CUDA kernel")
+@torch.inference_mode()
+def test_apply_repetition_penalties(
+    num_seqs: int,
+    vocab_size: int,
+    repetition_penalty: float,
+    dtype: torch.dtype,
+    seed: int,
+) -> None:
+    """
+    Test the apply_repetition_penalties custom op 
+    against a reference implementation.
+    """
+    current_platform.seed_everything(seed)
+    torch.set_default_device("cuda:0")
+
+    # Create test data
+    logits = torch.randn(num_seqs, vocab_size, dtype=dtype)
+
+    # Create masks with some random tokens marked as repeated
+    prompt_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool)
+    output_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool)
+
+    # Mark some tokens as repeated in prompt and output
+    prompt_indices = torch.randint(0, vocab_size,
+                                   (num_seqs, max(1, vocab_size // 200)))
+    output_indices = torch.randint(0, vocab_size,
+                                   (num_seqs, max(1, vocab_size // 200)))
+
+    for i in range(num_seqs):
+        prompt_mask[i, prompt_indices[i]] = True
+        output_mask[i, output_indices[i]] = True
+
+    # Create repetition penalties tensor
+    repetition_penalties = torch.full((num_seqs, ),
+                                      repetition_penalty,
+                                      dtype=dtype)
+
+    # Run all three implementations
+    logits_torch = logits.clone()
+    logits_cuda = logits.clone()
+
+    apply_repetition_penalties_torch(logits_torch, prompt_mask, output_mask,
+                                     repetition_penalties)
+    apply_repetition_penalties_cuda(logits_cuda, prompt_mask, output_mask,
+                                    repetition_penalties)
+
+    # Compare all outputs to reference
+    torch.testing.assert_close(logits_torch, logits_cuda, rtol=1e-3, atol=1e-3)
+
+    # Test the operator by applying the opcheck utility
+    opcheck(torch.ops._C.apply_repetition_penalties_,
+            (logits.clone(), prompt_mask, output_mask, repetition_penalties))
diff --git a/tests/kernels/test_cutlass_mla_decode.py b/tests/kernels/test_cutlass_mla_decode.py
index 87e4bd4b096b..2b745b84dae6 100644
--- a/tests/kernels/test_cutlass_mla_decode.py
+++ b/tests/kernels/test_cutlass_mla_decode.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 import torch.nn.functional as F
@@ -75,7 +76,9 @@ def test_cutlass_mla_decode(dtype: torch.dtype, mean_seq_len: int, bs: int,
     pack_factor = 128 // block_size
     block_num = ((block_num + pack_factor - 1) // pack_factor) * pack_factor
 
-    q = torch.randn(bs, h_q, d)
+    # Amplify input values to ensure test coverage of edge cases where CUTLASS
+    # kernel errors occur with split_k settings.
+    q = torch.randn(bs, h_q, d) * 100
     block_table = torch.randint(0,
                                 bs * block_num, (bs, block_num),
                                 dtype=torch.int32)
diff --git a/tests/kernels/test_flex_attention.py b/tests/kernels/test_flex_attention.py
new file mode 100644
index 000000000000..040ddac10258
--- /dev/null
+++ b/tests/kernels/test_flex_attention.py
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Integration tests for FlexAttention backend vs default backend"""
+
+import random
+
+import numpy as np
+import pytest
+import torch
+from packaging import version
+
+from vllm import LLM, SamplingParams
+
+TORCH_VERSION = version.parse(torch.__version__)
+MINIMUM_TORCH_VERSION = version.parse("2.7.0")
+
+
+def set_seed(seed):
+    """Set seeds for reproducibility"""
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION,
+    reason="CUDA not available or PyTorch version < 2.7",
+)
+def test_flex_attention_vs_default_backend(monkeypatch):
+    """Test that FlexAttention produces the same outputs as the default backend.
+
+    This test compares the outputs from the FlexAttention backend with
+    the default backend, ensuring they are identical when using the same seed.
+    """
+    model_name = "Qwen/Qwen2.5-1.5B-Instruct"
+    seed = 42
+    max_tokens = 32
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+    ]
+
+    sampling_params = SamplingParams(temperature=0.0,
+                                     top_p=1.0,
+                                     seed=seed,
+                                     max_tokens=max_tokens)
+
+    # Run with flex attention
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
+        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+
+        set_seed(seed)
+
+        llm_flex = LLM(
+            model_name,
+            tensor_parallel_size=1,
+            num_gpu_blocks_override=128,
+            enforce_eager=True,
+        )
+        output_flex = llm_flex.generate(prompts, sampling_params)
+
+    # Run with default backend
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+        set_seed(seed)
+        llm_default = LLM(
+            model_name,
+            tensor_parallel_size=1,
+            num_gpu_blocks_override=128,
+            enforce_eager=True,
+        )
+        output_default = llm_default.generate(prompts, sampling_params)
+
+    # Compare outputs from both backends
+    for i, (flex_result,
+            default_result) in enumerate(zip(output_flex, output_default)):
+        prompt = prompts[i]
+        flex_text = flex_result.outputs[0].text
+        default_text = default_result.outputs[0].text
+
+        assert flex_text == default_text, (
+            f"FlexAttention output doesn't match default for: {prompt!r}\n"
+            f"FlexAttention: {flex_text!r}\n"
+            f"Default: {default_text!r}")
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/kernels/test_fused_quant_activation.py b/tests/kernels/test_fused_quant_activation.py
index faa8d49ce41b..803453a20d81 100644
--- a/tests/kernels/test_fused_quant_activation.py
+++ b/tests/kernels/test_fused_quant_activation.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 
diff --git a/tests/kernels/test_triton_flash_attention.py b/tests/kernels/test_triton_flash_attention.py
index cf2bdc908e42..1c31cfb25e5a 100644
--- a/tests/kernels/test_triton_flash_attention.py
+++ b/tests/kernels/test_triton_flash_attention.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the triton_flash_attention kernel
 
 Run `pytest tests/kernels/test_triton_flash_attention.py`.
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index 22b3d7c2be7a..d1db6a8eb1ba 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Kernel test utils"""
 
 import itertools
diff --git a/tests/kv_transfer/test_disagg.py b/tests/kv_transfer/test_disagg.py
index dc948a48bf32..9f2229cc41df 100644
--- a/tests/kv_transfer/test_disagg.py
+++ b/tests/kv_transfer/test_disagg.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import subprocess
diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py
index c5b34660d165..352ab63552de 100644
--- a/tests/kv_transfer/test_lookup_buffer.py
+++ b/tests/kv_transfer/test_lookup_buffer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import random
diff --git a/tests/kv_transfer/test_module.py b/tests/kv_transfer/test_module.py
index 8a6490b5c887..7a04174870da 100644
--- a/tests/kv_transfer/test_module.py
+++ b/tests/kv_transfer/test_module.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import subprocess
 import sys
diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py
index 3dd923d24050..32116608a217 100644
--- a/tests/kv_transfer/test_send_recv.py
+++ b/tests/kv_transfer/test_send_recv.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import time
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 399311ce65bb..0737bb886e43 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import tempfile
 from collections import OrderedDict
diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py
index 17347300b40c..cc8160b2860d 100644
--- a/tests/lora/test_add_lora.py
+++ b/tests/lora/test_add_lora.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 import time
 
diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py
index 007be7aa582e..774ebb9db210 100644
--- a/tests/lora/test_baichuan.py
+++ b/tests/lora/test_baichuan.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py
index cd9526c8b101..5481b413b8f5 100644
--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import vllm
 from vllm.lora.request import LoRARequest
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 0a8b38fa748a..92db023babc2 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 from copy import deepcopy
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index 54daea5b9dbf..23819f03dc51 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import subprocess
 import sys
 from typing import Union
diff --git a/tests/lora/test_lora_allowed_token_ids.py b/tests/lora/test_lora_allowed_token_ids.py
index 094541aef02b..01bc102bd112 100644
--- a/tests/lora/test_lora_allowed_token_ids.py
+++ b/tests/lora/test_lora_allowed_token_ids.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py
index 02f2339bef01..ebc0f26378d2 100644
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py
index fd80f61a5977..e9a52e1b6357 100644
--- a/tests/lora/test_lora_functions.py
+++ b/tests/lora/test_lora_functions.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Script to test add_lora, remove_lora, pin_lora, list_loras functions.
 """
diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py
index 90498c47fb10..b46d81f1651a 100644
--- a/tests/lora/test_lora_huggingface.py
+++ b/tests/lora/test_lora_huggingface.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 52b0834cacb8..8f8a27006cf6 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 
diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py
index 24242b8a1759..99fe951bbf07 100644
--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py
index 4e77c5559e16..0ea07793311c 100644
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/lora/test_peft_helper.py b/tests/lora/test_peft_helper.py
index 9935472ad18f..f16589e06b2d 100644
--- a/tests/lora/test_peft_helper.py
+++ b/tests/lora/test_peft_helper.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 import math
diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py
index 7375cabbc36d..a21de070517b 100644
--- a/tests/lora/test_phi.py
+++ b/tests/lora/test_phi.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/lora/test_punica_ops.py b/tests/lora/test_punica_ops.py
index add313c94544..14fa79ae5b44 100644
--- a/tests/lora/test_punica_ops.py
+++ b/tests/lora/test_punica_ops.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from threading import Lock
 
 import pytest
diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
index 43e2975cd87c..caa31fdb0e73 100644
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/tests/lora/test_llama.py
diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py
index 20a1ae67db2d..604bb307b889 100644
--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
 from typing import Optional
 
diff --git a/tests/lora/test_resolver.py b/tests/lora/test_resolver.py
index 8ebc2ae98fc4..6c93e577611f 100644
--- a/tests/lora/test_resolver.py
+++ b/tests/lora/test_resolver.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/lora/test_tokenizer_group.py b/tests/lora/test_tokenizer_group.py
index 8845eb33d207..6cfdaf50d33c 100644
--- a/tests/lora/test_tokenizer_group.py
+++ b/tests/lora/test_tokenizer_group.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
diff --git a/tests/lora/test_transfomers_model.py b/tests/lora/test_transfomers_model.py
index 63907f2c1d02..5065a2fb7164 100644
--- a/tests/lora/test_transfomers_model.py
+++ b/tests/lora/test_transfomers_model.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py
index 0d4e0bf681f2..b343bef0a920 100644
--- a/tests/lora/test_utils.py
+++ b/tests/lora/test_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections import OrderedDict
 from typing import NamedTuple, Optional
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index 1a5d527164d0..6f13e663a78b 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import random
diff --git a/tests/lora/utils.py b/tests/lora/utils.py
index 59a0e7420fc2..cc1b0d81955b 100644
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Optional, Union
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index e71c87ff3fc8..7bb5d8980d61 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 
diff --git a/tests/mistral_tool_use/conftest.py b/tests/mistral_tool_use/conftest.py
index 39ab01c9b874..e89e60c5a02e 100644
--- a/tests/mistral_tool_use/conftest.py
+++ b/tests/mistral_tool_use/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import pytest_asyncio
diff --git a/tests/mistral_tool_use/test_mistral_tool_calls.py b/tests/mistral_tool_use/test_mistral_tool_calls.py
index bbb3a07895f6..9bf6863f3f2b 100644
--- a/tests/mistral_tool_use/test_mistral_tool_calls.py
+++ b/tests/mistral_tool_use/test_mistral_tool_calls.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai
 import pytest
diff --git a/tests/mistral_tool_use/utils.py b/tests/mistral_tool_use/utils.py
index 1d809a05e89d..7a026cd9bb61 100644
--- a/tests/mistral_tool_use/utils.py
+++ b/tests/mistral_tool_use/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/model_executor/conftest.py b/tests/model_executor/conftest.py
index b588a1a96638..c6d89d849e9f 100644
--- a/tests/model_executor/conftest.py
+++ b/tests/model_executor/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
index e957db5b3f16..a94215ee397b 100644
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/model_executor/test_guided_processors.py b/tests/model_executor/test_guided_processors.py
index 6cd966f84802..ac31064d9212 100644
--- a/tests/model_executor/test_guided_processors.py
+++ b/tests/model_executor/test_guided_processors.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 import pickle
diff --git a/tests/model_executor/test_logits_processor.py b/tests/model_executor/test_logits_processor.py
index 8301c645b79f..532ebba038d3 100644
--- a/tests/model_executor/test_logits_processor.py
+++ b/tests/model_executor/test_logits_processor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 from unittest.mock import patch
diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py
index 7fda1f0e80d0..94a14bd24bcb 100644
--- a/tests/model_executor/test_model_load_with_params.py
+++ b/tests/model_executor/test_model_load_with_params.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 
diff --git a/tests/model_executor/test_weight_utils.py b/tests/model_executor/test_weight_utils.py
index bdaba22c3c7a..df625b8d6004 100644
--- a/tests/model_executor/test_weight_utils.py
+++ b/tests/model_executor/test_weight_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import tempfile
diff --git a/tests/models/language/generation/test_bart.py b/tests/models/language/generation/test_bart.py
index 8ab0167dc771..7d8acab5e834 100644
--- a/tests/models/language/generation/test_bart.py
+++ b/tests/models/language/generation/test_bart.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import pytest
diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py
index 05dd18fbdf8b..f656f90c4bd3 100644
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 from typing import Optional
 
@@ -86,7 +87,6 @@
         pytest.param("bigcode/starcoder2-3b"),  # starcoder2
         pytest.param(
             "TitanML/tiny-mixtral",  # mixtral
-            marks=[pytest.mark.cpu_model],
         )
     ])
 @pytest.mark.parametrize("max_tokens", [32])
diff --git a/tests/models/language/generation/test_granite.py b/tests/models/language/generation/test_granite.py
index f381c34f44b8..2a39f78a708e 100644
--- a/tests/models/language/generation/test_granite.py
+++ b/tests/models/language/generation/test_granite.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 from ...utils import check_logprobs_close
diff --git a/tests/models/language/generation/test_granitemoehybrid.py b/tests/models/language/generation/test_granitemoehybrid.py
index da3f5e1100bf..952449f28415 100644
--- a/tests/models/language/generation/test_granitemoehybrid.py
+++ b/tests/models/language/generation/test_granitemoehybrid.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index 604cb854b32f..3eaadcb45fe1 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py
index c1b612ae213b..bdd857ff5062 100644
--- a/tests/models/language/generation/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
 import json
 
diff --git a/tests/models/language/generation/test_phimoe.py b/tests/models/language/generation/test_phimoe.py
index 603ca1cb12a5..6c9cc2821c30 100644
--- a/tests/models/language/generation/test_phimoe.py
+++ b/tests/models/language/generation/test_phimoe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 
diff --git a/tests/models/language/pooling/embed_utils.py b/tests/models/language/pooling/embed_utils.py
index 0c8ac2ab1b9e..dabd7bee7f39 100644
--- a/tests/models/language/pooling/embed_utils.py
+++ b/tests/models/language/pooling/embed_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Sequence
 from typing import Optional
 
@@ -55,14 +56,10 @@ def correctness_test_embed_models(hf_runner,
                      max_model_len=None,
                      **vllm_extra_kwargs) as vllm_model:
         vllm_outputs = vllm_model.encode(example_prompts)
-        vllm_dtype = vllm_model.model.llm_engine.model_config.dtype
-        model_dtype = getattr(
-            vllm_model.model.llm_engine.model_config.hf_config, "torch_dtype",
-            vllm_dtype)
 
     with hf_runner(
             model_info.name,
-            dtype=model_dtype,
+            dtype="float32",
             is_sentence_transformer=True,
     ) as hf_model:
 
diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py
index f45168bc0f1d..0a047951db44 100644
--- a/tests/models/language/pooling/mteb_utils.py
+++ b/tests/models/language/pooling/mteb_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Sequence
 
 import mteb
@@ -6,7 +7,6 @@
 import pytest
 
 from tests.models.utils import EmbedModelInfo
-from vllm.model_executor.model_loader.utils import set_default_torch_dtype
 
 # Most models on the STS12 task (See #17175):
 # - Model implementation and minor changes in tensor dtype
@@ -103,17 +103,18 @@ def mteb_test_embed_models(hf_runner,
                                               MTEB_EMBED_TASKS)
         vllm_dtype = vllm_model.model.llm_engine.model_config.dtype
 
-    with set_default_torch_dtype(vllm_dtype) and hf_runner(
-            model_info.name, is_sentence_transformer=True,
-            dtype=vllm_dtype) as hf_model:
+    with hf_runner(model_info.name,
+                   is_sentence_transformer=True,
+                   dtype="float32") as hf_model:
 
         if hf_model_callback is not None:
             hf_model_callback(hf_model)
 
         st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
+        st_dtype = next(hf_model.model.parameters()).dtype
 
-    print("VLLM:", vllm_main_score)
-    print("SentenceTransformers:", st_main_score)
+    print("VLLM:", vllm_dtype, vllm_main_score)
+    print("SentenceTransformers:", st_dtype, st_main_score)
     print("Difference:", st_main_score - vllm_main_score)
 
     assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL)
diff --git a/tests/models/language/pooling/test_baai.py b/tests/models/language/pooling/test_baai.py
index fc0e8207954f..1af3c05d3d90 100644
--- a/tests/models/language/pooling/test_baai.py
+++ b/tests/models/language/pooling/test_baai.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 from .embed_utils import EmbedModelInfo, correctness_test_embed_models
diff --git a/tests/models/language/pooling/test_classification.py b/tests/models/language/pooling/test_classification.py
index 57b3cb58d88b..4a6d781ce6f0 100644
--- a/tests/models/language/pooling/test_classification.py
+++ b/tests/models/language/pooling/test_classification.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 from transformers import AutoModelForSequenceClassification
diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py
index 8f82c8091af3..9516a01421cb 100644
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 from vllm.config import PoolerConfig
diff --git a/tests/models/language/pooling/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py
index f450edd82162..c2f70bb647a4 100644
--- a/tests/models/language/pooling/test_gritlm.py
+++ b/tests/models/language/pooling/test_gritlm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 
 import importlib.util
diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py
index 725e3d168408..05bd479f42b9 100644
--- a/tests/models/language/pooling/test_gte.py
+++ b/tests/models/language/pooling/test_gte.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any
 
 import pytest
@@ -10,27 +11,21 @@
     ########## BertModel
     EmbedModelInfo("thenlper/gte-large",
                    architecture="BertModel",
-                   dtype="float32",
                    enable_test=True),
     EmbedModelInfo("thenlper/gte-base",
                    architecture="BertModel",
-                   dtype="float32",
                    enable_test=False),
     EmbedModelInfo("thenlper/gte-small",
                    architecture="BertModel",
-                   dtype="float32",
                    enable_test=False),
     EmbedModelInfo("thenlper/gte-large-zh",
                    architecture="BertModel",
-                   dtype="float32",
                    enable_test=False),
     EmbedModelInfo("thenlper/gte-base-zh",
                    architecture="BertModel",
-                   dtype="float32",
                    enable_test=False),
     EmbedModelInfo("thenlper/gte-small-zh",
                    architecture="BertModel",
-                   dtype="float32",
                    enable_test=False),
     ########### NewModel
     EmbedModelInfo("Alibaba-NLP/gte-multilingual-base",
@@ -45,7 +40,6 @@
     ########### Qwen2ForCausalLM
     EmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
                    architecture="Qwen2ForCausalLM",
-                   dtype="float32",
                    enable_test=True),
     ########## ModernBertModel
     EmbedModelInfo("Alibaba-NLP/gte-modernbert-base",
diff --git a/tests/models/language/pooling/test_intfloat.py b/tests/models/language/pooling/test_intfloat.py
new file mode 100644
index 000000000000..b6e83857fa70
--- /dev/null
+++ b/tests/models/language/pooling/test_intfloat.py
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+
+from ...utils import EmbedModelInfo
+from .embed_utils import correctness_test_embed_models
+from .mteb_utils import mteb_test_embed_models
+
+MODELS = [
+    ########## BertModel
+    EmbedModelInfo("intfloat/e5-small",
+                   architecture="BertModel",
+                   enable_test=True),
+    EmbedModelInfo("intfloat/e5-base",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("intfloat/e5-large",
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("intfloat/multilingual-e5-small",
+                   architecture="BertModel",
+                   enable_test=False),
+    ########## XLMRobertaModel
+    EmbedModelInfo("intfloat/multilingual-e5-base",
+                   architecture="XLMRobertaModel",
+                   enable_test=True),
+    EmbedModelInfo("intfloat/multilingual-e5-large",
+                   architecture="XLMRobertaModel",
+                   enable_test=False),
+    EmbedModelInfo("intfloat/multilingual-e5-large-instruct",
+                   architecture="XLMRobertaModel",
+                   enable_test=False),
+]
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_embed_models_mteb(hf_runner, vllm_runner,
+                           model_info: EmbedModelInfo) -> None:
+    mteb_test_embed_models(hf_runner, vllm_runner, model_info)
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_embed_models_correctness(hf_runner, vllm_runner,
+                                  model_info: EmbedModelInfo,
+                                  example_prompts) -> None:
+    correctness_test_embed_models(hf_runner, vllm_runner, model_info,
+                                  example_prompts)
diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py
index 0403a20a445a..33255021ad6a 100644
--- a/tests/models/language/pooling/test_jina.py
+++ b/tests/models/language/pooling/test_jina.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from functools import partial
 
 import pytest
@@ -31,8 +32,7 @@
 EMBEDDING_MODELS = [
     EmbedModelInfo("jinaai/jina-embeddings-v3",
                    architecture="XLMRobertaModel",
-                   is_matryoshka=True,
-                   dtype="float32")
+                   is_matryoshka=True)
 ]
 
 
diff --git a/tests/models/language/pooling/test_nomic.py b/tests/models/language/pooling/test_nomic.py
index 92cd7cc569d3..e16ec239a338 100644
--- a/tests/models/language/pooling/test_nomic.py
+++ b/tests/models/language/pooling/test_nomic.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
@@ -8,18 +9,15 @@
 MODELS = [
     EmbedModelInfo("nomic-ai/nomic-embed-text-v1",
                    architecture="NomicBertModel",
-                   dtype="float32",
                    enable_test=True),
     EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5",
                    architecture="NomicBertModel",
-                   dtype="float32",
                    enable_test=False),
     EmbedModelInfo("nomic-ai/CodeRankEmbed",
                    architecture="NomicBertModel",
                    enable_test=False),
     EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe",
                    architecture="NomicBertModel",
-                   dtype="float32",
                    enable_test=True)
 ]
 
diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py
index 68603e62843e..250b3a52835a 100644
--- a/tests/models/language/pooling/test_nomic_max_model_len.py
+++ b/tests/models/language/pooling/test_nomic_max_model_len.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: SIM117
 import pytest
 
diff --git a/tests/models/language/pooling/test_scoring.py b/tests/models/language/pooling/test_scoring.py
index 6b10aeffc4b7..c75ff1445616 100644
--- a/tests/models/language/pooling/test_scoring.py
+++ b/tests/models/language/pooling/test_scoring.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 import torch.nn.functional as F
diff --git a/tests/models/language/pooling/test_snowflake_arctic_embed.py b/tests/models/language/pooling/test_snowflake_arctic_embed.py
index c6c2d1e7a679..d6b5dbd08372 100644
--- a/tests/models/language/pooling/test_snowflake_arctic_embed.py
+++ b/tests/models/language/pooling/test_snowflake_arctic_embed.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/models/language/pooling/test_truncation_control.py b/tests/models/language/pooling/test_truncation_control.py
index 1b8ac395ed17..33aff1c873fc 100644
--- a/tests/models/language/pooling/test_truncation_control.py
+++ b/tests/models/language/pooling/test_truncation_control.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2"
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index e4e48f9951cf..496850b19af4 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Common tests for testing .generate() functionality for single / multiple
 image, embedding, and video support for different VLMs in vLLM.
 """
@@ -225,6 +226,8 @@
         img_idx_to_prompt=lambda idx: "",
         auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output,
+        # FIXME: https://github.com/huggingface/transformers/pull/38510
+        marks=[pytest.mark.skip("Model is broken")],
     ),
     "chameleon": VLMTestInfo(
         models=["facebook/chameleon-7b"],
@@ -280,10 +283,10 @@
         multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.",  # noqa: E501
         max_model_len=4096,
         max_num_seqs=2,
-        dtype="bfloat16",
         auto_cls=AutoModelForImageTextToText,
         vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
         patch_hf_runner=model_utils.gemma3_patch_hf_runner,
+        num_logprobs=10,
     ),
     "glm4v": VLMTestInfo(
         models=["THUDM/glm-4v-9b"],
@@ -336,7 +339,8 @@
         models=[
             "OpenGVLab/InternVL2-1B",
             "OpenGVLab/InternVL2-2B",
-            "OpenGVLab/Mono-InternVL-2B",
+            # FIXME: Config cannot be loaded in transformers 4.52
+            # "OpenGVLab/Mono-InternVL-2B",
         ],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
         prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
@@ -567,6 +571,8 @@
         max_num_seqs=2,
         vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output,
         prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
+        # FIXME: https://github.com/huggingface/transformers/issues/38358
+        marks=[pytest.mark.skip("Model initialization fails")],
     ),
     "qwen2_vl": VLMTestInfo(
         models=["Qwen/Qwen2-VL-2B-Instruct"],
diff --git a/tests/models/multimodal/generation/test_florence2.py b/tests/models/multimodal/generation/test_florence2.py
index b8225f5f1243..a622957f96f6 100644
--- a/tests/models/multimodal/generation/test_florence2.py
+++ b/tests/models/multimodal/generation/test_florence2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
@@ -99,6 +100,8 @@ def run_test(
         )
 
 
+# FIXME: https://github.com/huggingface/transformers/issues/38358
+@pytest.mark.skip("Model initialization fails")
 @pytest.mark.core_model
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize(
diff --git a/tests/models/multimodal/generation/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py
index 96c444441e3d..c5ffa5f3a70a 100644
--- a/tests/models/multimodal/generation/test_granite_speech.py
+++ b/tests/models/multimodal/generation/test_granite_speech.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
 from typing import Optional
@@ -28,7 +29,7 @@ def vllm_to_hf_output(
     return output_ids, hf_output_str, out_logprobs
 
 
-MODEL_NAME = "ibm-granite/granite-speech-3.3-8b"
+MODEL_NAME = "ibm-granite/granite-speech-3.3-2b"
 # Audio lora co-exists directly in the model directory, but
 # currently still needs to be passed directly to vLLM.
 audio_lora_path = MODEL_NAME
diff --git a/tests/models/multimodal/generation/test_interleaved.py b/tests/models/multimodal/generation/test_interleaved.py
index 972db40e8bd6..949c0a80d31b 100644
--- a/tests/models/multimodal/generation/test_interleaved.py
+++ b/tests/models/multimodal/generation/test_interleaved.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/models/multimodal/generation/test_mllama.py b/tests/models/multimodal/generation/test_mllama.py
index 99aa3c2d3bd9..2bb01e494d43 100644
--- a/tests/models/multimodal/generation/test_mllama.py
+++ b/tests/models/multimodal/generation/test_mllama.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional, overload
 
diff --git a/tests/models/multimodal/generation/test_phi4mm.py b/tests/models/multimodal/generation/test_phi4mm.py
index e51dbee479c5..4e8465778e25 100644
--- a/tests/models/multimodal/generation/test_phi4mm.py
+++ b/tests/models/multimodal/generation/test_phi4mm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from collections.abc import Sequence
@@ -121,6 +122,10 @@ def run_test(
             for prompts, images, audios in inputs
         ]
 
+    # This error occurs inside `get_peft_model`
+    # FIXME: https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions/75
+    pytest.skip("HF impl is not compatible with current transformers")
+
     hf_model_kwargs = {"_attn_implementation": "sdpa"}
     with hf_runner(model, dtype=dtype,
                    model_kwargs=hf_model_kwargs) as hf_model:
diff --git a/tests/models/multimodal/generation/test_pixtral.py b/tests/models/multimodal/generation/test_pixtral.py
index 506b71472f4a..1def825ab087 100644
--- a/tests/models/multimodal/generation/test_pixtral.py
+++ b/tests/models/multimodal/generation/test_pixtral.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
 from dataclasses import asdict
 from typing import TYPE_CHECKING, Any, Optional
diff --git a/tests/models/multimodal/generation/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py
index 6be401b775ec..a2793b8c8ddf 100644
--- a/tests/models/multimodal/generation/test_qwen2_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_vl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional, TypedDict, Union
 
diff --git a/tests/models/multimodal/generation/test_ultravox.py b/tests/models/multimodal/generation/test_ultravox.py
index 2c8a06688ca0..e7e7bd3154a1 100644
--- a/tests/models/multimodal/generation/test_ultravox.py
+++ b/tests/models/multimodal/generation/test_ultravox.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from typing import Any
diff --git a/tests/models/multimodal/generation/test_whisper.py b/tests/models/multimodal/generation/test_whisper.py
index d0b85842a3d8..363d55153aac 100644
--- a/tests/models/multimodal/generation/test_whisper.py
+++ b/tests/models/multimodal/generation/test_whisper.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import pytest
diff --git a/tests/models/multimodal/generation/vlm_utils/builders.py b/tests/models/multimodal/generation/vlm_utils/builders.py
index 32117c8d8dca..7d20dd66089b 100644
--- a/tests/models/multimodal/generation/vlm_utils/builders.py
+++ b/tests/models/multimodal/generation/vlm_utils/builders.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Helpers for building inputs that can be leveraged for different test types.
 """
 from collections.abc import Iterable
diff --git a/tests/models/multimodal/generation/vlm_utils/case_filtering.py b/tests/models/multimodal/generation/vlm_utils/case_filtering.py
index a5077a090b52..336e2dd2b120 100644
--- a/tests/models/multimodal/generation/vlm_utils/case_filtering.py
+++ b/tests/models/multimodal/generation/vlm_utils/case_filtering.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utils for determining which subset of model tests belong to a specific
 modality, getting all combinations (similar to pytest's parametrization),
 handling multimodal placeholder substitution, and so on.
diff --git a/tests/models/multimodal/generation/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py
index ccd2799abd90..8c83d8f8a8a2 100644
--- a/tests/models/multimodal/generation/vlm_utils/core.py
+++ b/tests/models/multimodal/generation/vlm_utils/core.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Core test implementation to be shared across modalities."""
 from typing import Any, Callable, Optional
 
diff --git a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
index cc1045561138..aa5835243e04 100644
--- a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
+++ b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Custom input builders for edge-cases in different models."""
 from io import BytesIO
 from typing import Callable
diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py
index dc1ea5208240..af4c72f44b67 100644
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Common utility functions relating to different models that are useful
 for manipulating the input / output of HF & vLLM test runners, which are
 typically specific to a small subset of models.
@@ -9,11 +10,12 @@
 
 import numpy as np
 import numpy.typing as npt
+import pytest
 import regex as re
 import torch
 from PIL.Image import Image
 from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
-                          GenerationConfig)
+                          GenerationConfig, GenerationMixin)
 
 from vllm.sequence import SampleLogprobs
 from vllm.transformers_utils.tokenizer import patch_padding_side
@@ -323,6 +325,16 @@ def processor(*args, **kwargs):
 
     hf_model.processor = processor
 
+    orig_generate = hf_model.model.generate
+
+    def _generate(self, *args, **kwargs):
+        # FIXME: https://github.com/huggingface/transformers/issues/38333
+        kwargs["disable_compile"] = True
+
+        return orig_generate(*args, **kwargs)
+
+    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
+
     return hf_model
 
 
@@ -609,6 +621,11 @@ def _internvl_generate(
     if getattr(self, "use_visual_token_mask", False):
         visual_token_mask = selected.reshape(B, N, 1).to(input_embeds.dtype)
         forward_kwargs["visual_token_mask"] = visual_token_mask
+
+    # e.g. InternVL2-2B
+    if not isinstance(self.language_model, GenerationMixin):
+        pytest.skip("HF impl is not compatible with current transformers")
+
     outputs = self.language_model.generate(
         **forward_kwargs,
         **generate_kwargs,
diff --git a/tests/models/multimodal/generation/vlm_utils/runners.py b/tests/models/multimodal/generation/vlm_utils/runners.py
index 9e8a1262e8c1..562f89df1347 100644
--- a/tests/models/multimodal/generation/vlm_utils/runners.py
+++ b/tests/models/multimodal/generation/vlm_utils/runners.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Entrypoints for wrapping the core run_test implementation for specific test
 types / modalities.
 """
diff --git a/tests/models/multimodal/generation/vlm_utils/types.py b/tests/models/multimodal/generation/vlm_utils/types.py
index 1c2bb4d6222b..0ec7909e744d 100644
--- a/tests/models/multimodal/generation/vlm_utils/types.py
+++ b/tests/models/multimodal/generation/vlm_utils/types.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Types for writing multimodal model tests."""
 from collections.abc import Iterable
 from enum import Enum
diff --git a/tests/models/multimodal/pooling/test_dse_qwen2_vl.py b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
index ea1caec0ecf3..3734d87b7962 100644
--- a/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
+++ b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable
 
diff --git a/tests/models/multimodal/pooling/test_intern_vit.py b/tests/models/multimodal/pooling/test_intern_vit.py
index 76f9fbe02550..3e2be34a50ad 100644
--- a/tests/models/multimodal/pooling/test_intern_vit.py
+++ b/tests/models/multimodal/pooling/test_intern_vit.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 import torch.nn as nn
diff --git a/tests/models/multimodal/pooling/test_llava_next.py b/tests/models/multimodal/pooling/test_llava_next.py
index 77508738cc87..b6d90d2b0abe 100644
--- a/tests/models/multimodal/pooling/test_llava_next.py
+++ b/tests/models/multimodal/pooling/test_llava_next.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch.nn.functional as F
diff --git a/tests/models/multimodal/pooling/test_phi3v.py b/tests/models/multimodal/pooling/test_phi3v.py
index cd58a5cb4531..b42ac6fb21ed 100644
--- a/tests/models/multimodal/pooling/test_phi3v.py
+++ b/tests/models/multimodal/pooling/test_phi3v.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch.nn.functional as F
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 2377fef820ed..1e6608955b31 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from functools import partial
 from typing import Optional, Union
@@ -244,7 +245,7 @@ def _test_processing_correctness_one(
     "adept/fuyu-8b",
     "google/gemma-3-4b-it",
     "THUDM/glm-4v-9b",
-    "ibm-granite/granite-speech-3.3-8b",
+    "ibm-granite/granite-speech-3.3-2b",
     "h2oai/h2ovl-mississippi-800m",
     "OpenGVLab/InternVL2-1B",
     "OpenGVLab/InternVL3-1B",
diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py
index 37142b6dd36f..76e4acc67d4d 100644
--- a/tests/models/multimodal/processing/test_h2ovl.py
+++ b/tests/models/multimodal/processing/test_h2ovl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for H2OVL's multimodal preprocessing kwargs."""
 from collections.abc import Mapping
 from typing import Optional
diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py
index c35ce2f6ab29..d3a55993e558 100644
--- a/tests/models/multimodal/processing/test_idefics3.py
+++ b/tests/models/multimodal/processing/test_idefics3.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for Idefics3's multimodal preprocessing kwargs."""
 import pytest
 from transformers import Idefics3Config
diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py
index 7ec81197a3db..c3e2841a8f06 100644
--- a/tests/models/multimodal/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for InternVL's multimodal preprocessing kwargs."""
 from collections.abc import Mapping
 from typing import Optional
diff --git a/tests/models/multimodal/processing/test_llama4.py b/tests/models/multimodal/processing/test_llama4.py
index 614f17dbbeda..9ef7af556291 100644
--- a/tests/models/multimodal/processing/test_llama4.py
+++ b/tests/models/multimodal/processing/test_llama4.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for Llama4's multimodal preprocessing kwargs."""
 
 import pytest
diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py
index b82bfe483dbb..ca34d1d758a4 100644
--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 from functools import partial
diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py
index dcc8dc8dab5a..e6344c4e7e6f 100644
--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 from functools import partial
diff --git a/tests/models/multimodal/processing/test_minimax_vl_01.py b/tests/models/multimodal/processing/test_minimax_vl_01.py
index 9bd2b9887294..9387212e3f10 100644
--- a/tests/models/multimodal/processing/test_minimax_vl_01.py
+++ b/tests/models/multimodal/processing/test_minimax_vl_01.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 from PIL import Image
diff --git a/tests/models/multimodal/processing/test_mllama.py b/tests/models/multimodal/processing/test_mllama.py
index d4794396f6d2..a6b20a1e3678 100644
--- a/tests/models/multimodal/processing/test_mllama.py
+++ b/tests/models/multimodal/processing/test_mllama.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for mllama's multimodal preprocessing and profiling."""
 import pytest
 from transformers import MllamaConfig
diff --git a/tests/models/multimodal/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py
index b53351544c45..1f3646f79486 100644
--- a/tests/models/multimodal/processing/test_phi3v.py
+++ b/tests/models/multimodal/processing/test_phi3v.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for phi3v's multimodal preprocessing kwargs."""
 import pytest
 
diff --git a/tests/models/multimodal/processing/test_phi4mm.py b/tests/models/multimodal/processing/test_phi4mm.py
index c6e272650e08..f16d261c2c6a 100644
--- a/tests/models/multimodal/processing/test_phi4mm.py
+++ b/tests/models/multimodal/processing/test_phi4mm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for phi4mm's multimodal preprocessing kwargs."""
 import pytest
 
diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py
index 02abe1ca8b02..9d1cd183387b 100644
--- a/tests/models/multimodal/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/models/multimodal/processing/test_smolvlm.py b/tests/models/multimodal/processing/test_smolvlm.py
index 224d1bcedb96..af8f983388c6 100644
--- a/tests/models/multimodal/processing/test_smolvlm.py
+++ b/tests/models/multimodal/processing/test_smolvlm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for smolvlm's multimodal preprocessing kwargs."""
 import pytest
 from transformers import SmolVLMConfig
diff --git a/tests/models/quantization/test_aqlm.py b/tests/models/quantization/test_aqlm.py
index 1272a62974cc..de6851e2fc28 100644
--- a/tests/models/quantization/test_aqlm.py
+++ b/tests/models/quantization/test_aqlm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 from tests.quantization.utils import is_quant_method_supported
diff --git a/tests/models/quantization/test_awq.py b/tests/models/quantization/test_awq.py
index 597c8e48fb64..bd696198931f 100644
--- a/tests/models/quantization/test_awq.py
+++ b/tests/models/quantization/test_awq.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/models/quantization/test_bitblas.py b/tests/models/quantization/test_bitblas.py
index f0781394d81d..754ac9a29a13 100644
--- a/tests/models/quantization/test_bitblas.py
+++ b/tests/models/quantization/test_bitblas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the outputs of a GPTQ model to a bitblas model.
 
 Note: GPTQ and bitblas do not have bitwise correctness.
diff --git a/tests/models/quantization/test_fp8.py b/tests/models/quantization/test_fp8.py
index e01ee2026393..10914abf9ad3 100644
--- a/tests/models/quantization/test_fp8.py
+++ b/tests/models/quantization/test_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # flake8: noqa
 """Tests fp8 models against ground truth generation
diff --git a/tests/models/quantization/test_gguf.py b/tests/models/quantization/test_gguf.py
index 5f17d12284a0..32f9472c12d5 100644
--- a/tests/models/quantization/test_gguf.py
+++ b/tests/models/quantization/test_gguf.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Tests gguf models against unquantized models generations
 Note: To pass the test, quantization higher than Q4 should be used
@@ -37,7 +38,7 @@ def gguf_model(self):
 LLAMA_CONFIG = GGUFTestConfig(
     original_model="meta-llama/Llama-3.2-1B-Instruct",
     gguf_repo="bartowski/Llama-3.2-1B-Instruct-GGUF",
-    gguf_filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf",
+    gguf_filename="Llama-3.2-1B-Instruct-Q6_K.gguf",
 )
 
 QWEN2_CONFIG = GGUFTestConfig(
diff --git a/tests/models/quantization/test_gptq_bitblas.py b/tests/models/quantization/test_gptq_bitblas.py
index c8e96455fd0c..c3aed77525de 100644
--- a/tests/models/quantization/test_gptq_bitblas.py
+++ b/tests/models/quantization/test_gptq_bitblas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the outputs of a GPTQ model to a bitblas model.
 
 Note: GPTQ and bitblas do not have bitwise correctness.
diff --git a/tests/models/quantization/test_gptq_marlin.py b/tests/models/quantization/test_gptq_marlin.py
index 397bdb98123f..db70a3bd2c04 100644
--- a/tests/models/quantization/test_gptq_marlin.py
+++ b/tests/models/quantization/test_gptq_marlin.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compares the outputs of gptq vs gptq_marlin.
 
 Note: GPTQ and Marlin do not have bitwise correctness.
diff --git a/tests/models/quantization/test_gptq_marlin_24.py b/tests/models/quantization/test_gptq_marlin_24.py
index 6fb24b1f432e..9b86ae95ba5c 100644
--- a/tests/models/quantization/test_gptq_marlin_24.py
+++ b/tests/models/quantization/test_gptq_marlin_24.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the outputs of a GPTQ model to a Marlin_24 model.
 
 Note: GPTQ and Marlin_24 do not have bitwise correctness.
diff --git a/tests/models/quantization/test_modelopt.py b/tests/models/quantization/test_modelopt.py
index 1d9aa4fa8ade..6ad526cc893f 100644
--- a/tests/models/quantization/test_modelopt.py
+++ b/tests/models/quantization/test_modelopt.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # flake8: noqa
 """Tests Model Optimizer fp8 models against ground truth generation
diff --git a/tests/models/quantization/test_mxfp4.py b/tests/models/quantization/test_mxfp4.py
index 9a060829525e..7b8a334bbc36 100644
--- a/tests/models/quantization/test_mxfp4.py
+++ b/tests/models/quantization/test_mxfp4.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # flake8: noqa
 """Tests Quark mxfp4 models against ground truth generation
 """
diff --git a/tests/models/quantization/test_nvfp4.py b/tests/models/quantization/test_nvfp4.py
index 510858c2d7ef..b95dad9a4eff 100644
--- a/tests/models/quantization/test_nvfp4.py
+++ b/tests/models/quantization/test_nvfp4.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # flake8: noqa
 """Tests Model Optimizer nvfp4 models against ground truth generation
 Note: these tests will only pass on B200
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 182a9668ebef..e6543c197348 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Mapping, Set
 from dataclasses import dataclass, field
@@ -159,17 +160,12 @@ def check_available_online(
     "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"),  # noqa: E501
     "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
     "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-1.5B-Instruct",
-                                          is_available_online=False,
-                                          min_transformers_version="4.52.2"),
+                                          min_transformers_version="4.53"),
     "GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"),
     "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
     "Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it"),
     "GlmForCausalLM": _HfExamplesInfo("THUDM/glm-4-9b-chat-hf"),
-    "Glm4ForCausalLM": _HfExamplesInfo(
-        "THUDM/GLM-4-32B-0414",
-        is_available_online=False,
-        min_transformers_version="4.52.dev0"
-    ),
+    "Glm4ForCausalLM": _HfExamplesInfo("THUDM/GLM-4-9B-0414"),
     "GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2",
                                        {"alias": "gpt2"}),
     "GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder",
@@ -180,8 +176,7 @@ def check_available_online(
                                           {"1b": "EleutherAI/pythia-1.4b"}),
     "GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"),
     "GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"),
-    "GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview",  # noqa: E501
-                                                   min_transformers_version="4.52.0"),  # noqa: E501
+    "GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview"),  # noqa: E501
     "GraniteMoeSharedForCausalLM": _HfExamplesInfo("ibm-research/moe-7b-1b-active-shared-experts"),  # noqa: E501
     "Grok1ModelForCausalLM": _HfExamplesInfo("hpcai-tech/grok-1",
                                              trust_remote_code=True),
@@ -202,8 +197,7 @@ def check_available_online(
     "LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf",
                                         is_available_online=False),
     "MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
-    "Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1",
-                                         is_available_online=False),
+    "Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1"),
     "FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"),  # noqa: E501
     "MiniCPMForCausalLM": _HfExamplesInfo("openbmb/MiniCPM-2B-sft-bf16",
                                          trust_remote_code=True),
@@ -218,6 +212,8 @@ def check_available_online(
     "MptForCausalLM": _HfExamplesInfo("mpt", is_available_online=False),
     "MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b"),
     "NemotronForCausalLM": _HfExamplesInfo("nvidia/Minitron-8B-Base"),
+    "NemotronHForCausalLM": _HfExamplesInfo("nvidia/Nemotron-H-8B-Base-8K",
+                                            trust_remote_code=True),
     "OlmoForCausalLM": _HfExamplesInfo("allenai/OLMo-1B-hf"),
     "Olmo2ForCausalLM": _HfExamplesInfo("allenai/OLMo-2-0425-1B"),
     "OlmoeForCausalLM": _HfExamplesInfo("allenai/OLMoE-1B-7B-0924-Instruct"),
@@ -242,10 +238,9 @@ def check_available_online(
     "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
     "Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"),
     "Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"),
-    "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b",
-                                     is_available_online=False),
+    "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"),
     "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b",  # noqa: E501
-                                                is_available_online=False),
+                                                v0_only=True),
     "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t",
                                            v0_only=True),
     "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"),
@@ -255,7 +250,7 @@ def check_available_online(
     "TeleFLMForCausalLM": _HfExamplesInfo("CofeAI/FLM-2-52B-Instruct-2407",
                                             trust_remote_code=True),
     "XverseForCausalLM": _HfExamplesInfo("xverse/XVERSE-7B-Chat",
-                                         is_available_online=False,
+                                         tokenizer="meta-llama/Llama-2-7b",
                                          trust_remote_code=True),
     "Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct"),
     "MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL",
@@ -274,8 +269,7 @@ def check_available_online(
                                                trust_remote_code=True),
     "GteNewModel": _HfExamplesInfo("Alibaba-NLP/gte-base-en-v1.5",
                                    trust_remote_code=True,
-                                   hf_overrides={"architectures":
-                                                     ["GteNewModel"]}),
+                                   hf_overrides={"architectures": ["GteNewModel"]}),  # noqa: E501
     "InternLM2ForRewardModel": _HfExamplesInfo("internlm/internlm2-1_8b-reward",
                                                trust_remote_code=True),
     "JambaForSequenceClassification": _HfExamplesInfo("ai21labs/Jamba-tiny-reward-dev"),  # noqa: E501
@@ -297,10 +291,8 @@ def check_available_online(
     "Phi3VForCausalLM": _HfExamplesInfo("TIGER-Lab/VLM2Vec-Full",
                                          trust_remote_code=True),
     "Qwen2VLForConditionalGeneration": _HfExamplesInfo("MrLight/dse-qwen2-2b-mrl-v1"), # noqa: E501
-    # The model on Huggingface is currently being updated,
-    # hence I temporarily mark it as not available online
-    "PrithviGeoSpatialMAE": _HfExamplesInfo("ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11",  # noqa: E501
-                                            is_available_online=False),
+    "PrithviGeoSpatialMAE": _HfExamplesInfo("ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11", # noqa: E501
+                                            is_available_online=False),  # noqa: E501
 }
 
 _CROSS_ENCODER_EXAMPLE_MODELS = {
@@ -326,8 +318,7 @@ def check_available_online(
                                                 hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}),  # noqa: E501
     "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
     "Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it"),
-    "GraniteSpeechForConditionalGeneration": _HfExamplesInfo("ibm-granite/granite-speech-3.3-8b",  # noqa: E501
-                                                             min_transformers_version="4.52.0"),  # noqa: E501
+    "GraniteSpeechForConditionalGeneration": _HfExamplesInfo("ibm-granite/granite-speech-3.3-2b"),  # noqa: E501
     "GLM4VForCausalLM": _HfExamplesInfo("THUDM/glm-4v-9b",
                                         trust_remote_code=True,
                                         hf_overrides={"architectures": ["GLM4VForCausalLM"]}),  # noqa: E501
@@ -346,7 +337,6 @@ def check_available_online(
                                                       trust_remote_code=True,
                                                       v0_only=True),
     "Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct",   # noqa: E501
-                                                      min_transformers_version="4.51",
                                                       max_model_len=10240),
     "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
                                                      extras={"mistral": "mistral-community/pixtral-12b", # noqa: E501
@@ -359,8 +349,6 @@ def check_available_online(
                                                       transformers_version_reason="HF model is not compatible.",  # noqa: E501
                                                       hf_overrides={"architectures": ["MantisForConditionalGeneration"]}),  # noqa: E501
     "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6",
-                                max_transformers_version="4.48",
-                                transformers_version_reason="Use of deprecated imports which have been removed.",  # noqa: E501
                                 trust_remote_code=True),
     "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
                                 extras={"2.6": "openbmb/MiniCPM-V-2_6"},  # noqa: E501
@@ -398,10 +386,8 @@ def check_available_online(
     "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"),  # noqa: E501
     "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"),  # noqa: E501
     "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct"),  # noqa: E501
-    "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B",
-                                        min_transformers_version="4.52"),
-    "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ",  # noqa: E501
-                                                           min_transformers_version="4.52"),
+    "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B"),
+    "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ"),  # noqa: E501
     "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"),
     "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"),  # noqa: E501
     "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",  # noqa: E501
@@ -412,8 +398,8 @@ def check_available_online(
     # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
     # Therefore, we borrow the BartTokenizer from the original Bart model
     "Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base",  # noqa: E501
-                                                         tokenizer="Isotr0py/Florence-2-tokenizer",
-                                                         trust_remote_code=True,),  # noqa: E501
+                                                         tokenizer="Isotr0py/Florence-2-tokenizer",  # noqa: E501
+                                                         trust_remote_code=True),  # noqa: E501
     "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"),  # noqa: E501
     "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"),  # noqa: E501
 }
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index d403cb392fe0..54e8cd597bfc 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from unittest.mock import patch
 
@@ -20,6 +21,10 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
 
+    # FIXME: Possible memory leak in the previous tests?
+    if model_arch == "GraniteSpeechForConditionalGeneration":
+        pytest.skip("Avoid OOM")
+
     # Avoid OOM and reduce initialization time by only using 1 layer
     def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
         hf_config.update(model_info.hf_overrides)
@@ -40,6 +45,13 @@ def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
                 "num_hidden_layers": 1,
             })
 
+        # e.g.: ibm-granite/granite-speech-3.3-2b
+        if hasattr(hf_config, "encoder_config"):
+            hf_config.encoder_config.update({
+                "num_layers": 1,
+                "num_hidden_layers": 1,
+            })
+
         return hf_config
 
     # Avoid calling model.forward()
@@ -74,6 +86,8 @@ def _initialize_kv_caches_v1(self, vllm_config):
             } if model_info.speculative_model else None,
             trust_remote_code=model_info.trust_remote_code,
             max_model_len=model_info.max_model_len,
+            # these tests seem to produce leftover memory
+            gpu_memory_utilization=0.80,
             load_format="dummy",
             hf_overrides=hf_overrides,
         )
diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
index b62720caa9cb..ef0ad613d525 100644
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py
index 3282284b6b27..b7527ca2706b 100644
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import warnings
 
diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py
index 1a51b4aeab04..b7b99ce41cbb 100644
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test the functionality of the Transformers backend."""
 from typing import Any, Optional, Union
 
diff --git a/tests/models/test_utils.py b/tests/models/test_utils.py
index a16384efe195..b52327a1844f 100644
--- a/tests/models/test_utils.py
+++ b/tests/models/test_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/tests/models/test_vision.py b/tests/models/test_vision.py
index d64c0e6d4e43..310d3a3719b6 100644
--- a/tests/models/test_vision.py
+++ b/tests/models/test_vision.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/models/utils.py b/tests/models/utils.py
index ffc904bd10f4..943b4f570446 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import warnings
 from collections.abc import Sequence
diff --git a/tests/mq_llm_engine/conftest.py b/tests/mq_llm_engine/conftest.py
index 1a20e2c135c2..375b248ebeda 100644
--- a/tests/mq_llm_engine/conftest.py
+++ b/tests/mq_llm_engine/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 
diff --git a/tests/mq_llm_engine/test_abort.py b/tests/mq_llm_engine/test_abort.py
index 808346b5e58d..5ff08cbb3248 100644
--- a/tests/mq_llm_engine/test_abort.py
+++ b/tests/mq_llm_engine/test_abort.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test that aborting is handled properly."""
 
 import asyncio
diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py
index e617bd057f1f..49b02279d61b 100644
--- a/tests/mq_llm_engine/test_error_handling.py
+++ b/tests/mq_llm_engine/test_error_handling.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test that various errors are handled properly."""
 
 import asyncio
diff --git a/tests/mq_llm_engine/test_load.py b/tests/mq_llm_engine/test_load.py
index 2069ff987f2f..e9fd5b814f28 100644
--- a/tests/mq_llm_engine/test_load.py
+++ b/tests/mq_llm_engine/test_load.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test that the MQLLMEngine is able to handle 10k concurrent requests."""
 
 import asyncio
diff --git a/tests/mq_llm_engine/utils.py b/tests/mq_llm_engine/utils.py
index 64559609abb2..7976d5031aea 100644
--- a/tests/mq_llm_engine/utils.py
+++ b/tests/mq_llm_engine/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import multiprocessing
diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py
index ce716e6474cb..56e339d485c5 100644
--- a/tests/multi_step/test_correctness_async_llm.py
+++ b/tests/multi_step/test_correctness_async_llm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Test the AsyncLLMEngine with multi-step-decoding
 from typing import Optional
diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py
index a823e484beab..9f1b3bbe8e22 100644
--- a/tests/multi_step/test_correctness_llm.py
+++ b/tests/multi_step/test_correctness_llm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Test the LLMEngine with multi-step-decoding
 
diff --git a/tests/multimodal/test_hasher.py b/tests/multimodal/test_hasher.py
index 17b36b36888d..b5048c8cc3ad 100644
--- a/tests/multimodal/test_hasher.py
+++ b/tests/multimodal/test_hasher.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from pathlib import Path
 
 import numpy as np
diff --git a/tests/multimodal/test_image.py b/tests/multimodal/test_image.py
index 56b5475c9ca0..cfd44351a6d1 100644
--- a/tests/multimodal/test_image.py
+++ b/tests/multimodal/test_image.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from pathlib import Path
 
 import numpy as np
diff --git a/tests/multimodal/test_inputs.py b/tests/multimodal/test_inputs.py
index f5d3e282f953..ffb3a6fe86b4 100644
--- a/tests/multimodal/test_inputs.py
+++ b/tests/multimodal/test_inputs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 59f7bf8fab2f..8b52911c6ccf 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import nullcontext
 from types import MethodType
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index f1e45da30eda..5ac0a90f5047 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import base64
 import mimetypes
@@ -8,12 +9,21 @@
 
 import numpy as np
 import pytest
+import torch
+import torch.multiprocessing as mp
 from PIL import Image, ImageChops
 
+from tests.utils import multi_gpu_test
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed.parallel_state import (init_distributed_environment,
+                                             initialize_model_parallel)
 from vllm.multimodal.image import convert_image_mode
 from vllm.multimodal.inputs import PlaceholderRange
 from vllm.multimodal.utils import (MediaConnector,
-                                   merge_and_sort_multimodal_metadata)
+                                   merge_and_sort_multimodal_metadata,
+                                   run_dp_sharded_vision_model)
+from vllm.platforms import current_platform
+from vllm.utils import get_open_port, update_environment_variables
 
 if TYPE_CHECKING:
     from vllm.multimodal.hasher import MultiModalHashDict
@@ -140,6 +150,19 @@ async def test_fetch_image_local_files(image_url: str):
                 f"file://{temp_dir}/../{os.path.basename(image_url)}")
 
 
+@pytest.mark.asyncio
+async def test_fetch_image_error_conversion():
+    connector = MediaConnector()
+    broken_img = "data:image/png;base64,aGVsbG9fdmxsbV9jb21tdW5pdHkK"
+
+    # PIL.UnidentifiedImageError should be converted to ValueError
+    with pytest.raises(ValueError):
+        await connector.fetch_image_async(broken_img)
+
+    with pytest.raises(ValueError):
+        connector.fetch_image(broken_img)
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
 @pytest.mark.parametrize("num_frames", [-1, 32, 1800])
@@ -399,3 +422,90 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving():
         assert modalities == expected_modalities
         assert ranges == expected_ranges
         assert hashes == expected_hashes
+
+
+class SimpleLinearModel(torch.nn.Module):
+    """A simple linear vision model for testing."""
+
+    def __init__(self, input_dim: int = 3 * 224 * 224, output_dim: int = 32):
+        super().__init__()
+        self.flatten = torch.nn.Flatten()
+        self.linear = torch.nn.Linear(input_dim, output_dim)
+
+    def forward(self, x: torch.Tensor):
+        # Flatten the input and apply linear transformation
+        x = self.flatten(x)
+        return self.linear(x)
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "batch_size",
+    [
+        1,  # Single image
+        4,  # Small batch
+        5,  # Odd batch size (for testing padding)
+    ],
+)
+def test_run_dp_sharded_vision_model(batch_size: int):
+    world_size = 2
+    # Launch processes
+    mp.spawn(
+        run_dp_sharded_vision_model_vs_direct,
+        args=(
+            world_size,
+            batch_size,
+            get_open_port(),
+        ),
+        nprocs=world_size,
+    )
+
+
+def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int,
+                                          batch_size: int, master_port: int):
+    """
+    Test that run_dp_sharded_vision_model produces the same results as 
+    calling the model directly.
+    """
+
+    # Set random seed for reproducibility
+    current_platform.seed_everything(0)
+
+    device = torch.device(f"cuda:{local_rank}")
+    torch.cuda.set_device(device)
+    torch.set_default_device(device)
+
+    update_environment_variables({
+        'RANK': str(local_rank),
+        'LOCAL_RANK': str(local_rank),
+        'WORLD_SIZE': str(world_size),
+        'MASTER_ADDR': 'localhost',
+        'MASTER_PORT': str(master_port),
+    })
+
+    # initialize distributed
+    init_distributed_environment()
+    initialize_model_parallel(tensor_model_parallel_size=world_size)
+
+    # Create a test input tensor
+    image_input = torch.randn(batch_size, 3, 224, 224)
+
+    # Create a simple linear model
+    vision_model = SimpleLinearModel()
+
+    # Run the model directly on the full input
+    with torch.inference_mode():
+        direct_output = vision_model(image_input)
+
+    # Run the model through the sharded function
+    with torch.inference_mode():
+        sharded_output = run_dp_sharded_vision_model(image_input, vision_model)
+
+    # Check that the world size is setup correctly
+    assert get_tensor_model_parallel_world_size() == world_size
+
+    # Check that the outputs have the same shape
+    assert direct_output.shape == sharded_output.shape
+
+    # Check that the outputs are close (they should be identical)
+    assert torch.allclose(direct_output, sharded_output, rtol=1e-5, atol=1e-5)
diff --git a/tests/multimodal/test_video.py b/tests/multimodal/test_video.py
index e67624ecefcb..9a700808d9d8 100644
--- a/tests/multimodal/test_video.py
+++ b/tests/multimodal/test_video.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import numpy as np
 import numpy.typing as npt
 import pytest
diff --git a/tests/multimodal/utils.py b/tests/multimodal/utils.py
index 40fcfeeeac7d..23346509a06f 100644
--- a/tests/multimodal/utils.py
+++ b/tests/multimodal/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import numpy as np
 from PIL import Image
diff --git a/tests/neuron/1_core/test_activation.py b/tests/neuron/1_core/test_activation.py
index ec2b1238e404..2d6e5f523cb8 100644
--- a/tests/neuron/1_core/test_activation.py
+++ b/tests/neuron/1_core/test_activation.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/neuron/1_core/test_block_table.py b/tests/neuron/1_core/test_block_table.py
index 033a36b4156b..efec56360c14 100644
--- a/tests/neuron/1_core/test_block_table.py
+++ b/tests/neuron/1_core/test_block_table.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import neuronxcc.nki.language as nl
 import pytest
diff --git a/tests/neuron/1_core/test_cache.py b/tests/neuron/1_core/test_cache.py
index 3d869cd2fa17..670889ad6b58 100644
--- a/tests/neuron/1_core/test_cache.py
+++ b/tests/neuron/1_core/test_cache.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/neuron/1_core/test_layernorm.py b/tests/neuron/1_core/test_layernorm.py
index e96df8db6ccd..c6fce1d1a063 100644
--- a/tests/neuron/1_core/test_layernorm.py
+++ b/tests/neuron/1_core/test_layernorm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/neuron/1_core/test_logits_processor.py b/tests/neuron/1_core/test_logits_processor.py
index 6d1514088f90..ce9eadf5a883 100644
--- a/tests/neuron/1_core/test_logits_processor.py
+++ b/tests/neuron/1_core/test_logits_processor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 from unittest.mock import patch
diff --git a/tests/neuron/1_core/test_neuron_model_runner.py b/tests/neuron/1_core/test_neuron_model_runner.py
index 92417fb64f7f..5f3268810f9f 100644
--- a/tests/neuron/1_core/test_neuron_model_runner.py
+++ b/tests/neuron/1_core/test_neuron_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 from unittest.mock import MagicMock
 
diff --git a/tests/neuron/1_core/test_neuron_quant.py b/tests/neuron/1_core/test_neuron_quant.py
index 68f0cb8054b4..086300269592 100644
--- a/tests/neuron/1_core/test_neuron_quant.py
+++ b/tests/neuron/1_core/test_neuron_quant.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.model_executor.layers.quantization.neuron_quant import (
     NeuronQuantConfig)
 
diff --git a/tests/neuron/1_core/test_prefix_prefill.py b/tests/neuron/1_core/test_prefix_prefill.py
index 8f7e711b525e..8b9a5f6e4a6a 100644
--- a/tests/neuron/1_core/test_prefix_prefill.py
+++ b/tests/neuron/1_core/test_prefix_prefill.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/neuron/1_core/test_rotary_embedding.py b/tests/neuron/1_core/test_rotary_embedding.py
index da57631fcfc5..a7ac79729986 100644
--- a/tests/neuron/1_core/test_rotary_embedding.py
+++ b/tests/neuron/1_core/test_rotary_embedding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Tests for miscellaneous utilities
 """
diff --git a/tests/neuron/2_core/test_comm_ops.py b/tests/neuron/2_core/test_comm_ops.py
index 3cad160b2cb7..85a48dae58aa 100644
--- a/tests/neuron/2_core/test_comm_ops.py
+++ b/tests/neuron/2_core/test_comm_ops.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import functools
 from typing import Callable
 from unittest.mock import patch
diff --git a/tests/neuron/2_core/test_eagle.py b/tests/neuron/2_core/test_eagle.py
index d71c88689a99..cac642af0310 100644
--- a/tests/neuron/2_core/test_eagle.py
+++ b/tests/neuron/2_core/test_eagle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 import os
diff --git a/tests/neuron/2_core/test_mistral.py b/tests/neuron/2_core/test_mistral.py
index 3e651502d1e2..d02fff943e90 100644
--- a/tests/neuron/2_core/test_mistral.py
+++ b/tests/neuron/2_core/test_mistral.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, SamplingParams
 
diff --git a/tests/neuron/2_core/test_multi_lora.py b/tests/neuron/2_core/test_multi_lora.py
index 6fa8f9128def..6b97f47d4db3 100644
--- a/tests/neuron/2_core/test_multi_lora.py
+++ b/tests/neuron/2_core/test_multi_lora.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from huggingface_hub import snapshot_download
 
diff --git a/tests/plugins/lora_resolvers/test_filesystem_resolver.py b/tests/plugins/lora_resolvers/test_filesystem_resolver.py
index cb0f0c3c5fa6..3e2c2577da66 100644
--- a/tests/plugins/lora_resolvers/test_filesystem_resolver.py
+++ b/tests/plugins/lora_resolvers/test_filesystem_resolver.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 import shutil
 
diff --git a/tests/plugins/vllm_add_dummy_model/setup.py b/tests/plugins/vllm_add_dummy_model/setup.py
index e3fb6efb2757..6307bb63897a 100644
--- a/tests/plugins/vllm_add_dummy_model/setup.py
+++ b/tests/plugins/vllm_add_dummy_model/setup.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from setuptools import setup
 
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
index 0c431cb39737..b2085b01c45c 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import ModelRegistry
 
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
index bc4a41cdf00d..aff3498567d2 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from typing import Optional, Union
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
index c23ab64308f2..da97cf7e2b40 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
index bbd11ed4aac9..8c34407e3e07 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/plugins/vllm_add_dummy_platform/setup.py b/tests/plugins/vllm_add_dummy_platform/setup.py
index 10df0b5e0503..e40f62f7749b 100644
--- a/tests/plugins/vllm_add_dummy_platform/setup.py
+++ b/tests/plugins/vllm_add_dummy_platform/setup.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from setuptools import setup
 
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
index 0d1b062ac2eb..1b28342eb179 100644
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
index 33425bbc11ed..f30a36f35f5d 100644
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.attention.backends.flash_attn import FlashAttentionBackend
 
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
index 5cefafc7e06c..67cd5ed3b73d 100644
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.platforms.cuda import CudaPlatform
 
diff --git a/tests/plugins_tests/conftest.py b/tests/plugins_tests/conftest.py
index 8561f2ddfa26..c8c1b81ca218 100644
--- a/tests/plugins_tests/conftest.py
+++ b/tests/plugins_tests/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 
diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py
index 207de53abd8d..685a8cd2c8b8 100644
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/plugins_tests/test_scheduler_plugins.py b/tests/plugins_tests/test_scheduler_plugins.py
index 4c95a52a967b..8c2121610868 100644
--- a/tests/plugins_tests/test_scheduler_plugins.py
+++ b/tests/plugins_tests/test_scheduler_plugins.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/pplx_utils.py b/tests/pplx_utils.py
new file mode 100644
index 000000000000..2d5d5be80c3f
--- /dev/null
+++ b/tests/pplx_utils.py
@@ -0,0 +1,123 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import dataclasses
+import os
+import traceback
+from typing import Callable
+
+import torch
+from torch.multiprocessing import (
+    spawn)  # pyright: ignore[reportPrivateImportUsage]
+from typing_extensions import Concatenate, ParamSpec
+
+P = ParamSpec("P")
+
+
+@dataclasses.dataclass
+class ProcessGroupInfo:
+    world_size: int
+    world_local_size: int
+    rank: int
+    node_rank: int
+    local_rank: int
+    device: torch.device
+
+
+def _worker_parallel_launch(
+    local_rank: int,
+    world_size: int,
+    world_local_size: int,
+    node_rank: int,
+    init_method: str,
+    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
+    *args: P.args,
+    **kwargs: P.kwargs,
+) -> None:
+    rank = node_rank * world_local_size + local_rank
+    torch.cuda.set_device(local_rank)
+    device = torch.device("cuda", local_rank)
+    torch.distributed.init_process_group(
+        backend="cpu:gloo,cuda:nccl",
+        init_method=init_method,
+        rank=rank,
+        world_size=world_size,
+        device_id=device,
+    )
+    barrier = torch.tensor([rank], device=device)
+    torch.distributed.all_reduce(barrier)
+
+    try:
+        worker(
+            ProcessGroupInfo(
+                world_size=world_size,
+                world_local_size=world_local_size,
+                rank=rank,
+                node_rank=node_rank,
+                local_rank=local_rank,
+                device=device,
+            ),
+            *args,
+            **kwargs,
+        )
+    except Exception as ex:
+        print(ex)
+        traceback.print_exc()
+        raise
+    finally:
+        torch.distributed.destroy_process_group()
+
+
+def parallel_launch(
+    world_size: int,
+    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
+    *args: P.args,
+    **kwargs: P.kwargs,
+) -> None:
+    assert not kwargs
+    spawn(
+        _worker_parallel_launch,
+        args=(
+            world_size,
+            world_size,
+            0,
+            "tcp://localhost:29500",
+            worker,
+        ) + args,
+        nprocs=world_size,
+        join=True,
+    )
+
+
+def parallel_launch_from_env(
+    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
+    *args: P.args,
+    **kwargs: P.kwargs,
+) -> None:
+    """
+    Launches a worker function in parallel across all processes in the current
+    environment. The environment must have the following variables set:
+    - WORLD_SIZE: The total number of processes.
+    - WORLD_LOCAL_SIZE: The number of processes on the current node.
+    - NODE_RANK: The rank of the current
+    - MASTER_ADDR: The address of the master process.
+    - MASTER_PORT: The port of the master process.
+    """
+    assert not kwargs
+    world_size = int(os.environ["WORLD_SIZE"])
+    world_local_size = int(os.environ["WORLD_LOCAL_SIZE"])
+    node_rank = int(os.environ["NODE_RANK"])
+    assert "MASTER_ADDR" in os.environ
+    assert "MASTER_PORT" in os.environ
+    spawn(
+        _worker_parallel_launch,
+        args=(
+            world_size,
+            world_local_size,
+            node_rank,
+            "env://",
+            worker,
+        ) + args,
+        nprocs=world_local_size,
+        join=True,
+    )
diff --git a/tests/prefix_caching/test_disable_sliding_window.py b/tests/prefix_caching/test_disable_sliding_window.py
index 4cc399175df4..f00a8f6998cb 100644
--- a/tests/prefix_caching/test_disable_sliding_window.py
+++ b/tests/prefix_caching/test_disable_sliding_window.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the with and without prefix caching.
 
 Run `pytest tests/prefix_caching/test_prefix_caching.py`.
diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
index 607b6c43e02e..a65fc934b16a 100644
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the with and without prefix caching.
 
 Run `pytest tests/prefix_caching/test_prefix_caching.py`.
diff --git a/tests/prompt_adapter/test_bloom.py b/tests/prompt_adapter/test_bloom.py
index a31d8e873d79..2b603fe8f022 100644
--- a/tests/prompt_adapter/test_bloom.py
+++ b/tests/prompt_adapter/test_bloom.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/prompt_adapter/test_multi_adapter_inference.py b/tests/prompt_adapter/test_multi_adapter_inference.py
index e249a6e64427..4f273afb4e36 100644
--- a/tests/prompt_adapter/test_multi_adapter_inference.py
+++ b/tests/prompt_adapter/test_multi_adapter_inference.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import EngineArgs, LLMEngine, SamplingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
diff --git a/tests/prompt_adapter/test_pa_lora.py b/tests/prompt_adapter/test_pa_lora.py
index fb4c3e149765..ba2e15b81bc1 100644
--- a/tests/prompt_adapter/test_pa_lora.py
+++ b/tests/prompt_adapter/test_pa_lora.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from huggingface_hub import snapshot_download
 
diff --git a/tests/quantization/test_auto_round.py b/tests/quantization/test_auto_round.py
index 81ceecdb45d6..1c41d904b816 100644
--- a/tests/quantization/test_auto_round.py
+++ b/tests/quantization/test_auto_round.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test model set-up and inference for quantized HF models supported
  on the AutoRound.
 
diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py
index e8ddfd7fc779..325a902b3111 100644
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 '''Tests whether bitsandbytes computation is enabled correctly.
 
 Run `pytest tests/quantization/test_bitsandbytes.py`.
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index c968a68f1a8e..2c07fe29fb0e 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test model set-up and weight loading for llmcompressor-quantized models.
 
 Run `pytest tests/quantization/test_compressed_tensors.py`.
@@ -652,7 +653,7 @@ def check_model(model):
 
 def test_compressed_tensors_nvfp4a16(vllm_runner):
     # run weight only example
-    model = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP4"
+    model = "nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16"
     with vllm_runner(model, enforce_eager=True) as llm:
 
         def check_model(model):
diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py
index e30166842ea8..8b0ffc0fe42f 100644
--- a/tests/quantization/test_configs.py
+++ b/tests/quantization/test_configs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests whether Marlin models can be loaded from the autogptq config.
 
 Run `pytest tests/quantization/test_configs.py --forked`.
diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py
index a05eb494c11a..08d9573ecf0b 100644
--- a/tests/quantization/test_cpu_offload.py
+++ b/tests/quantization/test_cpu_offload.py
@@ -1,4 +1,5 @@
-# SPDX-License-Identifier: Apache-2.0
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Expanded quantized model tests for CPU offloading
 # Base tests: tests/basic_correctness/test_cpu_offload.py
diff --git a/tests/quantization/test_experts_int8.py b/tests/quantization/test_experts_int8.py
index b6db6d5f2fdc..50179b9a904d 100644
--- a/tests/quantization/test_experts_int8.py
+++ b/tests/quantization/test_experts_int8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # flake8: noqa
 """Tests experts_int8 quantization startup and generation, 
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index e74e14a0dcb6..e5ab7b3dd3cf 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests whether FP8 computation is enabled correctly.
 
 Run `pytest tests/quantization/test_fp8.py --forked`.
diff --git a/tests/quantization/test_gptq_dynamic.py b/tests/quantization/test_gptq_dynamic.py
index 22055c49ae29..23b999e7c679 100644
--- a/tests/quantization/test_gptq_dynamic.py
+++ b/tests/quantization/test_gptq_dynamic.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests whether gptq models with dynamic quantized can be loaded.
 
 Run `pytest tests/quantization/test_gptq_dynamic.py --forked`.
diff --git a/tests/quantization/test_ipex_quant.py b/tests/quantization/test_ipex_quant.py
index 0e3913676f5f..34b1b6c2e5b6 100644
--- a/tests/quantization/test_ipex_quant.py
+++ b/tests/quantization/test_ipex_quant.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test model set-up and inference for quantized HF models supported
  on the CPU/GPU backend using IPEX (including AWQ/GPTQ).
  
diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py
index 1c6bd18521c3..11f78a23bb4c 100644
--- a/tests/quantization/test_lm_head.py
+++ b/tests/quantization/test_lm_head.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests whether gptq models with quantized lm_head can be loaded.
 
 Run `pytest tests/quantization/test_quant_lm_head_true.py --forked`.
diff --git a/tests/quantization/test_ptpc_fp8.py b/tests/quantization/test_ptpc_fp8.py
index 9bbb5e327968..5f78bc30504c 100644
--- a/tests/quantization/test_ptpc_fp8.py
+++ b/tests/quantization/test_ptpc_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests whether PTPC w8a8 FP8 computation is enabled correctly.
 
 Run `pytest tests/quantization/test_ptpc_fp8.py --forked`.
diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py
index ae09ac58e675..3571f773fb02 100644
--- a/tests/quantization/test_quark.py
+++ b/tests/quantization/test_quark.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test model set-up and weight loading for quark-quantized models.
 
 Run `pytest tests/quantization/test_quark.py`.
diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py
index 0ea71aaf828b..42081a8c68cd 100644
--- a/tests/quantization/test_register_quantization_config.py
+++ b/tests/quantization/test_register_quantization_config.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests register custom quantization config.
 
 See https://github.com/vllm-project/vllm/issues/11926 for more details.
diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py
index 6571fc9e471b..54ec59585450 100644
--- a/tests/quantization/test_torchao.py
+++ b/tests/quantization/test_torchao.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import importlib.metadata
 import importlib.util
 
@@ -12,7 +13,7 @@
 
 @pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
 def test_pre_quantized_model(vllm_runner):
-    with vllm_runner("drisspg/float8_dynamic_act_float8_weight-opt-125m",
+    with vllm_runner("drisspg/fp8-opt-125m",
                      quantization="torchao",
                      dtype="bfloat16",
                      enforce_eager=True) as llm:
@@ -29,10 +30,10 @@ def test_pre_quantized_model(vllm_runner):
         "cuda:0",
         # {"": "cuda"},
     ])
-def test_opt_125m_int4wo_model_loading_with_params(vllm_runner,
+def test_opt_125m_int8wo_model_loading_with_params(vllm_runner,
                                                    pt_load_map_location):
     torch._dynamo.reset()
-    model_name = "jerryzh168/opt-125m-int4wo"
+    model_name = "jerryzh168/opt-125m-int8wo-partial-quant"
     with vllm_runner(model_name=model_name,
                      quantization="torchao",
                      dtype="bfloat16",
diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py
index 7a339c162cc4..20a425b72114 100644
--- a/tests/quantization/utils.py
+++ b/tests/quantization/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.model_executor.layers.quantization import get_quantization_config
 from vllm.platforms import current_platform
diff --git a/tests/reasoning/test_deepseekr1_reasoning_parser.py b/tests/reasoning/test_deepseekr1_reasoning_parser.py
index 1b669c8fd2fb..987f3c48de0c 100644
--- a/tests/reasoning/test_deepseekr1_reasoning_parser.py
+++ b/tests/reasoning/test_deepseekr1_reasoning_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 from transformers import AutoTokenizer
diff --git a/tests/reasoning/test_granite_reasoning_parser.py b/tests/reasoning/test_granite_reasoning_parser.py
index 48fb8c2f8d1b..38cab73a45f2 100644
--- a/tests/reasoning/test_granite_reasoning_parser.py
+++ b/tests/reasoning/test_granite_reasoning_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 from transformers import AutoTokenizer
 
diff --git a/tests/reasoning/test_qwen3_reasoning_parser.py b/tests/reasoning/test_qwen3_reasoning_parser.py
index 95b7460d359e..2d5557d5cdc1 100644
--- a/tests/reasoning/test_qwen3_reasoning_parser.py
+++ b/tests/reasoning/test_qwen3_reasoning_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 from transformers import AutoTokenizer
diff --git a/tests/reasoning/utils.py b/tests/reasoning/utils.py
index 0f894ed800c6..ddcf89796fb5 100644
--- a/tests/reasoning/utils.py
+++ b/tests/reasoning/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional, Union
 
diff --git a/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py b/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py
index 8b96184f579e..e27d9958f291 100644
--- a/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py
+++ b/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import SamplingParams
 from vllm.config import LoadConfig, LoadFormat
diff --git a/tests/runai_model_streamer_test/test_weight_utils.py b/tests/runai_model_streamer_test/test_weight_utils.py
index 06e506c35761..ee448c2ccb21 100644
--- a/tests/runai_model_streamer_test/test_weight_utils.py
+++ b/tests/runai_model_streamer_test/test_weight_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import glob
 import tempfile
diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py
index 5de1137eaf68..bdf48c7687b2 100644
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the outputs of HF and vLLM when using beam search.
 
 Run `pytest tests/samplers/test_beam_search.py`.
diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py
index 2a124aa0c596..7eb9c0b5fb8c 100644
--- a/tests/samplers/test_ignore_eos.py
+++ b/tests/samplers/test_ignore_eos.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Make sure ignore_eos works.
 
 Run `pytest tests/samplers/test_ignore_eos.py`.
diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py
index 74f1eb4a9547..901c87591264 100644
--- a/tests/samplers/test_logits_processor.py
+++ b/tests/samplers/test_logits_processor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index 5cc646e76ec8..86c8a03eee10 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py
index f9688b4b9b27..42b529ae169d 100644
--- a/tests/samplers/test_no_bad_words.py
+++ b/tests/samplers/test_no_bad_words.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Make sure bad_words works.
 
 Run `pytest tests/samplers/test_no_bad_words.py`.
diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py
index ebe9b302148c..86fc14dc85f8 100644
--- a/tests/samplers/test_ranks.py
+++ b/tests/samplers/test_ranks.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py
index 6ef61f2ff406..3b93c64113da 100644
--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for rejection sampling."""
 
 import pytest
diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index 7b19d5750906..520b88d03ac8 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 import random
diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py
index efa2642dba97..b339b4b2ddf3 100644
--- a/tests/samplers/test_seeded_generate.py
+++ b/tests/samplers/test_seeded_generate.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Verify that seeded random sampling is deterministic.
 
 Run `pytest tests/samplers/test_seeded_generate.py`.
diff --git a/tests/samplers/test_typical_acceptance_sampler.py b/tests/samplers/test_typical_acceptance_sampler.py
index 279e5ed100d9..418471b8e523 100644
--- a/tests/samplers/test_typical_acceptance_sampler.py
+++ b/tests/samplers/test_typical_acceptance_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for rejection sampling."""
 
 import pytest
diff --git a/tests/spec_decode/conftest.py b/tests/spec_decode/conftest.py
index 1a20e2c135c2..375b248ebeda 100644
--- a/tests/spec_decode/conftest.py
+++ b/tests/spec_decode/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 
diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
index 921081f3c3f2..f3fe9db3f79e 100644
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
 from itertools import cycle
diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py
index 4fd52cf7e2cb..6c453879a6a6 100644
--- a/tests/spec_decode/e2e/test_compatibility.py
+++ b/tests/spec_decode/e2e/test_compatibility.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/spec_decode/e2e/test_eagle_correctness.py b/tests/spec_decode/e2e/test_eagle_correctness.py
index eee535a146f4..98939461422e 100644
--- a/tests/spec_decode/e2e/test_eagle_correctness.py
+++ b/tests/spec_decode/e2e/test_eagle_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """This docstring details important information on the testing methodology.
 
 Most of the tests rely on "greedy equality", where we expect the output of
diff --git a/tests/spec_decode/e2e/test_integration.py b/tests/spec_decode/e2e/test_integration.py
index 9dfc1b2fd91e..760861850296 100644
--- a/tests/spec_decode/e2e/test_integration.py
+++ b/tests/spec_decode/e2e/test_integration.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests which cover integration of the speculative decoding framework with
 other features, e.g. cuda graphs.
 """
diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py
index b11297475420..a18be80c50dd 100644
--- a/tests/spec_decode/e2e/test_integration_dist_tp2.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests which cover integration of the speculative decoding framework with
 tensor parallelism.
 """
diff --git a/tests/spec_decode/e2e/test_integration_dist_tp4.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py
index a1b7c8b40c39..039eec8fd2cc 100644
--- a/tests/spec_decode/e2e/test_integration_dist_tp4.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests which cover integration of the speculative decoding framework with
 tensor parallelism.
 """
diff --git a/tests/spec_decode/e2e/test_logprobs.py b/tests/spec_decode/e2e/test_logprobs.py
index cb2dae541411..1629c69f8ee9 100644
--- a/tests/spec_decode/e2e/test_logprobs.py
+++ b/tests/spec_decode/e2e/test_logprobs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from itertools import cycle
 
diff --git a/tests/spec_decode/e2e/test_medusa_correctness.py b/tests/spec_decode/e2e/test_medusa_correctness.py
index 5c60100e6797..064a6e10ae6e 100644
--- a/tests/spec_decode/e2e/test_medusa_correctness.py
+++ b/tests/spec_decode/e2e/test_medusa_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """This docstring details important information on the testing methodology.
 
 Most of the tests rely on "greedy equality", where we expect the output of
diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py
index 7bf29349d672..9f778ca8d179 100644
--- a/tests/spec_decode/e2e/test_mlp_correctness.py
+++ b/tests/spec_decode/e2e/test_mlp_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """This docstring details important information on the testing methodology.
 
 Most of the tests rely on "greedy equality", where we expect the output of
diff --git a/tests/spec_decode/e2e/test_mtp_correctness.py b/tests/spec_decode/e2e/test_mtp_correctness.py
index 371e6834b639..d4d4d519b7a1 100644
--- a/tests/spec_decode/e2e/test_mtp_correctness.py
+++ b/tests/spec_decode/e2e/test_mtp_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """This docstring details important information on the testing methodology.
 
 Most of the tests rely on "greedy equality", where we expect the output of
diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py
index e187b6bc1434..6d385184d264 100644
--- a/tests/spec_decode/e2e/test_multistep_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """The tests in this file verify end-to-end speculative decoding correctness.
 
 This docstring details important information on the testing methodology.
diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py
index eca433ffa1d0..c10329a9ba97 100644
--- a/tests/spec_decode/e2e/test_ngram_correctness.py
+++ b/tests/spec_decode/e2e/test_ngram_correctness.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """This docstring details important information on the testing methodology.
 
 Most of the tests rely on "greedy equality", where we expect the output of
diff --git a/tests/spec_decode/e2e/test_seed.py b/tests/spec_decode/e2e/test_seed.py
index 3dc37172285e..4cf373809dba 100644
--- a/tests/spec_decode/e2e/test_seed.py
+++ b/tests/spec_decode/e2e/test_seed.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py
index 9edd8bd4c00d..d20c549b0905 100644
--- a/tests/spec_decode/test_batch_expansion.py
+++ b/tests/spec_decode/test_batch_expansion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/spec_decode/test_dynamic_spec_decode.py b/tests/spec_decode/test_dynamic_spec_decode.py
index 0bff0ea1d7db..407786ad3c64 100644
--- a/tests/spec_decode/test_dynamic_spec_decode.py
+++ b/tests/spec_decode/test_dynamic_spec_decode.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from unittest.mock import MagicMock, patch
 
diff --git a/tests/spec_decode/test_memory_usage.py b/tests/spec_decode/test_memory_usage.py
index 16dffe6d7d69..5d9dd3f72a78 100644
--- a/tests/spec_decode/test_memory_usage.py
+++ b/tests/spec_decode/test_memory_usage.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """This docstring details important information on the testing methodology.
 
 This test verifies that memory usage remains constant (or never grows) when 
diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py
index 1a6693e16817..e8de410f8a94 100644
--- a/tests/spec_decode/test_metrics.py
+++ b/tests/spec_decode/test_metrics.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 from unittest.mock import MagicMock
diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py
index ca37c9a68dfa..f2d93203b8e1 100644
--- a/tests/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 from unittest.mock import MagicMock
diff --git a/tests/spec_decode/test_ngram_worker.py b/tests/spec_decode/test_ngram_worker.py
index 7de54b3edb6c..8a7c11485681 100644
--- a/tests/spec_decode/test_ngram_worker.py
+++ b/tests/spec_decode/test_ngram_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/tests/spec_decode/test_scorer.py b/tests/spec_decode/test_scorer.py
index f73cf4b345fb..55fcf0055747 100644
--- a/tests/spec_decode/test_scorer.py
+++ b/tests/spec_decode/test_scorer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index f7ef9786a690..8aceaadff8d3 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 from collections import defaultdict
diff --git a/tests/spec_decode/test_utils.py b/tests/spec_decode/test_utils.py
index 24573e22487d..9cfc618b9d95 100644
--- a/tests/spec_decode/test_utils.py
+++ b/tests/spec_decode/test_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from unittest.mock import MagicMock
 
diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py
index d303b7f1219a..1733f66feec0 100644
--- a/tests/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence as GenericSequence
 from itertools import count
diff --git a/tests/standalone_tests/lazy_imports.py b/tests/standalone_tests/lazy_imports.py
index 61e3b387973b..21bcb6b822d1 100644
--- a/tests/standalone_tests/lazy_imports.py
+++ b/tests/standalone_tests/lazy_imports.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Description: Test the lazy import module
 # The utility function cannot be placed in `vllm.utils`
diff --git a/tests/tensorizer_loader/conftest.py b/tests/tensorizer_loader/conftest.py
index ce8689f5b89c..cd59d579e8d6 100644
--- a/tests/tensorizer_loader/conftest.py
+++ b/tests/tensorizer_loader/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 from vllm.distributed import cleanup_dist_env_and_memory
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index 747ec56ad629..c97f5968d58a 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import gc
 import os
diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py
index 05d2c624df17..edc0849dff33 100644
--- a/tests/test_cache_block_hashing.py
+++ b/tests/test_cache_block_hashing.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test hashing of cache blocks.
 
 Run `pytest tests/test_cache_block_hashing.py`.
diff --git a/tests/test_config.py b/tests/test_config.py
index 7db95e3f6450..ce383e1b420a 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -1,10 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import MISSING, Field, asdict, dataclass, field
 from typing import Literal, Union
 
 import pytest
 
+from vllm.compilation.backends import VllmBackend
 from vllm.config import (LoadConfig, ModelConfig, PoolerConfig, VllmConfig,
                          config, get_field)
 from vllm.model_executor.layers.pooler import PoolingType
@@ -43,6 +45,18 @@ def test_config(test_config, expected_error):
         config(test_config)
 
 
+def test_compile_config_repr_succeeds():
+    # setup: VllmBackend mutates the config object
+    config = VllmConfig()
+    backend = VllmBackend(config)
+    backend.configure_post_pass()
+
+    # test that repr(config) succeeds
+    val = repr(config)
+    assert 'VllmConfig' in val
+    assert 'inductor_passes' in val
+
+
 def test_get_field():
 
     @dataclass
diff --git a/tests/test_embedded_commit.py b/tests/test_embedded_commit.py
index a9b4f5cbf78c..b9593e2a3b7c 100644
--- a/tests/test_embedded_commit.py
+++ b/tests/test_embedded_commit.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import vllm
 
diff --git a/tests/test_inputs.py b/tests/test_inputs.py
index d361808ed2f9..e549834faf6f 100644
--- a/tests/test_inputs.py
+++ b/tests/test_inputs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/test_logger.py b/tests/test_logger.py
index 046f70504c89..8f235f1474fe 100644
--- a/tests/test_logger.py
+++ b/tests/test_logger.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import enum
 import json
 import logging
diff --git a/tests/test_outputs.py b/tests/test_outputs.py
index c41bd6723ba1..4bb1c20f77f1 100644
--- a/tests/test_outputs.py
+++ b/tests/test_outputs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.outputs import RequestOutput
 
diff --git a/tests/test_regression.py b/tests/test_regression.py
index e092945422ed..f5f1ed8e805e 100644
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Containing tests that check for regressions in vLLM's behavior.
 
 It should include tests that are reported by users and making sure they
diff --git a/tests/test_sampling_params.py b/tests/test_sampling_params.py
index 9af810c4c1bc..39e3808d831c 100644
--- a/tests/test_sampling_params.py
+++ b/tests/test_sampling_params.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the SamplingParams class.
 """
 
diff --git a/tests/test_scalartype.py b/tests/test_scalartype.py
index eecfa1db3d7e..ef4aef3afc2e 100644
--- a/tests/test_scalartype.py
+++ b/tests/test_scalartype.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/test_seed_behavior.py b/tests/test_seed_behavior.py
index c45ed6926d77..e9138b9e8eb6 100644
--- a/tests/test_seed_behavior.py
+++ b/tests/test_seed_behavior.py
@@ -1,4 +1,5 @@
-# SPDX-License-Identifier: Apache-2.0
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import random
 
 import numpy as np
diff --git a/tests/test_sequence.py b/tests/test_sequence.py
index 902de1099e60..a782a3bf7716 100644
--- a/tests/test_sequence.py
+++ b/tests/test_sequence.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py
index 77fec0968000..64706defb596 100644
--- a/tests/test_sharded_state_loader.py
+++ b/tests/test_sharded_state_loader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import multiprocessing as mp
 import os
diff --git a/tests/test_triton_utils.py b/tests/test_triton_utils.py
index eb8ad48fdead..64f72668f29c 100644
--- a/tests/test_triton_utils.py
+++ b/tests/test_triton_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import sys
 import types
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 42e0df1ffb01..a2fd845ea54b 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa
 
 import asyncio
diff --git a/tests/test_version.py b/tests/test_version.py
index 56842b6d409d..fd07abb59b1f 100644
--- a/tests/test_version.py
+++ b/tests/test_version.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from unittest.mock import patch
 
diff --git a/tests/test_vllm_port.py b/tests/test_vllm_port.py
index ccbb36bf4c06..88e1efd8fdbb 100644
--- a/tests/test_vllm_port.py
+++ b/tests/test_vllm_port.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from unittest.mock import patch
diff --git a/tests/tokenization/test_cached_tokenizer.py b/tests/tokenization/test_cached_tokenizer.py
index c740fde42636..e218678c4363 100644
--- a/tests/tokenization/test_cached_tokenizer.py
+++ b/tests/tokenization/test_cached_tokenizer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pickle
 from copy import deepcopy
 
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index 079100e78b5f..9f2414eca24f 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Generator
 from typing import Any, Optional
@@ -69,7 +70,8 @@ def _run_incremental_decode(tokenizer,
                                 None,
                                 0.0,
                                 None,
-                                cache_salt=None)
+                                cache_salt=None,
+                                data_parallel_rank=None)
 
     if fast is None:
         detokenizer = IncrementalDetokenizer.from_new_request(
diff --git a/tests/tokenization/test_get_eos.py b/tests/tokenization/test_get_eos.py
index 8942f8891283..d8288429351c 100644
--- a/tests/tokenization/test_get_eos.py
+++ b/tests/tokenization/test_get_eos.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This test file includes some cases where it is inappropriate to
 only get the `eos_token_id` from the tokenizer as defined by
diff --git a/tests/tokenization/test_mistral_tokenizer.py b/tests/tokenization/test_mistral_tokenizer.py
index b16d9af35be9..69b3c6294284 100644
--- a/tests/tokenization/test_mistral_tokenizer.py
+++ b/tests/tokenization/test_mistral_tokenizer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 from mistral_common.protocol.instruct.messages import (AssistantMessage,
diff --git a/tests/tokenization/test_tokenizer.py b/tests/tokenization/test_tokenizer.py
index eddc630986ea..09a3638fd2ed 100644
--- a/tests/tokenization/test_tokenizer.py
+++ b/tests/tokenization/test_tokenizer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 from transformers import PreTrainedTokenizerBase
diff --git a/tests/tokenization/test_tokenizer_group.py b/tests/tokenization/test_tokenizer_group.py
index bcfa78ed41cf..0570c1525e11 100644
--- a/tests/tokenization/test_tokenizer_group.py
+++ b/tests/tokenization/test_tokenizer_group.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
diff --git a/tests/tokenization/test_tokenizer_registry.py b/tests/tokenization/test_tokenizer_registry.py
index 772eeb345ca4..5abb10164408 100644
--- a/tests/tokenization/test_tokenizer_registry.py
+++ b/tests/tokenization/test_tokenizer_registry.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import TYPE_CHECKING, Any, Optional, Union
 
diff --git a/tests/tool_use/conftest.py b/tests/tool_use/conftest.py
index 4bf9b45fe212..510b54790cd9 100644
--- a/tests/tool_use/conftest.py
+++ b/tests/tool_use/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import pytest_asyncio
diff --git a/tests/tool_use/test_chat_completion_request_validations.py b/tests/tool_use/test_chat_completion_request_validations.py
index ba0ad78f6467..a30c58b09fe8 100644
--- a/tests/tool_use/test_chat_completion_request_validations.py
+++ b/tests/tool_use/test_chat_completion_request_validations.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/tool_use/test_chat_completions.py b/tests/tool_use/test_chat_completions.py
index 448347be6ec1..8c01c86e29f2 100644
--- a/tests/tool_use/test_chat_completions.py
+++ b/tests/tool_use/test_chat_completions.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai
 import pytest
diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_use/test_jamba_tool_parser.py
index a40675744ba2..35153139350b 100644
--- a/tests/tool_use/test_jamba_tool_parser.py
+++ b/tests/tool_use/test_jamba_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from collections.abc import Generator
diff --git a/tests/tool_use/test_parallel_tool_calls.py b/tests/tool_use/test_parallel_tool_calls.py
index 910e0b2d51ab..fff20c68d621 100644
--- a/tests/tool_use/test_parallel_tool_calls.py
+++ b/tests/tool_use/test_parallel_tool_calls.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from typing import Optional
diff --git a/tests/tool_use/test_tool_calls.py b/tests/tool_use/test_tool_calls.py
index b320b335e338..53ba03a0ae10 100644
--- a/tests/tool_use/test_tool_calls.py
+++ b/tests/tool_use/test_tool_calls.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from typing import Optional
diff --git a/tests/tool_use/test_tool_choice_required.py b/tests/tool_use/test_tool_choice_required.py
index 291769848145..3b43b723d438 100644
--- a/tests/tool_use/test_tool_choice_required.py
+++ b/tests/tool_use/test_tool_choice_required.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
 from copy import deepcopy
 from unittest.mock import MagicMock
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index efa6455c41df..a17fab9aecbc 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from copy import deepcopy
 from typing import Any, Optional
diff --git a/tests/tpu/lora/test_lora.py b/tests/tpu/lora/test_lora.py
index 21d7fce691c9..b26bdd34d890 100644
--- a/tests/tpu/lora/test_lora.py
+++ b/tests/tpu/lora/test_lora.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 import vllm
diff --git a/tests/tpu/test_compilation.py b/tests/tpu/test_compilation.py
index 06e00187caf4..448b8b2bc094 100644
--- a/tests/tpu/test_compilation.py
+++ b/tests/tpu/test_compilation.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import glob
 import os
@@ -63,9 +64,10 @@ def extract_compiled_index(s):
         numbers = [int(part) for part in parts if part.isdigit()]
         return numbers[0]
 
-    # Check all the compilations are as expected
+    # Check all the compilations are as expected. The dump files include the
+    # captured graph for the forward function of the nn.Module.
     compiled_fns = sorted(glob.glob(
-        os.path.join(temp_dir, "__compiled_fn*Captured*.py")),
+        os.path.join(temp_dir, "__compiled_fn*Forward_graph*.py")),
                           key=lambda s: extract_compiled_index(s))
 
     for i, compiled_fn in enumerate(compiled_fns):
diff --git a/tests/tpu/test_custom_dispatcher.py b/tests/tpu/test_custom_dispatcher.py
index acb6b90f5f7f..9c90df1b7701 100644
--- a/tests/tpu/test_custom_dispatcher.py
+++ b/tests/tpu/test_custom_dispatcher.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/tpu/test_moe_pallas.py b/tests/tpu/test_moe_pallas.py
index 19df22f78039..407a824d8174 100644
--- a/tests/tpu/test_moe_pallas.py
+++ b/tests/tpu/test_moe_pallas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the Pallas MOE implementation.
 
 Run `pytest tests/kernels/moe/test_moe_pallas.py`.
@@ -26,7 +27,7 @@
 # The Pallas GMM kernel requires num_tokens * topk to be a multiple of 16
 @pytest.mark.parametrize("m", [8, 16, 64, 2048])
 @pytest.mark.parametrize("n", [128, 1024, 2048])
-@pytest.mark.parametrize("k", [128, 512, 1024])
+@pytest.mark.parametrize("k", [128, 511, 1024])
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("ep_size", EP_SIZE)
diff --git a/tests/tpu/test_quantization_accuracy.py b/tests/tpu/test_quantization_accuracy.py
index 20f9dd77d0e8..a13cf7064d54 100644
--- a/tests/tpu/test_quantization_accuracy.py
+++ b/tests/tpu/test_quantization_accuracy.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 
diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py
index caa233ec3ff9..4dbae7c15de3 100644
--- a/tests/tracing/test_tracing.py
+++ b/tests/tracing/test_tracing.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa
 # type: ignore
 from __future__ import annotations
diff --git a/tests/utils.py b/tests/utils.py
index d21b18470b1b..ade28a481261 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import copy
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 61aee8752988..ab7aa02823ab 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import importlib
 
 import pytest
@@ -11,13 +12,11 @@
 from vllm.v1.core.kv_cache_manager import KVCacheManager
 # disable yapf here as it formats differently than isort such that both fail
 # yapf: disable
-from vllm.v1.core.kv_cache_utils import (FreeKVCacheBlockQueue, KVCacheBlock,
-                                         PrefixCachingMetrics,
-                                         estimate_max_model_len,
-                                         generate_block_hash_extra_keys,
-                                         hash_block_tokens,
-                                         hash_request_tokens,
-                                         unify_kv_cache_configs)
+from vllm.v1.core.kv_cache_utils import (
+    FreeKVCacheBlockQueue, KVCacheBlock, PrefixCachingMetrics,
+    estimate_max_model_len, generate_block_hash_extra_keys,
+    get_kv_cache_config, get_max_concurrency_for_kv_cache_config,
+    hash_block_tokens, hash_request_tokens, unify_kv_cache_configs)
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec, KVCacheTensor,
                                         SlidingWindowSpec)
@@ -64,6 +63,20 @@ def new_kv_cache_spec(block_size=16,
                              sliding_window=sliding_window)
 
 
+def new_sliding_window_spec(block_size=16,
+                            num_kv_heads=2,
+                            head_size=64,
+                            dtype=torch.float32,
+                            use_mla=False,
+                            sliding_window=1):
+    return SlidingWindowSpec(block_size=block_size,
+                             num_kv_heads=num_kv_heads,
+                             head_size=head_size,
+                             dtype=dtype,
+                             use_mla=use_mla,
+                             sliding_window=sliding_window)
+
+
 def test_none_hash(monkeypatch):
     import vllm.v1.core.kv_cache_utils
 
@@ -404,10 +417,10 @@ def test_unify_kv_cache_configs():
     same_kv_cache_config = [
         KVCacheConfig(
             num_blocks=10,
-            tensors={
-                "layer1": KVCacheTensor(100),
-                "layer2": KVCacheTensor(100),
-            },
+            kv_cache_tensors=[
+                KVCacheTensor(size=100, shared_by=["layer1"]),
+                KVCacheTensor(size=100, shared_by=["layer2"]),
+            ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
                 KVCacheGroupSpec(["layer2"],
@@ -416,10 +429,10 @@ def test_unify_kv_cache_configs():
         ),
         KVCacheConfig(
             num_blocks=20,
-            tensors={
-                "layer1": KVCacheTensor(100),
-                "layer2": KVCacheTensor(100),
-            },
+            kv_cache_tensors=[
+                KVCacheTensor(size=100, shared_by=["layer1"]),
+                KVCacheTensor(size=100, shared_by=["layer2"]),
+            ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
                 KVCacheGroupSpec(["layer2"],
@@ -434,10 +447,10 @@ def test_unify_kv_cache_configs():
     need_sort_kv_cache_config = [
         KVCacheConfig(
             num_blocks=10,
-            tensors={
-                "layer1": KVCacheTensor(100),
-                "layer2": KVCacheTensor(100),
-            },
+            kv_cache_tensors=[
+                KVCacheTensor(size=100, shared_by=["layer1"]),
+                KVCacheTensor(size=100, shared_by=["layer2"]),
+            ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
                 KVCacheGroupSpec(["layer2"],
@@ -446,10 +459,10 @@ def test_unify_kv_cache_configs():
         ),
         KVCacheConfig(
             num_blocks=20,
-            tensors={
-                "layer1": KVCacheTensor(100),
-                "layer2": KVCacheTensor(100),
-            },
+            kv_cache_tensors=[
+                KVCacheTensor(size=100, shared_by=["layer1"]),
+                KVCacheTensor(size=100, shared_by=["layer2"]),
+            ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer2"],
                                  new_kv_cache_spec(num_kv_heads=4)),
@@ -465,10 +478,10 @@ def test_unify_kv_cache_configs():
     diff_kv_cache_config = [
         KVCacheConfig(
             num_blocks=10,
-            tensors={
-                "layer1": KVCacheTensor(100),
-                "layer2": KVCacheTensor(100),
-            },
+            kv_cache_tensors=[
+                KVCacheTensor(size=100, shared_by=["layer1"]),
+                KVCacheTensor(size=100, shared_by=["layer2"]),
+            ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
                 KVCacheGroupSpec(["layer2"],
@@ -477,10 +490,10 @@ def test_unify_kv_cache_configs():
         ),
         KVCacheConfig(
             num_blocks=20,
-            tensors={
-                "layer1": KVCacheTensor(100),
-                "layer2": KVCacheTensor(100),
-            },
+            kv_cache_tensors=[
+                KVCacheTensor(size=100, shared_by=["layer1"]),
+                KVCacheTensor(size=100, shared_by=["layer2"]),
+            ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
                 KVCacheGroupSpec(["layer2"],
@@ -596,14 +609,92 @@ def test_estimate_max_model_len(model_id, max_model_len,
     assert estimated_max_len == want_estimated_max_len
 
 
+def test_get_max_concurrency_for_kv_cache_config():
+    # Create a VllmConfig
+    model_id = "Qwen/Qwen1.5-7B"
+    max_model_len = 16384
+    model_config = ModelConfig(
+        model_id,
+        task="generate",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        max_model_len=max_model_len,
+    )
+    scheduler_config = SchedulerConfig(max_num_batched_tokens=1024,
+                                       enable_chunked_prefill=True)
+
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        scheduler_config=scheduler_config,
+    )
+
+    full_attention_spec = FullAttentionSpec(
+        block_size=16,
+        num_kv_heads=32,
+        head_size=128,
+        dtype=torch.float16,
+        use_mla=False,
+    )
+
+    sliding_window_spec = SlidingWindowSpec(
+        block_size=16,
+        num_kv_heads=32,
+        head_size=128,
+        dtype=torch.float16,
+        use_mla=False,
+        sliding_window=1024,
+    )
+
+    kv_cache_config_full_attention = KVCacheConfig(
+        num_blocks=int(1024 * 1.5),
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec([f"layer_{i}" for i in range(32)],
+                             full_attention_spec),
+        ],
+    )
+    max_concurrency_full_attention = get_max_concurrency_for_kv_cache_config(
+        vllm_config, kv_cache_config_full_attention)
+    assert max_concurrency_full_attention == 1.5
+
+    kv_cache_config_sliding_window = KVCacheConfig(
+        num_blocks=129 * 3,
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec([f"layer_{i}" for i in range(32)],
+                             sliding_window_spec),
+        ],
+    )
+    max_concurrency_sliding_window = get_max_concurrency_for_kv_cache_config(
+        vllm_config, kv_cache_config_sliding_window)
+    assert max_concurrency_sliding_window == 3
+
+    kv_cache_config_hybrid_model = KVCacheConfig(
+        num_blocks=(1024 + 129) * 3,
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec([f"layer_{i}" for i in range(32)],
+                             full_attention_spec),
+            KVCacheGroupSpec([f"layer_{i}" for i in range(32, 64)],
+                             sliding_window_spec),
+        ],
+    )
+    max_concurrency_hybrid_model = get_max_concurrency_for_kv_cache_config(
+        vllm_config, kv_cache_config_hybrid_model)
+    assert max_concurrency_hybrid_model == 3
+
+
 def test_allocate_with_lookahead():
     """Verify that lookahead tokens correctly affect block allocation"""
     block_size = 4
     config = KVCacheConfig(
         num_blocks=10,
-        tensors={
-            "layer1": KVCacheTensor(100),
-        },
+        kv_cache_tensors=[
+            KVCacheTensor(size=100, shared_by=["layer1"]),
+        ],
         kv_cache_groups=[
             KVCacheGroupSpec(["layer1"],
                              new_kv_cache_spec(block_size=block_size)),
@@ -625,7 +716,7 @@ def test_allocate_with_lookahead():
         num_new_tokens=3,
         num_lookahead_tokens=2,  # Total required: 3+2=5 tokens
     )
-    assert len(blocks.blocks) == 2  # ceil(5/4)=2 blocks
+    assert len(blocks.get_block_ids()[0]) == 2  # ceil(5/4)=2 blocks
 
     # Test case 2: With precomputed blocks
     kv_cache_manager = KVCacheManager(kv_cache_config=config,
@@ -636,7 +727,7 @@ def test_allocate_with_lookahead():
         num_new_tokens=3,
         num_lookahead_tokens=2,
     )
-    assert len(blocks.blocks) == 2
+    assert len(blocks.get_block_ids()[0]) == 2
 
     # Test case 3: With precomputed blocks
     # required_blocks = ceil((3 + 4) / 4) = 2
@@ -647,4 +738,165 @@ def test_allocate_with_lookahead():
         num_new_tokens=3,
         num_lookahead_tokens=4,
     )
-    assert len(blocks.blocks) == 2
+    assert len(blocks.get_block_ids()[0]) == 2
+
+
+def test_get_kv_cache_config():
+    # pass max_model_len to pass check_enough_kv_cache_memory
+    model_config = ModelConfig(max_model_len=16)
+    vllm_config = VllmConfig(model_config=model_config)
+
+    mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2
+    # all layers are full attention -> single group
+    kv_cache_specs_full = {
+        'layer_1': new_kv_cache_spec(),
+        'layer_2': new_kv_cache_spec(),
+    }
+    kv_cache_config_full = get_kv_cache_config(
+        vllm_config, kv_cache_specs_full, mem_per_block_per_layer * 2 * 32)
+    assert kv_cache_config_full == KVCacheConfig(
+        num_blocks=32,
+        kv_cache_tensors=[
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_1"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_2"]),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_1", "layer_2"], new_kv_cache_spec())
+        ])
+
+    # all layers are sliding window -> single group
+    kv_cache_specs_sliding = {
+        'layer_1': new_sliding_window_spec(),
+        'layer_2': new_sliding_window_spec(),
+    }
+    kv_cache_config_sliding = get_kv_cache_config(
+        vllm_config, kv_cache_specs_sliding, mem_per_block_per_layer * 2 * 32)
+    assert kv_cache_config_sliding == KVCacheConfig(
+        num_blocks=32,
+        kv_cache_tensors=[
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_1"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_2"]),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_1", "layer_2"], new_sliding_window_spec())
+        ])
+
+    # full + sliding, but disable_hybrid_kv_cache_manager
+    vllm_config.scheduler_config.disable_hybrid_kv_cache_manager = True
+    kv_cache_specs_hybrid = {
+        'layer_1': new_kv_cache_spec(),
+        'layer_2': new_sliding_window_spec(),
+    }
+    kv_cache_config_hybrid = get_kv_cache_config(
+        vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 2 * 32)
+    assert kv_cache_config_hybrid == KVCacheConfig(
+        num_blocks=32,
+        kv_cache_tensors=[
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_1"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_2"]),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_1", "layer_2"],
+                             new_kv_cache_spec(sliding_window=1)),
+        ],
+    )
+    vllm_config.scheduler_config.disable_hybrid_kv_cache_manager = False
+
+    # full + sliding, with hybrid_kv_cache_manager
+    kv_cache_specs_hybrid = {
+        'layer_1': new_kv_cache_spec(),
+        'layer_2': new_sliding_window_spec(),
+    }
+    kv_cache_config_hybrid = get_kv_cache_config(
+        vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 2 * 32)
+    assert kv_cache_config_hybrid == KVCacheConfig(
+        num_blocks=64,
+        kv_cache_tensors=[
+            KVCacheTensor(size=mem_per_block_per_layer * 64,
+                          shared_by=["layer_1", "layer_2"]),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_1"], new_kv_cache_spec()),
+            KVCacheGroupSpec(["layer_2"], new_sliding_window_spec()),
+        ],
+    )
+
+    # 2 full + 4 sliding, 2 layers per group
+    kv_cache_specs_hybrid = {
+        'layer_1': new_kv_cache_spec(),
+        'layer_2': new_kv_cache_spec(),
+        'layer_3': new_sliding_window_spec(),
+        'layer_4': new_sliding_window_spec(),
+        'layer_5': new_sliding_window_spec(),
+        'layer_6': new_sliding_window_spec(),
+    }
+    kv_cache_config_hybrid = get_kv_cache_config(
+        vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 2 * 32)
+    assert kv_cache_config_hybrid == KVCacheConfig(
+        num_blocks=32,
+        kv_cache_tensors=[
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_1", "layer_3", "layer_5"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_2", "layer_4", "layer_6"]),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_1", "layer_2"], new_kv_cache_spec()),
+            KVCacheGroupSpec(["layer_3", "layer_4"],
+                             new_sliding_window_spec()),
+            KVCacheGroupSpec(["layer_5", "layer_6"],
+                             new_sliding_window_spec()),
+        ],
+    )
+
+    # 3 full + 7 sliding, pad to 3 full + 9 sliding
+    kv_cache_specs_hybrid = {
+        'layer_1': new_kv_cache_spec(),
+        'layer_2': new_kv_cache_spec(),
+        'layer_3': new_kv_cache_spec(),
+        'layer_4': new_sliding_window_spec(),
+        'layer_5': new_sliding_window_spec(),
+        'layer_6': new_sliding_window_spec(),
+        'layer_7': new_sliding_window_spec(),
+        'layer_8': new_sliding_window_spec(),
+        'layer_9': new_sliding_window_spec(),
+        'layer_10': new_sliding_window_spec(),
+    }
+    kv_cache_config_hybrid = get_kv_cache_config(
+        vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 3 * 32)
+    assert kv_cache_config_hybrid == KVCacheConfig(
+        num_blocks=32,
+        kv_cache_tensors=[
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32,
+                shared_by=["layer_1", "layer_4", "layer_7", "layer_10"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_2", "layer_5", "layer_8"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_3", "layer_6", "layer_9"]),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_1", "layer_2", "layer_3"],
+                             new_kv_cache_spec()),
+            KVCacheGroupSpec(["layer_4", "layer_5", "layer_6"],
+                             new_sliding_window_spec()),
+            KVCacheGroupSpec(["layer_7", "layer_8", "layer_9"],
+                             new_sliding_window_spec()),
+            KVCacheGroupSpec(["layer_10"], new_sliding_window_spec()),
+        ],
+    )
+
+    # different hidden size, unimplemented
+    kv_cache_specs_hybrid = {
+        'layer_1': new_kv_cache_spec(head_size=128),
+        'layer_2': new_kv_cache_spec(),
+    }
+    with pytest.raises(NotImplementedError):
+        get_kv_cache_config(vllm_config, kv_cache_specs_hybrid,
+                            mem_per_block_per_layer * 2 * 32)
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 1a7a31d98506..bf4cb539ebef 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the with and without prefix caching."""
 
+import copy
 from typing import Optional
 
 import pytest
@@ -12,8 +14,8 @@
 from vllm.utils import sha256
 from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
-from vllm.v1.core.kv_cache_utils import (BlockHash, KVCacheBlock,
-                                         hash_block_tokens)
+from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId,
+                                         KVCacheBlock, hash_block_tokens)
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec, SlidingWindowSpec)
 
@@ -46,7 +48,7 @@ def make_request(request_id,
 def make_kv_cache_config(block_size: int, num_blocks: int) -> KVCacheConfig:
     return KVCacheConfig(
         num_blocks=num_blocks,
-        tensors={},
+        kv_cache_tensors=[],
         kv_cache_groups=[
             KVCacheGroupSpec(
                 ["layer"],
@@ -56,6 +58,38 @@ def make_kv_cache_config(block_size: int, num_blocks: int) -> KVCacheConfig:
     )
 
 
+def make_kv_cache_config_hybrid_model(block_size: int,
+                                      num_blocks: int) -> KVCacheConfig:
+    return KVCacheConfig(
+        num_blocks=num_blocks,
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec(
+                ["layer1"],
+                FullAttentionSpec(block_size, 1, 1, torch.float32, False),
+            ),
+            KVCacheGroupSpec(
+                ["layer2"],
+                SlidingWindowSpec(block_size,
+                                  1,
+                                  1,
+                                  torch.float32,
+                                  False,
+                                  sliding_window=2 * block_size),
+            ),
+            KVCacheGroupSpec(
+                ["layer3"],
+                SlidingWindowSpec(block_size,
+                                  1,
+                                  1,
+                                  torch.float32,
+                                  False,
+                                  sliding_window=2 * block_size),
+            ),
+        ],
+    )
+
+
 @pytest.mark.parametrize("hash_algo", ["sha256", "hash"])
 def test_prefill(hash_algo):
     manager = KVCacheManager(
@@ -78,10 +112,10 @@ def test_prefill(hash_algo):
     req0 = make_request("0", all_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert len(manager.req_to_block_hashes[req0.request_id]) == 3
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
     assert blocks.get_block_ids() == [[1, 2, 3, 4]]
 
@@ -91,7 +125,8 @@ def test_prefill(hash_algo):
         block_tokens = tuple(all_token_ids[(block_id - 1) * 16:block_id * 16])
         block_hash = hash_block_tokens(hash_fn, parent_block_hash,
                                        block_tokens)
-        assert manager.block_pool.blocks[block_id].block_hash == block_hash
+        assert manager.block_pool.blocks[
+            block_id].block_hash.block_hash == block_hash
         assert manager.block_pool.blocks[block_id].ref_cnt == 1
         parent_block_hash = block_hash.hash_value
 
@@ -110,10 +145,10 @@ def test_prefill(hash_algo):
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req1, num_new_tokens,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
     assert blocks.get_block_ids() == [[5]]
-    for block in computed_blocks.blocks:
+    for block in computed_blocks.blocks[0]:
         assert block.ref_cnt == 2
 
     # At this point, we should have 5 free blocks left.
@@ -144,7 +179,7 @@ def test_prefill(hash_algo):
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req2, num_new_tokens,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
     assert blocks.get_block_ids() == [[6]]
 
@@ -164,10 +199,10 @@ def test_prefill(hash_algo):
     # Cache miss and eviction.
     req3 = make_request("3", [99] * (16 * 10))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req3, 16 * 10,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
     # This block ID order also checks the eviction order.
     assert blocks.get_block_ids() == [[7, 8, 9, 10, 4, 5, 6, 3, 2, 1]]
@@ -176,6 +211,138 @@ def test_prefill(hash_algo):
     assert manager.block_pool.free_block_queue.free_list_tail is None
 
 
+def test_prefill_hybrid_model():
+    block_size = 16
+    manager = KVCacheManager(
+        make_kv_cache_config_hybrid_model(block_size, 21),
+        max_model_len=8192,
+        enable_caching=True,
+    )
+
+    hash_fn = hash
+
+    # Complete 3 blocks (48 tokens)
+    common_token_ids = [i for i in range(3) for _ in range(block_size)]
+
+    # Fully cache miss
+    # Incomplete 1 block (7 tokens)
+    unique_token_ids = [3] * 7
+    all_token_ids = common_token_ids + unique_token_ids
+    req0 = make_request("0", all_token_ids)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
+    assert len(manager.req_to_block_hashes[req0.request_id]) == 3
+    assert not computed_blocks.blocks[0]
+    assert num_computed_tokens == 0
+    blocks = manager.allocate_slots(req0, 55,
+                                    len(computed_blocks.blocks[0]) * 16,
+                                    computed_blocks)
+    assert blocks.get_block_ids() == [[1, 2, 3, 4], [5, 6, 7, 8],
+                                      [9, 10, 11, 12]]
+
+    # Check full block metadata
+    parent_block_hash = None
+    for length, block_ids in zip((1, 2, 3),
+                                 ((1, 5, 9), (2, 6, 10), (3, 7, 11))):
+        block_tokens = tuple(all_token_ids[(length - 1) * 16:length * 16])
+        block_hash = hash_block_tokens(hash_fn, parent_block_hash,
+                                       block_tokens)
+        for block_id in block_ids:
+            assert manager.block_pool.blocks[
+                block_id].block_hash.block_hash == block_hash
+            assert manager.block_pool.blocks[block_id].ref_cnt == 1
+        parent_block_hash = block_hash.hash_value
+
+    # Check partial block metadata
+    for block_id in (4, 8, 12):
+        assert manager.block_pool.blocks[block_id].block_hash is None
+        assert manager.block_pool.blocks[block_id].ref_cnt == 1
+
+    # Cache hit in the common prefix
+    # Incomplete 1 block (5 tokens)
+    unique_token_ids = [3] * 5
+    req1 = make_request("1", common_token_ids + unique_token_ids)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
+    assert len(manager.req_to_block_hashes[req1.request_id]) == 3
+    assert computed_blocks.get_block_ids() == [[1, 2, 3], [0, 6, 7],
+                                               [0, 10, 11]]
+    assert num_computed_tokens == 3 * 16
+    num_new_tokens = 53 - 3 * 16
+    blocks = manager.allocate_slots(req1, num_new_tokens,
+                                    len(computed_blocks.blocks[0]) * 16,
+                                    computed_blocks)
+    assert blocks.get_block_ids() == [[13], [14], [15]]
+    for block_per_group in computed_blocks.blocks:
+        for block in block_per_group:
+            if block != manager.block_pool.null_block:
+                assert block.ref_cnt == 2
+
+    block_hashes = manager.req_to_block_hashes[req1.request_id]
+    manager.free(req0)
+    manager.free(req1)
+
+    cached_block_hash_to_block_bak = copy.copy(
+        manager.block_pool.cached_block_hash_to_block)
+
+    def test_partial_request_hit(request_id: str,
+                                 hash_to_evict: list[BlockHashWithGroupId],
+                                 expect_hit_length: int):
+        req = make_request(request_id, common_token_ids + unique_token_ids)
+        for hash_with_group_id in hash_to_evict:
+            manager.block_pool.cached_block_hash_to_block.pop(
+                hash_with_group_id)
+        computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
+        assert len(manager.req_to_block_hashes[req.request_id]) == 3
+        assert num_computed_tokens == expect_hit_length * block_size
+        for block_per_group in computed_blocks.blocks:
+            assert len(block_per_group) == num_computed_tokens // block_size
+        for hash_with_group_id in hash_to_evict:
+            manager.block_pool.cached_block_hash_to_block[
+                hash_with_group_id] = cached_block_hash_to_block_bak[
+                    hash_with_group_id]
+        manager.free(req)
+
+    # Evict the blocks outside sliding window, does not affect the hit length.
+    test_partial_request_hit("2", [
+        BlockHashWithGroupId(block_hashes[0], 1),
+        BlockHashWithGroupId(block_hashes[0], 2)
+    ], 3)
+
+    # Evict the first block of full attention, makes total cache miss.
+    test_partial_request_hit("3", [
+        BlockHashWithGroupId(block_hashes[0], 0),
+    ], 0)
+
+    # Evict the last block of all layers, reduces the hit length to 2.
+    test_partial_request_hit("4", [
+        BlockHashWithGroupId(block_hashes[2], 0),
+        BlockHashWithGroupId(block_hashes[2], 1),
+        BlockHashWithGroupId(block_hashes[2], 2),
+    ], 2)
+
+    # Evict the last block of full attention, reduces the hit length to 2.
+    test_partial_request_hit("5", [BlockHashWithGroupId(block_hashes[2], 0)],
+                             2)
+
+    # Evict the last block of sliding window, reduces the hit length to 2.
+    test_partial_request_hit("6", [BlockHashWithGroupId(block_hashes[2], 1)],
+                             2)
+
+    # Evict the last block of sliding window, reduces the hit length to 2.
+    test_partial_request_hit("7", [BlockHashWithGroupId(block_hashes[2], 2)],
+                             2)
+
+    # Evict different set of blocks for full attention and sliding window makes
+    # total cache miss.
+    # The cache hit length of full attention is 1 * block_size.
+    # The cache hit length of sliding window is 2 * block_size.
+    # Then it is cache miss as the two type of layers have different hit length.
+    test_partial_request_hit("8", [
+        BlockHashWithGroupId(block_hashes[2], 0),
+        BlockHashWithGroupId(block_hashes[0], 1),
+        BlockHashWithGroupId(block_hashes[0], 2),
+    ], 0)
+
+
 def test_prefill_plp():
     '''Test prefill with APC and some prompt logprobs (plp) requests.
 
@@ -202,13 +369,13 @@ def test_prefill_plp():
     req0 = make_request("0", all_token_ids, prompt_logprobs=5)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert len(manager.req_to_block_hashes[req0.request_id]) == 0
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
     assert blocks.get_block_ids() == [[1, 2, 3, 4]]
-    req0_block_hashes = [b.block_hash for b in blocks.blocks]
+    req0_block_hashes = [b.block_hash for b in blocks.blocks[0]]
 
     # Check full block metadata
     parent_block_hash = None
@@ -216,7 +383,8 @@ def test_prefill_plp():
         block_tokens = tuple(all_token_ids[(block_id - 1) * 16:block_id * 16])
         block_hash = hash_block_tokens(hash_fn, parent_block_hash,
                                        block_tokens)
-        assert manager.block_pool.blocks[block_id].block_hash == block_hash
+        assert manager.block_pool.blocks[
+            block_id].block_hash.block_hash == block_hash
         assert manager.block_pool.blocks[block_id].ref_cnt == 1
         parent_block_hash = block_hash.hash_value
 
@@ -236,10 +404,10 @@ def test_prefill_plp():
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req1, num_new_tokens,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
     assert blocks.get_block_ids() == [[5]]
-    for block in computed_blocks.blocks:
+    for block in computed_blocks.blocks[0]:
         assert block.ref_cnt == 2
 
     # At this point, we should have 5 free blocks left.
@@ -268,14 +436,14 @@ def test_prefill_plp():
                         prompt_logprobs=5)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert len(manager.req_to_block_hashes[req2.request_id]) == 0
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req2, 55,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
     block_ids = blocks.get_block_ids()
     # Duplicate cached blocks have different ids but same hashes vs request #0
-    assert [b.block_hash for b in blocks.blocks] == req0_block_hashes
+    assert [b.block_hash for b in blocks.blocks[0]] == req0_block_hashes
     assert block_ids != [[1, 2, 3, 4]]
 
     # Request #2 block hashes are valid since request #0 hashes are.
@@ -301,10 +469,10 @@ def test_decode():
     unique_token_ids = [3] * 7
     req0 = make_request("0", common_token_ids + unique_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
     assert blocks.get_block_ids() == [[1, 2, 3, 4]]
 
@@ -313,10 +481,10 @@ def test_decode():
     for _ in range(4):
         req0.append_output_token_ids(8)
     new_blocks = manager.allocate_slots(req0, 4,
-                                        len(computed_blocks.blocks) * 16,
+                                        len(computed_blocks.blocks[0]) * 16,
                                         computed_blocks)
-    assert new_blocks is not None and len(new_blocks.blocks) == 0
-    assert manager.single_type_manager.req_to_blocks[
+    assert new_blocks is not None and len(new_blocks.blocks[0]) == 0
+    assert manager.coordinator.single_type_managers[0].req_to_blocks[
         req0.request_id][-1].block_hash is None
 
     # Append slots with allocating a new block.
@@ -326,12 +494,12 @@ def test_decode():
     for _ in range(9 + 10):
         req0.append_output_token_ids(7)
     new_blocks = manager.allocate_slots(req0, 19,
-                                        len(computed_blocks.blocks) * 16,
+                                        len(computed_blocks.blocks[0]) * 16,
                                         computed_blocks)
-    assert new_blocks is not None and len(new_blocks.blocks) == 1
-    assert manager.single_type_manager.req_to_blocks[
+    assert new_blocks is not None and len(new_blocks.blocks[0]) == 1
+    assert manager.coordinator.single_type_managers[0].req_to_blocks[
         req0.request_id][-2].block_hash is not None
-    assert manager.single_type_manager.req_to_blocks[
+    assert manager.coordinator.single_type_managers[0].req_to_blocks[
         req0.request_id][-1].block_hash is None
 
 
@@ -345,23 +513,23 @@ def test_evict():
     last_token_id = 5 * 16 + 7
     req0 = make_request("0", list(range(last_token_id)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 5 * 16 + 7,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert len(blocks.blocks) == 6  # 5 full + 1 partial
+    assert len(blocks.blocks[0]) == 6  # 5 full + 1 partial
 
     # 3 blocks.
     req1 = make_request("1", list(range(last_token_id,
                                         last_token_id + 3 * 16)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req1, 3 * 16,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert len(blocks.blocks) == 3  # 3 full blocks
+    assert len(blocks.blocks[0]) == 3  # 3 full blocks
     last_token_id += 3 * 16
 
     # 10 - (6 + 3) == 1
@@ -381,7 +549,7 @@ def test_evict():
     assert computed_blocks.get_block_ids() == [[1, 2]]
     assert num_computed_tokens == 2 * 16
     blocks = manager.allocate_slots(req2, 3,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
     assert blocks.get_block_ids() == [[10]]
     assert manager.block_pool.free_block_queue.num_free_blocks == 7
@@ -403,12 +571,12 @@ def test_hash_block_correct_reuse():
     num_tokens = block_size * 1
     req = make_request("0", list(range(num_tokens)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req, num_tokens,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert len(blocks.blocks) == 1
+    assert len(blocks.blocks[0]) == 1
 
     # Deallocate the block.
     manager.free(req)
@@ -417,15 +585,15 @@ def test_hash_block_correct_reuse():
     # block is cleared.
     req = make_request("1", list(range(num_tokens - 1)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req, num_tokens - 1,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert len(blocks.blocks) == 1
+    assert len(blocks.blocks[0]) == 1
 
-    assert manager.block_pool.blocks[
-        blocks.blocks[0].block_id].block_hash is None
+    assert manager.block_pool.blocks[blocks.blocks[0]
+                                     [0].block_id].block_hash is None
 
 
 def test_computed_blocks_not_evicted():
@@ -444,24 +612,24 @@ def test_computed_blocks_not_evicted():
     num_tokens = block_size * 1
     req0 = make_request("0", list(range(num_tokens)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, num_tokens,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert len(blocks.blocks) == 1
-    assert blocks.blocks[0].block_id == 1
+    assert len(blocks.blocks[0]) == 1
+    assert blocks.blocks[0][0].block_id == 1
 
     # Allocate another block.
     req1 = make_request("1", list(range(num_tokens, num_tokens * 2)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req1, num_tokens,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert len(blocks.blocks) == 1
-    assert blocks.blocks[0].block_id == 2
+    assert len(blocks.blocks[0]) == 1
+    assert blocks.blocks[0][0].block_id == 2
 
     # Free the blocks.
     manager.free(req0)
@@ -471,15 +639,15 @@ def test_computed_blocks_not_evicted():
     # cached block rather than the first one.
     req2 = make_request("2", list(range(num_tokens * 2)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
-    assert len(computed_blocks.blocks) == 1
-    assert computed_blocks.blocks[0].block_id == 1
+    assert len(computed_blocks.blocks[0]) == 1
+    assert computed_blocks.blocks[0][0].block_id == 1
     assert num_computed_tokens == block_size
 
     blocks = manager.allocate_slots(req2, num_tokens * 2 - num_tokens,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert len(blocks.blocks) == 1
-    assert blocks.blocks[0].block_id == 2
+    assert len(blocks.blocks[0]) == 1
+    assert blocks.blocks[0][0].block_id == 2
 
 
 def test_basic_prefix_caching_disabled():
@@ -496,12 +664,12 @@ def test_basic_prefix_caching_disabled():
     req1 = make_request("1", list(range(10)))  # 2 blocks and some more
 
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req1, 10,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert len(blocks.blocks) == 3
+    assert len(blocks.blocks[0]) == 3
 
     # Free the blocks.
     manager.free(req1)
@@ -509,20 +677,20 @@ def test_basic_prefix_caching_disabled():
     # No caching.
     req2 = make_request("2", list(range(16)))  # shared prefix
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req2, 16,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert len(blocks.blocks) == 4
+    assert len(blocks.blocks[0]) == 4
 
     # New requests should not have any blocks.
     req3 = make_request("3", list(range(4)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req3, 4,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
     assert not blocks
 
@@ -557,6 +725,7 @@ def test_cache_blocks(hash_fn):
         num_full_blocks=2,
         block_size=block_size,
         hash_fn=hash_fn,
+        kv_cache_group_id=0,
     )
 
     assert len(block_pool.cached_block_hash_to_block) == 2
@@ -572,11 +741,83 @@ def test_cache_blocks(hash_fn):
         num_full_blocks=3,
         block_size=block_size,
         hash_fn=hash_fn,
+        kv_cache_group_id=0,
     )
     assert len(block_pool.cached_block_hash_to_block) == 3
     assert blocks[0].block_hash is not None
 
 
+def test_cache_blocks_multi_group():
+    """
+    This tests that blocks are cached correctly for different kv cache groups.
+    """
+    block_size = 4
+    block_pool = BlockPool(num_gpu_blocks=10, enable_caching=True)
+
+    # Req:
+    #  Block 0/4: [0, 1, 2, 3]
+    #  Block 1/5: [4, 5, 6, 7]
+    #  Block 2/6: [8, 9, 10, 11]
+    #  Block 3/7: [12, 13]
+    req = make_request("0", list(range(14)))
+
+    # Cache the blocks for group 0.
+    blocks = [KVCacheBlock(block_id=i) for i in range(2)]
+    block_hashes: list[BlockHash] = []
+    block_pool.cache_full_blocks(
+        request=req,
+        blocks=blocks,
+        block_hashes=block_hashes,
+        num_cached_blocks=0,
+        num_full_blocks=2,
+        block_size=block_size,
+        hash_fn=hash,
+        kv_cache_group_id=0,
+    )
+    assert len(block_pool.cached_block_hash_to_block) == 2
+    assert len(block_hashes) == 2
+    assert all([block.block_hash is not None for block in blocks])
+
+    # Cache the blocks for group 1.
+    blocks = [KVCacheBlock(block_id=i) for i in range(3)]
+    block_pool.cache_full_blocks(
+        request=req,
+        blocks=blocks,
+        block_hashes=block_hashes,
+        num_cached_blocks=0,
+        num_full_blocks=3,
+        block_size=block_size,
+        hash_fn=hash,
+        kv_cache_group_id=1,
+    )
+    assert len(block_pool.cached_block_hash_to_block) == 5
+    assert len(block_hashes) == 3
+    assert all([block.block_hash is not None for block in blocks])
+
+    # Block hash 0: hit for group 0 and 1
+    # Block hash 1: hit for group 0 and 1
+    # Block hash 2: hit for group 1
+
+    assert block_pool.get_cached_block(block_hashes[0],
+                                       kv_cache_group_ids=[0]) is not None
+    assert block_pool.get_cached_block(block_hashes[1],
+                                       kv_cache_group_ids=[0]) is not None
+    assert block_pool.get_cached_block(block_hashes[2],
+                                       kv_cache_group_ids=[0]) is None
+    assert block_pool.get_cached_block(block_hashes[0],
+                                       kv_cache_group_ids=[1]) is not None
+    assert block_pool.get_cached_block(block_hashes[1],
+                                       kv_cache_group_ids=[1]) is not None
+    assert block_pool.get_cached_block(block_hashes[2],
+                                       kv_cache_group_ids=[1]) is not None
+    assert block_pool.get_cached_block(block_hashes[0],
+                                       kv_cache_group_ids=[0, 1]) is not None
+    assert block_pool.get_cached_block(block_hashes[1],
+                                       kv_cache_group_ids=[0, 1]) is not None
+    assert block_pool.get_cached_block(block_hashes[2],
+                                       kv_cache_group_ids=[0, 1]) is None
+
+
 def test_mm_prefix_caching():
     """
     This tests that the multi-modal prefix caching is correct.
@@ -613,7 +854,7 @@ def test_mm_prefix_caching():
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
 
     # Completed block should have hashes with extra keys.
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     block_hashes = manager.req_to_block_hashes[req0.request_id]
     assert len(block_hashes) == 3
@@ -622,7 +863,7 @@ def test_mm_prefix_caching():
     assert block_hashes[2].extra_keys == ("bbb", )
 
     blocks = manager.allocate_slots(req0, 59,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
     assert blocks.get_block_ids() == [[1, 2, 3, 4]]
     req0.num_computed_tokens = 59
@@ -631,9 +872,9 @@ def test_mm_prefix_caching():
     for _ in range(5):
         req0.append_output_token_ids(8)
     new_blocks = manager.allocate_slots(req0, 5,
-                                        len(computed_blocks.blocks) * 16,
+                                        len(computed_blocks.blocks[0]) * 16,
                                         computed_blocks)
-    assert new_blocks is not None and len(new_blocks.blocks) == 0
+    assert new_blocks is not None and len(new_blocks.blocks[0]) == 0
 
     # The just completed block should have hashes with extra keys.
     assert len(block_hashes) == 4
@@ -651,7 +892,7 @@ def test_mm_prefix_caching():
                         mm_positions=mm_positions,
                         mm_hashes=mm_hashes)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
-    assert len(computed_blocks.blocks) == 3
+    assert len(computed_blocks.blocks[0]) == 3
     assert num_computed_tokens == 3 * 16
 
 
@@ -674,7 +915,7 @@ def test_cache_key_salting():
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
 
     # Completed block should have hashes with extra keys.
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     block_hashes = manager.req_to_block_hashes[req0.request_id]
     assert len(block_hashes) == 3
@@ -683,7 +924,7 @@ def test_cache_key_salting():
     assert block_hashes[2].extra_keys is None
 
     blocks = manager.allocate_slots(req0, 59,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
     assert blocks.get_block_ids() == [[1, 2, 3, 4]]
     req0.num_computed_tokens = 59
@@ -692,9 +933,9 @@ def test_cache_key_salting():
     for _ in range(5):
         req0.append_output_token_ids(8)
     new_blocks = manager.allocate_slots(req0, 5,
-                                        len(computed_blocks.blocks) * 16,
+                                        len(computed_blocks.blocks[0]) * 16,
                                         computed_blocks)
-    assert new_blocks is not None and len(new_blocks.blocks) == 0
+    assert new_blocks is not None and len(new_blocks.blocks[0]) == 0
 
     # Now one more block that should not have extra keys.
     assert len(block_hashes) == 4
@@ -705,14 +946,14 @@ def test_cache_key_salting():
     req1 = make_request("1", token_ids, cache_salt="salt1")
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     # Should match only a prefix of 3 blocks.
-    assert len(computed_blocks.blocks) == 3
+    assert len(computed_blocks.blocks[0]) == 3
     assert num_computed_tokens == 3 * block_size
 
     # Test cache miss with same content but different salt.
     token_ids = common_token_ids + [4] * 11
     req2 = make_request("2", token_ids, cache_salt="salt2")
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
-    assert len(computed_blocks.blocks) == 0
+    assert len(computed_blocks.blocks[0]) == 0
     assert num_computed_tokens == 0
     block_hashes = manager.req_to_block_hashes[req2.request_id]
     assert len(block_hashes) == 3
@@ -737,20 +978,24 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
     common_token_ids = [i for i in range(3) for _ in range(16)]
     req0 = make_request("0", common_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     manager.allocate_slots(req0, 48,
-                           len(computed_blocks.blocks) * 16, computed_blocks)
-    block_part0 = manager.single_type_manager.req_to_blocks[req0.request_id]
+                           len(computed_blocks.blocks[0]) * 16,
+                           computed_blocks)
+    block_part0 = manager.coordinator.single_type_managers[0].req_to_blocks[
+        req0.request_id]
 
     # | Common-0 | Common-1 | Common-2 | Req1-3 | Req1-4 | Req1-5 | ... |
     req1 = make_request("1", common_token_ids * 2)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
-    assert computed_blocks.blocks == block_part0
+    assert computed_blocks.blocks[0] == block_part0
     assert num_computed_tokens == 3 * 16
     manager.allocate_slots(req1, 48,
-                           len(computed_blocks.blocks) * 16, computed_blocks)
-    block_part1 = manager.single_type_manager.req_to_blocks[req1.request_id]
+                           len(computed_blocks.blocks[0]) * 16,
+                           computed_blocks)
+    block_part1 = manager.coordinator.single_type_managers[0].req_to_blocks[
+        req1.request_id]
     # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
     # | Req1-5(F)| ... |
     manager.free(req1)
@@ -761,10 +1006,11 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
     # | Req1-5(F)| Req2-0   | Req2-1   | ... |
     req2 = make_request("2", [7] * block_size * 2)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     manager.allocate_slots(req2, block_size * 2,
-                           len(computed_blocks.blocks) * 16, computed_blocks)
+                           len(computed_blocks.blocks[0]) * 16,
+                           computed_blocks)
 
     # Req3 is Req2 + 3 new blocks, so the first 6 blocks are computed,
     # but it cannot be allocated due to insufficient free blocks (2).
@@ -772,11 +1018,11 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
     assert manager.block_pool.free_block_queue.num_free_blocks == 5
     req3 = make_request("3", common_token_ids * 3)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
-    assert computed_blocks.blocks == block_part1
+    assert computed_blocks.blocks[0] == block_part1
     assert num_computed_tokens == 6 * 16
     # Req3 cannot be allocated.
     assert manager.allocate_slots(req3, 48,
-                                  len(computed_blocks.blocks) * 16,
+                                  len(computed_blocks.blocks[0]) * 16,
                                   computed_blocks) is None
     # Block 0-2 are used by Req 1.
     assert {block.ref_cnt for block in block_part1[:3]} == {1}
@@ -803,9 +1049,9 @@ def test_reset_prefix_cache():
     req1 = make_request("1", all_token_ids)
     computed_blocks, _ = manager.get_computed_blocks(req1)
     assert len(manager.req_to_block_hashes[req1.request_id]) == 3
-    assert len(computed_blocks.blocks) == 3
+    assert len(computed_blocks.blocks[0]) == 3
     blocks = manager.allocate_slots(req1, 7,
-                                    len(computed_blocks.blocks) * 16,
+                                    len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
     assert blocks.get_block_ids() == [[5]]
 
@@ -835,10 +1081,11 @@ def test_prefix_cache_stats_disabled():
     # Call all functions that check whether log_stats is disabled.
     req = make_request("0", list(range(16)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
-    assert not computed_blocks.blocks
+    assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     manager.allocate_slots(req, 16,
-                           len(computed_blocks.blocks) * 16, computed_blocks)
+                           len(computed_blocks.blocks[0]) * 16,
+                           computed_blocks)
     manager.reset_prefix_cache()
 
     # Ensure prefix_cache_stats remains None
@@ -917,7 +1164,8 @@ def test_eagle_enabled_removes_last_block():
     # Prime the cache
     computed_blocks, _ = manager.get_computed_blocks(req)
     manager.allocate_slots(req, len(token_ids),
-                           len(computed_blocks.blocks) * 16, computed_blocks)
+                           len(computed_blocks.blocks[0]) * 16,
+                           computed_blocks)
     manager.free(req)
 
     # New request with same tokens + Eagle enabled
@@ -927,7 +1175,7 @@ def test_eagle_enabled_removes_last_block():
     # Should retain 1 block:
     # 1. Original 3 blocks → pop last hash → 2 matched blocks
     # 2. drop last matched block → 1 remaining block
-    assert len(computed_blocks.blocks) == 1
+    assert len(computed_blocks.blocks[0]) == 1
     assert num_tokens == 1 * block_size  # 16 tokens
 
 
@@ -947,14 +1195,15 @@ def test_eagle_with_partial_blocks():
     # Prime the cache
     computed_blocks, _ = manager.get_computed_blocks(req)
     manager.allocate_slots(req, len(token_ids),
-                           len(computed_blocks.blocks) * 16, computed_blocks)
+                           len(computed_blocks.blocks[0]) * 16,
+                           computed_blocks)
     manager.free(req)
 
     # New request with Eagle enabled
     req_eagle = make_request("partial_eagle", token_ids)
     computed_blocks, num_tokens = manager.get_computed_blocks(req_eagle)
     # Original match: 2 full blocks → Eagle removes 1 → 1 remaining
-    assert len(computed_blocks.blocks) == 1
+    assert len(computed_blocks.blocks[0]) == 1
     assert num_tokens == 1 * block_size
 
 
@@ -972,7 +1221,7 @@ def test_eagle_with_sliding_window():
     manager = KVCacheManager(
         KVCacheConfig(
             num_blocks=10,
-            tensors={},
+            kv_cache_tensors=[],
             kv_cache_groups=[KVCacheGroupSpec(['layer'], sliding_window_spec)],
         ),
         max_model_len=8192,
@@ -987,7 +1236,8 @@ def test_eagle_with_sliding_window():
     # Prime the cache
     computed_blocks, _ = manager.get_computed_blocks(req)
     manager.allocate_slots(req, len(token_ids),
-                           len(computed_blocks.blocks) * 16, computed_blocks)
+                           len(computed_blocks.blocks[0]) * 16,
+                           computed_blocks)
     # record the block hash of the first block in the request for later use
     block_hash_first_block = manager.req_to_block_hashes[req.request_id][0]
     assert block_hash_first_block is not None
@@ -997,13 +1247,14 @@ def test_eagle_with_sliding_window():
     req_eagle = make_request("partial_eagle", token_ids)
     computed_blocks, num_tokens = manager.get_computed_blocks(req_eagle)
     # Original match: 2 full blocks → Eagle removes 1 → 1 remaining
-    assert len(computed_blocks.blocks) == 1
+    assert len(computed_blocks.blocks[0]) == 1
     assert num_tokens == 1 * block_size
 
     # Evict the first block in the request
     assert manager.block_pool.get_cached_block(
-        block_hash_first_block) is not None
-    manager.block_pool.cached_block_hash_to_block.pop(block_hash_first_block)
+        block_hash_first_block, kv_cache_group_ids=[0]) is not None
+    manager.block_pool.cached_block_hash_to_block.pop(
+        BlockHashWithGroupId(block_hash_first_block, 0))
 
     # New request
     req_after_evict = make_request("partial_eagle_after_evict", token_ids)
@@ -1011,5 +1262,5 @@ def test_eagle_with_sliding_window():
     # Cache miss. The only hit prefix is [NULL_BLOCK, BLOCK_2] if eagle is
     # not considered. But after dropping the last matched block due to eagle,
     # there will be no matched prefix.
-    assert len(computed_blocks.blocks) == 0
+    assert len(computed_blocks.blocks[0]) == 0
     assert num_tokens == 0
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 079179730d5d..1aef7511cc02 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 from unittest.mock import Mock
 
@@ -96,7 +97,7 @@ def create_scheduler(
     )
     kv_cache_config = KVCacheConfig(
         num_blocks=num_blocks,  # A large number of blocks to hold all requests
-        tensors={},
+        kv_cache_tensors=[],
         kv_cache_groups=[
             KVCacheGroupSpec(['layer'],
                              FullAttentionSpec(block_size, 1, 1, torch.float32,
@@ -813,10 +814,10 @@ def _assert_right_kv_cache_manager(
     # Make sure the request stats are right.
     EXPECTED_TOTAL_BLOCKS = num_tokens // block_size
     for req_id in req_ids:
-        blocks = (scheduler.kv_cache_manager.single_type_manager.
-                  req_to_blocks[req_id])
+        blocks = (scheduler.kv_cache_manager.coordinator.
+                  single_type_managers[0].req_to_blocks[req_id])
         hashes = scheduler.kv_cache_manager.req_to_block_hashes[req_id]
-        assert (scheduler.kv_cache_manager.single_type_manager.
+        assert (scheduler.kv_cache_manager.coordinator.single_type_managers[0].
                 num_cached_block[req_id] == EXPECTED_TOTAL_BLOCKS)
         assert len(blocks) == EXPECTED_TOTAL_BLOCKS
         assert len(hashes) == EXPECTED_TOTAL_BLOCKS
@@ -1197,11 +1198,11 @@ def assert_scheduler_empty(scheduler: Scheduler):
     assert len(scheduler.encoder_cache_manager.cached) == 0
 
     # KVCache Manager.
-    assert len(
-        scheduler.kv_cache_manager.single_type_manager.req_to_blocks) == 0
+    assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
+               req_to_blocks) == 0
     assert len(scheduler.kv_cache_manager.req_to_block_hashes) == 0
-    assert len(
-        scheduler.kv_cache_manager.single_type_manager.num_cached_block) == 0
+    assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
+               num_cached_block) == 0
     num_free_blocks = (
         scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks)
     assert num_free_blocks == (
diff --git a/tests/v1/core/test_scheduler_e2e.py b/tests/v1/core/test_scheduler_e2e.py
index 511d57d405ba..85415f6ad4b6 100644
--- a/tests/v1/core/test_scheduler_e2e.py
+++ b/tests/v1/core/test_scheduler_e2e.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 
 import pytest
diff --git a/tests/v1/core/test_specialized_manager.py b/tests/v1/core/test_specialized_manager.py
index 4217dc37e2df..a9e1898df934 100644
--- a/tests/v1/core/test_specialized_manager.py
+++ b/tests/v1/core/test_specialized_manager.py
@@ -1,9 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
 from vllm.v1.core.block_pool import BlockPool
-from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
+from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId,
+                                         KVCacheBlock)
 from vllm.v1.core.single_type_kv_cache_manager import SlidingWindowManager
 from vllm.v1.kv_cache_interface import SlidingWindowSpec
 
@@ -11,9 +13,8 @@
 def get_sliding_window_manager(sliding_window_spec, block_pool):
     return SlidingWindowManager(sliding_window_spec,
                                 block_pool,
-                                use_eagle=False,
-                                num_kv_cache_groups=1,
-                                caching_hash_fn=lambda x: x)
+                                caching_hash_fn=lambda x: x,
+                                kv_cache_group_id=0)
 
 
 def test_sliding_window_possible_cached_prefix():
@@ -41,13 +42,18 @@ def run_one_case(block_is_cached, expect_length):
         for i, (block_hash,
                 is_cached) in enumerate(zip(block_hash_list, block_is_cached)):
             if is_cached:
-                block_pool.cached_block_hash_to_block[block_hash] = {
-                    i: block_pool.blocks[i + 10]
-                }
+                block_pool.cached_block_hash_to_block[BlockHashWithGroupId(
+                    block_hash, 0)] = {
+                        i: block_pool.blocks[i + 10],
+                    }
 
         computed_blocks = manager.find_longest_cache_hit(
-            block_hash_list,
-            len(block_hash_list) * block_size)
+            block_hashes=block_hash_list,
+            max_length=len(block_hash_list) * block_size,
+            kv_cache_group_ids=[0],
+            block_pool=block_pool,
+            kv_cache_spec=sliding_window_spec,
+            use_eagle=False)[0]
         assert len(computed_blocks) == expect_length
 
         assert all(block == block_pool.null_block
@@ -94,13 +100,13 @@ def test_sliding_window_remove_skipped_blocks():
 
     null_block_id = block_pool.null_block.block_id
 
-    def id_to_block_table(ids):
+    def id_to_block_table(ids) -> list[KVCacheBlock]:
         return [
             KVCacheBlock(id_)
             if id_ != null_block_id else block_pool.null_block for id_ in ids
         ]
 
-    def assert_block_id(block_table, ids):
+    def assert_block_id(block_table: list[KVCacheBlock], ids: list[int]):
         for block, id_ in zip(block_table, ids):
             if id_ == null_block_id:
                 assert block == block_pool.null_block
@@ -143,3 +149,26 @@ def assert_block_id(block_table, ids):
     # of removed blocks should be [1003, 1002].
     manager.remove_skipped_blocks("test", 11)
     assert_block_id(block_table, [null_block_id] * 4 + original_block_ids[4:])
+
+
+def test_get_num_blocks_to_allocate():
+    block_size = 2
+    sliding_window_spec = SlidingWindowSpec(
+        block_size=block_size,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+        sliding_window=4,  # Placeholder value, not related to test result
+        use_mla=False,
+    )
+
+    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
+    manager = get_sliding_window_manager(sliding_window_spec, block_pool)
+    cached_blocks_1 = [KVCacheBlock(i + 1) for i in range(10)]
+    cached_blocks_2 = [block_pool.null_block for _ in range(5)
+                       ] + [KVCacheBlock(i + 1) for i in range(5)]
+
+    assert manager.get_num_blocks_to_allocate("1", 20 * block_size,
+                                              cached_blocks_1) == 20
+    assert manager.get_num_blocks_to_allocate("2", 20 * block_size,
+                                              cached_blocks_2) == 15
diff --git a/tests/v1/e2e/test_cascade_attention.py b/tests/v1/e2e/test_cascade_attention.py
index 48c265560348..161bcd4d3ef9 100644
--- a/tests/v1/e2e/test_cascade_attention.py
+++ b/tests/v1/e2e/test_cascade_attention.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/v1/e2e/test_correctness_sliding_window.py b/tests/v1/e2e/test_correctness_sliding_window.py
index a125d3fb7975..d8882b1d9432 100644
--- a/tests/v1/e2e/test_correctness_sliding_window.py
+++ b/tests/v1/e2e/test_correctness_sliding_window.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
 
 import pytest
@@ -17,7 +18,7 @@ class TestConfig:
 
 model_config = {
     "bigcode/starcoder2-3b": TestConfig(4096, (800, 1100)),
-    "google/gemma-2-2b-it": TestConfig(4096, (400, 800)),
+    "google/gemma-3-1b-it": TestConfig(4096, (400, 800)),
 }
 
 
@@ -25,7 +26,7 @@ class TestConfig:
     "model",
     [
         "bigcode/starcoder2-3b",  # sliding window only
-        "google/gemma-2-2b-it",  # sliding window + full attention
+        "google/gemma-3-1b-it",  # sliding window + full attention
     ])
 @pytest.mark.parametrize("batch_size", [5])
 @pytest.mark.parametrize("seed", [1])
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 2fad37d6801b..93e7c12f3a09 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 
 import random
diff --git a/tests/v1/engine/conftest.py b/tests/v1/engine/conftest.py
index d04679c12448..d7722142b207 100644
--- a/tests/v1/engine/conftest.py
+++ b/tests/v1/engine/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index 5d52ad5f5328..7dff937c0fd9 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 from contextlib import ExitStack
@@ -21,9 +22,11 @@
     pytest.skip(reason="V1 currently only supported on CUDA.",
                 allow_module_level=True)
 
-TEXT_ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B-Instruct",
-                                   enforce_eager=True,
-                                   disable_log_requests=True)
+TEXT_ENGINE_ARGS = AsyncEngineArgs(
+    model="meta-llama/Llama-3.2-1B-Instruct",
+    enforce_eager=True,
+    disable_log_requests=True,
+)
 
 VISION_ENGINE_ARGS = AsyncEngineArgs(model="Qwen/Qwen2-VL-2B-Instruct",
                                      enforce_eager=True,
@@ -40,28 +43,33 @@
     "prompt": VISION_PROMPT_TEMPLATE,
     "multi_modal_data": {
         "image": ImageAsset("stop_sign").pil_image
-    }
+    },
 }
 
 
-async def generate(engine: AsyncLLM,
-                   request_id: str,
-                   prompt: PromptType,
-                   output_kind: RequestOutputKind,
-                   max_tokens: int,
-                   n: int = 1,
-                   prompt_logprobs: Optional[int] = None) -> tuple[int, str]:
+async def generate(
+    engine: AsyncLLM,
+    request_id: str,
+    prompt: PromptType,
+    output_kind: RequestOutputKind,
+    max_tokens: int,
+    n: int = 1,
+    prompt_logprobs: Optional[int] = None,
+    cancel_after: Optional[int] = None,
+) -> tuple[int, str]:
     # Ensure generate doesn't complete too fast for cancellation test.
     await asyncio.sleep(0.2)
 
     count = 0
-    sampling_params = SamplingParams(max_tokens=max_tokens,
-                                     ignore_eos=True,
-                                     output_kind=output_kind,
-                                     temperature=0.5,
-                                     seed=33,
-                                     n=n,
-                                     prompt_logprobs=prompt_logprobs)
+    sampling_params = SamplingParams(
+        max_tokens=max_tokens,
+        ignore_eos=True,
+        output_kind=output_kind,
+        temperature=0.5,
+        seed=33,
+        n=n,
+        prompt_logprobs=prompt_logprobs,
+    )
     async for out in engine.generate(request_id=request_id,
                                      prompt=prompt,
                                      sampling_params=sampling_params):
@@ -72,20 +80,27 @@ async def generate(engine: AsyncLLM,
         else:
             count = num_tokens
 
-        await asyncio.sleep(0.)
+        if cancel_after is not None and count >= cancel_after:
+            return count, request_id
+
+        await asyncio.sleep(0.0)
 
     return count, request_id
 
 
 @pytest.mark.parametrize(
     "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
-@pytest.mark.parametrize("engine_args,prompt",
-                         [(TEXT_ENGINE_ARGS, TEXT_PROMPT),
-                          (VISION_ENGINE_ARGS, VISION_PROMPT)])
+@pytest.mark.parametrize(
+    "engine_args,prompt",
+    [(TEXT_ENGINE_ARGS, TEXT_PROMPT), (VISION_ENGINE_ARGS, VISION_PROMPT)],
+)
 @pytest.mark.asyncio
-async def test_load(monkeypatch: pytest.MonkeyPatch,
-                    output_kind: RequestOutputKind,
-                    engine_args: AsyncEngineArgs, prompt: PromptType):
+async def test_load(
+    monkeypatch: pytest.MonkeyPatch,
+    output_kind: RequestOutputKind,
+    engine_args: AsyncEngineArgs,
+    prompt: PromptType,
+):
     # TODO(rickyx): Remove monkeypatch once we have a better way to test V1
     # so that in the future when we switch, we don't have to change all the
     # tests.
@@ -124,13 +139,17 @@ async def test_load(monkeypatch: pytest.MonkeyPatch,
 
 @pytest.mark.parametrize(
     "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
-@pytest.mark.parametrize("engine_args,prompt",
-                         [(TEXT_ENGINE_ARGS, TEXT_PROMPT),
-                          (VISION_ENGINE_ARGS, VISION_PROMPT)])
+@pytest.mark.parametrize(
+    "engine_args,prompt",
+    [(TEXT_ENGINE_ARGS, TEXT_PROMPT), (VISION_ENGINE_ARGS, VISION_PROMPT)],
+)
 @pytest.mark.asyncio
-async def test_abort(monkeypatch: pytest.MonkeyPatch,
-                     output_kind: RequestOutputKind,
-                     engine_args: AsyncEngineArgs, prompt: PromptType):
+async def test_abort(
+    monkeypatch: pytest.MonkeyPatch,
+    output_kind: RequestOutputKind,
+    engine_args: AsyncEngineArgs,
+    prompt: PromptType,
+):
 
     with monkeypatch.context() as m, ExitStack() as after:
         m.setenv("VLLM_USE_V1", "1")
@@ -149,8 +168,9 @@ async def test_abort(monkeypatch: pytest.MonkeyPatch,
         # Create concurrent requests.
         tasks: list[asyncio.Task] = []
         for idx, request_id in enumerate(request_ids):
-            max_tokens = NUM_EXPECTED_TOKENS_LONG if (
-                idx in REQUEST_IDS_TO_ABORT) else NUM_EXPECTED_TOKENS
+            max_tokens = (NUM_EXPECTED_TOKENS_LONG if
+                          (idx
+                           in REQUEST_IDS_TO_ABORT) else NUM_EXPECTED_TOKENS)
             n = 3 if idx in PARALLEL_SAMPLE_REQ_IDS else 1
             tasks.append(
                 asyncio.create_task(
@@ -191,12 +211,17 @@ async def test_abort(monkeypatch: pytest.MonkeyPatch,
 
 
 @pytest.mark.parametrize("n", [1, 3])
-@pytest.mark.parametrize("engine_args,prompt",
-                         [(TEXT_ENGINE_ARGS, TEXT_PROMPT),
-                          (VISION_ENGINE_ARGS, VISION_PROMPT)])
+@pytest.mark.parametrize(
+    "engine_args,prompt",
+    [(TEXT_ENGINE_ARGS, TEXT_PROMPT), (VISION_ENGINE_ARGS, VISION_PROMPT)],
+)
 @pytest.mark.asyncio
-async def test_finished_flag(monkeypatch: pytest.MonkeyPatch, n: int,
-                             engine_args: AsyncEngineArgs, prompt: PromptType):
+async def test_finished_flag(
+    monkeypatch: pytest.MonkeyPatch,
+    n: int,
+    engine_args: AsyncEngineArgs,
+    prompt: PromptType,
+):
 
     with monkeypatch.context() as m, ExitStack() as after:
         m.setenv("VLLM_USE_V1", "1")
@@ -204,11 +229,13 @@ async def test_finished_flag(monkeypatch: pytest.MonkeyPatch, n: int,
         engine = AsyncLLM.from_engine_args(engine_args)
         after.callback(engine.shutdown)
 
-        sampling_params = SamplingParams(max_tokens=100,
-                                         output_kind=RequestOutputKind.DELTA,
-                                         temperature=1.0,
-                                         seed=33,
-                                         n=n)
+        sampling_params = SamplingParams(
+            max_tokens=100,
+            output_kind=RequestOutputKind.DELTA,
+            temperature=1.0,
+            seed=33,
+            n=n,
+        )
         outputs = [
             out
             async for out in engine.generate(request_id="request-33",
@@ -221,6 +248,63 @@ async def test_finished_flag(monkeypatch: pytest.MonkeyPatch, n: int,
         assert outputs[-1].finished
 
 
+@pytest.mark.parametrize(
+    "engine_args,prompt",
+    [(TEXT_ENGINE_ARGS, TEXT_PROMPT), (VISION_ENGINE_ARGS, VISION_PROMPT)],
+)
+@pytest.mark.asyncio
+async def test_mid_stream_cancellation(monkeypatch: pytest.MonkeyPatch,
+                                       engine_args: AsyncEngineArgs,
+                                       prompt: PromptType):
+    """Test that requests can be cancelled mid-stream."""
+    with monkeypatch.context() as m, ExitStack() as after:
+        m.setenv("VLLM_USE_V1", "1")
+
+        engine = AsyncLLM.from_engine_args(engine_args)
+        after.callback(engine.shutdown)
+
+        NUM_REQUESTS = 100
+        NUM_TOKENS = 1000
+        NUM_EXPECTED_TOKENS = 20
+
+        request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
+
+        # Create concurrent requests that will be cancelled mid-stream
+        tasks = []
+        for request_id in request_ids:
+            tasks.append(
+                asyncio.create_task(
+                    generate(
+                        engine,
+                        request_id,
+                        prompt,
+                        RequestOutputKind.DELTA,
+                        NUM_TOKENS,
+                        cancel_after=NUM_EXPECTED_TOKENS,
+                    )))
+
+        # Wait for all tasks to complete
+        results = await asyncio.gather(*tasks)
+
+        # Verify all tasks were cancelled at the expected point
+        for num_generated_tokens, request_id in results:
+            assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
+                f"{request_id} generated {num_generated_tokens} tokens but "
+                f"expected to cancel after {NUM_EXPECTED_TOKENS}")
+
+        # Make sure no requests are left hanging
+        assert not engine.output_processor.has_unfinished_requests()
+
+        # Confirm we can reuse the request id after the cancellations.
+        request_id = request_ids[0]
+        task = asyncio.create_task(
+            generate(engine, request_id, prompt, RequestOutputKind.DELTA,
+                     NUM_EXPECTED_TOKENS))
+        num_generated_tokens, request_id = await task
+        assert num_generated_tokens == NUM_EXPECTED_TOKENS
+        assert not engine.output_processor.has_unfinished_requests()
+
+
 class MockLoggingStatLogger(LoggingStatLogger):
 
     def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
@@ -249,3 +333,32 @@ async def test_customize_loggers(monkeypatch):
         assert len(engine.stat_loggers) == 1
         assert len(engine.stat_loggers[0]) == 1
         engine.stat_loggers[0][0].log.assert_called_once()
+
+
+@pytest.mark.asyncio(scope="module")
+async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m, ExitStack() as after:
+        m.setenv("VLLM_USE_V1", "1")
+
+        engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
+        after.callback(engine.shutdown)
+
+        sampling_params = SamplingParams(max_tokens=100,
+                                         output_kind=RequestOutputKind.DELTA,
+                                         temperature=1.0,
+                                         seed=33)
+
+        # Test with valid DP rank.
+        async for _ in engine.generate(request_id="request-34",
+                                       prompt=TEXT_PROMPT,
+                                       sampling_params=sampling_params,
+                                       data_parallel_rank=0):
+            pass
+
+        # Test with out-of-range DP rank.
+        with pytest.raises(ValueError):
+            async for _ in engine.generate(request_id="request-35",
+                                           prompt=TEXT_PROMPT,
+                                           sampling_params=sampling_params,
+                                           data_parallel_rank=1):
+                pass
diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py
index 9b2f1a919931..f70a3ce147ff 100644
--- a/tests/v1/engine/test_engine_args.py
+++ b/tests/v1/engine/test_engine_args.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from argparse import ArgumentError
 
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index e78c7480a837..1cbbf30371af 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import copy
 import time
@@ -41,6 +42,7 @@ def make_request() -> EngineCoreRequest:
         arrival_time=time.time(),
         lora_request=None,
         cache_salt=None,
+        data_parallel_rank=None,
     )
 
 
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 8bea032f656f..c2dc3b4731b5 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import os
@@ -11,8 +12,10 @@
 import pytest
 from transformers import AutoTokenizer
 
+from tests.utils import multi_gpu_test
 from vllm import SamplingParams
-from vllm.distributed.kv_events import BlockStored, KVEventBatch
+from vllm.distributed.kv_events import (BlockStored, KVEventBatch,
+                                        ZmqEventPublisher)
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
 from vllm.usage.usage_lib import UsageContext
@@ -36,10 +39,15 @@
 PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
 
 
-def make_request(params: SamplingParams) -> EngineCoreRequest:
+def make_request(
+        params: SamplingParams,
+        prompt_tokens_ids: Optional[list[int]] = None) -> EngineCoreRequest:
+    if not prompt_tokens_ids:
+        prompt_tokens_ids = PROMPT_TOKENS
+
     return EngineCoreRequest(
         request_id=str(uuid.uuid4()),
-        prompt_token_ids=PROMPT_TOKENS,
+        prompt_token_ids=prompt_tokens_ids,
         mm_inputs=None,
         mm_hashes=None,
         mm_placeholders=None,
@@ -48,6 +56,7 @@ def make_request(params: SamplingParams) -> EngineCoreRequest:
         arrival_time=time.time(),
         lora_request=None,
         cache_salt=None,
+        data_parallel_rank=None,
     )
 
 
@@ -87,6 +96,25 @@ async def loop_until_done_async(client: EngineCoreClient, outputs: dict):
             break
 
 
+async def loop_until_fully_done_async(client: EngineCoreClient, outputs: dict):
+
+    while True:
+        engine_core_outputs = (await client.get_output_async()).outputs
+
+        if len(engine_core_outputs) == 0:
+            continue
+
+        # Add outputs to the dict
+        for out in engine_core_outputs:
+            outputs[out.request_id].append(out)
+
+        # Check if all request IDs in outputs have finished
+        if all(outs and outs[-1].finished for outs in outputs.values()):
+            break
+
+        await asyncio.sleep(0.1)
+
+
 # Dummy utility function to monkey-patch into engine core.
 def echo(self, msg: str, err_msg: Optional[str] = None) -> str:
     print(f"echo util function called: {msg}, {err_msg}")
@@ -272,10 +300,12 @@ def test_kv_cache_events(
         block_size = 16
         num_blocks = 2
 
-        engine_args = EngineArgs(model=MODEL_NAME,
-                                 enforce_eager=True,
-                                 enable_prefix_caching=True,
-                                 block_size=block_size)
+        engine_args = EngineArgs(
+            model=MODEL_NAME,
+            enforce_eager=True,
+            enable_prefix_caching=True,
+            block_size=block_size,
+        )
         engine_args.kv_events_config = publisher_config
 
         vllm_config = engine_args.create_engine_config(
@@ -296,19 +326,8 @@ def test_kv_cache_events(
 
         try:
             custom_tokens = list(range(num_blocks * block_size))
-            request = EngineCoreRequest(
-                request_id=str(uuid.uuid4()),
-                prompt_token_ids=custom_tokens,
-                mm_inputs=None,
-                mm_hashes=None,
-                mm_placeholders=None,
-                sampling_params=SamplingParams(
-                    max_tokens=1),  # Short completion for speed
-                eos_token_id=None,
-                arrival_time=time.time(),
-                lora_request=None,
-                cache_salt=None,
-            )
+            sampling_params = SamplingParams(max_tokens=1)
+            request = make_request(sampling_params, custom_tokens)
             client.add_request(request)
 
             outputs: dict[str, list] = {request.request_id: []}
@@ -320,24 +339,130 @@ def test_kv_cache_events(
             seq, received = result
 
             assert seq == 0, "Sequence number mismatch"
-            assert len(received.events) == 1, (
-                "We should have exactly one BlockStored event")
+            assert (len(received.events) == 1
+                    ), "We should have exactly one BlockStored event"
             event = received.events[0]
             assert isinstance(
-                event, BlockStored), ("We should have a BlockStored event")
-            assert len(event.block_hashes) == num_blocks, (
-                "We should have a BlockStored event with 2 block_hashes")
-            assert event.block_size == block_size, (
-                "Block size should be the same as the block size")
-            assert event.parent_block_hash is None, (
-                "Parent block hash should be None")
+                event, BlockStored), "We should have a BlockStored event"
+            assert (len(event.block_hashes) == num_blocks
+                    ), "We should have a BlockStored event with 2 block_hashes"
+            assert (event.block_size == block_size
+                    ), "Block size should be the same as the block size"
+            assert (event.parent_block_hash
+                    is None), "Parent block hash should be None"
             assert event.lora_id is None, "Lora id should be None"
-            assert len(event.token_ids) == num_blocks * block_size, (
-                "Token ids should be the same as the custom tokens")
-            assert event.token_ids == custom_tokens, (
-                "Token ids should be the same as the custom tokens")
+            assert (len(event.token_ids) == num_blocks * block_size
+                    ), "Token ids should be the same as the custom tokens"
+            assert (event.token_ids == custom_tokens
+                    ), "Token ids should be the same as the custom tokens"
+        finally:
+            client.shutdown()
+            subscriber.close()
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "multiprocessing_mode,publisher_config",
+    [(True, "tcp")],
+    indirect=["publisher_config"],
+)
+@multi_gpu_test(num_gpus=4)
+async def test_kv_cache_events_dp(
+    monkeypatch: pytest.MonkeyPatch,
+    multiprocessing_mode: bool,
+    publisher_config,
+):
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        block_size = 16
+        num_blocks = 2
+        dp_size = 2
+        tp_size = 2
+
+        engine_args = EngineArgs(
+            model=MODEL_NAME,
+            enforce_eager=True,
+            enable_prefix_caching=True,
+            data_parallel_size=dp_size,
+            tensor_parallel_size=tp_size,
+            block_size=block_size,
+        )
+        engine_args.kv_events_config = publisher_config
+
+        vllm_config = engine_args.create_engine_config(
+            UsageContext.UNKNOWN_CONTEXT)
+
+        executor_class = Executor.get_class(vllm_config)
+        client = EngineCoreClient.make_client(
+            multiprocess_mode=multiprocessing_mode,
+            asyncio_mode=True,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=False,
+        )
+        await asyncio.sleep(1)
+
+        # Build endpoints for all DP ranks
+        base_endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
+        endpoints = []
+        for i in range(dp_size):
+            offset_endpoint = ZmqEventPublisher.offset_endpoint_port(
+                base_endpoint, i)
+            endpoints.append(offset_endpoint)
+
+        subscriber = MockSubscriber(endpoints,
+                                    topic=publisher_config.topic,
+                                    decode_type=KVEventBatch)
+
+        try:
+            custom_tokens = list(range(num_blocks * block_size))
+            sampling_params = SamplingParams(max_tokens=1)
+            all_request_ids = []
+
+            # Create and add 25 requests
+            # NOTE: attempts to force routing to both dp groups but can be flaky
+            for i in range(25):
+                await asyncio.sleep(0.01)
+                request = make_request(sampling_params, custom_tokens)
+                await client.add_request_async(request)
+                all_request_ids.append(request.request_id)
+
+            await asyncio.sleep(0.1)
+
+            # Initialize outputs dict for all requests
+            outputs: dict[str, list] = {
+                req_id: []
+                for req_id in all_request_ids
+            }
+
+            print("processing requests...")
+            await asyncio.wait_for(loop_until_fully_done_async(
+                client, outputs),
+                                   timeout=20.0)
+
+            # Receive from subscriber until no more messages
+            print("collecting results...")
+            results = []
+            while True:
+                result = subscriber.receive_one(timeout=1)
+                print(result)
+                if result is None:
+                    break
+                results.append(result)
+
+            # Collect all events and data_parallel_ranks from all results
+            all_dp_ranks = [
+                received.data_parallel_rank for (_, received) in results
+            ]
+            unique_dps = set(all_dp_ranks)
+            assert (
+                len(unique_dps) == 2
+            ), f"Expected 2 unique data_parallel_ranks, got {len(unique_dps)}"
+
         finally:
             client.shutdown()
+            subscriber.close()
 
 
 @pytest.mark.timeout(20)
diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py
index e77916f95823..6284dcfb915b 100644
--- a/tests/v1/engine/test_llm_engine.py
+++ b/tests/v1/engine/test_llm_engine.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 from typing import Optional
diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index fac701c4ca35..6b88b0cf17e3 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 import time
@@ -58,6 +59,7 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind,
                           eos_token_id=None,
                           lora_request=None,
                           cache_salt=None,
+                          data_parallel_rank=None,
                           sampling_params=SamplingParams(
                               skip_special_tokens=False,
                               spaces_between_special_tokens=False,
@@ -405,6 +407,7 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind,
                           eos_token_id=None,
                           lora_request=None,
                           cache_salt=None,
+                          data_parallel_rank=None,
                           sampling_params=SamplingParams(
                               skip_special_tokens=False,
                               spaces_between_special_tokens=False,
@@ -568,6 +571,7 @@ def test_stop_token(include_stop_str_in_output: bool,
         eos_token_id=eos_token_id,
         lora_request=None,
         cache_salt=None,
+        data_parallel_rank=None,
         sampling_params=SamplingParams(
             skip_special_tokens=False,
             spaces_between_special_tokens=False,
@@ -665,6 +669,7 @@ def test_stop_string(include_stop_str_in_output: bool,
             eos_token_id=None,
             lora_request=None,
             cache_salt=None,
+            data_parallel_rank=None,
             sampling_params=SamplingParams(
                 skip_special_tokens=False,
                 spaces_between_special_tokens=False,
@@ -779,6 +784,7 @@ def test_iteration_stats(dummy_test_vectors):
             eos_token_id=None,
             lora_request=None,
             cache_salt=None,
+            data_parallel_rank=None,
             sampling_params=SamplingParams(),
         ) for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
     ]
diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py
index 4a23e0c1b212..b58bc75fc956 100644
--- a/tests/v1/engine/utils.py
+++ b/tests/v1/engine/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 from dataclasses import dataclass
diff --git a/tests/v1/entrypoints/conftest.py b/tests/v1/entrypoints/conftest.py
index 8c03f04330dd..ffe061212466 100644
--- a/tests/v1/entrypoints/conftest.py
+++ b/tests/v1/entrypoints/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index 5f1fff200de3..a39ab47b8d87 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -1,5 +1,6 @@
 # ruff: noqa: E501
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from __future__ import annotations
 
diff --git a/tests/v1/entrypoints/openai/test_chat_completion.py b/tests/v1/entrypoints/openai/test_chat_completion.py
index c650ccd0ccd7..dffb32846c05 100644
--- a/tests/v1/entrypoints/openai/test_chat_completion.py
+++ b/tests/v1/entrypoints/openai/test_chat_completion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai  # use the official client for correctness check
 import pytest
diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py
index 333ad23795f3..a7c31c064224 100644
--- a/tests/v1/entrypoints/openai/test_completion.py
+++ b/tests/v1/entrypoints/openai/test_completion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/v1/entrypoints/openai/test_multi_api_servers.py b/tests/v1/entrypoints/openai/test_multi_api_servers.py
index 7b4583bc3bf3..ed4ecbe8484c 100644
--- a/tests/v1/entrypoints/openai/test_multi_api_servers.py
+++ b/tests/v1/entrypoints/openai/test_multi_api_servers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 import os
 
diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
index c17784e0a263..b48655d80eef 100755
--- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
@@ -8,7 +8,9 @@ MODELS=(
 
 # Number of prefill and decode instances to create
 NUM_PREFILL_INSTANCES=${NUM_PREFILL_INSTANCES:-1} # Default to 1
-NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-2}   # Default to 2
+NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-1}   # Default to 1
+PREFILLER_TP_SIZE=${PREFILLER_TP_SIZE:-1}
+DECODER_TP_SIZE=${DECODER_TP_SIZE:-1}
 
 # Find the git repository root directory
 GIT_ROOT=$(git rev-parse --show-toplevel)
@@ -74,9 +76,10 @@ run_tests_for_model() {
   for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do
     # Calculate GPU ID - we'll distribute across available GPUs
     GPU_ID=$((i % $(get_num_gpus)))
+
     # Calculate port number (base port + instance number)
     PORT=$((8100 + i))
-    # Calculate side channel port
+    # Calculate side channel port. Avoid clash with with TP workers. 
     SIDE_CHANNEL_PORT=$((5559 + i))
 
     echo "Starting prefill instance $i on GPU $GPU_ID, port $PORT"
@@ -87,6 +90,7 @@ run_tests_for_model() {
     --enforce-eager \
     --disable-log-requests \
     --gpu-memory-utilization 0.2 \
+    --tensor-parallel-size $PREFILLER_TP_SIZE \
     --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
 
     if [ -n "$model_args" ]; then
@@ -109,7 +113,7 @@ run_tests_for_model() {
     # Calculate port number (base port + instance number)
     PORT=$((8200 + i))
     # Calculate side channel port
-    SIDE_CHANNEL_PORT=$((5659 + i))
+    SIDE_CHANNEL_PORT=$((5659 + i * $DECODER_TP_SIZE))
 
     echo "Starting decode instance $i on GPU $GPU_ID, port $PORT"
 
@@ -119,6 +123,7 @@ run_tests_for_model() {
     --enforce-eager \
     --disable-log-requests \
     --gpu-memory-utilization 0.2 \
+    --tensor-parallel-size $DECODER_TP_SIZE \
     --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
 
     if [ -n "$model_args" ]; then
diff --git a/tests/v1/kv_connector/nixl_integration/test_accuracy.py b/tests/v1/kv_connector/nixl_integration/test_accuracy.py
index be2d84f3bb17..e5d66ffeeeb2 100644
--- a/tests/v1/kv_connector/nixl_integration/test_accuracy.py
+++ b/tests/v1/kv_connector/nixl_integration/test_accuracy.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 
 import lm_eval
@@ -13,6 +14,7 @@
 # Model-specific expected values
 EXPECTED_VALUES = {
     "Qwen/Qwen3-0.6B": 0.41,
+    "deepseek-ai/deepseek-vl2-small": 0.59
 }
 
 SIMPLE_PROMPT = "The best part about working on vLLM is that I got to meet so many people across various different organizations like UCB, Google, and Meta which means",  # noqa: E501
diff --git a/tests/v1/kv_connector/nixl_integration/test_edge_cases.py b/tests/v1/kv_connector/nixl_integration/test_edge_cases.py
index 5363fbde0096..95465a25fc9d 100644
--- a/tests/v1/kv_connector/nixl_integration/test_edge_cases.py
+++ b/tests/v1/kv_connector/nixl_integration/test_edge_cases.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 
 import openai
diff --git a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
index 13071f581375..3d720fe0cafe 100644
--- a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
+++ b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import itertools
diff --git a/tests/v1/kv_connector/unit/test_multi_connector.py b/tests/v1/kv_connector/unit/test_multi_connector.py
index a21d92c52244..72848c1a706e 100644
--- a/tests/v1/kv_connector/unit/test_multi_connector.py
+++ b/tests/v1/kv_connector/unit/test_multi_connector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import filecmp
 import shutil
 import tempfile
@@ -11,6 +12,7 @@
     KVConnectorFactory)
 from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (  # noqa
     SharedStorageConnector)
+from vllm.v1.core.kv_cache_manager import KVCacheBlocks
 
 MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
 
@@ -31,7 +33,7 @@ def __init__(self, config: VllmConfig, role):
         self.call_record: dict[str, int] = defaultdict(int)
         # Use a unique temp file per connector
         self._event_file = tempfile.gettempdir(
-        ) + f"/connector_{self.name}_events.log"
+        ) + f"/connector_{self.name}-{self.role.name}_events.log"
         # Start with an empty file
         with open(self._event_file, "w") as _:
             pass
@@ -51,10 +53,20 @@ def __getattribute__(self, name):
 
             def wrapper(*args, **kwargs):
                 self.call_record[name] += 1
+
+                # Include args that we're interested in
+                to_log = [name]
+                for arg in args:
+                    if isinstance(arg, int):
+                        to_log.append(str(arg))
+                    elif isinstance(arg, KVCacheBlocks):
+                        to_log.append(
+                            f"num_blocks={[len(b) for b in arg.blocks]}")
+
                 # Log the event as a line to the file
                 try:
                     with open(self._event_file, "a") as f:
-                        f.write(name + "\n")
+                        f.write(' '.join(to_log) + "\n")
                 except Exception as e:
                     print(f"[ERROR] Could not log event {name} "
                           f"for {self.name}: {e}")
@@ -161,15 +173,23 @@ def test_multi_shared_storage_connector_consistency():
              f"{storage_1_path} and {storage_2_path}")
 
     events = get_connector_events()
-    # get_num_new_matched_tokens will be called on each connector in turn.
-    # neither of them have hits so update_state_after_alloc won't be called.
-    assert events["storage1"][:3] == [
-        'get_num_new_matched_tokens', 'build_connector_meta',
-        'bind_connector_metadata'
+    # get_num_new_matched_tokens and update_state_after_alloc will be called
+    # on each connector in turn.
+    assert events["storage1-SCHEDULER"][:3] == [
+        'get_num_new_matched_tokens 0',
+        'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
+    ]
+    assert events["storage1-WORKER"][:5] == [
+        'register_kv_caches', 'bind_connector_metadata', 'start_load_kv',
+        'wait_for_layer_load', 'save_kv_layer'
+    ]
+    assert events["storage2-SCHEDULER"][:3] == [
+        'get_num_new_matched_tokens 0',
+        'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
     ]
-    assert events["storage2"][:3] == [
-        'get_num_new_matched_tokens', 'build_connector_meta',
-        'bind_connector_metadata'
+    assert events["storage2-WORKER"][:5] == [
+        'register_kv_caches', 'bind_connector_metadata', 'start_load_kv',
+        'wait_for_layer_load', 'save_kv_layer'
     ]
 
     # Reset prefix cache or else we'll just get the tokens back from there.
@@ -181,16 +201,16 @@ def test_multi_shared_storage_connector_consistency():
 
     events = get_connector_events()
     # get_num_new_matched_tokens will return new tokens from the first
-    # connector so update_state_after_alloc will be called once blocks
-    # are allocated for the first connector.
-    # get_num_new_matched_tokens *won't* be called on the second connector
-    # in this case.
-    assert events["storage1"][:4] == [
-        'get_num_new_matched_tokens', 'update_state_after_alloc',
-        'build_connector_meta', 'bind_connector_metadata'
+    # connector so update_state_after_alloc will be with allocated blocks
+    # on that one but with zero blocks for others (first nonzero match is
+    # chosen).
+    assert events["storage1-SCHEDULER"][:3] == [
+        'get_num_new_matched_tokens 0',
+        'update_state_after_alloc num_blocks=[7] 96', 'build_connector_meta'
     ]
-    assert events["storage2"][:2] == [
-        'build_connector_meta', 'bind_connector_metadata'
+    assert events["storage2-SCHEDULER"][:3] == [
+        'get_num_new_matched_tokens 0',
+        'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
     ]
 
     # Delete storage1 connector state
@@ -204,17 +224,17 @@ def test_multi_shared_storage_connector_consistency():
     _ = llm.generate(PROMPTS, SAMPLING_PARAMS)
 
     events = get_connector_events()
-    # get_num_new_matched_tokens will be called for the first connector but it
-    # won't have a hit so update_state_after_alloc won't be called.
-    # get_num_new_matched_tokens will also be called on the second connector,
-    # but it should have a hit so update_state_after_alloc will be called.
-    assert events["storage1"][:3] == [
-        'get_num_new_matched_tokens', 'build_connector_meta',
-        'bind_connector_metadata'
+    # get_num_new_matched_tokens will be called for both connectors but will
+    # return 0 from the first connector, but the second connector should have
+    # a hit, so update_state_after_alloc will only be called with allocated
+    # blocks for the second connector.
+    assert events["storage1-SCHEDULER"][:3] == [
+        'get_num_new_matched_tokens 0',
+        'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
     ]
-    assert events["storage2"][:4] == [
-        'get_num_new_matched_tokens', 'update_state_after_alloc',
-        'build_connector_meta', 'bind_connector_metadata'
+    assert events["storage2-SCHEDULER"][:3] == [
+        'get_num_new_matched_tokens 0',
+        'update_state_after_alloc num_blocks=[7] 96', 'build_connector_meta'
     ]
 
     # Clean up
diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index 9b2a720c11c4..622ab6f35db3 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import (
     NixlConnectorMetadata)
@@ -35,8 +36,8 @@ def test_basic_inferface():
     req_meta = kv_connector_metadata.requests[request_id]
 
     for block_id, block in zip(
-            req_meta.local_block_ids, scheduler.kv_cache_manager.
-            single_type_manager.req_to_blocks[request_id]):
+            req_meta.local_block_ids, scheduler.kv_cache_manager.coordinator.
+            single_type_managers[0].req_to_blocks[request_id]):
         assert block_id == block.block_id
 
 
diff --git a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
index dc963251c962..ff36a281c413 100644
--- a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
+++ b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
 
 from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT
@@ -53,8 +54,8 @@ def test_basic_lifecycle():
     assert len(scheduler.waiting) == 0
 
     # ... but blocks should not be freed.
-    blocks = scheduler.kv_cache_manager.single_type_manager.req_to_blocks[
-        request_id]
+    blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
+        0].req_to_blocks[request_id]
     for block in blocks:
         assert block.ref_cnt == 1
 
diff --git a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
index 86eacb693869..a1156306dc4b 100644
--- a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
+++ b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
 
 from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT
@@ -50,8 +51,8 @@ def test_basic_lifecycle():
     assert (block_pool.free_block_queue.num_free_blocks
             < START_FREE_BLOCK_QUEUE_SIZE)
     assert len(block_pool.cached_block_hash_to_block) == 0
-    blocks = scheduler.kv_cache_manager.single_type_manager.req_to_blocks[
-        request_id]
+    blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
+        0].req_to_blocks[request_id]
     for block in blocks:
         assert block._block_hash is None
 
@@ -86,8 +87,8 @@ def test_basic_lifecycle():
 
     # Confirm the block are actually allocated.
     num_hashed_blocks = 0
-    blocks = scheduler.kv_cache_manager.single_type_manager.req_to_blocks[
-        request_id]
+    blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
+        0].req_to_blocks[request_id]
     for block in blocks:
         assert block.ref_cnt == 1
         num_hashed_blocks += (1 if block._block_hash is not None else 0)
@@ -260,10 +261,10 @@ def test_no_spurious_prefix_caching():
     assert len(scheduler.running) == 1
     assert len(scheduler.waiting) == 1
 
-    local_blocks = scheduler.kv_cache_manager.single_type_manager.req_to_blocks[
-        request_local.request_id]
-    remote_blocks = scheduler.kv_cache_manager.single_type_manager.req_to_blocks[  # noqa: E501
-        request_remote.request_id]
+    local_blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
+        0].req_to_blocks[request_local.request_id]
+    remote_blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
+        0].req_to_blocks[request_remote.request_id]
 
     # Local should have cached blocks (but not all due to preallocate).
     num_hashed_blocks = 0
@@ -299,8 +300,8 @@ def test_full_block_prompt():
     # STEP (1): Initialize a recv.
     scheduler_output = scheduler.schedule()
     # All blocks should be allocated.
-    num_blocks = len(scheduler.kv_cache_manager.single_type_manager.
-                     req_to_blocks[request_id])
+    num_blocks = len(scheduler.kv_cache_manager.coordinator.
+                     single_type_managers[0].req_to_blocks[request_id])
     assert num_blocks == NUM_EXTERNAL_FULL_BLOCKS
     model_runner_output = EMPTY_MODEL_RUNNER_OUTPUT
     scheduler.update_from_output(scheduler_output, model_runner_output)
@@ -318,8 +319,8 @@ def test_full_block_prompt():
 
     # We need to recompute the final token of the prompt to generate
     # the first new token, so we should not have a new block.
-    num_blocks = len(scheduler.kv_cache_manager.single_type_manager.
-                     req_to_blocks[request_id])
+    num_blocks = len(scheduler.kv_cache_manager.coordinator.
+                     single_type_managers[0].req_to_blocks[request_id])
     assert num_blocks == NUM_EXTERNAL_FULL_BLOCKS
     assert (scheduler_output.scheduled_new_reqs[0].num_computed_tokens ==
             NUM_TOKENS - 1)
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index 3c3190b32563..4a9e3a7ad807 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any, Optional
 
 import torch
@@ -31,11 +32,11 @@ def assert_scheduler_empty(scheduler: Scheduler):
     assert len(scheduler.encoder_cache_manager.cached) == 0
 
     # KVCache Manager.
-    assert len(
-        scheduler.kv_cache_manager.single_type_manager.req_to_blocks) == 0
+    assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
+               req_to_blocks) == 0
     assert len(scheduler.kv_cache_manager.req_to_block_hashes) == 0
-    assert len(
-        scheduler.kv_cache_manager.single_type_manager.num_cached_block) == 0
+    assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
+               num_cached_block) == 0
     num_free_blocks = (
         scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks)
     assert num_free_blocks == (
@@ -95,7 +96,7 @@ def create_scheduler(
     block_size = vllm_config.cache_config.block_size
     kv_cache_config = KVCacheConfig(
         num_blocks=num_blocks,  # A large number of blocks to hold all requests
-        tensors={},
+        kv_cache_tensors=[],
         kv_cache_groups=[
             KVCacheGroupSpec(['layer'],
                              FullAttentionSpec(block_size, 1, 1, torch.float32,
diff --git a/tests/v1/metrics/test_ray_metrics.py b/tests/v1/metrics/test_ray_metrics.py
index 02475f7c150b..0898ae65e7cd 100644
--- a/tests/v1/metrics/test_ray_metrics.py
+++ b/tests/v1/metrics/test_ray_metrics.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import ray
 
@@ -46,12 +47,15 @@ async def run(self):
                 engine_args, stat_loggers=[RayPrometheusStatLogger])
 
             for i, prompt in enumerate(example_prompts):
-                engine.generate(
+                results = engine.generate(
                     request_id=f"request-id-{i}",
                     prompt=prompt,
                     sampling_params=SamplingParams(max_tokens=max_tokens),
                 )
 
+                async for _ in results:
+                    pass
+
     # Create the actor and call the async method
     actor = EngineTestActor.remote()  # type: ignore[attr-defined]
     ray.get(actor.run.remote())
diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index 3800cb392fba..69180e6e5db4 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 from collections.abc import Generator
@@ -41,7 +42,7 @@ def vllm_model(vllm_runner, request) -> Generator[VllmRunner, None, None]:
             #TODO: enable this once we support it for
             # prompt logprobs.
             enable_prefix_caching=request.param,
-            gpu_memory_utilization=0.5,
+            gpu_memory_utilization=0.4,  # up to 2 alive concurrently
     ) as vllm_model:
         yield vllm_model
 
@@ -342,10 +343,13 @@ def test_max_logprobs(monkeypatch: pytest.MonkeyPatch):
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
-        runner = VllmRunner("facebook/opt-125m",
-                            max_logprobs=1,
-                            enable_prefix_caching=False,
-                            max_model_len=256)
+        runner = VllmRunner(
+            "facebook/opt-125m",
+            max_logprobs=1,
+            enable_prefix_caching=False,
+            # 2 other llms alive during whole session
+            gpu_memory_utilization=0.15,
+            max_model_len=256)
         vllm_sampling_params = SamplingParams(logprobs=1)
         # should pass
         runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
diff --git a/tests/v1/sample/test_logprobs_e2e.py b/tests/v1/sample/test_logprobs_e2e.py
index f62770060160..085b2ee09743 100644
--- a/tests/v1/sample/test_logprobs_e2e.py
+++ b/tests/v1/sample/test_logprobs_e2e.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import lm_eval
 
diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py
index cbdb0b910d1d..f35c3e194fa7 100644
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any, Optional
 
 import pytest
diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
index 24b759bc1fa6..a2beb5ad71db 100644
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py
index 0512a1e02660..ac0f3eb58836 100644
--- a/tests/v1/sample/test_sampling_params_e2e.py
+++ b/tests/v1/sample/test_sampling_params_e2e.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 
 import pytest
diff --git a/tests/v1/sample/test_topk_topp_sampler.py b/tests/v1/sample/test_topk_topp_sampler.py
index 220f05c7ff1c..63fdeb5a6de8 100644
--- a/tests/v1/sample/test_topk_topp_sampler.py
+++ b/tests/v1/sample/test_topk_topp_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 from flashinfer.sampling import top_k_renorm_probs, top_p_renorm_probs
diff --git a/tests/v1/sample/utils.py b/tests/v1/sample/utils.py
index 932b652aea32..8c111f846b47 100644
--- a/tests/v1/sample/utils.py
+++ b/tests/v1/sample/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from enum import Enum
 from typing import Optional
diff --git a/tests/v1/shutdown/test_delete.py b/tests/v1/shutdown/test_delete.py
index ed368fe828d0..682d84dc23d1 100644
--- a/tests/v1/shutdown/test_delete.py
+++ b/tests/v1/shutdown/test_delete.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test that we handle a startup Error and shutdown."""
 
 import pytest
diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py
index 9fedbe4f9a01..523b7ee23115 100644
--- a/tests/v1/shutdown/test_forward_error.py
+++ b/tests/v1/shutdown/test_forward_error.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test that we handle an Error in model forward and shutdown."""
 
 import asyncio
diff --git a/tests/v1/shutdown/test_processor_error.py b/tests/v1/shutdown/test_processor_error.py
index 0fe48da475c6..a077d48fecbb 100644
--- a/tests/v1/shutdown/test_processor_error.py
+++ b/tests/v1/shutdown/test_processor_error.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test error handling in Processor. Should not impact other reqs."""
 
 import asyncio
diff --git a/tests/v1/shutdown/test_startup_error.py b/tests/v1/shutdown/test_startup_error.py
index 1bba19102ec6..88fc5297aaf5 100644
--- a/tests/v1/shutdown/test_startup_error.py
+++ b/tests/v1/shutdown/test_startup_error.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test that we handle a startup Error and shutdown."""
 
 import pytest
diff --git a/tests/v1/shutdown/utils.py b/tests/v1/shutdown/utils.py
index 8f7c0380d407..124254a41337 100644
--- a/tests/v1/shutdown/utils.py
+++ b/tests/v1/shutdown/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Shutdown test utils"""
 
 SHUTDOWN_TEST_TIMEOUT_SEC = 120
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index b49ac45f3129..c93b7f57c041 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from unittest import mock
 
@@ -8,6 +9,7 @@
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig, SpeculativeConfig,
                          VllmConfig)
+from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.v1.spec_decode.eagle import EagleProposer
 
 model_dir = "meta-llama/Llama-3.1-8B-Instruct"
@@ -112,21 +114,26 @@ def test_prepare_inputs():
     assert torch.equal(token_indices, expected_token_indices)
 
 
-@pytest.mark.parametrize(
-    "method,proposer_helper,draft_model_dir,target_attribute_path", [
-        ("eagle", lambda k: _create_proposer("eagle", k), eagle_dir,
-         ('lm_head', )),
-        ("eagle3", lambda k: _create_proposer("eagle3", k), eagle3_dir,
-         ('model', 'embed_tokens')),
-    ])
+@pytest.mark.parametrize("method,proposer_helper", [
+    ("eagle", lambda k: _create_proposer("eagle", k)),
+    ("eagle3", lambda k: _create_proposer("eagle3", k)),
+])
+@pytest.mark.parametrize("pp_size", [1, 2])
+@pytest.mark.parametrize("use_distinct_embed_tokens", [True, False])
 @mock.patch('vllm.v1.spec_decode.eagle.get_pp_group')
 @mock.patch('vllm.v1.spec_decode.eagle.get_layers_from_vllm_config')
 @mock.patch('vllm.v1.spec_decode.eagle.get_model')
 def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
-                    proposer_helper, draft_model_dir, target_attribute_path):
-
-    # Setup model mock
+                    proposer_helper, pp_size, use_distinct_embed_tokens):
+    # Setup draft model mock
     mock_model = mock.MagicMock()
+    if use_distinct_embed_tokens:
+        # Some models can have a different hidden size than the target model,
+        # so we test that their embed_tokens doesn't get overwritten
+        mock_model.model.embed_tokens.weight.shape = (131072, 2048)
+    else:
+        mock_model.model.embed_tokens.weight.shape = (131072, 4096)
+
     mock_get_model.return_value = mock_model
 
     # Setup mocks for attention layers
@@ -144,22 +151,24 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
 
     # Setup mock for pp group to return the appropriate value for world size
     mock_pp_group = mock.MagicMock()
-    mock_pp_group.world_size = 2 if method == "eagle" else 1
+    mock_pp_group.world_size = pp_size
     mock_get_pp_group.return_value = mock_pp_group
 
-    # Setup target model with the appropriate attributes
-    target_model = mock.MagicMock()
+    # Setup the target model mock with a custom class so that
+    # isinstance() checks match the expected type.
+    class _TargetModelStub(LlamaForCausalLM):
+        model: mock.MagicMock
+        lm_head: mock.MagicMock
 
-    # Create the necessary attributes on the target model
-    current_obj = target_model
-    for i, attr in enumerate(target_attribute_path):
-        if i == len(target_attribute_path) - 1:
-            # Set the last attribute in the path to a MagicMock
-            setattr(current_obj, attr, mock.MagicMock())
-        else:
-            # Create intermediate objects if needed
-            setattr(current_obj, attr, mock.MagicMock())
-            current_obj = getattr(current_obj, attr)
+    target_model = mock.create_autospec(_TargetModelStub, instance=True)
+    target_model.model = mock.MagicMock()
+    target_model.model.embed_tokens.weight.shape = (131072, 4096)
+
+    from vllm.model_executor.models import SupportsMultiModal
+    assert not isinstance(target_model, SupportsMultiModal)
+
+    if method == "eagle":
+        target_model.lm_head = mock.MagicMock()
 
     # Create proposer using the helper function
     proposer = proposer_helper(k=8)
@@ -170,10 +179,18 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
     # Verify common interactions
     mock_get_model.assert_called_once()
 
-    # Verify the specific attribute sharing based on the method
+    # Verify that EAGLE models gain the lm head from the target model
     if method == "eagle":
         assert proposer.model.lm_head == target_model.lm_head
+
+    # Verify that the embed tokens are set correctly
+    # If pp_size is > 1, the embed tokens should be distinct
+    if pp_size > 1 or use_distinct_embed_tokens:
+        assert proposer.model.model.embed_tokens != \
+            target_model.model.embed_tokens
     else:
+        # When pp_size is 1 and the draft and target models have
+        # embed_tokens of the same shape, they should be shared.
         assert proposer.model.model.embed_tokens == \
             target_model.model.embed_tokens
 
diff --git a/tests/v1/spec_decode/test_max_len.py b/tests/v1/spec_decode/test_max_len.py
index f577fb4ab329..9070d2b10f8b 100644
--- a/tests/v1/spec_decode/test_max_len.py
+++ b/tests/v1/spec_decode/test_max_len.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test whether spec decoding handles the max model length properly."""
 
 import pytest
diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py
index 50548219fff0..ffea86d0d19c 100644
--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import numpy as np
 
diff --git a/tests/v1/structured_output/test_utils.py b/tests/v1/structured_output/test_utils.py
index ffc0bceeee49..4e7c4b33e8c4 100644
--- a/tests/v1/structured_output/test_utils.py
+++ b/tests/v1/structured_output/test_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 
diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/test_async_llm_dp.py
index 366fa3b2561f..075ceb257ab7 100644
--- a/tests/v1/test_async_llm_dp.py
+++ b/tests/v1/test_async_llm_dp.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import os
@@ -28,12 +29,14 @@
                 allow_module_level=True)
 
 
-async def generate(engine: AsyncLLM,
-                   request_id: str,
-                   prompt: PromptType,
-                   output_kind: RequestOutputKind,
-                   max_tokens: int,
-                   prompt_logprobs: Optional[int] = None) -> tuple[int, str]:
+async def generate(
+        engine: AsyncLLM,
+        request_id: str,
+        prompt: PromptType,
+        output_kind: RequestOutputKind,
+        max_tokens: int,
+        prompt_logprobs: Optional[int] = None,
+        data_parallel_rank: Optional[int] = None) -> tuple[int, str]:
     # Ensure generate doesn't complete too fast for cancellation test.
     await asyncio.sleep(0.2)
 
@@ -45,7 +48,8 @@ async def generate(engine: AsyncLLM,
                                      prompt_logprobs=prompt_logprobs)
     async for out in engine.generate(request_id=request_id,
                                      prompt=prompt,
-                                     sampling_params=sampling_params):
+                                     sampling_params=sampling_params,
+                                     data_parallel_rank=data_parallel_rank):
 
         num_tokens = len(out.outputs[0].token_ids)
         if output_kind == RequestOutputKind.DELTA:
@@ -88,8 +92,12 @@ async def test_load(output_kind: RequestOutputKind,
         for request_id in request_ids:
             tasks.append(
                 asyncio.create_task(
-                    generate(engine, request_id, prompt, output_kind,
-                             NUM_EXPECTED_TOKENS)))
+                    generate(engine,
+                             request_id,
+                             prompt,
+                             output_kind,
+                             NUM_EXPECTED_TOKENS,
+                             data_parallel_rank=0)))
         # Confirm that we got all the EXPECTED tokens from the requests.
         done, pending = await asyncio.wait(tasks,
                                            return_when=asyncio.FIRST_EXCEPTION)
diff --git a/tests/v1/test_metrics_reader.py b/tests/v1/test_metrics_reader.py
index 68539c80b59c..c05de5e4cb64 100644
--- a/tests/v1/test_metrics_reader.py
+++ b/tests/v1/test_metrics_reader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import prometheus_client
 import pytest
diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py
index 1b77417a1bd3..e5eadfd4e9da 100644
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 
 import pytest
diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py
index ee490071f6a2..0ab4e0bf59cf 100644
--- a/tests/v1/test_serial_utils.py
+++ b/tests/v1/test_serial_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections import UserDict
 from dataclasses import dataclass
 from typing import Optional
diff --git a/tests/v1/test_utils.py b/tests/v1/test_utils.py
index b68f08385866..a3df882a9e29 100644
--- a/tests/v1/test_utils.py
+++ b/tests/v1/test_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py
index 1c0210b6a814..7117a66c2958 100644
--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A basic correctness check for TPUs
 
 Run `pytest tests/v1/tpu/test_basic.py`.
diff --git a/tests/v1/tpu/test_mha_attn.py b/tests/v1/tpu/test_mha_attn.py
index 01664598ccfd..55fee4ee1ad4 100644
--- a/tests/v1/tpu/test_mha_attn.py
+++ b/tests/v1/tpu/test_mha_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Test:
 
diff --git a/tests/v1/tpu/test_multimodal.py b/tests/v1/tpu/test_multimodal.py
index 8c87fc836b51..a61773a4f611 100644
--- a/tests/v1/tpu/test_multimodal.py
+++ b/tests/v1/tpu/test_multimodal.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai
 import pytest
diff --git a/tests/v1/tpu/test_pallas.py b/tests/v1/tpu/test_pallas.py
index 8faa5270b593..3a9d80847a16 100644
--- a/tests/v1/tpu/test_pallas.py
+++ b/tests/v1/tpu/test_pallas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from unittest.mock import ANY, patch
 
 import torch
diff --git a/tests/v1/tpu/test_perf.py b/tests/v1/tpu/test_perf.py
index 811833f73cdb..f4a2d5ac853a 100644
--- a/tests/v1/tpu/test_perf.py
+++ b/tests/v1/tpu/test_perf.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A basic performance regression test for TPUs
 
 Run `pytest tests/v1/tpu/test_perf.py`.
diff --git a/tests/v1/tpu/test_sampler.py b/tests/v1/tpu/test_sampler.py
index 2bbeb3ddac91..198bb1e16ed9 100644
--- a/tests/v1/tpu/test_sampler.py
+++ b/tests/v1/tpu/test_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import random
 
 import pytest
diff --git a/tests/v1/tpu/test_spmd_model_weight_loading.py b/tests/v1/tpu/test_spmd_model_weight_loading.py
index d36edfc3fb61..916325e41b92 100644
--- a/tests/v1/tpu/test_spmd_model_weight_loading.py
+++ b/tests/v1/tpu/test_spmd_model_weight_loading.py
@@ -45,11 +45,14 @@ def _get_spmd_mesh():
     return MESH
 
 
-@pytest.mark.parametrize("model", [
-    "Qwen/Qwen2-1.5B-Instruct",
-    "meta-llama/Llama-3.1-8B-Instruct",
-    "meta-llama/Llama-3.1-70B-Instruct",
-])
+@pytest.mark.parametrize(
+    "model",
+    [
+        "Qwen/Qwen2-1.5B-Instruct",
+        # Skip large models due to CI runner disk space limitations
+        # "meta-llama/Llama-3.1-8B-Instruct",
+        # "meta-llama/Llama-3.1-70B-Instruct",
+    ])
 def test_tpu_model_loader(model):
     # Skip the 70B test if there are less than 8 chips
     # TODO: Query using torch xla API, the query API is not working
diff --git a/tests/v1/tpu/test_topk_topp_sampler.py b/tests/v1/tpu/test_topk_topp_sampler.py
index ff9217f8f3ca..ca5c067b364e 100644
--- a/tests/v1/tpu/test_topk_topp_sampler.py
+++ b/tests/v1/tpu/test_topk_topp_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 
 import pytest
diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index 348f12887a44..73c0da45d4ab 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -1,10 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import unittest.mock as mock
 
 import pytest
 
-from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
+from vllm.attention.layer import Attention
+from vllm.config import (CacheConfig, ModelConfig, SchedulerConfig, VllmConfig,
+                         set_current_vllm_config)
 from vllm.sampling_params import SamplingParams
+from vllm.utils import GiB_bytes
+from vllm.v1.core.kv_cache_utils import (estimate_max_model_len,
+                                         get_kv_cache_config)
 from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
                                        SchedulerOutput)
 from vllm.v1.worker.tpu_model_runner import (
@@ -220,7 +226,7 @@ def test_update_states_request_resumed(model_runner):
         req_id=req_id,
         resumed_from_preemption=False,
         new_token_ids=[],
-        new_block_ids=[],
+        new_block_ids=[[]],
         num_computed_tokens=0,
     )
 
@@ -362,3 +368,228 @@ def test_get_req_paddings():
     assert _get_req_paddings(1, 32) == [8, 16, 32]
     assert _get_req_paddings(8, 32) == [8, 16, 32]
     assert _get_req_paddings(8, 36) == [8, 16, 32, 36]
+
+
+@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
+def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    error_msg = f"{layer_1} must come before the current layer"
+    with pytest.raises(ValueError, match=error_msg):
+        fwd_context = {
+            # initialization below will fail because target layer is invalid;
+            # the target layer needs to come before layer 1
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=128,
+                scale=1.0,
+                prefix=layer_0,
+                kv_sharing_target_layer_name=layer_1,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=128,
+                scale=1.0,
+                prefix=layer_1,
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+
+
+@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
+def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    invalid_layer = "model.layers.0.cross_attn.attn"
+    error_msg = f"{invalid_layer} is not a valid Attention layer in the model"
+    with pytest.raises(ValueError, match=error_msg):
+        fwd_context = {
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=128,
+                scale=1.0,
+                prefix=layer_0,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=128,
+                scale=1.0,
+                prefix=layer_1,
+                # invalid layer: cross_attn.atn doesn't exist!
+                kv_sharing_target_layer_name=invalid_layer,
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+
+
+@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
+def test_init_kv_cache_with_kv_sharing_target_same_as_current():
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    error_msg = f"{layer_1} cannot be the same as the current layer"
+    with pytest.raises(ValueError, match=error_msg):
+        fwd_context = {
+            # initialization below will fail because target layer is invalid;
+            # the target layer needs to come before layer 1
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=128,
+                scale=1.0,
+                prefix=layer_0,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=128,
+                scale=1.0,
+                prefix=layer_1,
+                kv_sharing_target_layer_name=layer_1,
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+
+
+@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
+def test_init_kv_cache_without_kv_sharing(model_runner):
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    vllm_config = model_runner.vllm_config
+    with set_current_vllm_config(vllm_config):
+        fwd_context = {
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=128,
+                scale=1.0,
+                prefix=layer_0,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=128,
+                scale=1.0,
+                prefix=layer_1,
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+    # Set high context length to test max context length estimation
+    vllm_config.model_config.max_model_len = 3_000_000
+    vllm_ctx = vllm_config.compilation_config.static_forward_context
+    kv_cache_spec = model_runner.get_kv_cache_spec()
+    assert len(kv_cache_spec) == 2
+    assert len(model_runner.shared_kv_cache_layers) == 0
+
+    available_memory = 20 * GiB_bytes
+    # page size for layer 0's kv_cache_spec is 32KB
+    num_expected_blocks = 327680  # 20GB / 32KB / 2 (num layers)
+    kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
+                                          available_memory)
+    assert kv_cache_config.num_blocks == num_expected_blocks
+    assert len(kv_cache_config.tensors) == 2
+    assert kv_cache_config.tensors[layer_0].size == available_memory // 2
+    assert kv_cache_config.tensors[layer_1].size == available_memory // 2
+
+    max_context_len =\
+        estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
+    # max context len with KV sharing should be 2x as large as without
+    assert max_context_len == 1310720
+
+    # important: override tensor size to prevent large mem alloc during test
+    # this will only allocate 2 block worth of memory (2 * 32kb)
+    kv_cache_config.num_blocks = 1
+    for layer in kv_cache_config.tensors:
+        kv_cache_config.tensors[layer].size =\
+            kv_cache_spec[layer].page_size_bytes
+
+    model_runner.initialize_kv_cache(kv_cache_config)
+
+    layer_0_kv = vllm_ctx[layer_0].kv_cache[0]
+    layer_1_kv = vllm_ctx[layer_1].kv_cache[0]
+    # check layer 1 kv cache does NOT share memory with layer 0
+    assert id(layer_1_kv) != id(layer_0_kv)
+
+    # check layer 1 added to kv cache group's layer names
+    assert len(kv_cache_config.kv_cache_groups) == 1
+    assert len(kv_cache_config.kv_cache_groups[0].layer_names) == 2
+    assert kv_cache_config.kv_cache_groups[0].layer_names[0] == layer_0
+    assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1
+
+
+@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
+def test_init_kv_cache_with_kv_sharing_valid(model_runner):
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    vllm_config = model_runner.vllm_config
+    with set_current_vllm_config(vllm_config):
+        fwd_context = {
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=128,
+                scale=1.0,
+                prefix=layer_0,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=128,
+                scale=1.0,
+                prefix=layer_1,
+                kv_sharing_target_layer_name="model.layers.0.self_attn.attn",
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+    # Set high context length to test max context length estimation
+    vllm_config.model_config.max_model_len = 3_000_000
+    vllm_ctx = vllm_config.compilation_config.static_forward_context
+    kv_cache_spec = model_runner.get_kv_cache_spec()
+    assert len(kv_cache_spec) == 1
+    assert layer_0 in kv_cache_spec
+    assert model_runner.shared_kv_cache_layers[layer_1] == layer_0
+
+    available_memory = 20 * GiB_bytes
+    # page size for layer 0's kv_cache_spec is 32KB
+    # with KV sharing, we can allocate (available_mem//page_size//1) blocks
+    # which is twice as many as without KV sharing
+    num_expected_blocks = 655360  # 20GB / 32KB
+    kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
+                                          available_memory)
+    assert kv_cache_config.num_blocks == num_expected_blocks
+    assert len(kv_cache_config.tensors) == 1
+    # Each layer now has twice the available memory for KV cache
+    # compared to no KV sharing
+    assert kv_cache_config.tensors[layer_0].size == available_memory
+
+    max_context_len =\
+        estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
+    # max context len with KV sharing should be 2x as large as without
+    assert max_context_len == 2 * 1310720
+
+    # important: override tensor size to prevent large mem alloc during test
+    # this will only allocate 1 block worth of memory (32kb)
+    kv_cache_config.num_blocks = 1
+    kv_cache_config.tensors[layer_0].size =\
+        kv_cache_spec[layer_0].page_size_bytes
+
+    model_runner.initialize_kv_cache(kv_cache_config)
+
+    layer_0_kv = vllm_ctx[layer_0].kv_cache[0]
+    layer_1_kv = vllm_ctx[layer_1].kv_cache[0]
+    # check layer 1 kv cache shares memory with layer 0
+    assert id(layer_1_kv) == id(layer_0_kv)
+
+    # check layer 1 added to kv cache group's layer names
+    assert len(kv_cache_config.kv_cache_groups) == 1
+    assert len(kv_cache_config.kv_cache_groups[0].layer_names) == 2
+    assert kv_cache_config.kv_cache_groups[0].layer_names[0] == layer_0
+    assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 27741bd156be..72547e86b0e9 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import inspect
 from typing import Optional
@@ -9,8 +10,6 @@
 
 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
-from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
-                                        KVCacheGroupSpec, KVCacheTensor)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.block_table import BlockTable, MultiGroupBlockTable
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
@@ -24,27 +23,6 @@
 MAX_NUM_PROMPT_TOKENS = 64
 
 
-def get_kv_cache_config() -> KVCacheConfig:
-    return KVCacheConfig(
-        num_blocks=10,
-        tensors={
-            "layer.0": KVCacheTensor(size=1024),
-        },
-        kv_cache_groups=[
-            KVCacheGroupSpec(
-                layer_names=["layer.0"],
-                kv_cache_spec=FullAttentionSpec(
-                    block_size=1,
-                    num_kv_heads=1,
-                    head_size=16,
-                    dtype=torch.float16,
-                    use_mla=False,
-                ),
-            ),
-        ],
-    )
-
-
 def _compare_objs(obj1, obj2):
     attrs = inspect.getmembers(obj1, lambda a: not (inspect.isroutine(a)))
     attr_names = set([
@@ -251,7 +229,7 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int):
         device=torch.device(device),
         pin_memory=is_pin_memory_available(),
         vocab_size=1024,
-        block_size=1,
+        block_sizes=[1],
     )
     reqs: list[CachedRequestState] = []
     req_id_reqs = {}
@@ -341,7 +319,7 @@ def test_swap_states_in_input_batch(device: str, batch_size: int,
         device=torch.device(device),
         pin_memory=is_pin_memory_available(),
         vocab_size=1024,
-        block_size=1,
+        block_sizes=[1],
     )
     ref_input_batch: InputBatch = InputBatch(
         max_num_reqs=batch_size,
@@ -350,7 +328,7 @@ def test_swap_states_in_input_batch(device: str, batch_size: int,
         device=torch.device(device),
         pin_memory=is_pin_memory_available(),
         vocab_size=1024,
-        block_size=1,
+        block_sizes=[1],
     )
 
     reqs: list[CachedRequestState] = []
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 6ba6d1f6f131..caacb1652e9a 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
 
@@ -6,8 +7,11 @@
 
 from vllm.attention import Attention
 from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig, VllmConfig)
+                         SchedulerConfig, VllmConfig, set_current_vllm_config)
 from vllm.sampling_params import SamplingParams
+from vllm.utils import GiB_bytes
+from vllm.v1.core.kv_cache_utils import (estimate_max_model_len,
+                                         get_kv_cache_config)
 from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
                                        SchedulerOutput)
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
@@ -18,6 +22,7 @@
 
 BLOCK_SIZE = 16
 NUM_BLOCKS = 10
+DEVICE = "cuda"
 
 
 def initialize_kv_cache(runner: GPUModelRunner):
@@ -35,12 +40,13 @@ def initialize_kv_cache(runner: GPUModelRunner):
     tensor_size = attn_spec.page_size_bytes * NUM_BLOCKS
     kv_cache_config = KVCacheConfig(
         num_blocks=NUM_BLOCKS,
-        tensors={
-            "layer.0": KVCacheTensor(size=tensor_size),
-        },
+        kv_cache_tensors=[
+            KVCacheTensor(size=tensor_size, shared_by=["layer.0"]),
+        ],
         kv_cache_groups=[
             KVCacheGroupSpec(layer_names=["layer.0"], kv_cache_spec=attn_spec)
-        ])
+        ],
+    )
     runner.kv_cache_config = kv_cache_config
     runner.input_batch = InputBatch(
         max_num_reqs=runner.max_num_reqs,
@@ -49,13 +55,14 @@ def initialize_kv_cache(runner: GPUModelRunner):
         device=runner.device,
         pin_memory=runner.pin_memory,
         vocab_size=runner.model_config.get_vocab_size(),
-        block_size=kv_cache_config.kv_cache_groups[0].kv_cache_spec.block_size,
+        block_sizes=[
+            kv_cache_config.kv_cache_groups[0].kv_cache_spec.block_size
+        ],
     )
     runner.initialize_attn_backend(kv_cache_config)
 
 
-@pytest.fixture
-def model_runner():
+def get_vllm_config():
     scheduler_config = SchedulerConfig(
         max_num_seqs=10,
         max_num_batched_tokens=512,
@@ -83,13 +90,18 @@ def model_runner():
         scheduler_config=scheduler_config,
         parallel_config=parallel_config,
     )
-    num_heads = model_config.get_num_kv_heads(parallel_config)
+    return vllm_config
+
+
+@pytest.fixture
+def model_runner():
+    vllm_config = get_vllm_config()
+    model_config = vllm_config.model_config
+    num_heads = model_config.get_num_kv_heads(vllm_config.parallel_config)
     head_size = model_config.get_head_size()
     vllm_config.compilation_config.static_forward_context[
         "layer.0"] = Attention(num_heads, head_size, 0.1)
-
-    device = "cuda"
-    runner = GPUModelRunner(vllm_config, device)
+    runner = GPUModelRunner(vllm_config, DEVICE)
     initialize_kv_cache(runner)
     return runner
 
@@ -384,3 +396,225 @@ def test_load_model_weights_inplace(dist_init, model_runner, model_runner_2):
     model_runner_2.load_model()  # Load real weights inplace
     assert str(model_runner.get_model().state_dict()) == str(
         model_runner_2.get_model().state_dict())
+
+
+def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    error_msg = f"{layer_1} must come before the current layer"
+    with pytest.raises(ValueError, match=error_msg):
+        fwd_context = {
+            # initialization below will fail because target layer is invalid;
+            # the target layer needs to come before layer 1
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_0,
+                kv_sharing_target_layer_name=layer_1,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_1,
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+
+
+def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    invalid_layer = "model.layers.0.cross_attn.attn"
+    error_msg = f"{invalid_layer} is not a valid Attention layer in the model"
+    with pytest.raises(ValueError, match=error_msg):
+        fwd_context = {
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_0,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_1,
+                # invalid layer: cross_attn.atn doesn't exist!
+                kv_sharing_target_layer_name=invalid_layer,
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+
+
+def test_init_kv_cache_with_kv_sharing_target_same_as_current():
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    error_msg = f"{layer_1} cannot be the same as the current layer"
+    with pytest.raises(ValueError, match=error_msg):
+        fwd_context = {
+            # initialization below will fail because target layer is invalid;
+            # the target layer needs to come before layer 1
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_0,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_1,
+                kv_sharing_target_layer_name=layer_1,
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+
+
+def test_init_kv_cache_without_kv_sharing():
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    vllm_config = get_vllm_config()
+    with set_current_vllm_config(vllm_config):
+        fwd_context = {
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_0,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_1,
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+    # Set high context length to test max context length estimation
+    vllm_config.model_config.max_model_len = 3_000_000
+    vllm_ctx = vllm_config.compilation_config.static_forward_context
+    runner = GPUModelRunner(vllm_config, DEVICE)
+    kv_cache_spec = runner.get_kv_cache_spec()
+    assert len(kv_cache_spec) == 2
+    assert len(runner.shared_kv_cache_layers) == 0
+
+    available_memory = 20 * GiB_bytes
+    # page size for layer 0's kv_cache_spec is 32KB
+    num_expected_blocks = 327680  # 20GB / 32KB / 2 (num layers)
+    kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
+                                          available_memory)
+    assert kv_cache_config.num_blocks == num_expected_blocks
+    assert len(kv_cache_config.kv_cache_tensors) == 2
+    assert kv_cache_config.kv_cache_tensors[0].size == available_memory // 2
+    assert kv_cache_config.kv_cache_tensors[1].size == available_memory // 2
+
+    max_context_len =\
+        estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
+    # max context len with KV sharing should be 2x as large as without
+    assert max_context_len == 1310720
+
+    # important: override tensor size to prevent large mem alloc during test
+    # this will only allocate 2 block worth of memory (2 * 32kb)
+    kv_cache_config.num_blocks = 1
+    for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
+        kv_cache_tensor.size = (
+            kv_cache_spec[kv_cache_tensor.shared_by[0]].page_size_bytes)
+
+    runner.initialize_kv_cache(kv_cache_config)
+
+    layer_0_kv = vllm_ctx[layer_0].kv_cache[0]
+    layer_1_kv = vllm_ctx[layer_1].kv_cache[0]
+    # check layer 1 kv cache does NOT share memory with layer 0
+    assert id(layer_1_kv) != id(layer_0_kv)
+
+    # check layer 1 added to kv cache group's layer names
+    assert len(kv_cache_config.kv_cache_groups) == 1
+    assert len(kv_cache_config.kv_cache_groups[0].layer_names) == 2
+    assert kv_cache_config.kv_cache_groups[0].layer_names[0] == layer_0
+    assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1
+
+
+def test_init_kv_cache_with_kv_sharing_valid():
+    layer_0 = "model.layers.0.self_attn.attn"
+    layer_1 = "model.layers.1.self_attn.attn"
+    vllm_config = get_vllm_config()
+    with set_current_vllm_config(vllm_config):
+        fwd_context = {
+            layer_0:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_0,
+            ),
+            layer_1:
+            Attention(
+                num_heads=8,
+                head_size=64,
+                scale=1.0,
+                prefix=layer_1,
+                kv_sharing_target_layer_name="model.layers.0.self_attn.attn",
+            )
+        }
+        # suppress var not used error
+        assert fwd_context is not None
+    # Set high context length to test max context length estimation
+    vllm_config.model_config.max_model_len = 3_000_000
+    vllm_ctx = vllm_config.compilation_config.static_forward_context
+    runner = GPUModelRunner(vllm_config, DEVICE)
+    kv_cache_spec = runner.get_kv_cache_spec()
+    assert len(kv_cache_spec) == 1
+    assert layer_0 in kv_cache_spec
+    assert runner.shared_kv_cache_layers[layer_1] == layer_0
+
+    available_memory = 20 * GiB_bytes
+    # page size for layer 0's kv_cache_spec is 32KB
+    # with KV sharing, we can allocate (available_mem//page_size//1) blocks
+    # which is twice as many as without KV sharing
+    num_expected_blocks = 655360  # 20GB / 32KB
+    kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
+                                          available_memory)
+    assert kv_cache_config.num_blocks == num_expected_blocks
+    assert len(kv_cache_config.kv_cache_tensors) == 1
+    # Each layer now has twice the available memory for KV cache
+    # compared to no KV sharing
+    assert kv_cache_config.kv_cache_tensors[0].size == available_memory
+
+    max_context_len =\
+        estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
+    # max context len with KV sharing should be 2x as large as without
+    assert max_context_len == 2 * 1310720
+
+    # important: override tensor size to prevent large mem alloc during test
+    # this will only allocate 1 block worth of memory (32kb)
+    kv_cache_config.num_blocks = 1
+    kv_cache_config.kv_cache_tensors[0].size =\
+        kv_cache_spec[layer_0].page_size_bytes
+
+    runner.initialize_kv_cache(kv_cache_config)
+
+    layer_0_kv = vllm_ctx[layer_0].kv_cache[0]
+    layer_1_kv = vllm_ctx[layer_1].kv_cache[0]
+    # check layer 1 kv cache shares memory with layer 0
+    assert id(layer_1_kv) == id(layer_0_kv)
+
+    # check layer 1 added to kv cache group's layer names
+    assert len(kv_cache_config.kv_cache_groups) == 1
+    assert len(kv_cache_config.kv_cache_groups[0].layer_names) == 2
+    assert kv_cache_config.kv_cache_groups[0].layer_names[0] == layer_0
+    assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1
diff --git a/tests/vllm_test_utils/setup.py b/tests/vllm_test_utils/setup.py
index c039431494c4..83be8bdce85c 100644
--- a/tests/vllm_test_utils/setup.py
+++ b/tests/vllm_test_utils/setup.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from setuptools import setup
 
diff --git a/tests/vllm_test_utils/vllm_test_utils/__init__.py b/tests/vllm_test_utils/vllm_test_utils/__init__.py
index 1d1219fbeffa..2818428de4a7 100644
--- a/tests/vllm_test_utils/vllm_test_utils/__init__.py
+++ b/tests/vllm_test_utils/vllm_test_utils/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 vllm_utils is a package for vLLM testing utilities.
 It does not import any vLLM modules.
diff --git a/tests/vllm_test_utils/vllm_test_utils/blame.py b/tests/vllm_test_utils/vllm_test_utils/blame.py
index 3b25980cb946..49fd083ef19c 100644
--- a/tests/vllm_test_utils/vllm_test_utils/blame.py
+++ b/tests/vllm_test_utils/vllm_test_utils/blame.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import contextlib
 import dataclasses
diff --git a/tests/vllm_test_utils/vllm_test_utils/monitor.py b/tests/vllm_test_utils/vllm_test_utils/monitor.py
index 27077f13de24..9454221b273e 100644
--- a/tests/vllm_test_utils/vllm_test_utils/monitor.py
+++ b/tests/vllm_test_utils/vllm_test_utils/monitor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import contextlib
 import dataclasses
diff --git a/tests/weight_loading/test_weight_loading.py b/tests/weight_loading/test_weight_loading.py
index 9f99b3725fe4..3aabae099073 100644
--- a/tests/weight_loading/test_weight_loading.py
+++ b/tests/weight_loading/test_weight_loading.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 
diff --git a/tests/worker/conftest.py b/tests/worker/conftest.py
index 372d71a78d0a..3f202d4dbe94 100644
--- a/tests/worker/conftest.py
+++ b/tests/worker/conftest.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
 
diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py
index 3e237aacc8c6..35ac90b38e84 100644
--- a/tests/worker/test_encoder_decoder_model_runner.py
+++ b/tests/worker/test_encoder_decoder_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 
diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py
index a41fc52170fe..a5e61128d1e9 100644
--- a/tests/worker/test_model_input.py
+++ b/tests/worker/test_model_input.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 
diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
index ae4b536524be..0be25aa2fc35 100644
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
 import torch
diff --git a/tests/worker/test_profile.py b/tests/worker/test_profile.py
index 22466105b8ab..d8767f700b57 100644
--- a/tests/worker/test_profile.py
+++ b/tests/worker/test_profile.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py
index 3ab8070999b0..6d9f404ac207 100644
--- a/tests/worker/test_swap.py
+++ b/tests/worker/test_swap.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/tools/check_spdx_header.py b/tools/check_spdx_header.py
index 709befc53207..92914186b16e 100644
--- a/tools/check_spdx_header.py
+++ b/tools/check_spdx_header.py
@@ -1,8 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import sys
 
-SPDX_HEADER = "# SPDX-License-Identifier: Apache-2.0"
+SPDX_HEADER = (
+    "# SPDX-License-Identifier: Apache-2.0\n"
+    "# SPDX-FileCopyrightText: Copyright contributors to the vLLM project")
 SPDX_HEADER_PREFIX = "# SPDX-License-Identifier:"
 
 
diff --git a/tools/check_triton_import.py b/tools/check_triton_import.py
index 18c9726a11ac..77b2dfc39188 100644
--- a/tools/check_triton_import.py
+++ b/tools/check_triton_import.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import subprocess
 import sys
 
diff --git a/tools/enforce_regex_import.py b/tools/enforce_regex_import.py
index 6c201dd2543e..63ceee5829ab 100644
--- a/tools/enforce_regex_import.py
+++ b/tools/enforce_regex_import.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 
 import subprocess
diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py
index 9601b578eb97..209c3a576aee 100644
--- a/tools/profiler/print_layerwise_table.py
+++ b/tools/profiler/print_layerwise_table.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import json
diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py
index 8ec3dfc97a73..038d3c44f043 100644
--- a/tools/profiler/visualize_layerwise_profile.py
+++ b/tools/profiler/visualize_layerwise_profile.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import copy
diff --git a/tools/report_build_time_ninja.py b/tools/report_build_time_ninja.py
index 011af25229f4..7368ae95313d 100644
--- a/tools/report_build_time_ninja.py
+++ b/tools/report_build_time_ninja.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright (c) 2018 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
diff --git a/use_existing_torch.py b/use_existing_torch.py
index 7d352c6ca6fa..a9f79e16981c 100644
--- a/use_existing_torch.py
+++ b/use_existing_torch.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import glob
 
diff --git a/vllm/__init__.py b/vllm/__init__.py
index 52022fb8f016..6232b657e828 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
 # The version.py should be independent library, and we always import the
 # version library first.  Such assumption is critical for some customization.
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 3c8e6b95ce76..92de1f5efa83 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import contextlib
 import importlib
@@ -281,6 +282,45 @@ def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
     torch.ops._C.fused_add_rms_norm(input, residual, weight, epsilon)
 
 
+def apply_repetition_penalties_torch(
+        logits: torch.Tensor, prompt_mask: torch.Tensor,
+        output_mask: torch.Tensor, repetition_penalties: torch.Tensor) -> None:
+    repetition_penalties = repetition_penalties.unsqueeze(dim=1).repeat(
+        1, logits.size(1))
+    # If token appears in prompt or output, apply, otherwise use 1.0 for no-op.
+    penalties = torch.where(prompt_mask | output_mask, repetition_penalties,
+                            1.0)
+    # If logits are positive, divide by penalty, otherwise multiply by penalty.
+    scaling = torch.where(logits > 0, 1.0 / penalties, penalties)
+    logits *= scaling
+
+
+def apply_repetition_penalties_cuda(
+        logits: torch.Tensor, prompt_mask: torch.Tensor,
+        output_mask: torch.Tensor, repetition_penalties: torch.Tensor) -> None:
+    torch.ops._C.apply_repetition_penalties_(logits, prompt_mask, output_mask,
+                                             repetition_penalties)
+
+
+def apply_repetition_penalties(logits: torch.Tensor, prompt_mask: torch.Tensor,
+                               output_mask: torch.Tensor,
+                               repetition_penalties: torch.Tensor) -> None:
+    """Apply repetition penalties to logits in-place.
+
+    Args:
+        logits: The logits tensor of shape [num_seqs, vocab_size].
+        prompt_mask: A boolean tensor indicating which tokens appear in the prompt.
+        output_mask: A boolean tensor indicating which tokens appear in the output.
+        repetition_penalties: The repetition penalties of shape (num_seqs, ).
+    """
+    if current_platform.is_cuda() and logits.is_contiguous():
+        apply_repetition_penalties_cuda(logits, prompt_mask, output_mask,
+                                        repetition_penalties)
+    else:
+        apply_repetition_penalties_torch(logits, prompt_mask, output_mask,
+                                         repetition_penalties)
+
+
 def advance_step_flashattn(num_seqs: int, num_queries: int, block_size: int,
                            input_tokens: torch.Tensor,
                            sampled_token_ids: torch.Tensor,
@@ -805,11 +845,16 @@ def cutlass_scaled_sparse_mm(
     return out
 
 
-def get_cutlass_moe_mm_data(
-        topk_ids: torch.Tensor, expert_offsets: torch.Tensor,
-        problem_sizes1: torch.Tensor, problem_sizes2: torch.Tensor,
-        input_permutation: torch.Tensor, output_permutation: torch.Tensor,
-        num_experts: int, n: int, k: int):
+def get_cutlass_moe_mm_data(topk_ids: torch.Tensor,
+                            expert_offsets: torch.Tensor,
+                            problem_sizes1: torch.Tensor,
+                            problem_sizes2: torch.Tensor,
+                            input_permutation: torch.Tensor,
+                            output_permutation: torch.Tensor,
+                            num_experts: int,
+                            n: int,
+                            k: int,
+                            blockscale_offsets: Optional[torch.Tensor] = None):
     """
     Prepare data necessary to perform CUTLASS grouped matrix multiplications
     used in CUTLASS-based fused MoE.
@@ -827,19 +872,63 @@ def get_cutlass_moe_mm_data(
                          before executing the MMs.
     - output_permutation: Permutation that must be used to shuffle the output
                           after executing the MMs.
+    - blockscale_offsets: Optional argument passed for fp4 moe. Indices that
+                          mark at which block scale index each expert begins
+                          its computation. The number of block scale rows
+                          computed with expert E is blockscale_offsets[E + 1] -
+                          blockscale_offsets[E]
     """
     return torch.ops._C.get_cutlass_moe_mm_data(topk_ids, expert_offsets,
                                                 problem_sizes1, problem_sizes2,
                                                 input_permutation,
                                                 output_permutation,
-                                                num_experts, n, k)
+                                                num_experts, n, k,
+                                                blockscale_offsets)
+
+
+def shuffle_rows(input_tensor: torch.Tensor, dst2src_map: torch.Tensor):
+    """
+    Shuffle and expand the input tensor according to the dst2src_map and store the result in output_tensor.
+    This is used in MoE to permute the input tensor before performing grouped matrix multiplications.
+    """
+    num_tokens_permuted = dst2src_map.shape[0]
+    output_tensor = torch.empty((num_tokens_permuted, input_tensor.shape[1]),
+                                device=input_tensor.device,
+                                dtype=input_tensor.dtype)
+    torch.ops._moe_C.shuffle_rows(input_tensor, dst2src_map, output_tensor)
+    return output_tensor
+
+
+def get_cutlass_pplx_moe_mm_data(expert_offsets: torch.Tensor,
+                                 problem_sizes1: torch.Tensor,
+                                 problem_sizes2: torch.Tensor,
+                                 expert_num_tokens: torch.Tensor,
+                                 num_local_experts: int, padded_m: int, n: int,
+                                 k: int):
+    """
+    Prepare data necessary to perform CUTLASS grouped matrix multiplications
+    used in CUTLASS-based fused MoE.
+
+    The function takes in expert_num_tokens (token count per expert) and
+    non_zero_expert_idxs (consecutive indices of experts with non-zero token 
+    counts) and uses them to compute:
+    - expert_offsets: Indices that mark at which token index each expert begins
+                      its computation.
+    - problem_sizes1, problem_sizes2: MxNxK sizes of each expert's
+                                      multiplication in two grouped MMs used in
+                                      the fused MoE operation.
+    """
+    return torch.ops._C.get_cutlass_pplx_moe_mm_data(
+        expert_offsets, problem_sizes1, problem_sizes2, expert_num_tokens,
+        num_local_experts, padded_m, n, k)
 
 
 def cutlass_moe_mm(out_tensors: torch.Tensor, a_tensors: torch.Tensor,
                    b_tensors: torch.Tensor, a_scales: torch.Tensor,
                    b_scales: torch.Tensor, expert_offsets: torch.Tensor,
                    problem_sizes: torch.Tensor, a_strides: torch.Tensor,
-                   b_strides: torch.Tensor, c_strides: torch.Tensor):
+                   b_strides: torch.Tensor, c_strides: torch.Tensor,
+                   per_act_token: bool, per_out_ch: bool):
     """
     A single grouped matrix multiplication used in CUTLASS-based fused MoE.
     The function executes fp8-quantized OUT = AB matrix multiplication.
@@ -854,7 +943,7 @@ def cutlass_moe_mm(out_tensors: torch.Tensor, a_tensors: torch.Tensor,
     return torch.ops._C.cutlass_moe_mm(out_tensors, a_tensors, b_tensors,
                                        a_scales, b_scales, expert_offsets,
                                        problem_sizes, a_strides, b_strides,
-                                       c_strides)
+                                       c_strides, per_act_token, per_out_ch)
 
 
 def cutlass_fp4_moe_mm(a_tensors: torch.Tensor, b_tensors: torch.Tensor,
@@ -1084,14 +1173,12 @@ def scaled_fp4_experts_quant(
     expert_offsets: torch.Tensor,
     blockscale_offsets: torch.Tensor,
     topk: int,
-    expert_map: Optional[torch.Tensor] = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Quantize input tensor to FP4 and return quantized tensor and scale, for
     packed MoE Inputs.
     Args:
-        input: The input tensor to be quantized to FP4
-        expert_map: The expert map tensor
+        input_tensor: The input tensor to be quantized to FP4
         input_global_scale: A scalar scaling factor for the entire tensor.
         expert_offsets: The expert offsets tensor
         blockscale_offsets: The blockscale offsets tensor
@@ -1103,14 +1190,13 @@ def scaled_fp4_experts_quant(
     assert input_tensor.ndim == 2, (
         f'input.ndim needs to be == 2, but got {input_tensor.ndim}.')
 
-    input_tensor = input_tensor[
-        expert_map] if expert_map is not None else input_tensor
-    m_numtopk, k = input_tensor.shape
     # Control the maximum number of tokens per expert supported by the
     # NVFP4 MoE Expert Quantization. This is used to prevent the kernel
     # from running out of memory. This value can also be increased to support
     # larger models.
     MAX_TOKENS_PER_EXPERT = envs.VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE
+    m_numtopk, k = input_tensor.shape
+
     assert (m_numtopk <= MAX_TOKENS_PER_EXPERT * topk), (
         f"m_numtopk must be less than MAX_TOKENS_PER_EXPERT("
         f"{MAX_TOKENS_PER_EXPERT})"
diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py
index a9a624b85abc..ae63e06030dd 100644
--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/adapter_commons/layers.py b/vllm/adapter_commons/layers.py
index 9cc2b181fc7c..9753a0880656 100644
--- a/vllm/adapter_commons/layers.py
+++ b/vllm/adapter_commons/layers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 
diff --git a/vllm/adapter_commons/models.py b/vllm/adapter_commons/models.py
index a84fbea2e444..7b685880a9e6 100644
--- a/vllm/adapter_commons/models.py
+++ b/vllm/adapter_commons/models.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from typing import Any, Callable, Optional, TypeVar
diff --git a/vllm/adapter_commons/request.py b/vllm/adapter_commons/request.py
index 2b604b91bbb6..8135b54ba19f 100644
--- a/vllm/adapter_commons/request.py
+++ b/vllm/adapter_commons/request.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 
diff --git a/vllm/adapter_commons/utils.py b/vllm/adapter_commons/utils.py
index 46e9629e1f55..a1a56b6bbd4b 100644
--- a/vllm/adapter_commons/utils.py
+++ b/vllm/adapter_commons/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Callable, Optional
 
diff --git a/vllm/adapter_commons/worker_manager.py b/vllm/adapter_commons/worker_manager.py
index 3c1d26404c99..07e85d138ac5 100644
--- a/vllm/adapter_commons/worker_manager.py
+++ b/vllm/adapter_commons/worker_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from typing import Any, Optional
diff --git a/vllm/assets/audio.py b/vllm/assets/audio.py
index a21eb7f599fa..1c16230849bc 100644
--- a/vllm/assets/audio.py
+++ b/vllm/assets/audio.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from pathlib import Path
diff --git a/vllm/assets/base.py b/vllm/assets/base.py
index 03f3b9dabf14..31cde431b5b6 100644
--- a/vllm/assets/base.py
+++ b/vllm/assets/base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from functools import lru_cache
 from pathlib import Path
diff --git a/vllm/assets/image.py b/vllm/assets/image.py
index d8cca9b74edd..c977242a3d48 100644
--- a/vllm/assets/image.py
+++ b/vllm/assets/image.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Literal
diff --git a/vllm/assets/video.py b/vllm/assets/video.py
index bf06746a9ff6..01834aeeb6c1 100644
--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from functools import lru_cache
diff --git a/vllm/attention/__init__.py b/vllm/attention/__init__.py
index 85c5715faba7..344040586a53 100644
--- a/vllm/attention/__init__.py
+++ b/vllm/attention/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.attention.backends.abstract import (AttentionBackend,
                                               AttentionMetadata,
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index f3d6ffaeb8f4..0ba5a5bf94c9 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
@@ -269,6 +270,7 @@ def __init__(
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
     ) -> None:
         raise NotImplementedError
 
diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
index ea4f840729b4..c1663516de35 100644
--- a/vllm/attention/backends/blocksparse_attn.py
+++ b/vllm/attention/backends/blocksparse_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Tuple, Type
@@ -305,7 +306,10 @@ def __init__(
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
     ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0.")
         assert blocksparse_params is not None
         assert alibi_slopes is None, ValueError(
             "Alibi not support for blocksparse flash attention.")
diff --git a/vllm/attention/backends/cpu_mla.py b/vllm/attention/backends/cpu_mla.py
index 4567893a9ef7..793cb87b7434 100644
--- a/vllm/attention/backends/cpu_mla.py
+++ b/vllm/attention/backends/cpu_mla.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Type
@@ -177,7 +178,7 @@ def build(self, seq_lens, query_lens, cuda_graph_pad_size, batch_size):
             seq_lens_tensor=seq_lens_tensor,
             max_query_len=max_query_len,
             max_kv_len=max_kv_len,
-            query_start_loc=query_start_loc,
+            prefill_query_start_loc=query_start_loc,
             kv_start_loc=kv_start_loc,
             max_decode_seq_len=input_data.max_decode_seq_len,
             num_prefills=input_data.num_prefills,
@@ -205,12 +206,13 @@ def __init__(
             blocksparse_params: Optional[Dict[str, Any]],
             logits_soft_cap: Optional[float],
             attn_type: str,
+            kv_sharing_target_layer_name: Optional[str],
             # MLA Specific Arguments
             **mla_args) -> None:
         super().__init__(num_heads, head_size, scale, num_kv_heads,
                          alibi_slopes, sliding_window, kv_cache_dtype,
                          blocksparse_params, logits_soft_cap, attn_type,
-                         **mla_args)
+                         kv_sharing_target_layer_name, **mla_args)
 
         unsupported_features = [
             alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
@@ -262,8 +264,8 @@ def _forward_prefill(
             key=k,
             value=v_padded,
             out=output,
-            seqlen_q=prefill_metadata.query_start_loc,
-            seqlen_k=prefill_metadata.query_start_loc,
+            seqlen_q=prefill_metadata.prefill_query_start_loc,
+            seqlen_k=prefill_metadata.prefill_query_start_loc,
             max_seqlen_q=prefill_metadata.max_query_len,
             max_seqlen_k=prefill_metadata.max_query_len,
             pdropout=0.0,
diff --git a/vllm/attention/backends/dual_chunk_flash_attn.py b/vllm/attention/backends/dual_chunk_flash_attn.py
index eceab1f1ac9a..963bccdf21bc 100644
--- a/vllm/attention/backends/dual_chunk_flash_attn.py
+++ b/vllm/attention/backends/dual_chunk_flash_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with Dual chunk flash attention and sparse attention.
 """
 import math
@@ -289,9 +290,12 @@ def __init__(
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
         layer_idx: int = -1,
         dual_chunk_attention_config: Optional[Dict[str, Any]] = None,
     ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 7f8f720eee0a..73e3772682e6 100755
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with FlashAttention."""
 from collections import defaultdict
 from dataclasses import dataclass
@@ -617,8 +618,11 @@ def __init__(
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
         use_irope: bool = False,
     ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0.")
         if blocksparse_params is not None:
             raise ValueError(
                 "FlashAttention does not support block-sparse attention.")
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 37b20d0739f7..a3937760f03b 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 import os
@@ -935,8 +936,11 @@ def __init__(
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
         use_irope: bool = False,
     ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0.")
         if use_irope:
             logger.warning_once(
                 "Using irope in FlashInfer is not supported yet, it will fall"
diff --git a/vllm/attention/backends/flashmla.py b/vllm/attention/backends/flashmla.py
index 0e62748ddbee..e185d0260d0a 100644
--- a/vllm/attention/backends/flashmla.py
+++ b/vllm/attention/backends/flashmla.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import contextmanager
 from dataclasses import dataclass
@@ -183,12 +184,13 @@ def __init__(
             blocksparse_params: Optional[Dict[str, Any]],
             logits_soft_cap: Optional[float],
             attn_type: str,
+            kv_sharing_target_layer_name: Optional[str] = None,
             # MLA Specific Arguments
             **mla_args) -> None:
         super().__init__(num_heads, head_size, scale, num_kv_heads,
                          alibi_slopes, sliding_window, kv_cache_dtype,
                          blocksparse_params, logits_soft_cap, attn_type,
-                         **mla_args)
+                         kv_sharing_target_layer_name, **mla_args)
 
         assert is_flashmla_supported(), \
             "FlashMLA is not supported on this device"
diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index d701c59a234f..9bd513fd894f 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 ###############################################################################
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
@@ -109,9 +110,12 @@ def __init__(
         blocksparse_params: Optional[Dict[str, Any]] = None,
         max_seq_len: int = 4096,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
         use_irope: bool = False,
     ) -> None:
         super(AttentionImpl, self).__init__()
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0.")
         if use_irope:
             logger.warning_once(
                 "Using irope in HPU is not supported yet, it will fall back "
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
index f322c7b3dd6a..5051c6a7cc4f 100644
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """ Attention layer with torch scaled_dot_product_attention
     and PagedAttention."""
 from dataclasses import dataclass
@@ -122,8 +123,11 @@ def __init__(
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
         use_irope: bool = False,
     ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0.")
         if use_irope:
             logger.warning_once(
                 "Using irope in Ipex is not supported yet, it will fall"
diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index 1007140ef386..78cf95288130 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 # MLA Common Components
 
@@ -999,6 +1000,7 @@ def __init__(
         blocksparse_params: Optional[Dict[str, Any]],
         logits_soft_cap: Optional[float],
         attn_type: str,
+        kv_sharing_target_layer_name: Optional[str],
         # MLA Specific Arguments
         q_lora_rank: Optional[int],
         kv_lora_rank: int,
@@ -1008,6 +1010,8 @@ def __init__(
         v_head_dim: int,
         kv_b_proj: ColumnParallelLinear,
     ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing not supported in V0.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index 19642a939b48..7ad67615d33d 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Type
@@ -108,8 +109,11 @@ def __init__(
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
         use_irope: bool = False,
     ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0.")
         if use_irope:
             logger.warning_once(
                 "Using irope in Pallas is not supported yet, it will fall back "
diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py
index f1def25c89cf..820ddcab77d7 100644
--- a/vllm/attention/backends/placeholder_attn.py
+++ b/vllm/attention/backends/placeholder_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections import defaultdict
 from dataclasses import dataclass
diff --git a/vllm/attention/backends/rocm_aiter_mla.py b/vllm/attention/backends/rocm_aiter_mla.py
index c974f2a15a0e..1edf34351db3 100644
--- a/vllm/attention/backends/rocm_aiter_mla.py
+++ b/vllm/attention/backends/rocm_aiter_mla.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import contextmanager
 from dataclasses import dataclass
@@ -369,12 +370,13 @@ def __init__(
             blocksparse_params: Optional[dict[str, Any]],
             logits_soft_cap: Optional[float],
             attn_type: str,
+            kv_sharing_target_layer_name: Optional[str],
             # MLA Specific Arguments
             **mla_args) -> None:
         super().__init__(num_heads, head_size, scale, num_kv_heads,
                          alibi_slopes, sliding_window, kv_cache_dtype,
                          blocksparse_params, logits_soft_cap, attn_type,
-                         **mla_args)
+                         kv_sharing_target_layer_name, **mla_args)
 
         unsupported_features = [
             alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 7134472daa60..4b460dc0b58c 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer ROCm GPUs."""
 import itertools
 from dataclasses import dataclass
@@ -493,8 +494,11 @@ def __init__(
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
         use_irope: bool = False,
     ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0.")
         if use_irope:
             logger.warning_once(
                 "Using irope in ROCm Flash Attention is not supported yet, it "
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index c1bd638f2605..23231c323f13 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """ Attention layer with torch scaled_dot_product_attention
     and PagedAttention."""
 from dataclasses import dataclass
@@ -86,10 +87,13 @@ class TorchSDPAMetadata(AttentionMetadata, PagedAttentionMetadata):
     # For chunked prefill only
     max_query_len: Optional[int] = None
     max_kv_len: Optional[int] = None
-    query_start_loc: Optional[torch.Tensor] = None
+    prefill_query_start_loc: Optional[torch.Tensor] = None
     kv_start_loc: Optional[torch.Tensor] = None
     prefill_block_tables: Optional[torch.Tensor] = None
 
+    # For V1 logits index only
+    query_start_loc: Optional[torch.Tensor] = None
+
     # Begin encoder attn & enc/dec cross-attn fields...
     # Encoder sequence lengths representation
     encoder_seq_lens: Optional[List[int]] = None
@@ -374,7 +378,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             seq_lens_tensor=seq_lens_tensor,
             max_query_len=max_query_len,
             max_kv_len=max_kv_len,
-            query_start_loc=query_start_loc,
+            prefill_query_start_loc=query_start_loc,
             kv_start_loc=kv_start_loc,
             max_decode_seq_len=input_data.max_decode_seq_len,
             num_prefills=input_data.num_prefills,
@@ -404,8 +408,11 @@ def __init__(
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
         use_irope: bool = False,
     ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0.")
         if blocksparse_params is not None:
             raise ValueError(
                 "Torch SPDA does not support block-sparse attention.")
@@ -466,6 +473,11 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
+
+        # For warming-up
+        if attn_metadata is None:
+            return query
+
         attn_type = self.attn_type
         if (attn_type == AttentionType.ENCODER
                 and (not attn_metadata.is_all_encoder_attn_metadata_set)):
@@ -533,8 +545,8 @@ def forward(
 
         output = torch.empty_like(query)
         if prefill_meta := attn_metadata.prefill_metadata:
-            assert attn_metadata.seq_lens is not None
             if not prefill_meta.prefill_metadata.chunked_prefill:  # type: ignore
+                assert attn_metadata.seq_lens is not None
                 self._run_sdpa_forward(output,
                                        query,
                                        key,
@@ -551,7 +563,7 @@ def forward(
                     query[:prefill_meta.num_prefill_tokens, :, :],
                     key_cache,
                     value_cache,
-                    prefill_meta.query_start_loc,
+                    prefill_meta.prefill_query_start_loc,
                     prefill_meta.kv_start_loc,
                     prefill_meta.max_query_len,
                     prefill_meta.max_kv_len,
diff --git a/vllm/attention/backends/triton_mla.py b/vllm/attention/backends/triton_mla.py
index 6945c2c6e29c..e06f7d54e342 100644
--- a/vllm/attention/backends/triton_mla.py
+++ b/vllm/attention/backends/triton_mla.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Dict, List, Optional, Type
 
@@ -37,12 +38,13 @@ def __init__(
             blocksparse_params: Optional[Dict[str, Any]],
             logits_soft_cap: Optional[float],
             attn_type: str,
+            kv_sharing_target_layer_name: Optional[str],
             # MLA Specific Arguments
             **mla_args) -> None:
         super().__init__(num_heads, head_size, scale, num_kv_heads,
                          alibi_slopes, sliding_window, kv_cache_dtype,
                          blocksparse_params, logits_soft_cap, attn_type,
-                         **mla_args)
+                         kv_sharing_target_layer_name, **mla_args)
 
         unsupported_features = [
             alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index a281c9771a82..e3f02a193614 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention backend utils"""
 from collections import defaultdict
 from contextlib import contextmanager
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index a9d4a70b55a8..04ef928b7d7b 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with xFormers and PagedAttention."""
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Type
@@ -389,8 +390,11 @@ def __init__(
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
         use_irope: bool = False,
     ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0.")
         if blocksparse_params is not None:
             raise ValueError(
                 "XFormers does not support block-sparse attention.")
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 9e4fbe0b4c6c..a5fbd1a1c016 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer."""
 from typing import Any, Dict, List, Optional
 
@@ -20,6 +21,7 @@
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.platforms import _Backend, current_platform
 from vllm.utils import direct_register_custom_op
+from vllm.v1.attention.backends.utils import validate_kv_sharing_target
 
 
 class Attention(nn.Module):
@@ -49,6 +51,7 @@ def __init__(
         use_mla: bool = False,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
         **extra_impl_args,
     ) -> None:
         """
@@ -134,7 +137,7 @@ def __init__(
         self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads,
                              alibi_slopes, sliding_window, kv_cache_dtype,
                              blocksparse_params, logits_soft_cap, attn_type,
-                             **extra_impl_args)
+                             kv_sharing_target_layer_name, **extra_impl_args)
         self.backend = backend_name_to_enum(attn_backend.get_name())
         self.dtype = dtype
 
@@ -152,6 +155,19 @@ def __init__(
         compilation_config.static_forward_context[prefix] = self
         self.layer_name = prefix
         self.attn_type = attn_type
+
+        if kv_sharing_target_layer_name is not None:
+            if not envs.VLLM_USE_V1:
+                raise NotImplementedError(
+                    "Cross-layer KV sharing is not supported in V0.")
+
+            validate_kv_sharing_target(
+                prefix,
+                kv_sharing_target_layer_name,
+                compilation_config.static_forward_context,
+            )
+        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
+
         # use a placeholder kv cache tensor during init, which will be replaced
         # by bind_kv_cache
         # this variable will not be accessed if use_direct_call is True
diff --git a/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py b/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py
index bc87ce33a301..05fa9d11f228 100644
--- a/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py
+++ b/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/vllm/attention/ops/blocksparse_attention/interface.py b/vllm/attention/ops/blocksparse_attention/interface.py
index 6ab69ea5b409..c6f6cc29793f 100644
--- a/vllm/attention/ops/blocksparse_attention/interface.py
+++ b/vllm/attention/ops/blocksparse_attention/interface.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 
diff --git a/vllm/attention/ops/blocksparse_attention/utils.py b/vllm/attention/ops/blocksparse_attention/utils.py
index e64fc1139713..445720c709c4 100644
--- a/vllm/attention/ops/blocksparse_attention/utils.py
+++ b/vllm/attention/ops/blocksparse_attention/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Helper functions for 3D sparse pattern
 # These function are not optimized and very inefficient.
diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py
index 6ca2a64145bd..4f839348e522 100644
--- a/vllm/attention/ops/chunked_prefill_paged_decode.py
+++ b/vllm/attention/ops/chunked_prefill_paged_decode.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Authors:
 #  - Burkhard Ringlein <ngl@zurich.ibm.com>
diff --git a/vllm/attention/ops/flashmla.py b/vllm/attention/ops/flashmla.py
index 18b69a6b3ddf..b85f27ac417c 100644
--- a/vllm/attention/ops/flashmla.py
+++ b/vllm/attention/ops/flashmla.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # adapted from: https://github.com/deepseek-ai/FlashMLA/blob/main/flash_mla/flash_mla_interface.py
 from typing import Optional, Tuple
 
diff --git a/vllm/attention/ops/hpu_paged_attn.py b/vllm/attention/ops/hpu_paged_attn.py
index a97c36338d3c..412dd20ec1de 100644
--- a/vllm/attention/ops/hpu_paged_attn.py
+++ b/vllm/attention/ops/hpu_paged_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 ###############################################################################
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py
index 1702203b1834..b7e4ba4d7416 100644
--- a/vllm/attention/ops/ipex_attn.py
+++ b/vllm/attention/ops/ipex_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Dict, List, Optional, Tuple
 
diff --git a/vllm/attention/ops/merge_attn_states.py b/vllm/attention/ops/merge_attn_states.py
index f9fcfe6a6338..5cb1a47394cf 100644
--- a/vllm/attention/ops/merge_attn_states.py
+++ b/vllm/attention/ops/merge_attn_states.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py
index 8c9145bb99e8..e28ff7e8b4ed 100644
--- a/vllm/attention/ops/nki_flash_attn.py
+++ b/vllm/attention/ops/nki_flash_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import neuronxcc.nki.isa as nisa
 import neuronxcc.nki.language as nl
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
index 827c3041a682..c6d1501e2757 100644
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import List, Optional, Tuple
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index 729b61b02906..13bef96722d2 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # The kernels in this file are adapted from LightLLM's context_attention_fwd:
 # https://github.com/ModelTC/lightllm/blob/main/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py
diff --git a/vllm/attention/ops/rocm_aiter_mla.py b/vllm/attention/ops/rocm_aiter_mla.py
index 421891ab6b73..cce6b4639460 100644
--- a/vllm/attention/ops/rocm_aiter_mla.py
+++ b/vllm/attention/ops/rocm_aiter_mla.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/attention/ops/rocm_aiter_paged_attn.py b/vllm/attention/ops/rocm_aiter_paged_attn.py
index 0f3cf1842c80..ad97152e208b 100644
--- a/vllm/attention/ops/rocm_aiter_paged_attn.py
+++ b/vllm/attention/ops/rocm_aiter_paged_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import aiter as rocm_aiter
diff --git a/vllm/attention/ops/triton_decode_attention.py b/vllm/attention/ops/triton_decode_attention.py
index fb983907e375..c27b377aebe9 100644
--- a/vllm/attention/ops/triton_decode_attention.py
+++ b/vllm/attention/ops/triton_decode_attention.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/sgl-project/sglang/blob/9f635ea50de920aa507f486daafba26a5b837574/python/sglang/srt/layers/attention/triton_ops/decode_attention.py
diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py
index 62cfb813d5f9..a26e713b1c62 100644
--- a/vllm/attention/ops/triton_flash_attention.py
+++ b/vllm/attention/ops/triton_flash_attention.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Fused Attention
 ===============
diff --git a/vllm/attention/ops/triton_merge_attn_states.py b/vllm/attention/ops/triton_merge_attn_states.py
index 30e61b6d8263..56d78ed5ea6e 100644
--- a/vllm/attention/ops/triton_merge_attn_states.py
+++ b/vllm/attention/ops/triton_merge_attn_states.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py
index 87cf333f7f0a..92c09e6dd064 100644
--- a/vllm/attention/ops/triton_unified_attention.py
+++ b/vllm/attention/ops/triton_unified_attention.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Authors:
 #  - Burkhard Ringlein <ngl@zurich.ibm.com>
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index ebbdea27f413..cb577fa67302 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from contextlib import contextmanager
diff --git a/vllm/attention/utils/fa_utils.py b/vllm/attention/utils/fa_utils.py
index ca88549f3f72..69cde06fd72e 100644
--- a/vllm/attention/utils/fa_utils.py
+++ b/vllm/attention/utils/fa_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 from vllm import envs
diff --git a/vllm/beam_search.py b/vllm/beam_search.py
index ddacc669551b..f3bc4218323d 100644
--- a/vllm/beam_search.py
+++ b/vllm/beam_search.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Optional, Union
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 21fe3eb629e2..4da9f7368e63 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This module defines a framework for sampling benchmark requests from various
 datasets. Each dataset subclass of BenchmarkDataset must implement sample
@@ -23,7 +24,6 @@
 from typing import Any, Callable, Optional, Union
 
 import numpy as np
-import pandas as pd
 from PIL import Image
 from transformers import PreTrainedTokenizerBase
 
@@ -32,6 +32,23 @@
 from vllm.multimodal import MultiModalDataDict
 from vllm.multimodal.image import convert_image_mode
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
+from vllm.utils import PlaceholderModule
+
+try:
+    from datasets import load_dataset
+except ImportError:
+    datasets = PlaceholderModule("datasets")
+    load_dataset = datasets.placeholder_attr("load_dataset")
+
+try:
+    import pandas as pd
+except ImportError:
+    pd = PlaceholderModule("pandas")
+
+try:
+    import librosa
+except ImportError:
+    librosa = PlaceholderModule("librosa")
 
 logger = logging.getLogger(__name__)
 
@@ -635,13 +652,6 @@ def load_data(self, ):
         if self.dataset_path is None:
             raise ValueError("dataset_path must be provided for loading data.")
 
-        try:
-            import pandas as pd
-        except ImportError as e:
-            raise ImportError(
-                "Pandas is required for BurstGPTDataset. Please install it "
-                "using `pip install pandas`.") from e
-
         df = pd.read_csv(self.dataset_path)
         # Filter to keep only GPT-4 rows.
         gpt4_df = df[df["Model"] == "GPT-4"]
@@ -716,13 +726,6 @@ def __init__(
 
     def load_data(self) -> None:
         """Load data from HuggingFace datasets."""
-        try:
-            from datasets import load_dataset
-        except ImportError as e:
-            raise ImportError(
-                "Hugging Face datasets library is required for this dataset. "
-                "Please install it using `pip install datasets`.") from e
-
         self.data = load_dataset(
             self.dataset_path,
             name=self.dataset_subset,
@@ -879,7 +882,19 @@ def sample(self,
         for item in self.data:
             if len(sampled_requests) >= num_requests:
                 break
-            prompt = f"{item['instruction']}:\n{item['input']}"
+            prompt = f"{item['input']}\n\n{item['instruction']} Just output \
+            the code, do not include any explanation."
+
+            # apply template
+            prompt = tokenizer.apply_chat_template(
+                [{
+                    "role": "user",
+                    "content": prompt
+                }],
+                add_generation_prompt=True,
+                tokenize=False,
+            )
+
             prompt_len = len(tokenizer(prompt).input_ids)
             sampled_requests.append(
                 SampleRequest(
@@ -1134,13 +1149,6 @@ def sample(
         output_len: Optional[int] = None,
         **kwargs,
     ) -> list:
-        try:
-            import librosa
-        except ImportError as e:
-            raise ImportError(
-                "librosa is required for ASRDataset. Please install it "
-                "using `pip install librosa`.") from e
-
         output_len = (output_len
                       if output_len is not None else self.DEFAULT_OUTPUT_LEN)
         prompt = ASRDataset.TRANSCRIPTION_PREAMBLE
diff --git a/vllm/benchmarks/endpoint_request_func.py b/vllm/benchmarks/endpoint_request_func.py
index a28630d50f26..aba60edc58cb 100644
--- a/vllm/benchmarks/endpoint_request_func.py
+++ b/vllm/benchmarks/endpoint_request_func.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """The request function for API endpoints."""
 
 import io
diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py
index dc1c42879b2c..5c6124db80b4 100644
--- a/vllm/benchmarks/latency.py
+++ b/vllm/benchmarks/latency.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Benchmark the latency of processing a single batch of requests."""
 
 import argparse
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index 858a0c6a00e4..019ebcf8d504 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 r"""Benchmark online serving throughput.
 
 On the server side, run one of the following commands
diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py
index 3ea6c194baa8..be9ea39f0c38 100644
--- a/vllm/benchmarks/throughput.py
+++ b/vllm/benchmarks/throughput.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Benchmark offline inference throughput."""
 import argparse
 import dataclasses
diff --git a/vllm/benchmarks/utils.py b/vllm/benchmarks/utils.py
index 45a0ddbd5d08..f0bb99326ab4 100644
--- a/vllm/benchmarks/utils.py
+++ b/vllm/benchmarks/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import json
diff --git a/vllm/collect_env.py b/vllm/collect_env.py
index 86eb465b8f65..64172a9bf91d 100644
--- a/vllm/collect_env.py
+++ b/vllm/collect_env.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # ruff: noqa
 # code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py
@@ -6,7 +7,6 @@
 import datetime
 import locale
 import os
-import re
 import subprocess
 import sys
 # Unlike the rest of the PyTorch this file must be python2 compliant.
@@ -14,6 +14,8 @@
 # Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
 from collections import namedtuple
 
+import regex as re
+
 from vllm.envs import environment_variables
 
 try:
@@ -815,4 +817,4 @@ def main():
 
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()
diff --git a/vllm/compilation/activation_quant_fusion.py b/vllm/compilation/activation_quant_fusion.py
index dc3e1482e2b4..ce4e50a2b02d 100644
--- a/vllm/compilation/activation_quant_fusion.py
+++ b/vllm/compilation/activation_quant_fusion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index c4bfffe92997..5af3b7efed2d 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import ast
 import dataclasses
diff --git a/vllm/compilation/base_piecewise_backend.py b/vllm/compilation/base_piecewise_backend.py
index 84d1e1f77739..4d7aeeb4d03e 100644
--- a/vllm/compilation/base_piecewise_backend.py
+++ b/vllm/compilation/base_piecewise_backend.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Callable, Protocol
 
diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py
index f651ee6912ab..f754fc2388b2 100644
--- a/vllm/compilation/collective_fusion.py
+++ b/vllm/compilation/collective_fusion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index 9293610cc246..36c810ec2dc9 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import contextlib
 import copy
 import hashlib
diff --git a/vllm/compilation/counter.py b/vllm/compilation/counter.py
index 2200671b8848..165347cfccef 100644
--- a/vllm/compilation/counter.py
+++ b/vllm/compilation/counter.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import copy
 import dataclasses
@@ -14,7 +15,7 @@ class CompilationCounter:
     # not including the splitting ops
     num_piecewise_capturable_graphs_seen: int = 0
     num_backend_compilations: int = 0
-    num_cudagraph_caputured: int = 0
+    num_cudagraph_captured: int = 0
     # InductorAdapter.compile calls
     num_inductor_compiles: int = 0
     # EagerAdapter.compile calls
diff --git a/vllm/compilation/cuda_piecewise_backend.py b/vllm/compilation/cuda_piecewise_backend.py
index 0ad480e28cd7..993def49af70 100644
--- a/vllm/compilation/cuda_piecewise_backend.py
+++ b/vllm/compilation/cuda_piecewise_backend.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 from contextlib import ExitStack
@@ -192,7 +193,7 @@ def __call__(self, *args) -> Any:
             entry.output = weak_ref_tensors(output)
             entry.cudagraph = cudagraph
 
-            compilation_counter.num_cudagraph_caputured += 1
+            compilation_counter.num_cudagraph_captured += 1
 
             # important: we need to return the output, rather than
             # the weak ref of the output, so that pytorch can correctly
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index f02994c55527..05e4ca9f08b3 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import inspect
 from typing import Callable, Optional, TypeVar, Union, overload
diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py
index 70f3b8b6df94..286221d32c1e 100644
--- a/vllm/compilation/fix_functionalization.py
+++ b/vllm/compilation/fix_functionalization.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import operator
 from collections.abc import Iterable
diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
index 618b2fe94d3a..7e2c5b4fe66a 100644
--- a/vllm/compilation/fusion.py
+++ b/vllm/compilation/fusion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, NamedTuple, Optional
 
diff --git a/vllm/compilation/fx_utils.py b/vllm/compilation/fx_utils.py
index b9eeb0c8d2af..9ef388932388 100644
--- a/vllm/compilation/fx_utils.py
+++ b/vllm/compilation/fx_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import operator
 from collections.abc import Iterable
diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py
index a9359fe1e117..810d0801e9f3 100644
--- a/vllm/compilation/inductor_pass.py
+++ b/vllm/compilation/inductor_pass.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import hashlib
 import inspect
diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py
index 786c7c1e1859..1e059b59fb64 100644
--- a/vllm/compilation/monitor.py
+++ b/vllm/compilation/monitor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import time
diff --git a/vllm/compilation/multi_output_match.py b/vllm/compilation/multi_output_match.py
index cef19f9257ed..6d1893777cec 100644
--- a/vllm/compilation/multi_output_match.py
+++ b/vllm/compilation/multi_output_match.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import abc
 import operator
diff --git a/vllm/compilation/noop_elimination.py b/vllm/compilation/noop_elimination.py
index 13e4cd73f8ce..46f70dcdc688 100644
--- a/vllm/compilation/noop_elimination.py
+++ b/vllm/compilation/noop_elimination.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from typing import Union
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index 07ebd3e1b7dd..621c89a14487 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from torch import fx as fx
 
diff --git a/vllm/compilation/sequence_parallelism.py b/vllm/compilation/sequence_parallelism.py
index 17dded87fe8d..d41093903480 100644
--- a/vllm/compilation/sequence_parallelism.py
+++ b/vllm/compilation/sequence_parallelism.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
diff --git a/vllm/compilation/torch25_custom_graph_pass.py b/vllm/compilation/torch25_custom_graph_pass.py
index 4b881d0b6f2d..cd3970657522 100644
--- a/vllm/compilation/torch25_custom_graph_pass.py
+++ b/vllm/compilation/torch25_custom_graph_pass.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from typing import Any, Optional
 
diff --git a/vllm/compilation/vllm_inductor_pass.py b/vllm/compilation/vllm_inductor_pass.py
index 0fe73b72b1de..3ccbf52d9fd3 100644
--- a/vllm/compilation/vllm_inductor_pass.py
+++ b/vllm/compilation/vllm_inductor_pass.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index 1a8211f0ab7c..2a261c84c3fc 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import sys
@@ -40,11 +41,16 @@ def __init__(self,
             # compiling the forward method
 
             backend = vllm_config.compilation_config.init_backend(vllm_config)
+            options = None
+            if isinstance(backend, str) and backend == "inductor":
+                options = get_current_vllm_config(
+                ).compilation_config.inductor_compile_config
 
             compiled_callable = torch.compile(
                 self.forward,
                 fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
-                backend=backend)
+                backend=backend,
+                options=options)
 
         self.compiled_callable = compiled_callable
         self.original_code_object = self.__class__.forward.__code__
diff --git a/vllm/config.py b/vllm/config.py
index 8aa1b5610300..d643daa5c58e 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import ast
 import copy
@@ -647,7 +648,7 @@ def architectures(self) -> list[str]:
     def maybe_pull_model_tokenizer_for_s3(self, model: str,
                                           tokenizer: str) -> None:
         """Pull model/tokenizer from S3 to temporary directory when needed.
-        
+
         Args:
             model: Model name or path
             tokenizer: Tokenizer name or path
@@ -1369,9 +1370,9 @@ def get_diff_sampling_param(self) -> dict[str, Any]:
     def is_encoder_decoder(self) -> bool:
         """Extract the HF encoder/decoder model flag."""
         """
-        For Mllama, VLLM overrides HF's is_encoder_decoder flag and sets it to 
+        For Mllama, VLLM overrides HF's is_encoder_decoder flag and sets it to
         True to enable cross-attention
-        Neuron needs all multimodal data to be in the decoder and does not 
+        Neuron needs all multimodal data to be in the decoder and does not
         need to explicitly enable cross-attention
         """
         if (current_platform.is_neuron()
@@ -1793,7 +1794,7 @@ class is dynamically inherited by the worker class. This is used to inject
     """Global rank in distributed setup."""
 
     enable_multimodal_encoder_data_parallel: bool = False
-    """ Use data parallelism instead of tensor parallelism for vision encoder. 
+    """ Use data parallelism instead of tensor parallelism for vision encoder.
     Only support LLama4 for now"""
 
     @property
@@ -1855,6 +1856,8 @@ def compute_hash(self):
         factors.append(self.pipeline_parallel_size)
         factors.append(self.tensor_parallel_size)
         factors.append(self.enable_expert_parallel)
+        factors.append(self.data_parallel_size)
+        factors.append(envs.VLLM_ALL2ALL_BACKEND)
         return hashlib.sha256(str(factors).encode()).hexdigest()
 
     def __post_init__(self) -> None:
@@ -2101,6 +2104,12 @@ class SchedulerConfig:
     default scheduler. Can be a class directly or the path to a class of form
     "mod.custom_class"."""
 
+    disable_hybrid_kv_cache_manager: bool = False
+    """If set to True, KV cache manager will allocate the same size of KV cache
+    for all attention layers even if there are multiple type of attention layers
+    like full attention and sliding window attention.
+    """
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
@@ -2263,9 +2272,9 @@ class DeviceConfig:
 
     device: SkipValidation[Union[Device, torch.device]] = "auto"
     """Device type for vLLM execution.
-    This parameter is deprecated and will be 
-    removed in a future release. 
-    It will now be set automatically based 
+    This parameter is deprecated and will be
+    removed in a future release.
+    It will now be set automatically based
     on the current platform."""
     device_type: str = field(init=False)
     """Device type from the current platform. This is set in
@@ -3136,6 +3145,8 @@ def _find_dtype(
         config_dtype = getattr(config.get_text_config(), "torch_dtype", None)
     if config_dtype is None and hasattr(config, "vision_config"):
         config_dtype = getattr(config.vision_config, "torch_dtype", None)
+    if config_dtype is None and hasattr(config, "encoder_config"):
+        config_dtype = getattr(config.encoder_config, "torch_dtype", None)
 
     # Try to read the dtype of the weights if they are in safetensors format
     if config_dtype is None:
@@ -3907,12 +3918,14 @@ class CompilationConfig:
     constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
 
     # CudaGraph compilation
-    use_cudagraph: bool = False
+    use_cudagraph: bool = envs.VLLM_USE_V1
     """Whether to use cudagraph inside compilation.
     - False: cudagraph inside compilation is not used.
     - True: cudagraph inside compilation is used. It requires
         that all input buffers have fixed addresses, and all
         splitting ops write their outputs to input buffers.
+    In the vLLM V1 Engine, this flag only applies for
+    CompilationLevel.PIECEWISE (aka -O3).
     Note that this is orthogonal to the cudagraph capture logic
     outside of compilation.
     TODO: move outside cudagraph logic into compilation.
@@ -3996,19 +4009,24 @@ def compute_hash(self) -> str:
 
     def __repr__(self) -> str:
         exclude = {
-            "static_forward_context",
-            "enabled_custom_ops",
-            "disabled_custom_ops",
-            "compilation_time",
-            "bs_to_padded_graph_size",
-            "pass_config",
-            "traced_files",
+            "static_forward_context": True,
+            "enabled_custom_ops": True,
+            "disabled_custom_ops": True,
+            "compilation_time": True,
+            "bs_to_padded_graph_size": True,
+            "pass_config": True,
+            "traced_files": True,
+            "inductor_compile_config": {
+                "post_grad_custom_post_pass": True,
+            },
         }
         # The cast to string is necessary because Pydantic is mocked in docs
         # builds and sphinx-argparse doesn't know the return type of decode()
         return str(
             TypeAdapter(CompilationConfig).dump_json(
-                self, exclude=exclude, exclude_unset=True).decode())
+                self,
+                exclude=exclude,  # type: ignore[arg-type]
+                exclude_unset=True).decode())
 
     __str__ = __repr__
 
@@ -4409,7 +4427,6 @@ def __post_init__(self):
             # FIXME(rob): Add function to set all of these.
             if not self.compilation_config.custom_ops:
                 self.compilation_config.custom_ops = ["none"]
-            self.compilation_config.use_cudagraph = True
             self.compilation_config.cudagraph_num_of_warmups = 1
             self.compilation_config.pass_config.enable_fusion = False
             self.compilation_config.pass_config.enable_noop = False
@@ -4460,6 +4477,21 @@ def __post_init__(self):
         if not self.instance_id:
             self.instance_id = random_uuid()[:5]
 
+        if (envs.VLLM_USE_V1
+                and not self.scheduler_config.disable_hybrid_kv_cache_manager):
+            # logger should only print warning message for hybrid models. As we
+            # can't know whether the model is hybrid or not now, so we don't log
+            # warning message here and will log it later.
+            if not (current_platform.is_cuda() or current_platform.is_rocm()):
+                # Hybrid KV cache manager is not supported on non-GPU platforms.
+                self.disable_hybrid_kv_cache_manager = True
+            if self.kv_transfer_config is not None:
+                # Hybrid KV cache manager is not compatible with KV transfer.
+                self.disable_hybrid_kv_cache_manager = True
+            if self.kv_events_config is not None:
+                # Hybrid KV cache manager is not compatible with KV events.
+                self.disable_hybrid_kv_cache_manager = True
+
     def update_sizes_for_sequence_parallelism(self,
                                               possible_sizes: list) -> list:
         # remove the sizes that not multiple of tp_size when
diff --git a/vllm/connections.py b/vllm/connections.py
index 84e32a4d5ca9..103505eb3d81 100644
--- a/vllm/connections.py
+++ b/vllm/connections.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Mapping, MutableMapping
 from pathlib import Path
diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index d4d31c58dc8d..444bb25f2830 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 from typing import List, Optional
diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py
index 1966eac1cf9e..a337007a9eaa 100644
--- a/vllm/core/block/common.py
+++ b/vllm/core/block/common.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections import deque
 from dataclasses import dataclass
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index d64142e77f37..ea490c32791c 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Dict, FrozenSet, List, Optional, Tuple
 
diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py
index 301656996435..1a05881f7c00 100644
--- a/vllm/core/block/interfaces.py
+++ b/vllm/core/block/interfaces.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from typing import Dict, FrozenSet, List, Optional, Protocol, Tuple
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index c388366b825f..dae6ead04e9c 100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections import deque
 from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple, Union
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index 1ca9e49dac37..2913a01bf34a 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Token blocks."""
 import sys
 from bisect import bisect_left
diff --git a/vllm/core/block/utils.py b/vllm/core/block/utils.py
index 910afdd9feff..e933c6ee7c8b 100644
--- a/vllm/core/block/utils.py
+++ b/vllm/core/block/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Block manager utils."""
 from vllm.sequence import SequenceGroup
 from vllm.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index c6bf6d163132..a33399204faf 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A block manager that manages token blocks."""
 from typing import Dict, List, Optional
 from typing import Sequence as GenericSequence
diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py
index 0e363eddc8a5..7ec4768e90b1 100644
--- a/vllm/core/evictor.py
+++ b/vllm/core/evictor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
 import heapq
diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py
index 4c1182debcec..ba290eeda12b 100644
--- a/vllm/core/interfaces.py
+++ b/vllm/core/interfaces.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
 from abc import ABC, abstractmethod
diff --git a/vllm/core/placeholder_block_space_manager.py b/vllm/core/placeholder_block_space_manager.py
index 0f5d8ca6dc7e..71b22942a3ed 100644
--- a/vllm/core/placeholder_block_space_manager.py
+++ b/vllm/core/placeholder_block_space_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import List, Optional, Tuple
 
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 06d4ed470b20..44be855b1bfd 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
 import os
diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py
index 6fcbca628c6a..942e866ed97e 100644
--- a/vllm/device_allocator/cumem.py
+++ b/vllm/device_allocator/cumem.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # cumem-based pytorch pluggable allocator to implement sleep mode.
 # other approaches tried but failed:
diff --git a/vllm/distributed/__init__.py b/vllm/distributed/__init__.py
index 39955ddacfe9..e911b2a1ab28 100644
--- a/vllm/distributed/__init__.py
+++ b/vllm/distributed/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from .communication_op import *
 from .parallel_state import *
diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py
index d85a41ddac22..0a5a95176f7c 100644
--- a/vllm/distributed/communication_op.py
+++ b/vllm/distributed/communication_op.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional, Union
 
diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py
index 7177754a3711..cab2496bfba7 100644
--- a/vllm/distributed/device_communicators/all2all.py
+++ b/vllm/distributed/device_communicators/all2all.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import importlib.util
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 
 import torch
 import torch.distributed as dist
@@ -83,10 +84,6 @@ def __init__(self, cpu_group):
         assert has_pplx, "pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install pplx_kernels."  # noqa
         super().__init__(cpu_group)
 
-        # TODO(tms): Disable pplx-a2a intranode as it fails with the error:
-        # failed: cuda error /app/pplx/csrc/all_to_all/intranode.cpp:84 'invalid resource handle' # noqa
-        self.internode = True
-
         if self.internode:
             # inter-node communication needs nvshmem,
             # intra-node communication uses p2p mapping directly
@@ -128,3 +125,145 @@ def destroy(self):
             from pplx_kernels.nvshmem import nvshmem_finalize
             logger.debug("PPLX NVSHMEM finalize")
             nvshmem_finalize()
+
+
+class DeepEPAll2AllManagerBase(All2AllManagerBase):
+    """
+    All2All communication based on DeepEP High-Throughput kernels.
+    """
+
+    def __init__(self, cpu_group):
+        has_deepep = importlib.util.find_spec("deep_ep") is not None
+        assert has_deepep, "DeepEP kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install DeepEP kernels."  # noqa
+        super().__init__(cpu_group)
+        self.handle_cache = Cache()
+
+        # This is the DeepEP default. Stick to it till we can establish
+        # reasonable defaults based on profiling.
+        self.num_sms = 20
+
+    def get_handle(self, kwargs):
+        raise NotImplementedError
+
+    def dispatch(self, hidden_states: torch.Tensor,
+                 router_logits: torch.Tensor):
+        raise NotImplementedError
+
+    def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+
+    def destroy(self):
+        pass
+
+
+class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase):
+    """
+    All2All communication based on DeepEP High-Throughput kernels.
+    """
+
+    def __init__(self, cpu_group):
+        super().__init__(cpu_group)
+
+    def _make_all2all_kwargs(self) -> dict[Any, Any]:
+        # Defaults for internode and intranode are taken from DeepEP tests.
+        num_nvl_bytes = 1024 * 1024 * 1024
+        num_rdma_bytes = None
+        num_qps_per_rank = None
+
+        if self.internode:
+            num_rdma_bytes = 1024 * 1024 * 1024
+            num_qps_per_rank = self.num_sms // 2
+        else:
+            num_rdma_bytes = 0
+            num_qps_per_rank = 1
+
+        assert num_rdma_bytes is not None
+        assert num_qps_per_rank is not None
+        return dict(group=self.cpu_group,
+                    num_nvl_bytes=num_nvl_bytes,
+                    num_rdma_bytes=num_rdma_bytes,
+                    low_latency_mode=False,
+                    num_qps_per_rank=num_qps_per_rank)
+
+    def get_handle(self, kwargs):
+
+        assert len(kwargs) == 0, (
+            "DeepEPHTAll2AllManager expects no arguments. All the required "
+            "args are computed in the Manager itself.")
+
+        import deep_ep
+        buffer_kwargs = self._make_all2all_kwargs()
+        logger.debug("DeepEP all2all args %s", buffer_kwargs)
+        handle: deep_ep.Buffer = self.handle_cache.get_or_create(
+            buffer_kwargs, deep_ep.Buffer)
+        # It is dangerous to set num sms outside this function. num_sms is not
+        # a part of the hash-key that identifies this object. If we are in a
+        # situation where we make objects with different num_sms, the hash key
+        # in get_or_create must be updated.
+        handle.set_num_sms(self.num_sms)
+        return handle
+
+
+class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase):
+    """
+    All2All communication based on DeepEP Low-Latency kernels.
+    """
+
+    def __init__(self, cpu_group):
+        super().__init__(cpu_group)
+
+    def _make_all2all_kwargs(
+        self,
+        max_num_tokens_per_dp_rank: int,
+        token_hidden_size: int,
+        num_ep_ranks: int,
+        num_global_experts: int,
+        num_local_experts: int,
+    ) -> dict[Any, Any]:
+        """
+        max_num_tokens_per_dp_rank : the maximum number of tokens a DP rank
+          can dispatch all the ranks must hold the same value.
+        token_hidden_size: the hidden dimension of each token.
+        num_ep_ranks: the number of EP group ranks.
+        num_global_experts: Number of experts in the model.
+        num_local_experts: Number of experts in an EP rank.
+        """
+        import deep_ep
+
+        # Defaults for internode and intranode are taken from DeepEP tests.
+        num_nvl_bytes = 1024 * 1024 * 1024
+        num_qps_per_rank = num_local_experts
+        num_rdma_bytes = None
+
+        if self.internode:
+            num_rdma_bytes = 1024 * 1024 * 1024
+        else:
+            num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(
+                num_max_dispatch_tokens_per_rank=max_num_tokens_per_dp_rank,
+                hidden=token_hidden_size,
+                num_ranks=num_ep_ranks,
+                num_experts=num_global_experts)
+
+        assert num_rdma_bytes is not None
+        return dict(group=self.cpu_group,
+                    num_nvl_bytes=num_nvl_bytes,
+                    num_rdma_bytes=num_rdma_bytes,
+                    low_latency_mode=True,
+                    num_qps_per_rank=num_qps_per_rank)
+
+    def get_handle(self, kwargs):
+        """
+        The kwargs for DeepEPLLAll2AllManager is dictated by
+        _make_all2all_kwargs.
+        """
+        import deep_ep
+        buffer_kwargs = self._make_all2all_kwargs(**kwargs)
+        logger.debug("DeepEP all2all args %s", buffer_kwargs)
+        handle: deep_ep.Buffer = self.handle_cache.get_or_create(
+            buffer_kwargs, deep_ep.Buffer)
+        # It is dangerous to set num sms outside this function. num_sms is not
+        # a part of the hash-key that identifies this object. If we are in a
+        # situation where we make objects with different num_sms, the hash key
+        # in get_or_create must be updated.
+        handle.set_num_sms(self.num_sms)
+        return handle
diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py
index 52b970949144..1bc2d8e0281c 100644
--- a/vllm/distributed/device_communicators/base_device_communicator.py
+++ b/vllm/distributed/device_communicators/base_device_communicator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import threading
 from typing import Optional
 from weakref import WeakValueDictionary
@@ -48,8 +49,7 @@ def __init__(self, cpu_group):
 
         # all2all communication often has separate implementations for
         # intra-node and inter-node communication
-        self.intranode = in_the_same_node_as(cpu_group, source_rank=0)
-        self.internode = not self.intranode
+        self.internode = not all(in_the_same_node_as(cpu_group, source_rank=0))
 
     def get_handle(self, kwargs):
         # get a handle for the all2all communication,
diff --git a/vllm/distributed/device_communicators/cpu_communicator.py b/vllm/distributed/device_communicators/cpu_communicator.py
index c04218cb9f39..94effa0b2ca8 100644
--- a/vllm/distributed/device_communicators/cpu_communicator.py
+++ b/vllm/distributed/device_communicators/cpu_communicator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from typing import Optional
diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py
index a05a13f51d4b..055d91690e67 100644
--- a/vllm/distributed/device_communicators/cuda_communicator.py
+++ b/vllm/distributed/device_communicators/cuda_communicator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
@@ -66,6 +67,14 @@ def __init__(self,
                 from .all2all import PPLXAll2AllManager
                 self.all2all_manager = PPLXAll2AllManager(self.cpu_group)
                 logger.info("Using PPLX all2all manager.")
+            elif all2all_backend == "deepep_high_throughput":
+                from .all2all import DeepEPHTAll2AllManager
+                self.all2all_manager = DeepEPHTAll2AllManager(self.cpu_group)
+                logger.info("Using DeepEP High-Throughput all2all manager.")
+            elif all2all_backend == "deepep_low_latency":
+                from .all2all import DeepEPLLAll2AllManager
+                self.all2all_manager = DeepEPLLAll2AllManager(self.cpu_group)
+                logger.info("Using DeepEP Low-Latency all2all manager.")
             else:
                 raise ValueError(f"Unknown all2all backend: {all2all_backend}")
 
diff --git a/vllm/distributed/device_communicators/cuda_wrapper.py b/vllm/distributed/device_communicators/cuda_wrapper.py
index 6c15ef644b8c..2c38e8ed21d7 100644
--- a/vllm/distributed/device_communicators/cuda_wrapper.py
+++ b/vllm/distributed/device_communicators/cuda_wrapper.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """This file is a pure Python wrapper for the cudart library.
 It avoids the need to compile a separate shared library, and is
 convenient for use when we just need to call a few functions.
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index 5c2dbcc27b13..7dd104a4fcc4 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import contextmanager
 from typing import Optional, Union
diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
index 11b8b57fe2ae..7c6001e87039 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import ctypes
 import json
diff --git a/vllm/distributed/device_communicators/hpu_communicator.py b/vllm/distributed/device_communicators/hpu_communicator.py
index 9536a7f883e1..f00f6b62bf24 100644
--- a/vllm/distributed/device_communicators/hpu_communicator.py
+++ b/vllm/distributed/device_communicators/hpu_communicator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 import torch.distributed as dist
diff --git a/vllm/distributed/device_communicators/neuron_communicator.py b/vllm/distributed/device_communicators/neuron_communicator.py
index dfa4b5194bdb..5b61a1687a01 100644
--- a/vllm/distributed/device_communicators/neuron_communicator.py
+++ b/vllm/distributed/device_communicators/neuron_communicator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch
 
 from vllm.distributed.device_communicators.base_device_communicator import (
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index 0ccd423121cb..29486292996a 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional, Union
 
diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py
index 6f69089b6196..04a4d0147f5d 100644
--- a/vllm/distributed/device_communicators/pynccl_wrapper.py
+++ b/vllm/distributed/device_communicators/pynccl_wrapper.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # This file is a pure Python wrapper for the NCCL library.
 # The main purpose is to use NCCL combined with CUDA graph.
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 40e57e6624d1..c7810043b81e 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pickle
 import time
@@ -27,6 +28,43 @@
 logger = init_logger(__name__)
 
 
+class SpinTimer:
+
+    def record_activity(self):
+        pass
+
+    def spin(self):
+        sched_yield()
+
+
+class SpinSleepTimer(SpinTimer):
+    """
+    In setups which have long inactivity periods it is desirable to reduce
+    system power consumption when vllm does nothing. This would lead to more
+    CPU thermal headroom when a request eventually comes, especially when
+    multiple GPUs are connected as each GPU would otherwise pin one thread at
+    100% CPU usage.
+
+    The simplest solution is to reduce polling frequency when there is no
+    activity for a certain period of time.
+    """
+
+    def __init__(self, busy_loop_s: float = 3.0, wait_sleep_s: float = 0.1):
+        self.last_activity = time.monotonic()
+        self.busy_loop_s = busy_loop_s
+        self.wait_sleep_s = wait_sleep_s
+
+    def record_activity(self):
+        self.last_activity = time.monotonic()
+
+    def spin(self):
+        curr_time = time.monotonic()
+        if curr_time >= self.last_activity + self.busy_loop_s:
+            time.sleep(self.wait_sleep_s)
+        else:
+            sched_yield()
+
+
 class ShmRingBuffer:
 
     def __init__(self,
@@ -41,7 +79,7 @@ def __init__(self,
         of items that can be stored in the buffer are known in advance.
         In this case, we don't need to synchronize the access to
          the buffer.
-        
+
         Buffer memory layout:
                   data                                 metadata
                     |                                      |
@@ -237,6 +275,7 @@ def __init__(
         self.local_reader_rank = -1
         # rank does not matter for remote readers
         self._is_remote_reader = False
+        self._read_spin_timer = SpinTimer()
 
         self.handle = Handle(
             local_reader_ranks=local_reader_ranks,
@@ -275,6 +314,9 @@ def create_from_handle(handle: Handle, rank) -> "MessageQueue":
             self.local_socket.connect(socket_addr)
 
             self.remote_socket = None
+
+            self._read_spin_timer = SpinSleepTimer(
+            ) if envs.VLLM_SLEEP_WHEN_IDLE else SpinTimer()
         else:
             self.buffer = None  # type: ignore
             self.current_idx = -1
@@ -406,7 +448,7 @@ def acquire_read(self,
                     # we need to wait until it is written
 
                     # Release the processor to other threads
-                    sched_yield()
+                    self._read_spin_timer.spin()
 
                     # if we wait for a long time, log a message
                     if (time.monotonic() - start_time
@@ -437,6 +479,8 @@ def acquire_read(self,
                 metadata_buffer[self.local_reader_rank + 1] = 1
                 self.current_idx = (self.current_idx +
                                     1) % self.buffer.max_chunks
+
+                self._read_spin_timer.record_activity()
                 break
 
     def enqueue(self, obj, timeout: Optional[float] = None):
diff --git a/vllm/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py
index a1775279661d..c60a7a7eb25c 100644
--- a/vllm/distributed/device_communicators/tpu_communicator.py
+++ b/vllm/distributed/device_communicators/tpu_communicator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from typing import Optional
diff --git a/vllm/distributed/device_communicators/xpu_communicator.py b/vllm/distributed/device_communicators/xpu_communicator.py
index 256e7965e0a7..216ff85c8bb7 100644
--- a/vllm/distributed/device_communicators/xpu_communicator.py
+++ b/vllm/distributed/device_communicators/xpu_communicator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/distributed/kv_events.py b/vllm/distributed/kv_events.py
index 29c6a70c4d26..2d7935773dd9 100644
--- a/vllm/distributed/kv_events.py
+++ b/vllm/distributed/kv_events.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import queue
 import threading
@@ -27,6 +28,7 @@ class EventBatch(
 ):
     ts: float
     events: list[Any]
+    data_parallel_rank: Optional[int] = None
 
 
 class KVCacheEvent(
@@ -59,7 +61,22 @@ class KVEventBatch(EventBatch):
 
 
 class EventPublisher(ABC):
-    """Lightweight publisher for EventBatch batches."""
+    """Lightweight publisher for EventBatch batches with data parallelism
+    support.
+    
+    In data parallel setups, each DP rank runs its own EventPublisher instance
+    to avoid duplicate events and ensure proper event attribution:
+    
+    - Each DP rank creates a separate publisher
+    - Publishers automatically annotate events with their data_parallel_rank
+    - This allows consumers to distinguish events from different DP ranks
+    
+    The publisher is responsible for adding DP metadata since the scheduler
+    operates independently of DP topology and shouldn't need DP awareness.
+    """
+
+    def __init__(self, data_parallel_rank: int = 0) -> None:
+        self._data_parallel_rank = data_parallel_rank
 
     @abstractmethod
     def publish(self, events: EventBatch) -> None:
@@ -112,6 +129,7 @@ class ZmqEventPublisher(EventPublisher):
 
     def __init__(
         self,
+        data_parallel_rank: int,
         endpoint: str = "tcp://*:5557",
         replay_endpoint: Optional[str] = None,
         buffer_steps: int = 10_000,
@@ -120,6 +138,7 @@ def __init__(
         topic: str = "",
     ) -> None:
         # Storage
+        super().__init__(data_parallel_rank)
         self._event_queue = Queue[Optional[EventBatch]](maxsize=max_queue_size)
         self._buffer = deque[tuple[int, bytes]](maxlen=buffer_steps)
 
@@ -127,8 +146,11 @@ def __init__(
         self._ctx = zmq.Context.instance()
         self._pub: Optional[zmq.Socket] = None
         self._replay: Optional[zmq.Socket] = None
-        self._endpoint = endpoint
-        self._replay_endpoint = replay_endpoint
+        self._dp_rank = data_parallel_rank
+
+        self._endpoint = self.offset_endpoint_port(endpoint, self._dp_rank)
+        self._replay_endpoint = self.offset_endpoint_port(
+            replay_endpoint, self._dp_rank)
         self._hwm = hwm
         self._socket_setup()
 
@@ -148,6 +170,8 @@ def __init__(
     def publish(self, events: EventBatch) -> None:
         if not self._running:
             raise RuntimeError("Publisher is closed")
+        if events.data_parallel_rank is None:
+            events.data_parallel_rank = self._data_parallel_rank
         self._event_queue.put(events)
 
     def shutdown(self) -> None:
@@ -190,11 +214,12 @@ def _socket_setup(self) -> None:
             self._pub.set_hwm(self._hwm)
             # Heuristic: bind if wildcard / * present, else connect.
             # bind stable, connect volatile convention
-            if ("*" in self._endpoint or "::" in self._endpoint
-                    or self._endpoint.startswith("ipc://")
-                    or self._endpoint.startswith("inproc://")):
+            if (self._endpoint is not None
+                    and ("*" in self._endpoint or "::" in self._endpoint
+                         or self._endpoint.startswith("ipc://")
+                         or self._endpoint.startswith("inproc://"))):
                 self._pub.bind(self._endpoint)
-            else:
+            elif self._endpoint is not None:
                 self._pub.connect(self._endpoint)
 
         # Set up replay socket: use ROUTER
@@ -265,6 +290,38 @@ def _service_replay(self) -> None:
         # receiving payload is (-1, b""")
         self._replay.send_multipart((client_id, b"", self.END_SEQ, b""))
 
+    @staticmethod
+    def offset_endpoint_port(endpoint: Optional[str],
+                             data_parallel_rank: int) -> Optional[str]:
+        """Helper function to offset the port in an endpoint by 
+            the data parallel rank.
+
+        Args:
+            endpoint: The endpoint string 
+                (e.g., "tcp://*:5557" or "inproc://cache")
+            data_parallel_rank: The data parallel rank to offset by
+
+        Returns:
+            The endpoint with the port offset by data_parallel_rank 
+                or suffix appended
+        """
+        # Do nothing if input is None or data_parallel_rank is 0
+        if not endpoint or data_parallel_rank == 0:
+            return endpoint
+
+        if "inproc" in endpoint:
+            return f"{endpoint}_dp{data_parallel_rank}"
+        if "tcp" in endpoint:
+            if endpoint and ":" in endpoint:
+                # Get everything after the last colon (the port)
+                last_colon_idx = endpoint.rfind(":")
+                base_addr = endpoint[:last_colon_idx]
+                base_port = int(endpoint[last_colon_idx + 1:])
+                new_port = base_port + data_parallel_rank
+                return f"{base_addr}:{new_port}"
+            return endpoint
+        raise ValueError("Invalid endpoint: must contain 'inproc' or 'tcp'")
+
 
 class EventPublisherFactory:
     _registry: dict[str, Callable[..., EventPublisher]] = {
@@ -280,7 +337,9 @@ def register_publisher(cls, name: str,
         cls._registry[name] = ctor
 
     @classmethod
-    def create(cls, config: Optional[KVEventsConfig]) -> EventPublisher:
+    def create(cls,
+               config: Optional[KVEventsConfig],
+               data_parallel_rank: int = 0) -> EventPublisher:
         """Create publisher from a config mapping."""
         if not config:
             return NullEventPublisher()
@@ -293,4 +352,5 @@ def create(cls, config: Optional[KVEventsConfig]) -> EventPublisher:
             constructor = cls._registry[kind]
         except KeyError as exc:
             raise ValueError(f"Unknown event publisher '{kind}'") from exc
-        return constructor(**config_dict)
+        return constructor(data_parallel_rank=data_parallel_rank,
+                           **config_dict)
diff --git a/vllm/distributed/kv_transfer/__init__.py b/vllm/distributed/kv_transfer/__init__.py
index 8b6abf5a80dd..fa9b7e4f14c0 100644
--- a/vllm/distributed/kv_transfer/__init__.py
+++ b/vllm/distributed/kv_transfer/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.distributed.kv_transfer.kv_transfer_state import (
     KVConnectorBaseType, ensure_kv_transfer_initialized, get_kv_transfer_group,
diff --git a/vllm/distributed/kv_transfer/kv_connector/base.py b/vllm/distributed/kv_transfer/kv_connector/base.py
index e9b70610e8cd..181c33925da7 100644
--- a/vllm/distributed/kv_transfer/kv_connector/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 KVConnectorBase Class for Distributed KV Cache & Hidden State communication
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index dce0b545c188..58dfa251c735 100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import importlib
 from typing import TYPE_CHECKING, Callable
diff --git a/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py
index d121cb701bef..78bf3095613a 100644
--- a/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 LMCache KV Cache Connector for Distributed Machine Learning Inference
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py b/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py
index 58eabd0a37eb..94a7ce91acf1 100644
--- a/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 MooncakeStore Connector for Distributed Machine Learning Inference
 The MooncakeStoreConnector transfers KV caches between prefill vLLM workers
diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
index ed8fe38161e9..e7c079e1f115 100644
--- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Simple KV Cache Connector for Distributed Machine Learning Inference
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py
index b1c9c9af6e23..b9bed06d791c 100644
--- a/vllm/distributed/kv_transfer/kv_connector/utils.py
+++ b/vllm/distributed/kv_transfer/kv_connector/utils.py
@@ -1,12 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 KV cache helper for store.
 """
+
 import torch
 
 import vllm.envs as envs
 from vllm import _custom_ops as ops
-from vllm.config import VllmConfig
+from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -89,3 +91,18 @@ def put_kv_to_cache(self, model_executable: torch.nn.Module, keys, values,
                 layer.self_attn.attn._k_scale,
                 layer.self_attn.attn._v_scale,
             )
+
+
+def get_kv_connector_cache_layout():
+    vllm_config = get_current_vllm_config()
+    kv_config = vllm_config.kv_transfer_config
+    if vllm_config.model_config is None:
+        logger.warning("Unable to detect current VLLM config. " \
+        "Defaulting to NHD kv cache layout.")
+    else:
+        use_mla = vllm_config.model_config.use_mla
+        if not use_mla and kv_config.kv_connector == "NixlConnector":
+            logger.info("NixlConnector detected. Setting KV cache " \
+            "layout to HND for better xfer performance.")
+            return "HND"
+    return "NHD"
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py b/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py
index e66aaa7f8af8..f00f31dde915 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1, KVConnectorRole)
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index bc9258e9d07b..f80b5eba235d 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 KVConnectorBase_V1 Class for Distributed KV Cache & Hidden State
 communication in vLLM v1
@@ -7,9 +8,15 @@
     Scheduler-side: runs in the scheduler, binds metadata, which
     is used by the worker-side to load/save KV cache.
         get_num_new_matched_tokens() - get number of new tokens 
-            that exist in the remote KV cache
+            that exist in the remote KV cache. Might be called multiple
+            times for a given request and should be side-effect free.
         update_state_after_alloc() - update KVConnector state after
             temporary buffer alloc by the CacheManager.
+        request_finished() - called when a request is finished, with
+            the computed kv cache blocks for the request.
+            Returns whether KV cache should be freed now or will be
+            freed asynchronously and optionally returns KV transfer
+            params.
 
     Worker-side: runs in each worker, loads/saves KV cache to/from
     the Connector based on the metadata.
@@ -18,6 +25,9 @@
 
         save_kv_layer() - starts saving KV for layer i (maybe async)
         wait_for_save() - blocks until all saves are done
+
+        get_finished() - called with ids of finished requests, returns
+            ids of requests that have completed async sending/recving.
 """
 
 import enum
@@ -183,7 +193,8 @@ def get_finished(
         finished generating tokens.
 
         Returns:
-            ids of requests that have finished asynchronous transfer,
+            ids of requests that have finished asynchronous transfer
+            (requests that previously returned True from request_finished()),
             tuple of (sending/saving ids, recving/loading ids).
             The finished saves/sends req ids must belong to a set provided in a
             call to this method (this call or a prior one).
@@ -214,7 +225,8 @@ def get_num_new_matched_tokens(
                 - The number of tokens that can be loaded from the 
                   external KV cache beyond what is already computed.
                 - `True` if external KV cache tokens will be loaded
-                  asynchronously (between scheduler steps).
+                  asynchronously (between scheduler steps). Must be
+                  'False' if the first element is 0.
         """
         pass
 
@@ -224,6 +236,18 @@ def update_state_after_alloc(self, request: "Request",
                                  num_external_tokens: int):
         """
         Update KVConnector state after block allocation.
+
+        If get_num_new_matched_tokens previously returned True for a
+        request, this function may be called twice for that same request -
+        first when blocks are allocated for the connector tokens to be
+        asynchronously loaded into, and second when any additional blocks
+        are allocated, after the load/transfer is complete.
+
+        Args:
+            request (Request): the request object.
+            blocks (KVCacheBlocks): the blocks allocated for the request.
+            num_external_tokens (int): the number of tokens that will be
+                loaded from the external KV cache.
         """
         pass
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
index 2cb68dc1ff67..cc1f4ba35642 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import TYPE_CHECKING
 
 import torch
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
index 0aabb260fd3d..be3c23399419 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Optional
@@ -11,12 +12,12 @@
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
 from vllm.logger import init_logger
+from vllm.v1.core.kv_cache_manager import KVCacheBlocks
 from vllm.v1.core.sched.output import SchedulerOutput
 
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.forward_context import ForwardContext
-    from vllm.v1.core.kv_cache_manager import KVCacheBlocks
     from vllm.v1.request import Request
 
 logger = init_logger(__name__)
@@ -50,8 +51,9 @@ def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
             self._connectors.append(
                 KVConnectorFactory.create_connector_v1(temp_config, role))
 
-        # A mapping from request id to the connector that is assigned to it.
-        self._requests_to_connector: dict[str, KVConnectorBase_V1] = {}
+        # A mapping from request id to the index of the connector chosen to
+        # load the request from (if any).
+        self._requests_to_connector: dict[str, int] = {}
 
         # Keeps track of *additional* remaining async saves (beyond 1) to be
         # finished per request. Not needed for async loads since we only allow
@@ -135,25 +137,31 @@ def get_num_new_matched_tokens(
         request: "Request",
         num_computed_tokens: int,
     ) -> tuple[int, bool]:
-        for c in self._connectors:
+        to_return = (0, False)
+        for i, c in enumerate(self._connectors):
             toks, load_async = c.get_num_new_matched_tokens(
                 request, num_computed_tokens)
             # The first connector that has new matched tokens will be assigned
             # to this request.
-            if toks > 0:
-                self._requests_to_connector[request.request_id] = c
-                return toks, load_async
-        return 0, False
+            if to_return[0] == 0 and toks > 0:
+                self._requests_to_connector[request.request_id] = i
+                to_return = (toks, load_async)
+        return to_return
 
     def update_state_after_alloc(self, request: "Request",
                                  blocks: "KVCacheBlocks",
                                  num_external_tokens: int):
-        # If the request is not assigned to any connector, we do nothing.
-        if request.request_id not in self._requests_to_connector:
-            return
-        # We assume that the request is assigned to only one connector.
-        c = self._requests_to_connector.pop(request.request_id)
-        c.update_state_after_alloc(request, blocks, num_external_tokens)
+        chosen_connector = self._requests_to_connector.get(
+            request.request_id, -1)
+        empty_blocks = blocks.new_empty()
+        for i, c in enumerate(self._connectors):
+            if i == chosen_connector:
+                # Forward call to the chosen connector (if any).
+                c.update_state_after_alloc(request, blocks,
+                                           num_external_tokens)
+            else:
+                # Call with empty blocks for other connectors.
+                c.update_state_after_alloc(request, empty_blocks, 0)
 
     def build_connector_meta(
             self,
@@ -169,7 +177,7 @@ def build_connector_meta(
     def request_finished(
         self,
         request: "Request",
-        blocks: "KVCacheBlocks",
+        blocks: list[int],
     ) -> tuple[bool, Optional[dict[str, Any]]]:
         async_saves = 0
         kv_txfer_params = None
@@ -186,4 +194,8 @@ def request_finished(
                 kv_txfer_params = txfer_params
         if async_saves > 1:
             self._extra_async_saves[request.request_id] = async_saves - 1
+
+        # Clean up other state for this request.
+        self._requests_to_connector.pop(request.request_id, None)
+
         return async_saves > 0, kv_txfer_params
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 4d228dbc9d49..7552fc889f2f 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import contextlib
 import math
 import threading
@@ -14,6 +15,7 @@
 import zmq
 
 from vllm import envs
+from vllm.attention.selector import backend_name_to_enum, get_attn_backend
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
@@ -21,6 +23,7 @@
     get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size,
     get_tp_group)
 from vllm.logger import init_logger
+from vllm.platforms import _Backend
 from vllm.utils import make_zmq_path, make_zmq_socket, round_down
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.request import RequestStatus
@@ -31,6 +34,7 @@
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
     from vllm.v1.request import Request
 
+Transfer = tuple[int, float]  # (xfer_handle, start_time)
 GET_META_MSG = b"get_meta_msg"
 
 logger = init_logger(__name__)
@@ -53,6 +57,9 @@ class NixlAgentMetadata(
     agent_metadata: bytes
     kv_caches_base_addr: list[int]
     num_blocks: int
+    tp_size: int
+    block_len: int
+    attn_backend_name: str
 
 
 @dataclass
@@ -217,15 +224,6 @@ def get_num_new_matched_tokens(
             if count > 0:
                 return count, True
 
-            # NOTE: if count is 0 here, we have less than block_size
-            # tokens to pull after subtracting the local prefix cache hit.
-            # The remote only sends fully computed blocks, so there is
-            # nothing to transfer but we still need to notify the
-            # prefill worker so that the remote blocks are freed.
-            if all(p in params for p in ("remote_engine_id", "remote_host",
-                                         "remote_port")):
-                self._reqs_need_recv[request.request_id] = (request, [])
-
         # No remote prefill for this request.
         return 0, False
 
@@ -243,9 +241,14 @@ def update_state_after_alloc(self, request: "Request",
             if params.get("remote_block_ids"):
                 if all(p in params for p in ("remote_engine_id", "remote_host",
                                              "remote_port")):
+                    # If remote_blocks and num_external_tokens = 0, we have
+                    # a full prefix cache hit on the D worker. We need to call
+                    # send_notif in _read_blocks to free the memory on the P.
+                    local_block_ids = (blocks.get_unhashed_block_ids()
+                                       if num_external_tokens > 0 else [])
                     # Get unhashed blocks to pull from remote.
                     self._reqs_need_recv[request.request_id] = (
-                        request, blocks.get_unhashed_block_ids())
+                        request, local_block_ids)
                 else:
                     logger.warning(
                         "Got invalid KVTransferParams: %s. This "
@@ -264,15 +267,6 @@ def build_connector_meta(
         # Loop through scheduled reqs and convert to ReqMeta.
         for req_id, (req, block_ids) in self._reqs_need_recv.items():
             assert req.kv_transfer_params is not None
-            # For the case where there are no remote blocks to pull
-            # (block_ids is empty), we don't need to schedule
-            # an async read on the worker side.
-            if not block_ids:
-                logger.debug(
-                    "Skipping adding request %s to NixlConnectorMetadata, "
-                    "as there are no remote blocks to pull", req_id)
-                continue
-
             meta.add_new_req(
                 request_id=req_id,
                 local_block_ids=block_ids,
@@ -330,10 +324,14 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
         logger.info("Initializing NIXL wrapper")
         logger.info("Initializing NIXL worker %s", engine_id)
 
+        # Config.
+        self.vllm_config = vllm_config
+        self.block_size = vllm_config.cache_config.block_size
+
         # Agent.
         self.nixl_wrapper = NixlWrapper(str(uuid.uuid4()), None)
-        # Map of engine_id -> agent_name.
-        self._remote_agents: dict[str, str] = {}
+        # Map of engine_id -> {rank0: agent_name0, rank1: agent_name1..}.
+        self._remote_agents: dict[str, dict[int, str]] = defaultdict(dict)
 
         # NIXL handshake port.
         # NOTE(rob): Within a DP group, each DP rank gets its own
@@ -353,7 +351,8 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
         # KV Caches and nixl tracking data.
         self.kv_caches: dict[str, torch.Tensor] = {}
 
-        # Map of engine_id -> kv_caches_base_addr
+        # Map of engine_id -> kv_caches_base_addr. For TP case, each local
+        # rank will still only pull from a single remote TP worker.
         self.kv_caches_base_addr: dict[str, list[int]] = {}
 
         # Number of NIXL regions. Currently one region per cache
@@ -361,19 +360,19 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
         self.num_regions = 0
         self.num_layers = 0
 
-        # nixl_prepped_dlist_handle (int).
+        # nixl_prepped_dlist_handle.
         self.src_xfer_side_handle: int = 0
         # Map of engine_id -> nixl_prepped_dlist_handle (int)].
         self.dst_xfer_side_handles: dict[str, int] = {}
 
-        # Map of engine_id -> num_blocks.
+        # Map of engine_id -> num_blocks. All ranks in the same deployment will
+        # have the same number of blocks.
         self.dst_num_blocks: dict[str, int] = {}
         self._registered_descs: list[Any] = []
 
         # In progress transfers.
         # [req_id -> list[handle]]
-        self._recving_transfers: defaultdict[str, list[Any]] = defaultdict(
-            list[Any])
+        self._recving_transfers = defaultdict[str, list[Transfer]](list)
 
         # Complete transfer tracker. Used by the rank 0 to track finished
         # transactions on ranks 1 to N-1.
@@ -388,11 +387,30 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
 
         self.vllm_config = vllm_config
         self.block_size = vllm_config.cache_config.block_size
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
 
         # TODO(mgoin): remove this once we have hybrid memory allocator
         # Optimization for models with local attention (Llama 4)
         # List of block window sizes for each layer for local attention
         self.block_window_per_layer: list[Optional[int]] = []
+        self.use_mla = self.model_config.use_mla
+
+        backend = get_attn_backend(self.model_config.get_head_size(),
+                                   self.model_config.dtype,
+                                   self.cache_config.cache_dtype,
+                                   self.block_size,
+                                   self.model_config.is_attention_free,
+                                   use_mla=self.use_mla)
+        self.backend_name = backend.get_name()
+        attn_backend = backend_name_to_enum(self.backend_name)
+        self._use_flashinfer = attn_backend == _Backend.FLASHINFER_VLLM_V1
+        logger.debug("Detected attention backend %s", self.backend_name)
+
+        self._tp_size: dict[str, int] = {self.engine_id: self.world_size}
+        # With heterogeneous TP, P must wait for all assigned D TP workers to
+        # finish reading before safely freeing the blocks.
+        self.consumer_notification_counts_by_req = defaultdict[str, int](int)
 
     @staticmethod
     def _nixl_handshake_listener(metadata: NixlAgentMetadata,
@@ -425,27 +443,44 @@ def _nixl_handshake(self, host: str, port: int):
         """Do a NIXL handshake with a remote instance."""
 
         start_time = time.perf_counter()
-        # NOTE(rob): we need each tp_rank to have a unique port.
-        # This is a hack to keep us moving. We will switch when
-        # we switch to HTTP-based NIXL metadata exchange.
-        path = make_zmq_path("tcp", host, port + self.tp_rank)
-        logger.debug("Querying metadata on path: %s", path)
-        with zmq_ctx(zmq.REQ, path) as sock:
-            # Send query for the request.
-            sock.send(GET_META_MSG)
-            metadata_bytes = sock.recv()
-            decoder = msgspec.msgpack.Decoder(NixlAgentMetadata)
-            metadata = decoder.decode(metadata_bytes)
-            got_metadata_time = time.perf_counter()
 
-            # Register Remote agent.
-            self.add_remote_agent(metadata)
-            setup_agent_time = time.perf_counter()
+        # NOTE(rob): we need each rank to have a unique port. This is
+        # a hack to keep us moving. We will switch when moving to etcd
+        # or where we have a single ZMQ socket in the scheduler.
 
-            logger.debug("NIXL handshake: get metadata took: %s",
-                         got_metadata_time - start_time)
-            logger.debug("NIXL handshake: add agent took: %s",
-                         setup_agent_time - got_metadata_time)
+        def handshake(path: str, rank: int) -> NixlAgentMetadata:
+            # Send query for the request.
+            with zmq_ctx(zmq.REQ, path) as sock:
+                sock.send(GET_META_MSG)
+                metadata_bytes = sock.recv()
+                decoder = msgspec.msgpack.Decoder(NixlAgentMetadata)
+                metadata = decoder.decode(metadata_bytes)
+                got_metadata_time = time.perf_counter()
+
+                # Register Remote agent.
+                self.add_remote_agent(metadata, rank)
+                setup_agent_time = time.perf_counter()
+
+                logger.debug("NIXL handshake: get metadata took: %s",
+                             got_metadata_time - start_time)
+                logger.debug("NIXL handshake: add agent took: %s",
+                             setup_agent_time - got_metadata_time)
+                return metadata
+
+        # Handshake with remote agent-rank0 first to get the tp_size of remote
+        path = make_zmq_path("tcp", host, port)
+        logger.debug("Querying master rank metadata on path: %s", path)
+        metadata = handshake(path, 0)
+
+        # Handshake only with the other TP remote the current local rank will
+        # pull from. With homogeneous TP it happens to be the same rank_i.
+        tp_ratio = self._tp_size[self.engine_id] // metadata.tp_size
+        p_remote_rank = self.tp_rank // tp_ratio
+        if p_remote_rank > 0:
+            path = make_zmq_path("tcp", host, port + p_remote_rank)
+            logger.debug("Querying metadata on path: %s at remote rank %s",
+                         path, p_remote_rank)
+            _ = handshake(path, p_remote_rank)
 
     def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
         """Register the KV Cache data in nixl."""
@@ -454,27 +489,44 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
         kv_elem_size = first_kv_cache.element_size()
 
         # TODO(tms): Find a more robust way to detect and handle MLA
+        # NOTE (NickLucche) To move blocks efficiently with NIXL, the expected
+        # KV memory layout is HND, as opposed to the default NHD. Note that it
+        # will only affects the strides. For MLA instead, we make require no
+        # such thing and resort to the standard layout.
         use_mla = len(first_kv_cache.shape) == 3
+        assert use_mla == self.use_mla
+
+        # TODO (NickLucche) not compatible with hybrid allocator. Enforce check
+        # once it goes live, as a single kv layout is expected for xfers.
         if use_mla:
             # MLA case.
             self.num_blocks = first_kv_cache.shape[0]
             block_rank = 2  # [block_size, latent_dim]
             block_shape = first_kv_cache.shape[-block_rank:]
+            block_size, kv_latent_dim = block_shape
+            self.slot_size_bytes = kv_elem_size * kv_latent_dim
         else:
             # [2 (k and v), num_blocks, ...]
-            self.num_blocks = first_kv_cache.shape[1]
-            block_rank = 3  # [block_size, kv_heads, head_dim]
+            if self._use_flashinfer:
+                # FlashInfer swaps 2<->num_blocks dimensions.
+                self.num_blocks = first_kv_cache.shape[0]
+                block_rank = 4  # [2, block_size, kv_heads, head_dim]
+            else:
+                self.num_blocks = first_kv_cache.shape[1]
+                block_rank = 3  # [block_size, kv_heads, head_dim]
             block_shape = first_kv_cache.shape[-block_rank:]
-
+            block_size, n_kv_heads, head_dim = block_shape[-3:]
+            # head size in bytes.
+            self.slot_size_bytes = kv_elem_size * n_kv_heads * head_dim
+        assert block_size == self.block_size
         # TODO(tms): self.block_len needs to be per-layer for sliding window,
         # hybrid attn, etc
+        # block size in bytes
         self.block_len = kv_elem_size * math.prod(block_shape)
-
-        logger.debug("Registering KV_Caches. use_mla: %s, shape %s", use_mla,
-                     first_kv_cache.shape)
-        logger.debug("num_blocks: %s, block_shape: %s", self.num_blocks,
-                     block_shape)
-        logger.debug("Per layer kv cache size: %s", first_kv_cache.shape)
+        logger.info(
+            "Registering KV_Caches: use_mla: %s, num_blocks: %s, "
+            "block_shape: %s, per_layer_kv_cache_shape: %s", use_mla,
+            self.num_blocks, block_shape, first_kv_cache.shape)
         self.dst_num_blocks[self.engine_id] = self.num_blocks
         self.kv_caches = kv_caches
         kv_caches_base_addr = []
@@ -486,9 +538,12 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
         # are non-contiguous (it's not locally guaranteed that they will be)
         # Disadvantage is that the encoded NixlAgentMetadata is now larger
         # (roughly 8KB vs 5KB).
+        # Conversely for FlashInfer, K and V are transferred in the same tensor
+        # to better exploit the memory layout (ie num_blocks is the first dim).
         for cache_or_caches in kv_caches.values():
             # Normalize to always be a list of caches
-            cache_list = [cache_or_caches] if use_mla else cache_or_caches
+            cache_list = [cache_or_caches] if use_mla or self._use_flashinfer \
+                else cache_or_caches
             for cache in cache_list:
                 base_addr = cache.data_ptr()
                 region_len = self.num_blocks * self.block_len
@@ -523,16 +578,38 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
         logger.debug("Registering descs: %s", caches_data)
         self.nixl_wrapper.register_memory(descs)
         logger.debug("Done registering descs")
-
         self._registered_descs.append(descs)
 
+        # Register local/src descr for NIXL xfer.
+        blocks_data = []
+        for base_addr in self.kv_caches_base_addr[self.engine_id]:
+            # NOTE With heter-TP, more blocks are prepared than what are
+            # needed as self.num_blocks >= nixl_agent_meta.num_blocks. We
+            # could create fewer, but then _get_block_descs_ids needs to
+            # select agent_meta.num_blocks instead of self.num_blocks for
+            # local descr, and that makes handling regular flow less clean.
+            for block_id in range(self.num_blocks):
+                block_offset = block_id * self.block_len
+                addr = base_addr + block_offset
+                # (addr, len, device id)
+                blocks_data.append((addr, self.block_len, self.tp_rank))
+        logger.debug("Created %s blocks for src engine %s and rank %s",
+                     len(blocks_data), self.engine_id, self.tp_rank)
+
+        descs = self.nixl_wrapper.get_xfer_descs(blocks_data, "VRAM")
+        # NIXL_INIT_AGENT to be used for preparations of local descs.
+        self.src_xfer_side_handle = self.nixl_wrapper.prep_xfer_dlist(
+            "NIXL_INIT_AGENT", descs)
+
         # After KV Caches registered, listen for new connections.
         metadata = NixlAgentMetadata(
             engine_id=self.engine_id,
             agent_metadata=self.nixl_wrapper.get_agent_metadata(),
             kv_caches_base_addr=self.kv_caches_base_addr[self.engine_id],
             num_blocks=self.num_blocks,
-        )
+            tp_size=self.world_size,
+            block_len=self.block_len,
+            attn_backend_name=self.backend_name)
         ready_event = threading.Event()
         self._nixl_handshake_listener_t = threading.Thread(
             target=self._nixl_handshake_listener,
@@ -542,50 +619,130 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
         self._nixl_handshake_listener_t.start()
         ready_event.wait()
 
-    def add_remote_agent(self, nixl_agent_meta: NixlAgentMetadata):
+    def add_remote_agent(self,
+                         nixl_agent_meta: NixlAgentMetadata,
+                         remote_tp_rank: int = 0):
+        """
+        Add the remote NIXL agent and prepare the descriptors for reading cache
+        blocks from remote.
+
+        In particular, handle both homogeneous and heterogeneous TP. The former
+        requires local rank_i to read from remote rank_i. 
+        The latter, assuming D.world_size > P.world_size, requires that two or 
+        more local TP worker share the xfer from a single TP worker.
+
+        Here's an example:
+
+        rank_offset     p_remote_tp_rank
+        (kv split no)    
+        --------------------------------
+            0                 0      Worker0  ---- 1st half of KV ----> Worker0  [ KV Cache ]
+                                                                        /
+            1                 0      Worker1  ---- 2nd half of KV -----/
+
+            0                 1      Worker2  ---- 1st half of KV ----> Worker1  [ KV Cache ]
+                                                                        /
+            1                 1      Worker3  ---- 2nd half of KV -----/
+
+
+                                Decoder TP workers                     Prefix TP workers
+                                  (world_size=4)                         (world_size=2)
+                                                 tp_ratio = 4 // 2 = 2                  
+                                
+        Considering the KV Caches, if P-Worker_i has cache size [2, num_blocksP, kv_heads, block_size, head_dim]  
+        then D-Worker_j has [2, num_blocksD, kv_heads//tp_ratio, block_size, head_dim]. Mind the "HND" layout format.
+        Assuming num_blocksD >= num_blocksP, D-Worker0 reads from P-Worker0 by preparing the kv_heads//tp_ratio 
+        first heads from all the slots of all the blocks. D-Worker1 will do the same, but reading the second split
+        along the kv_heads dimension, and so forth until "tp_ratio" D TP workers have pulled from P-Worker0.   
+        
+        Note that the above will also hold true for the homogeneous TP case, where tp_ratio evaluates to 1.
+
+        Regarding MLA case, the cache is replicated across TP workers so the rank_offset will just always be 0
+        so that the whole cache is shared by "tp_ratio" D TP workers.
+        """ # noqa: E501
         engine_id = nixl_agent_meta.engine_id
-        assert engine_id != self.engine_id, "Conflict engine id found!"
-        if engine_id in self._remote_agents:
+        # TODO re-evaluate refreshing for scaling/recovery
+        if remote_tp_rank in self._remote_agents.get(engine_id, ()):
             return
 
-        self._remote_agents[engine_id] = self.nixl_wrapper.add_remote_agent(
-            nixl_agent_meta.agent_metadata)
-        self.kv_caches_base_addr[
-            engine_id] = nixl_agent_meta.kv_caches_base_addr
+        if engine_id in self._tp_size:
+            assert self._tp_size[engine_id] == nixl_agent_meta.tp_size
+        else:
+            self._tp_size[engine_id] = nixl_agent_meta.tp_size
+        # We may eventually enable this after asserting equality in cache
+        # layout and close outputs.
+        assert nixl_agent_meta.attn_backend_name == self.backend_name
+
+        self._remote_agents[engine_id][
+            remote_tp_rank] = self.nixl_wrapper.add_remote_agent(
+                nixl_agent_meta.agent_metadata)
+
+        # Number of D TP workers reading from a single P TP worker. This is
+        # 1 when P and D `--tensor-parallel-size` match.
+        assert self._tp_size[self.engine_id] % self._tp_size[engine_id] == 0, (
+            "Local TP size must be divisible by remote TP size.")
+        tp_ratio = self._tp_size[self.engine_id] // self._tp_size[engine_id]
+        assert tp_ratio > 0, "Decode TP cannot be smaller than prefill TP"
+        if self.use_mla:
+            # With MLA the only difference is in the number of blocks.
+            remote_block_size = nixl_agent_meta.block_len // (
+                self.slot_size_bytes)
+            assert self.block_len == nixl_agent_meta.block_len
+        else:
+            remote_block_size = nixl_agent_meta.block_len // (
+                self.slot_size_bytes * tp_ratio)
+            if self._use_flashinfer:
+                # Account for joint KV in FlashInfer.
+                remote_block_size //= 2
+
+            assert nixl_agent_meta.block_len == self.block_len * tp_ratio, (
+                "Remote P worker KV layer cache must be of shape [2, N, "
+                "local_kv_heads*tp_ratio, block_size, head_dim] and same dtype."
+            )
 
-        # Create src descs and xfer side handles.
-        blocks_data = []
-        for base_addr in self.kv_caches_base_addr[self.engine_id]:
-            for block_id in range(self.num_blocks):
-                block_offset = block_id * self.block_len
-                # (addr, len, device id)
-                blocks_data.append(
-                    (base_addr + block_offset, self.block_len, self.tp_rank))
-        logger.debug("Created %s blocks for src engine %s and tp_rank %s",
-                     len(blocks_data), self.engine_id, self.tp_rank)
+        assert self.block_size == remote_block_size, "Remote P worker with " \
+        "different block size is not supported"
 
-        # Register with NIXL.
-        descs = self.nixl_wrapper.get_xfer_descs(blocks_data, "VRAM")
-        self.src_xfer_side_handle = self.nixl_wrapper.prep_xfer_dlist(
-            "NIXL_INIT_AGENT", descs)
+        assert self.num_blocks >= nixl_agent_meta.num_blocks
+
+        # Create dst descs and xfer side handles. TP workers have same #blocks.
+        if engine_id in self.dst_num_blocks:
+            assert self.dst_num_blocks[engine_id] == nixl_agent_meta.num_blocks
+        else:
+            self.dst_num_blocks[engine_id] = nixl_agent_meta.num_blocks
 
-        # Create dst descs and xfer side handles.
-        self.dst_num_blocks[engine_id] = nixl_agent_meta.num_blocks
         blocks_data = []
-        for base_addr in self.kv_caches_base_addr[engine_id]:
-            for block_id in range(nixl_agent_meta.num_blocks):
-                block_offset = block_id * self.block_len
-                # (addr, len, device id)
-                blocks_data.append(
-                    (base_addr + block_offset, self.block_len, self.tp_rank))
-        logger.debug("Created %s blocks for dst engine %s and tp_rank %s",
-                     len(blocks_data), engine_id, self.tp_rank)
+        # With homogeneous TP, D pulls the whole kv cache from corresponding
+        # rank. With heterogeneous TP, prepare the descriptors by splitting the
+        # P KV cache along kv_head dim, of D worker's kv_head size (D>P).
+        # Eg. PTP1 DTP2 => P0 KV:[block0-KV_0 | block0-KV_1..].
+        p_remote_tp_rank = self.tp_rank // tp_ratio
+        # Only register the remote's descriptors if current rank pulls from it.
+        if p_remote_tp_rank == remote_tp_rank:
+            self.kv_caches_base_addr[
+                engine_id] = nixl_agent_meta.kv_caches_base_addr
+            rank_offset = self.tp_rank % tp_ratio * self.block_len \
+                if not self.use_mla else 0
+            # Register all remote blocks, but only the corresponding kv heads.
+            for base_addr in nixl_agent_meta.kv_caches_base_addr:
+                for block_id in range(nixl_agent_meta.num_blocks):
+                    block_offset = block_id * nixl_agent_meta.block_len
+                    # For each block, grab the heads chunk belonging to rank_i
+                    # of size remote_nheads // tp_ratio, which correspond to
+                    # self.block_len == remote_block_len//tp_ratio bytes.
+                    addr = base_addr + block_offset + rank_offset
+                    # (addr, len, device id)
+                    blocks_data.append((addr, self.block_len, remote_tp_rank))
+            logger.debug(
+                "Created %s blocks for dst engine %s with remote rank %s and "
+                "local rank %s", len(blocks_data), engine_id, remote_tp_rank,
+                self.tp_rank)
 
-        # Register with NIXL.
-        descs = self.nixl_wrapper.get_xfer_descs(blocks_data, "VRAM")
-        self.dst_xfer_side_handles[
-            engine_id] = self.nixl_wrapper.prep_xfer_dlist(
-                self._remote_agents[engine_id], descs)
+            # Register with NIXL.
+            descs = self.nixl_wrapper.get_xfer_descs(blocks_data, "VRAM")
+            self.dst_xfer_side_handles[
+                engine_id] = self.nixl_wrapper.prep_xfer_dlist(
+                    self._remote_agents[engine_id][remote_tp_rank], descs)
 
     def get_finished(self) -> tuple[set[str], set[str]]:
         """
@@ -653,16 +810,25 @@ def get_finished(self) -> tuple[set[str], set[str]]:
             return done_sending, done_recving
 
     def _get_new_notifs(self) -> set[str]:
-        """Get req_ids which got a remote xfer message."""
-
+        """
+        Get req_ids which got a remote xfer message. When multiple consumers
+        are reading from the same producer (heterogeneous TP scenario), wait
+        for all consumers to be done pulling.
+        """
         notified_req_ids: set[str] = set()
-        for req_ids in self.nixl_wrapper.get_new_notifs().values():
-            for req_id in req_ids:
-                assert req_id not in notified_req_ids
-                notified_req_ids.add(req_id.decode("utf-8"))
+        for notifs in self.nixl_wrapper.get_new_notifs().values():
+            for notif in notifs:
+                req_id, tp_ratio = notif.decode("utf-8").rsplit(":", 1)
+                self.consumer_notification_counts_by_req[req_id] += 1
+                # Wait all consumers (D) to be done reading before freeing.
+                if self.consumer_notification_counts_by_req[req_id] == int(
+                        tp_ratio):
+                    notified_req_ids.add(req_id)
+                    del self.consumer_notification_counts_by_req[req_id]
         return notified_req_ids
 
-    def _pop_done_transfers(self, transfers: dict[str, list[int]]) -> set[str]:
+    def _pop_done_transfers(
+            self, transfers: dict[str, list[tuple[int, float]]]) -> set[str]:
         """
         Pop completed xfers by checking for DONE state.
         Args:
@@ -672,23 +838,17 @@ def _pop_done_transfers(self, transfers: dict[str, list[int]]) -> set[str]:
         """
         done_req_ids: set[str] = set()
         for req_id, handles in list(transfers.items()):
-            running_reqs = []
-            for handle in handles:
+            for handle, xfer_stime in handles:
                 xfer_state = self.nixl_wrapper.check_xfer_state(handle)
                 if xfer_state == "DONE":
-                    # TODO ptarasiewicz: why abort is throwing errors?
-                    # self.nixl_wrapper.release_xfer_handle(handle)
+                    self.nixl_wrapper.release_xfer_handle(handle)
+                    done_req_ids.add(req_id)
+                    del transfers[req_id]
+                elif xfer_state == "PROC":
                     continue
-                if xfer_state == "PROC":
-                    running_reqs.append(handle)
                 else:
                     raise RuntimeError("Transfer failed with state %s",
                                        xfer_state)
-            if len(running_reqs) == 0:
-                done_req_ids.add(req_id)
-                del transfers[req_id]
-            else:
-                transfers[req_id] = running_reqs
         return done_req_ids
 
     def start_load_kv(self, metadata: NixlConnectorMetadata):
@@ -734,12 +894,19 @@ def _read_blocks(
         # saturate IB with heterogeneous TP sizes. We should remove the staging
         # blocks until we are ready.
 
+        # Number of D TP workers that will read from dst P. Propagate tp_ratio
+        # on notification so that dst worker can wait before freeing blocks.
+        tp_ratio = self._tp_size[
+            self.engine_id] // self._tp_size[dst_engine_id]
+        notif_id = f"{request_id}:{tp_ratio}".encode()
+
         # Full prefix cache hit: do not need to read remote blocks,
         # just notify P worker that we have the blocks we need.
         num_local_blocks = len(local_block_ids)
         if num_local_blocks == 0:
-            self.nixl_wrapper.send_notif(dst_engine_id,
-                                         notif_msg=request_id.encode("utf-8"))
+            remote_rank = self.tp_rank // tp_ratio
+            agent_name = self._remote_agents[dst_engine_id][remote_rank]
+            self.nixl_wrapper.send_notif(agent_name, notif_msg=notif_id)
             return
 
         # Partial prefix cache hit: just read uncomputed blocks.
@@ -752,6 +919,10 @@ def _read_blocks(
         local_xfer_side_handle = self.src_xfer_side_handle
         remote_xfer_side_handle = self.dst_xfer_side_handles[dst_engine_id]
 
+        # NOTE (nicolo) With homogeneous TP, each TP worker loads KV from
+        # corresponding rank. With heterogeneous TP, fixing D>P, the D tp
+        # workers will issue xfers to parts of the P worker remote kv caches.
+
         # Get descs ids.
         local_block_descs_ids: list[int] = []
         remote_block_descs_ids: list[int] = []
@@ -795,14 +966,16 @@ def _read_blocks(
             local_block_descs_ids,
             remote_xfer_side_handle,
             remote_block_descs_ids,
-            notif_msg=request_id.encode("utf-8"),
+            notif_msg=notif_id,
         )
 
         # Begin async xfer.
         self.nixl_wrapper.transfer(handle)
 
         # Use handle to check completion in future step().
-        self._recving_transfers[request_id].append(handle)
+        # TODO (NickLucche) surface xfer elapsed time
+        self._recving_transfers[request_id].append(
+            (handle, time.perf_counter()))
 
     def _get_block_descs_ids(self,
                              engine_id: str,
@@ -813,7 +986,6 @@ def _get_block_descs_ids(self,
         If layer_idx is provided, we use the region_ids for the given layer.
         Otherwise, we use all regions.
         """
-
         if layer_idx is None:
             region_ids = range(self.num_regions)
         else:
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
index 0421a65a2c81..f86b92692a0e 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import hashlib
 import os
 from dataclasses import dataclass
diff --git a/vllm/distributed/kv_transfer/kv_connector_agent.py b/vllm/distributed/kv_transfer/kv_connector_agent.py
index 819c06805ee4..8633fdaf59f8 100644
--- a/vllm/distributed/kv_transfer/kv_connector_agent.py
+++ b/vllm/distributed/kv_transfer/kv_connector_agent.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A centralized entrypoint to perform distributed KV cache transfer.
 
 This implementation is a shim wrapper on two APIs exposed by `kv_connector`:
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
index d1ffb8092dfc..eef14269f196 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file contains a new class `KVLookupBufferBase` that allows developers to
 think of KV cache operations as inserting new KV cache entries (`insert`)
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
index 5bb711021676..4381aad1e995 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file contains a new class `MooncakeStore` that allows developers to
 think of KV cache transfer operations as putting new KV cache entries
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
index e3b2274bd8a4..a0ff7c320f61 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
     Implements a distributed key-value (KV) cache transfer mechanism.
 
diff --git a/vllm/distributed/kv_transfer/kv_pipe/base.py b/vllm/distributed/kv_transfer/kv_pipe/base.py
index 40589fb3ef87..1423fd032477 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/base.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file defines an interface `KVPipeBase`
 that provides an abstraction for sending and receiving tensors, or None, via
diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
index aa4b1ba71492..9f3494b8106e 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 import os
diff --git a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
index 761c56f7e41f..09de0b682efc 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
     This module implements a PyNccl pipe for sending and receiving
     Optional[torch.Tensor] between distributed ranks with advanced
diff --git a/vllm/distributed/kv_transfer/kv_transfer_state.py b/vllm/distributed/kv_transfer/kv_transfer_state.py
index 25d2f2cf5c6e..60f1d5d8bca7 100644
--- a/vllm/distributed/kv_transfer/kv_transfer_state.py
+++ b/vllm/distributed/kv_transfer/kv_transfer_state.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import TYPE_CHECKING, Optional
 
 from vllm import envs
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 32c9301bf23d..10f87c49baa9 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2023 The vLLM team.
 # Adapted from
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index 96d08dc1a3c1..67f71643d039 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2023 The vLLM team.
 # Adapted from
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index e3b8a18ccdfe..4ce1b41e4f87 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # yapf: disable
 import argparse
@@ -14,7 +15,7 @@
 
 import regex as re
 import torch
-from pydantic import SkipValidation, TypeAdapter, ValidationError
+from pydantic import TypeAdapter, ValidationError
 from typing_extensions import TypeIs, deprecated
 
 import vllm.envs as envs
@@ -150,17 +151,29 @@ def is_not_builtin(type_hint: TypeHint) -> bool:
     return type_hint.__module__ != "builtins"
 
 
+def get_type_hints(type_hint: TypeHint) -> set[TypeHint]:
+    """Extract type hints from Annotated or Union type hints."""
+    type_hints: set[TypeHint] = set()
+    origin = get_origin(type_hint)
+    args = get_args(type_hint)
+
+    if origin is Annotated:
+        type_hints.update(get_type_hints(args[0]))
+    elif origin is Union:
+        for arg in args:
+            type_hints.update(get_type_hints(arg))
+    else:
+        type_hints.add(type_hint)
+
+    return type_hints
+
+
 def get_kwargs(cls: ConfigType) -> dict[str, Any]:
     cls_docs = get_attr_docs(cls)
     kwargs = {}
     for field in fields(cls):
         # Get the set of possible types for the field
-        type_hints: set[TypeHint] = set()
-        if get_origin(field.type) in {Union, Annotated}:
-            predicate = lambda arg: not isinstance(arg, SkipValidation)
-            type_hints.update(filter(predicate, get_args(field.type)))
-        else:
-            type_hints.add(field.type)
+        type_hints: set[TypeHint] = get_type_hints(field.type)
 
         # If the field is a dataclass, we can use the model_validate_json
         generator = (th for th in type_hints if is_dataclass(th))
@@ -374,6 +387,9 @@ class EngineArgs:
         bool] = SchedulerConfig.enable_chunked_prefill
     disable_chunked_mm_input: bool = SchedulerConfig.disable_chunked_mm_input
 
+    disable_hybrid_kv_cache_manager: bool = (
+        SchedulerConfig.disable_hybrid_kv_cache_manager)
+
     guided_decoding_backend: GuidedDecodingBackend = DecodingConfig.backend
     guided_decoding_disable_fallback: bool = DecodingConfig.disable_fallback
     guided_decoding_disable_any_whitespace: bool = \
@@ -836,6 +852,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             **scheduler_kwargs["disable_chunked_mm_input"])
         scheduler_group.add_argument("--scheduler-cls",
                                      **scheduler_kwargs["scheduler_cls"])
+        scheduler_group.add_argument(
+            "--disable-hybrid-kv-cache-manager",
+            **scheduler_kwargs["disable_hybrid_kv_cache_manager"])
 
         # vLLM arguments
         vllm_kwargs = get_kwargs(VllmConfig)
@@ -1161,6 +1180,8 @@ def create_engine_config(
             max_num_partial_prefills=self.max_num_partial_prefills,
             max_long_partial_prefills=self.max_long_partial_prefills,
             long_prefill_token_threshold=self.long_prefill_token_threshold,
+            disable_hybrid_kv_cache_manager=self.
+            disable_hybrid_kv_cache_manager,
         )
 
         lora_config = LoRAConfig(
@@ -1277,7 +1298,7 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
         # Skip this check if we are running on a non-GPU platform,
         # or if the device capability is not available
         # (e.g. in a Ray actor without GPUs).
-        from vllm.platforms import current_platform
+        from vllm.platforms import CpuArchEnum, current_platform
         if (current_platform.is_cuda()
                 and current_platform.get_device_capability()
                 and current_platform.get_device_capability().major < 8):
@@ -1382,10 +1403,13 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
             "PALLAS_VLLM_V1",
             "TRITON_ATTN_VLLM_V1",
             "TRITON_MLA",
+            "CUTLASS_MLA_VLLM_V1",
             "FLASHMLA",
             "FLASHINFER",
             "FLASHINFER_VLLM_V1",
             "ROCM_AITER_MLA",
+            "TORCH_SDPA_VLLM_V1",
+            "FLEX_ATTENTION",
         ]
         if (envs.is_set("VLLM_ATTENTION_BACKEND")
                 and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS):
@@ -1418,7 +1442,9 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
 
         # Non-[CUDA, TPU] may be supported on V1, but off by default for now.
         v0_hardware = not any(
-            (current_platform.is_cuda(), current_platform.is_tpu()))
+            (current_platform.is_cuda(), current_platform.is_tpu(),
+             (current_platform.is_cpu()
+              and current_platform.get_cpu_architecture() == CpuArchEnum.X86)))
         if v0_hardware and _warn_or_fallback(  # noqa: SIM103
                 current_platform.device_name):
             return False
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 19b219b674f3..72020a8ccf96 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import copy
@@ -441,6 +442,7 @@ async def add_request_async(
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
+        data_parallel_rank: Optional[int] = None,
     ) -> None:
         ...
 
@@ -455,6 +457,7 @@ async def add_request_async(
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
+        data_parallel_rank: Optional[int] = None,
     ) -> None:
         ...
 
@@ -472,6 +475,7 @@ async def add_request_async(
             trace_headers: Optional[Mapping[str, str]] = None,
             prompt_adapter_request: Optional[PromptAdapterRequest] = None,
             priority: int = 0,
+            data_parallel_rank: Optional[int] = None,
             *,
             inputs: Optional[PromptType] = None,  # DEPRECATED
     ) -> None:
@@ -490,6 +494,10 @@ async def add_request_async(
         if arrival_time is None:
             arrival_time = time.time()
 
+        if data_parallel_rank is not None:
+            raise ValueError("Targeting data_parallel_rank only supported "
+                             "in v1 client.")
+
         if (isinstance(prompt, dict)
                 and prompt.get("prompt_embeds", None) is not None
                 and not prompt.get("prompt_token_ids", None)):
@@ -901,6 +909,7 @@ def add_request(
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
+        data_parallel_rank: Optional[int] = None,
     ) -> Coroutine[None, None, AsyncGenerator[Union[
             RequestOutput, PoolingRequestOutput], None]]:
         ...
@@ -916,6 +925,7 @@ def add_request(
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
+        data_parallel_rank: Optional[int] = None,
     ) -> Coroutine[None, None, AsyncGenerator[Union[
             RequestOutput, PoolingRequestOutput], None]]:
         ...
@@ -934,6 +944,7 @@ async def add_request(
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
+        data_parallel_rank: Optional[int] = None,
         *,
         inputs: Optional[PromptType] = None,  # DEPRECATED
     ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
@@ -966,6 +977,7 @@ async def add_request(
             trace_headers=trace_headers,
             prompt_adapter_request=prompt_adapter_request,
             priority=priority,
+            data_parallel_rank=data_parallel_rank,
         )
 
         return stream.generator()
@@ -979,6 +991,7 @@ async def generate(
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
+        data_parallel_rank: Optional[int] = None,
     ) -> AsyncGenerator[RequestOutput, None]:
         """Generate outputs for a request.
 
@@ -998,7 +1011,8 @@ async def generate(
                                             for generation, if any.
             priority: The priority of the request.
                 Only applicable with priority scheduling.
-
+            data_parallel_rank: The (global) data parallel rank that must
+                handle this request. Only applicable if DP is enabled.
         Yields:
             The output `RequestOutput` objects from the LLMEngine
             for the request.
@@ -1056,6 +1070,7 @@ async def generate(
                     trace_headers=trace_headers,
                     prompt_adapter_request=prompt_adapter_request,
                     priority=priority,
+                    data_parallel_rank=data_parallel_rank,
             ):
                 yield LLMEngine.validate_output(output, RequestOutput)
         except asyncio.CancelledError:
diff --git a/vllm/engine/async_timeout.py b/vllm/engine/async_timeout.py
index 94674262bcfe..28a023a71ef5 100644
--- a/vllm/engine/async_timeout.py
+++ b/vllm/engine/async_timeout.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Workaround for https://github.com/python/cpython/issues/86296
 #
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index a9600a2c8aa3..dbcf78f02361 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import copy
 import time
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 916afe0c8e5f..8d51f0472351 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 from typing import TYPE_CHECKING
diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py
index acc83011d6c8..9375dc4c495b 100644
--- a/vllm/engine/metrics_types.py
+++ b/vllm/engine/metrics_types.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 These types are defined in this file to avoid importing vllm.engine.metrics
 and therefore importing prometheus_client.
diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py
index af72c8e6b776..bf9f669031cb 100644
--- a/vllm/engine/multiprocessing/__init__.py
+++ b/vllm/engine/multiprocessing/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import uuid
 from dataclasses import dataclass, field
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index 18b7c187bdff..f2f442485933 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import copy
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 434cb4985562..ef088bd3933a 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pickle
 import signal
diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py
index 4c8e295c1381..19c5963d32db 100644
--- a/vllm/engine/output_processor/interfaces.py
+++ b/vllm/engine/output_processor/interfaces.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from typing import Callable, List
diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
index 110f84a65efc..e0fa6a00ecfa 100644
--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import functools
 from typing import Callable, List, cast
diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py
index e88f119c8742..dbf6a371d050 100644
--- a/vllm/engine/output_processor/single_step.py
+++ b/vllm/engine/output_processor/single_step.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import List
 
diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py
index 6cad9ec8f327..7925d91f6064 100644
--- a/vllm/engine/output_processor/stop_checker.py
+++ b/vllm/engine/output_processor/stop_checker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, List, Optional, Tuple
 
diff --git a/vllm/engine/output_processor/util.py b/vllm/engine/output_processor/util.py
index 0d2b58c109e3..1e127eb98242 100644
--- a/vllm/engine/output_processor/util.py
+++ b/vllm/engine/output_processor/util.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import List
 from typing import Sequence as GenericSequence
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 28341c2c633e..727d59283643 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 from abc import ABC, abstractmethod
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index 1c027181156f..56f8754c266b 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 NOTE: This API server is used only for demonstrating usage of AsyncEngine
 and simple performance benchmarks. It is not intended for production use.
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index b051cd3338a4..95c806c228b8 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import json
diff --git a/vllm/entrypoints/cli/benchmark/base.py b/vllm/entrypoints/cli/benchmark/base.py
index 94fb415f581f..30a884410800 100644
--- a/vllm/entrypoints/cli/benchmark/base.py
+++ b/vllm/entrypoints/cli/benchmark/base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 
 from vllm.entrypoints.cli.types import CLISubcommand
diff --git a/vllm/entrypoints/cli/benchmark/latency.py b/vllm/entrypoints/cli/benchmark/latency.py
index 5aca16e0b640..e0358a262dcd 100644
--- a/vllm/entrypoints/cli/benchmark/latency.py
+++ b/vllm/entrypoints/cli/benchmark/latency.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 
 from vllm.benchmarks.latency import add_cli_args, main
diff --git a/vllm/entrypoints/cli/benchmark/main.py b/vllm/entrypoints/cli/benchmark/main.py
index 9e857af7d6db..717da630ab4f 100644
--- a/vllm/entrypoints/cli/benchmark/main.py
+++ b/vllm/entrypoints/cli/benchmark/main.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 
 import vllm.entrypoints.cli.benchmark.latency
diff --git a/vllm/entrypoints/cli/benchmark/serve.py b/vllm/entrypoints/cli/benchmark/serve.py
index d5a858920ebd..304370157023 100644
--- a/vllm/entrypoints/cli/benchmark/serve.py
+++ b/vllm/entrypoints/cli/benchmark/serve.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 
 from vllm.benchmarks.serve import add_cli_args, main
diff --git a/vllm/entrypoints/cli/benchmark/throughput.py b/vllm/entrypoints/cli/benchmark/throughput.py
index 88ee6aa03857..20431cd3d870 100644
--- a/vllm/entrypoints/cli/benchmark/throughput.py
+++ b/vllm/entrypoints/cli/benchmark/throughput.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 
 from vllm.benchmarks.throughput import add_cli_args, main
diff --git a/vllm/entrypoints/cli/collect_env.py b/vllm/entrypoints/cli/collect_env.py
index 810ecfdf71c3..141aafdb1a61 100644
--- a/vllm/entrypoints/cli/collect_env.py
+++ b/vllm/entrypoints/cli/collect_env.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 
diff --git a/vllm/entrypoints/cli/main.py b/vllm/entrypoints/cli/main.py
index 5eba72fec13c..9bb1162e38d8 100644
--- a/vllm/entrypoints/cli/main.py
+++ b/vllm/entrypoints/cli/main.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # The CLI entrypoint to vLLM.
 import signal
@@ -10,7 +11,7 @@
 import vllm.entrypoints.cli.run_batch
 import vllm.entrypoints.cli.serve
 import vllm.version
-from vllm.entrypoints.utils import VLLM_SERVE_PARSER_EPILOG, cli_env_setup
+from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG, cli_env_setup
 from vllm.utils import FlexibleArgumentParser
 
 CMD_MODULES = [
@@ -36,7 +37,7 @@ def main():
 
     parser = FlexibleArgumentParser(
         description="vLLM CLI",
-        epilog=VLLM_SERVE_PARSER_EPILOG,
+        epilog=VLLM_SUBCMD_PARSER_EPILOG,
     )
     parser.add_argument('-v',
                         '--version',
diff --git a/vllm/entrypoints/cli/openai.py b/vllm/entrypoints/cli/openai.py
index 215fcf3c3e44..58dcdfe217fd 100644
--- a/vllm/entrypoints/cli/openai.py
+++ b/vllm/entrypoints/cli/openai.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Commands that act as an interactive OpenAI API client
 
 import argparse
diff --git a/vllm/entrypoints/cli/run_batch.py b/vllm/entrypoints/cli/run_batch.py
index f74c8da9b9b8..6bdd3b63c26d 100644
--- a/vllm/entrypoints/cli/run_batch.py
+++ b/vllm/entrypoints/cli/run_batch.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import asyncio
@@ -9,6 +10,8 @@
 from vllm.entrypoints.logger import logger
 from vllm.entrypoints.openai.run_batch import main as run_batch_main
 from vllm.entrypoints.openai.run_batch import make_arg_parser
+from vllm.entrypoints.utils import (VLLM_SUBCMD_PARSER_EPILOG,
+                                    show_filtered_argument_or_group_from_help)
 from vllm.utils import FlexibleArgumentParser
 from vllm.version import __version__ as VLLM_VERSION
 
@@ -48,7 +51,11 @@ def subparser_init(
             usage=
             "vllm run-batch -i INPUT.jsonl -o OUTPUT.jsonl --model <model>",
         )
-        return make_arg_parser(run_batch_parser)
+        run_batch_parser = make_arg_parser(run_batch_parser)
+        show_filtered_argument_or_group_from_help(run_batch_parser,
+                                                  "run-batch")
+        run_batch_parser.epilog = VLLM_SUBCMD_PARSER_EPILOG
+        return run_batch_parser
 
 
 def cmd_init() -> list[CLISubcommand]:
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index 040ae166a2d5..51807a953e02 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import os
@@ -15,7 +16,7 @@
                                                 setup_server)
 from vllm.entrypoints.openai.cli_args import (make_arg_parser,
                                               validate_parsed_serve_args)
-from vllm.entrypoints.utils import (VLLM_SERVE_PARSER_EPILOG,
+from vllm.entrypoints.utils import (VLLM_SUBCMD_PARSER_EPILOG,
                                     show_filtered_argument_or_group_from_help)
 from vllm.executor.multiproc_worker_utils import _add_prefix
 from vllm.logger import init_logger
@@ -100,8 +101,8 @@ def subparser_init(
         )
 
         serve_parser = make_arg_parser(serve_parser)
-        show_filtered_argument_or_group_from_help(serve_parser)
-        serve_parser.epilog = VLLM_SERVE_PARSER_EPILOG
+        show_filtered_argument_or_group_from_help(serve_parser, "serve")
+        serve_parser.epilog = VLLM_SUBCMD_PARSER_EPILOG
         return serve_parser
 
 
diff --git a/vllm/entrypoints/cli/types.py b/vllm/entrypoints/cli/types.py
index f739a68c5f4c..0a7244312975 100644
--- a/vllm/entrypoints/cli/types.py
+++ b/vllm/entrypoints/cli/types.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 
diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py
index a4f70a51ebaf..9f4dc19fb4ab 100644
--- a/vllm/entrypoints/launcher.py
+++ b/vllm/entrypoints/launcher.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import signal
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index e05189ef4961..fd28bf39e2d5 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 import warnings
diff --git a/vllm/entrypoints/logger.py b/vllm/entrypoints/logger.py
index d4655dd5e6ab..f3aee188dae9 100644
--- a/vllm/entrypoints/logger.py
+++ b/vllm/entrypoints/logger.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional, Union
 
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 5a4295ff716d..2f8819bca60d 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import atexit
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index f196ff6ed302..ca70e78df326 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file contains the command line arguments for the vLLM's
 OpenAI-compatible server. It is kept in a separate file for documentation
diff --git a/vllm/entrypoints/openai/logits_processors.py b/vllm/entrypoints/openai/logits_processors.py
index 04d5091a9681..29d72256cf70 100644
--- a/vllm/entrypoints/openai/logits_processors.py
+++ b/vllm/entrypoints/openai/logits_processors.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from functools import lru_cache, partial
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index e72c23993ac8..79f0f200c74e 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
@@ -270,6 +271,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
     spaces_between_special_tokens: bool = True
     truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
     prompt_logprobs: Optional[int] = None
+    allowed_token_ids: Optional[list[int]] = None
     # --8<-- [end:chat-completion-sampling-params]
 
     # --8<-- [start:chat-completion-extra-params]
@@ -548,6 +550,7 @@ def to_sampling_params(
                 else RequestOutputKind.FINAL_ONLY,
             guided_decoding=guided_decoding,
             logit_bias=self.logit_bias,
+            allowed_token_ids=self.allowed_token_ids,
             extra_args=({"kv_transfer_params": self.kv_transfer_params}
                         if self.kv_transfer_params else None))
 
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index ac250b3cb4fb..9994b3cae888 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import tempfile
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index ea8e187dc6b7..79eac184a212 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import json
@@ -319,10 +320,13 @@ def _filter_delta_text(delta_text: str,
     def extract_tool_call_required_streaming(
         self,
         previous_text: str,
-        current_text: str,
+        current_text: Optional[str],
         delta_text: str,
         function_name_returned: bool,
     ) -> tuple[Optional[DeltaMessage], bool]:
+        if current_text is None or current_text == "":
+            # if the current text is empty, we cannot parse it
+            return None, function_name_returned
         try:
             obj = partial_json_parser.loads(current_text)
         except partial_json_parser.core.exceptions.MalformedJSON:
@@ -649,10 +653,18 @@ async def chat_completion_stream_generator(
                         current_text = previous_text + delta_text
                         fn_name_returned = function_name_returned[i]
 
+                        if self.reasoning_parser:
+                            _, content = \
+                                reasoning_parser.extract_reasoning_content(
+                                    current_text,
+                                    request
+                                )
+                        else:
+                            content = current_text
                         delta_message, function_name_returned[i] = (
                             self.extract_tool_call_required_streaming(
                                 previous_text=previous_text,
-                                current_text=current_text,
+                                current_text=content,
                                 delta_text=delta_text,
                                 function_name_returned=fn_name_returned))
 
@@ -677,7 +689,21 @@ async def chat_completion_stream_generator(
                                     current_token_ids,
                                     output.token_ids,
                                 ))
-
+                            # When encountering think end id in prompt_token_ids
+                            # i.e {"enable_thinking": False},
+                            # set reasoning status to end.
+                            # Remove the text and token ids related
+                            # to 'reasoning_content'.
+                            if res.prompt_token_ids and \
+                                reasoning_parser.is_reasoning_end(
+                                    list(res.prompt_token_ids)):
+                                reasoning_end_arr[i] = True
+                                current_token_ids = list(output.token_ids)
+                                if delta_message and delta_message.content:
+                                    current_text = delta_message.content
+                                    delta_message.content = None
+                                else:
+                                    current_text = ""
                             # When encountering think end id in delta_token_ids,
                             # set reasoning status to end.
                             # Remove the text and token ids related
@@ -980,8 +1006,9 @@ async def chat_completion_full_generator(
 
                 # the fields of FunctionDefinition are a superset of the
                 # tool call outputs and can be used for parsing
+                assert content is not None
                 tool_calls = TypeAdapter(
-                    list[FunctionDefinition]).validate_json(output.text)
+                    list[FunctionDefinition]).validate_json(content)
                 message = ChatMessage(
                     role=role,
                     content="",
diff --git a/vllm/entrypoints/openai/serving_classification.py b/vllm/entrypoints/openai/serving_classification.py
index 90cdd389d59f..3ac4f01ea602 100644
--- a/vllm/entrypoints/openai/serving_classification.py
+++ b/vllm/entrypoints/openai/serving_classification.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from http import HTTPStatus
 from typing import Optional, Union, cast
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 1c06070cb315..ce5eca855028 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import time
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 3785d2642f9d..e87decfe636a 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import base64
 from typing import Final, Literal, Optional, Union, cast
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index f96a4ac8b3a5..ac3883bdeb33 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import base64
 import io
 import json
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index 74433a1a3c3f..764b0e73690d 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 import pathlib
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
index 7c401d4f5cb1..b896cc46b9d0 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import base64
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 9bdacb5518d6..f58611c49b88 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 import time
 from collections.abc import AsyncGenerator, Mapping
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index 0d739bbf9bf2..3db0a71fadd1 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Final, Optional, Union
 
diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py
index 9fc5b562e7d5..f667c7e9b3a9 100644
--- a/vllm/entrypoints/openai/serving_transcription.py
+++ b/vllm/entrypoints/openai/serving_transcription.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 import io
 import time
diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
index 054c0b006b2f..3e4f4e149c9f 100644
--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from .abstract_tool_parser import ToolParser, ToolParserManager
 from .deepseekv3_tool_parser import DeepSeekV3ToolParser
diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
index 931d5aab9bd9..02aeab613631 100644
--- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from collections.abc import Sequence
diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
index 14e743e13a72..60025af2a6f3 100644
--- a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
 from typing import Union
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
index 383e0d44de99..5508ba6a3940 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from collections.abc import Sequence
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
index b8bf142530ee..fcc5b7edda83 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from collections.abc import Sequence
diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
index 2b9f9852bcb3..c7030d34d453 100644
--- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from collections.abc import Sequence
diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
index 3f2799f8010a..e5dcdf9a0760 100644
--- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from collections.abc import Sequence
diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
index 2714a545f997..66b483d8b0f6 100644
--- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from collections.abc import Sequence
diff --git a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
index 323fb144181e..6bf44a4345a9 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import ast
 import json
 from collections.abc import Sequence
diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
index 4eda7044cbba..5698bc70af23 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from collections.abc import Sequence
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index fecad7e653ab..ab1cfd4b6eab 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from collections.abc import Sequence
@@ -43,11 +44,17 @@ def is_valid_id(id: str) -> bool:
         return id.isalnum() and len(id) == 9
 
 
+def _is_fn_name_regex_support(model_tokenizer: AnyTokenizer) -> bool:
+    return isinstance(model_tokenizer, MistralTokenizer) \
+        and model_tokenizer.version >= 11
+
+
 @ToolParserManager.register_module("mistral")
 class MistralToolParser(ToolParser):
     """
-    Tool call parser for Mistral 7B Instruct v0.3, intended for use with the
-    examples/tool_chat_template_mistral.jinja template.
+    Tool call parser for Mistral 7B Instruct v0.3, intended for use with
+    - [`mistral_common`](https://github.com/mistralai/mistral-common/)
+    - the examples/tool_chat_template_mistral.jinja template.
 
     Used when --enable-auto-tool-choice --tool-call-parser mistral are all set
     """
@@ -69,6 +76,12 @@ def __init__(self, tokenizer: AnyTokenizer):
         self.bot_token = "[TOOL_CALLS]"
         self.bot_token_id = self.vocab.get(self.bot_token)
         self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)
+        if _is_fn_name_regex_support(self.model_tokenizer):
+            self.fn_name_regex = re.compile(r'([a-zA-Z0-9_-]+)(\{.*?\})',
+                                            re.DOTALL)
+        else:
+            self.fn_name_regex = None
+
         if self.bot_token_id is None:
             raise RuntimeError(
                 "Mistral Tool Parser could not locate the tool call token in "
@@ -108,11 +121,25 @@ def extract_tool_calls(
         tool_content = model_output.replace(self.bot_token, "").strip()
 
         try:
-
             # we first try to directly load the json as parsing very nested
             # jsons is difficult
             try:
-                function_call_arr = json.loads(tool_content)
+                if self.fn_name_regex:
+                    matches = self.fn_name_regex.findall(tool_content)
+
+                    function_call_arr = []
+                    for match in matches:
+                        fn_name = match[0]
+                        args = match[1]
+
+                        # fn_name is encoded outside serialized json dump
+                        # only arguments are serialized
+                        function_call_arr.append({
+                            "name": fn_name,
+                            "arguments": json.loads(args)
+                        })
+                else:
+                    function_call_arr = json.loads(tool_content)
             except json.JSONDecodeError:
                 # use a regex to find the part corresponding to the tool call.
                 # NOTE: This use case should not happen if the model is trained
diff --git a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
index 00690ad79a7a..5501028cf36b 100644
--- a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from collections.abc import Sequence
diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
index bc5d15dcb82f..73329cdf701d 100644
--- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import ast
 import json
diff --git a/vllm/entrypoints/openai/tool_parsers/utils.py b/vllm/entrypoints/openai/tool_parsers/utils.py
index acbff3258e46..aa41cd6dc53e 100644
--- a/vllm/entrypoints/openai/tool_parsers/utils.py
+++ b/vllm/entrypoints/openai/tool_parsers/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from json import JSONDecodeError, JSONDecoder
diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py
index 80b6c07c603f..c4e044f3a28e 100644
--- a/vllm/entrypoints/score_utils.py
+++ b/vllm/entrypoints/score_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Union
 
 from torch.nn import CosineSimilarity
diff --git a/vllm/entrypoints/ssl.py b/vllm/entrypoints/ssl.py
index dba916b8bf13..e3646a60a7cc 100644
--- a/vllm/entrypoints/ssl.py
+++ b/vllm/entrypoints/ssl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 from ssl import SSLContext
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index 1b0ea69096cc..16ba2b4531ac 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import functools
@@ -13,8 +14,9 @@
 
 logger = init_logger(__name__)
 
-VLLM_SERVE_PARSER_EPILOG = (
-    "Tip: Use `vllm serve --help=<keyword>` to explore arguments from help.\n"
+VLLM_SUBCMD_PARSER_EPILOG = (
+    "Tip: Use `vllm [serve|run-batch] --help=<keyword>` "
+    "to explore arguments from help.\n"
     "   - To view a argument group:     --help=ModelConfig\n"
     "   - To view a single argument:    --help=max-num-seqs\n"
     "   - To search by keyword:         --help=max\n"
@@ -172,8 +174,15 @@ def _validate_truncation_size(
     return truncate_prompt_tokens
 
 
-def show_filtered_argument_or_group_from_help(parser):
+def show_filtered_argument_or_group_from_help(parser, subcommand_name):
     import sys
+
+    # Only handle --help=<keyword> for the current subcommand.
+    # Since subparser_init() runs for all subcommands during CLI setup,
+    # we skip processing if the subcommand name is not in sys.argv.
+    if subcommand_name not in sys.argv:
+        return
+
     for arg in sys.argv:
         if arg.startswith('--help='):
             search_keyword = arg.split('=', 1)[1]
diff --git a/vllm/env_override.py b/vllm/env_override.py
index 71f031d1e231..2bede4963f96 100644
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -1,19 +1,26 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 
 import torch
 
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
 # set some common config/environment variables that should be set
 # for all processes created by vllm and all processes
 # that interact with vllm workers.
 # they are executed whenever `import vllm` is called.
 
-if not os.path.exists('/dev/nvidia-caps-imex-channels'):
-    # normally, we disable NCCL_CUMEM_ENABLE because it
-    # will cost 1~2 GiB GPU memory with cudagraph+allreduce,
-    # see https://github.com/NVIDIA/nccl/issues/1234
-    # for more details.
-    # However, NCCL requires NCCL_CUMEM_ENABLE to work with
+if 'NCCL_CUMEM_ENABLE' in os.environ:
+    logger.warning(
+        "NCCL_CUMEM_ENABLE is set to %s, skipping override. "
+        "This may increase memory overhead with cudagraph+allreduce: "
+        "https://github.com/NVIDIA/nccl/issues/1234",
+        os.environ['NCCL_CUMEM_ENABLE'])
+elif not os.path.exists('/dev/nvidia-caps-imex-channels'):
+    # NCCL requires NCCL_CUMEM_ENABLE to work with
     # multi-node NVLink, typically on GB200-NVL72 systems.
     # The ultimate way to detect multi-node NVLink is to use
     # NVML APIs, which are too expensive to call here.
diff --git a/vllm/envs.py b/vllm/envs.py
index 3dd0d9045372..ffb630079a84 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import hashlib
 import os
@@ -121,6 +122,7 @@
     VLLM_ALL2ALL_BACKEND: str = "naive"
     VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840
     VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1
+    VLLM_SLEEP_WHEN_IDLE: bool = False
 
 
 def get_default_cache_root():
@@ -825,6 +827,8 @@ def get_vllm_port() -> Optional[int]:
     # Available options:
     # - "naive": naive all2all implementation using all-reduce
     # - "pplx": use pplx kernels
+    # - "deepep_high_throughput", use deepep high-throughput kernels
+    # - "deepep_low_latency", use deepep low-latency kernels
     "VLLM_ALL2ALL_BACKEND":
     lambda: os.getenv("VLLM_ALL2ALL_BACKEND", "naive"),
 
@@ -838,6 +842,11 @@ def get_vllm_port() -> Optional[int]:
     # Regex timeout for use by the vLLM tool parsing plugins.
     "VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS":
     lambda: int(os.getenv("VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS", "1")),
+
+    # Reduce CPU usage when vLLM is idle. Enabling this will incur small
+    # latency penalty when a request eventually comes.
+    "VLLM_SLEEP_WHEN_IDLE":
+    lambda: bool(int(os.getenv("VLLM_SLEEP_WHEN_IDLE", "0"))),
 }
 
 # --8<-- [end:env-vars-definition]
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 40ca1d29939a..99e12201c96a 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import time
diff --git a/vllm/executor/mp_distributed_executor.py b/vllm/executor/mp_distributed_executor.py
index d1f8c36fbbec..4e8c6d79095f 100644
--- a/vllm/executor/mp_distributed_executor.py
+++ b/vllm/executor/mp_distributed_executor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import os
diff --git a/vllm/executor/msgspec_utils.py b/vllm/executor/msgspec_utils.py
index e680d53cbd10..852c8f5cffa0 100644
--- a/vllm/executor/msgspec_utils.py
+++ b/vllm/executor/msgspec_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from array import array
 from typing import Any, Type
diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py
index 380b672c3605..a6c172beff7b 100644
--- a/vllm/executor/multiproc_worker_utils.py
+++ b/vllm/executor/multiproc_worker_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import os
diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index 8e67c7a41bb1..bdc2b1f4c27c 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import json
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 7bc98a16f041..c222f1609096 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import time
diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py
index 1d3a6e443a80..7ebeb4a22556 100644
--- a/vllm/executor/uniproc_executor.py
+++ b/vllm/executor/uniproc_executor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index f192be1c40d5..f3b0518a44e0 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 from collections import defaultdict
diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
index df4f844cd815..37bf2b7a4436 100644
--- a/vllm/inputs/__init__.py
+++ b/vllm/inputs/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from .data import (DecoderOnlyInputs, EmbedsInputs, EncoderDecoderInputs,
                    ExplicitEncoderDecoderPrompt, ProcessorInputs, PromptType,
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index 843c45bd6163..23cb5e5022f1 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
 from typing import TYPE_CHECKING, Any, Generic, Literal, Optional, Union, cast
 
diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py
index 4c64a41ace31..8c3700799e4a 100644
--- a/vllm/inputs/parse.py
+++ b/vllm/inputs/parse.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Sequence
 from typing import Literal, Optional, TypedDict, Union, cast, overload
 
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index b9acabeabd8d..a13e563f34a1 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 from collections.abc import Mapping
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index f424a8f613ab..3dad021e3166 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -1,11 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Mapping
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union
 
+import torch
 from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
 from typing_extensions import TypeVar
 
+from vllm.jsontree import JSONTree, json_map_leaves
+from vllm.logger import init_logger
 from vllm.transformers_utils.processor import cached_processor_from_config
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import resolve_mm_processor_kwargs
@@ -20,6 +24,8 @@
 _C = TypeVar("_C", bound=PretrainedConfig, default=PretrainedConfig)
 _P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
 
+logger = init_logger(__name__)
+
 
 @dataclass(frozen=True)
 class InputContext:
@@ -133,7 +139,7 @@ def call_hf_processor(
         hf_processor: ProcessorMixin,
         data: Mapping[str, object],
         kwargs: Mapping[str, object] = {},
-    ) -> BatchFeature:
+    ) -> Union[BatchFeature, JSONTree]:
         """
         Call `hf_processor` on the prompt `data`
         (text, image, audio...) with configurable options `kwargs`.
@@ -153,8 +159,25 @@ def call_hf_processor(
             allow_var_kwargs=True,
         )
 
+        def maybe_cast_dtype(x):
+            # This mimics the behavior of transformers.BatchFeature
+            if isinstance(x, torch.Tensor) and x.is_floating_point():
+                return x.to(dtype=self.model_config.dtype)
+            return x
+
         try:
-            return hf_processor(**data, **merged_kwargs, return_tensors="pt")
+            output = hf_processor(**data, **merged_kwargs, return_tensors="pt")
+            # this emulates output.to(dtype=self.model_config.dtype)
+            cast_output = json_map_leaves(maybe_cast_dtype, output)
+            if isinstance(output, BatchFeature):
+                return BatchFeature(cast_output)
+
+            logger.warning_once(
+                f"{type(hf_processor).__name__} did not return `BatchFeature`. "
+                "Make sure to match the behaviour of `ProcessorMixin` when "
+                "implementing custom processors.")
+            return cast_output
+
         except Exception as exc:
             msg = (f"Failed to apply {type(hf_processor).__name__} "
                    f"on data={data} with kwargs={merged_kwargs}")
diff --git a/vllm/jsontree.py b/vllm/jsontree.py
index 91cd7cb216d7..4cbe0f76e006 100644
--- a/vllm/jsontree.py
+++ b/vllm/jsontree.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Helper functions to work with nested JSON structures."""
 from collections.abc import Iterable
 from functools import reduce
diff --git a/vllm/logger.py b/vllm/logger.py
index fd16dd95bb1b..0ddb83cb8ba7 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Logging configuration for vLLM."""
 import datetime
 import json
diff --git a/vllm/logging_utils/__init__.py b/vllm/logging_utils/__init__.py
index 7ab4632589bf..cf690a89ae9b 100644
--- a/vllm/logging_utils/__init__.py
+++ b/vllm/logging_utils/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.logging_utils.formatter import NewLineFormatter
 
diff --git a/vllm/logging_utils/dump_input.py b/vllm/logging_utils/dump_input.py
index 47ce0ab188bd..d14515f56e54 100644
--- a/vllm/logging_utils/dump_input.py
+++ b/vllm/logging_utils/dump_input.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import contextlib
 import enum
diff --git a/vllm/logging_utils/formatter.py b/vllm/logging_utils/formatter.py
index 010b0a124987..0affef10078d 100644
--- a/vllm/logging_utils/formatter.py
+++ b/vllm/logging_utils/formatter.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import logging
 
diff --git a/vllm/logits_process.py b/vllm/logits_process.py
index 29a73656bf65..5967d0836bd4 100644
--- a/vllm/logits_process.py
+++ b/vllm/logits_process.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, Union
 
diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py
index b6b138a44051..7fc4cfe026ae 100644
--- a/vllm/lora/fully_sharded_layers.py
+++ b/vllm/lora/fully_sharded_layers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # pylint: disable=unused-argument
 from typing import TYPE_CHECKING, Optional, Union, cast
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 023c8e9c9a86..66e037a97d06 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # pylint: disable=unused-argument
 import math
diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py
index 294b49e0a899..958364fca592 100644
--- a/vllm/lora/lora.py
+++ b/vllm/lora/lora.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence as GenericSequence
 from typing import Optional
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index dfdc908d7e05..262e6799583a 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 import os
diff --git a/vllm/lora/ops/torch_ops/__init__.py b/vllm/lora/ops/torch_ops/__init__.py
index 85601d58c9d7..22aa3c63dce1 100644
--- a/vllm/lora/ops/torch_ops/__init__.py
+++ b/vllm/lora/ops/torch_ops/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.lora.ops.torch_ops.lora_ops import bgmv_expand  # noqa: F401
 from vllm.lora.ops.torch_ops.lora_ops import (bgmv_expand_slice, bgmv_shrink,
diff --git a/vllm/lora/ops/torch_ops/lora_ops.py b/vllm/lora/ops/torch_ops/lora_ops.py
index ab65faceb2c1..cba5baad8668 100644
--- a/vllm/lora/ops/torch_ops/lora_ops.py
+++ b/vllm/lora/ops/torch_ops/lora_ops.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/vllm/lora/ops/triton_ops/__init__.py b/vllm/lora/ops/triton_ops/__init__.py
index 5a39705e8571..805de4b6f657 100644
--- a/vllm/lora/ops/triton_ops/__init__.py
+++ b/vllm/lora/ops/triton_ops/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.lora.ops.triton_ops.lora_expand_op import lora_expand
 from vllm.lora.ops.triton_ops.lora_kernel_metadata import LoRAKernelMeta
diff --git a/vllm/lora/ops/triton_ops/kernel_utils.py b/vllm/lora/ops/triton_ops/kernel_utils.py
index 0f971c03592d..e93064d0c83a 100644
--- a/vllm/lora/ops/triton_ops/kernel_utils.py
+++ b/vllm/lora/ops/triton_ops/kernel_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Utilities for Punica kernel construction.
 """
diff --git a/vllm/lora/ops/triton_ops/lora_expand_op.py b/vllm/lora/ops/triton_ops/lora_expand_op.py
index 9feb9e462459..9e1f90e757cd 100644
--- a/vllm/lora/ops/triton_ops/lora_expand_op.py
+++ b/vllm/lora/ops/triton_ops/lora_expand_op.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Based on:
 Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023).
diff --git a/vllm/lora/ops/triton_ops/lora_kernel_metadata.py b/vllm/lora/ops/triton_ops/lora_kernel_metadata.py
index ac459a83220c..39e647b9b88a 100644
--- a/vllm/lora/ops/triton_ops/lora_kernel_metadata.py
+++ b/vllm/lora/ops/triton_ops/lora_kernel_metadata.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 LoRA kernels metadata preparation utilities.
 """
diff --git a/vllm/lora/ops/triton_ops/lora_shrink_op.py b/vllm/lora/ops/triton_ops/lora_shrink_op.py
index c3871bd58ffa..3f9edfc6d655 100644
--- a/vllm/lora/ops/triton_ops/lora_shrink_op.py
+++ b/vllm/lora/ops/triton_ops/lora_shrink_op.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Based on:
 Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
diff --git a/vllm/lora/ops/triton_ops/utils.py b/vllm/lora/ops/triton_ops/utils.py
index 6225635c2955..5857f7fecb5b 100644
--- a/vllm/lora/ops/triton_ops/utils.py
+++ b/vllm/lora/ops/triton_ops/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/vllm/lora/ops/xla_ops/__init__.py b/vllm/lora/ops/xla_ops/__init__.py
index 94062b05d916..7e7c3c892457 100644
--- a/vllm/lora/ops/xla_ops/__init__.py
+++ b/vllm/lora/ops/xla_ops/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.lora.ops.xla_ops.lora_ops import (bgmv_expand, bgmv_expand_slice,
                                             bgmv_shrink)
diff --git a/vllm/lora/ops/xla_ops/lora_ops.py b/vllm/lora/ops/xla_ops/lora_ops.py
index dff4d5181efe..9118f3351ef0 100644
--- a/vllm/lora/ops/xla_ops/lora_ops.py
+++ b/vllm/lora/ops/xla_ops/lora_ops.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import jax
 import jax.numpy as jnp
diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py
index 7d335e5f7fab..a20d73f0f725 100644
--- a/vllm/lora/peft_helper.py
+++ b/vllm/lora/peft_helper.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from: https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/config.py
 
diff --git a/vllm/lora/punica_wrapper/__init__.py b/vllm/lora/punica_wrapper/__init__.py
index 915fc6623398..e664ffa1dfe6 100644
--- a/vllm/lora/punica_wrapper/__init__.py
+++ b/vllm/lora/punica_wrapper/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase
 from vllm.lora.punica_wrapper.punica_selector import get_punica_wrapper
diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py
index e03f7329021b..5b4902dcbeb3 100644
--- a/vllm/lora/punica_wrapper/punica_base.py
+++ b/vllm/lora/punica_wrapper/punica_base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Based on:
 Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
diff --git a/vllm/lora/punica_wrapper/punica_cpu.py b/vllm/lora/punica_wrapper/punica_cpu.py
index 8118a72d696a..59049cccc8cb 100644
--- a/vllm/lora/punica_wrapper/punica_cpu.py
+++ b/vllm/lora/punica_wrapper/punica_cpu.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, Optional, Union
 
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
index 224640ec7192..6b038309d55d 100644
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Based on:
 Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
diff --git a/vllm/lora/punica_wrapper/punica_hpu.py b/vllm/lora/punica_wrapper/punica_hpu.py
index 416c23e73bf8..b20c9785a74c 100644
--- a/vllm/lora/punica_wrapper/punica_hpu.py
+++ b/vllm/lora/punica_wrapper/punica_hpu.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import TYPE_CHECKING, Optional, Union, final
 
diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py
index 922d6c060003..c684ac77cc9c 100644
--- a/vllm/lora/punica_wrapper/punica_selector.py
+++ b/vllm/lora/punica_wrapper/punica_selector.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
diff --git a/vllm/lora/punica_wrapper/punica_tpu.py b/vllm/lora/punica_wrapper/punica_tpu.py
index 0556e583f409..6b48268c5006 100644
--- a/vllm/lora/punica_wrapper/punica_tpu.py
+++ b/vllm/lora/punica_wrapper/punica_tpu.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 from typing import TYPE_CHECKING, Optional, Union
diff --git a/vllm/lora/punica_wrapper/utils.py b/vllm/lora/punica_wrapper/utils.py
index 1adb40b4c284..0b0a7989f390 100644
--- a/vllm/lora/punica_wrapper/utils.py
+++ b/vllm/lora/punica_wrapper/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import TYPE_CHECKING, Optional, Union
 
diff --git a/vllm/lora/request.py b/vllm/lora/request.py
index 616e94f8d678..5bbba7830c1b 100644
--- a/vllm/lora/request.py
+++ b/vllm/lora/request.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import warnings
 from typing import Optional
diff --git a/vllm/lora/resolver.py b/vllm/lora/resolver.py
index 33f35322fe85..5808ae105e86 100644
--- a/vllm/lora/resolver.py
+++ b/vllm/lora/resolver.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from collections.abc import Set
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index 619dd3bdc40a..ee196e3f689a 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from typing import Optional, Union
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index f1ae03097507..7da44569f408 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import contextmanager
 from typing import Any, Literal, Optional, Union
diff --git a/vllm/model_executor/__init__.py b/vllm/model_executor/__init__.py
index 7636152176f1..55dfe8088c8f 100644
--- a/vllm/model_executor/__init__.py
+++ b/vllm/model_executor/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.model_executor.parameter import (BasevLLMParameter,
                                            PackedvLLMParameter)
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index acf7224675e4..7e6cdd987510 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch.nn as nn
 
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index a2b61a1b19e4..3c2998bece44 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from __future__ import annotations
 
diff --git a/vllm/model_executor/guided_decoding/guidance_decoding.py b/vllm/model_executor/guided_decoding/guidance_decoding.py
index 58adcc3caff9..05b6a1c3239f 100644
--- a/vllm/model_executor/guided_decoding/guidance_decoding.py
+++ b/vllm/model_executor/guided_decoding/guidance_decoding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
 
 import llguidance
diff --git a/vllm/model_executor/guided_decoding/guidance_logits_processors.py b/vllm/model_executor/guided_decoding/guidance_logits_processors.py
index e17df68b4b4d..379b5eaa38a7 100644
--- a/vllm/model_executor/guided_decoding/guidance_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/guidance_logits_processors.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
 import os
 from typing import Any
diff --git a/vllm/model_executor/guided_decoding/guided_fields.py b/vllm/model_executor/guided_decoding/guided_fields.py
index 316860718b77..fa97b6dbf511 100644
--- a/vllm/model_executor/guided_decoding/guided_fields.py
+++ b/vllm/model_executor/guided_decoding/guided_fields.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Optional, TypedDict, Union
diff --git a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
index 7eaf9e38e66a..f9b51f4c1574 100644
--- a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
+++ b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from functools import lru_cache
 from json import loads as json_loads
diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py
index e41af4b360e4..26c2d958e751 100644
--- a/vllm/model_executor/guided_decoding/outlines_decoding.py
+++ b/vllm/model_executor/guided_decoding/outlines_decoding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 import concurrent.futures
diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index 6986b6554c23..4ef4db7c4a39 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024- the Outlines developers
 # This file is adapted from
diff --git a/vllm/model_executor/guided_decoding/utils.py b/vllm/model_executor/guided_decoding/utils.py
index 3f77cf394d9a..8fdfa983e120 100644
--- a/vllm/model_executor/guided_decoding/utils.py
+++ b/vllm/model_executor/guided_decoding/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import regex as re
 
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index d2e568609945..bdd3a1a9c0a5 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # noqa: UP007
 from __future__ import annotations
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index a32c26317a88..cc9c8d445ab6 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Custom activation functions."""
 import math
 from typing import Optional
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 5c262287f7dd..2bdc96e297c1 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import contextmanager
 from typing import Any, Optional
diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
new file mode 100644
index 000000000000..76d71ca08856
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@@ -0,0 +1,125 @@
+# SPDX-License-Identifier: Apache-2.0
+import importlib.util
+from typing import Optional
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.utils import (
+    _resize_cache, per_token_group_quant_fp8)
+
+logger = init_logger(__name__)
+
+has_deep_gemm = importlib.util.find_spec("deep_gemm") is not None
+
+
+class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
+
+    # The Deep Gemm kernels only support block size of 128
+    DEEPGEMM_BLOCK_SHAPE = 128
+
+    def __init__(self, max_num_tokens: int, world_size: int, dp_size: int,
+                 block_shape: list[int]):
+        """
+        max_num_tokens: Maximum number of tokens from a DP Rank
+        world_size: Number of EP ranks
+        dp_size: Number of data-parallel ranks
+        block_shape: Block quantization block shape
+        """
+        super().__init__()
+        self.max_num_tokens = max_num_tokens
+        self.world_size = world_size
+        self.dp_size = dp_size
+        self.block_shape = block_shape
+
+        assert (len(self.block_shape) == 2 and all(
+            [v == self.DEEPGEMM_BLOCK_SHAPE for v in self.block_shape]))
+
+    def workspace_shapes(
+        self,
+        a: torch.Tensor,
+        aq: torch.Tensor,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        num_experts: int,
+    ) -> tuple[int, int, torch.dtype]:
+        assert a.dim() == 2
+        num_dp = self.world_size // self.dp_size
+        max_num_tokens = a.size(
+            0) if self.max_num_tokens is None else self.max_num_tokens
+        workspace13 = num_experts * max_num_tokens * num_dp * max(K, N)
+        workspace2 = num_experts * max_num_tokens * num_dp * (N // 2)
+        return (workspace13, workspace2, a.dtype)
+
+    def apply(
+        self,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_num_tokens: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        import deep_gemm as dg
+        assert hidden_states.ndim == 3
+
+        a1q = hidden_states
+        _, N, K = w1.size()
+
+        if global_num_experts == -1:
+            global_num_experts = w1.size(0)
+
+        assert w2.size(1) == K
+
+        E, max_num_tokens, N, K, top_k_num = mk._moe_problem_size(
+            hidden_states, w1, w2, topk_ids)
+
+        workspace1 = _resize_cache(workspace13, (E, max_num_tokens, N))
+        workspace2 = _resize_cache(workspace2, (E, max_num_tokens, N // 2))
+        workspace3 = _resize_cache(workspace13, (E, max_num_tokens, K))
+
+        # (from deepgemm docs) : A value hint (which is a value on CPU)
+        # for the M expectation of each batch, correctly setting this value
+        # may lead to better performance.
+        expected_m = max_num_tokens
+
+        dg.m_grouped_gemm_fp8_fp8_bf16_nt_masked((a1q, a1q_scale),
+                                                 (w1, w1_scale),
+                                                 out=workspace1,
+                                                 masked_m=expert_num_tokens,
+                                                 expected_m=expected_m)
+
+        # TODO (varun) [Optimization]: Use a batched version of activation.
+        # Similarly for the quant below.
+        self.activation(activation, workspace2, workspace1.view(-1, N))
+
+        w2_hidden_size = workspace2.size(-1)
+        workspace2 = workspace2.view(-1, w2_hidden_size)
+
+        a2q_scale: Optional[torch.Tensor] = None
+        a2q, a2q_scale = per_token_group_quant_fp8(workspace2,
+                                                   self.block_shape[1],
+                                                   column_major_scales=False)
+        a2q = a2q.view(E, max_num_tokens, -1)
+        a2q_scale = a2q_scale.view(E, max_num_tokens, -1)
+
+        dg.m_grouped_gemm_fp8_fp8_bf16_nt_masked((a2q, a2q_scale),
+                                                 (w2, w2_scale),
+                                                 out=workspace3,
+                                                 masked_m=expert_num_tokens,
+                                                 expected_m=expected_m)
+
+        return workspace3
diff --git a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
new file mode 100644
index 000000000000..d62d519af8d7
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
@@ -0,0 +1,117 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
+    BatchedDeepGemmExperts)
+from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+    BatchedTritonExperts)
+
+
+class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
+
+    def __init__(self,
+                 max_num_tokens: int,
+                 world_size: int,
+                 dp_size: int,
+                 use_fp8_w8a8: bool = False,
+                 use_int8_w8a8: bool = False,
+                 use_int8_w8a16: bool = False,
+                 use_int4_w4a16: bool = False,
+                 per_channel_quant: bool = False,
+                 block_shape: Optional[list[int]] = None,
+                 allow_deep_gemm: bool = False):
+        super().__init__()
+        assert not use_int8_w8a8, "NYI"
+        assert not use_int8_w8a16, "NYI"
+        assert not use_int4_w4a16, "NYI"
+
+        self.max_num_tokens = max_num_tokens
+        self.world_size = world_size
+        self.dp_size = dp_size
+        self.use_fp8_w8a8 = use_fp8_w8a8
+        self.use_int8_w8a8 = use_int8_w8a8
+        self.use_int8_w8a16 = use_int8_w8a16
+        self.use_int4_w4a16 = use_int4_w4a16
+        self.per_channel_quant = per_channel_quant
+        self.block_shape = block_shape
+        self.allow_deep_gemm = allow_deep_gemm
+
+        # BatchedTritonKernel doesn't support block quantization
+        # at the moment.
+        self.batched_triton_experts = BatchedTritonExperts(
+            max_num_tokens=self.max_num_tokens,
+            use_fp8_w8a8=self.use_fp8_w8a8,
+            use_int8_w8a8=self.use_int8_w8a8,
+            use_int8_w8a16=self.use_int8_w8a16,
+            use_int4_w4a16=self.use_int4_w4a16,
+            per_channel_quant=self.per_channel_quant,
+            block_shape=self.block_shape,
+            world_size=self.world_size,
+            dp_size=self.dp_size) if self.block_shape is None else None
+
+        is_fp8_128_block_quantized = (self.use_fp8_w8a8
+                                      and self.block_shape is not None
+                                      and len(self.block_shape) == 2 and all(
+                                          [b == 128
+                                           for b in self.block_shape]))
+        self.batched_deep_gemm_experts = BatchedDeepGemmExperts(
+            max_num_tokens=self.max_num_tokens,
+            world_size=self.world_size,
+            dp_size=self.dp_size,
+            block_shape=self.block_shape,  # type: ignore[arg-type]
+        ) if (self.allow_deep_gemm and is_fp8_128_block_quantized) else None
+
+    def workspace_shapes(
+        self,
+        a: torch.Tensor,
+        aq: torch.Tensor,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        num_experts: int,
+    ) -> tuple[int, int, torch.dtype]:
+        # Note: the deep gemm workspaces are strictly larger than the triton
+        # workspaces so we can be pessimistic here and allocate for DeepGemm
+        # even if we fall back to triton later, e.g. if expert maps are set.
+        if self.allow_deep_gemm and self.batched_deep_gemm_experts is not None:
+            return self.batched_deep_gemm_experts.workspace_shapes(
+                a, aq, M, N, K, topk, num_experts)
+        else:
+            assert self.batched_triton_experts is not None
+            return self.batched_triton_experts.workspace_shapes(
+                a, aq, M, N, K, topk, num_experts)
+
+    def apply(
+        self,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_num_tokens: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        use_batched_deep_gemm_experts = (self.allow_deep_gemm
+                                         and self.batched_deep_gemm_experts
+                                         is not None)
+        experts = (self.batched_deep_gemm_experts
+                   if use_batched_deep_gemm_experts else
+                   self.batched_triton_experts)
+        assert experts is not None
+        return experts.apply(hidden_states, w1, w2, topk_ids, activation,
+                             global_num_experts, expert_map, w1_scale,
+                             w2_scale, w1_zp, w2_zp, a1q_scale, a2_scale,
+                             workspace13, workspace2, expert_num_tokens)
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 000000000000..9264ca17fdcf
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
index 26a433da2189..6e7b1a4f2b6c 100644
--- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """ CUTLASS based Fused MoE kernels."""
-from typing import Optional
+from typing import Callable, Optional
 
 import torch
 
@@ -12,110 +13,109 @@
 from vllm.scalar_type import scalar_types
 
 
-class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute):
-
-    def __init__(
-        self,
-        ab_strides1: torch.Tensor,
-        c_strides1: torch.Tensor,
-        ab_strides2: torch.Tensor,
-        c_strides2: torch.Tensor,
-        out_dtype: torch.dtype,
-    ):
-        super().__init__()
-        self.ab_strides1 = ab_strides1
-        self.c_strides1 = c_strides1
-        self.ab_strides2 = ab_strides2
-        self.c_strides2 = c_strides2
-        self.out_dtype = out_dtype
-
-    def workspace_shapes(
-        self,
-        a: torch.Tensor,
-        M: int,
-        N: int,
-        K: int,
-        topk: int,
-        num_experts: int,
-    ) -> tuple[int, int, torch.dtype]:
-        # Note that K, N are transposed
-        N, K = K, N
-        workspace1 = M * topk * max(2 * N, K)
-        workspace2 = M * topk * N
-        return (workspace1, workspace2, self.out_dtype)
-
-    def apply(
-        self,
-        hidden_states: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        topk_ids: torch.Tensor,
-        activation: str,
-        global_num_experts: int,
-        expert_map: Optional[torch.Tensor],
-        w1_scale: Optional[torch.Tensor],
-        w2_scale: Optional[torch.Tensor],
-        w1_zp: Optional[torch.Tensor],
-        w2_zp: Optional[torch.Tensor],
-        a1q_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
-        workspace13: torch.Tensor,
-        workspace2: torch.Tensor,
-        expert_num_tokens: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        a1q = hidden_states
-
-        assert w1_scale is not None
-        assert w2_scale is not None
-        assert w1.dtype == torch.float8_e4m3fn
-        assert w2.dtype == torch.float8_e4m3fn
-        assert a1q.shape[1] == w1.shape[1], "Hidden size mismatch w1"
-        assert w1.shape[2] == w2.shape[1] * 2, "Hidden size mismatch w2"
-        assert w1.shape[0] == w2.shape[0], "Expert number mismatch"
-        assert a1q_scale is None or a1q_scale.dim(
-        ) == 0 or a1q_scale.shape[0] == 1 or a1q_scale.shape[0] == a1q.shape[
-            0], "Input scale shape mismatch"
-        assert w1_scale.dim() == 1 or w1_scale.shape[1] == 1 or w1_scale.shape[
-            1] == w1.shape[2], "W1 scale shape mismatch"
-        assert w2_scale.dim() == 1 or w2_scale.shape[1] == 1 or w2_scale.shape[
-            1] == w2.shape[2], "W2 scale shape mismatch"
-        assert w1.shape[0] == w2.shape[0], "Weights expert number mismatch"
-        assert w1.shape[0] == w1_scale.shape[
-            0], "w1 scales expert number mismatch"
-        assert w1.shape[0] == w2_scale.shape[
-            0], "w2 scales expert number mismatch"
-        assert a2_scale is None or a1q_scale is None or a2_scale.shape == a1q_scale.shape, "Intermediate scale shape mismatch"  # noqa: E501
-        assert self.ab_strides1.shape[0] == w1.shape[
-            0], "AB Strides 1 expert number mismatch"
-        assert self.c_strides1.shape[0] == w1.shape[
-            0], "C Strides 1 expert number mismatch"
-        assert self.ab_strides2.shape[0] == w2.shape[
-            0], "AB Strides 2 expert number  mismatch"
-        assert self.c_strides2.shape[0] == w2.shape[
-            0], "C Strides 2 expert number mismatch"
-        assert self.out_dtype in [torch.half,
-                                  torch.bfloat16], "Invalid output dtype"
-
-        M = a1q.shape[0]
-        _, N, K = w2.shape  # because w1 + w2 are transposed
-        device = a1q.device
-
-        assert w1.shape[1] == K
-        assert global_num_experts != -1
-        assert a1q_scale is not None
-
-        if expert_map is not None:
-            "Translate info from expert_map to topk_ids"
-            local_topk_ids = torch.where(expert_map[topk_ids] != -1,
-                                         expert_map[topk_ids], -1)
-        else:
-            local_topk_ids = topk_ids
+def run_cutlass_moe_fp8(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_ids: torch.Tensor,
+    activation_callable: Callable,
+    global_num_experts: int,
+    expert_map: Optional[torch.Tensor],
+    w1_scale: Optional[torch.Tensor],
+    w2_scale: Optional[torch.Tensor],
+    a1q_scale: Optional[torch.Tensor],
+    a2_scale: Optional[torch.Tensor],
+    workspace13: torch.Tensor,
+    workspace2: torch.Tensor,
+    expert_num_tokens: Optional[torch.Tensor],
+    out_dtype: torch.dtype,
+    per_act_token: bool,
+    per_out_ch: bool,
+) -> torch.Tensor:
+    a1q = hidden_states
+
+    assert w1_scale is not None
+    assert w2_scale is not None
+    assert w1.dtype == torch.float8_e4m3fn
+    assert w2.dtype == torch.float8_e4m3fn
+    if expert_num_tokens is None:
+        assert a1q.shape[1] == w1.shape[2], "Hidden size mismatch w1"
+    else:
+        assert a1q.shape[2] == w1.shape[2], "Hidden size mismatch w1"
+    assert w1.shape[1] == w2.shape[2] * 2, "Hidden size mismatch w2"
+    assert w1_scale.dim() == 1 or w1_scale.shape[1] == 1 or w1_scale.shape[
+        1] == w1.shape[1], "W1 scale shape mismatch"
+    assert w2_scale.dim() == 1 or w2_scale.shape[1] == 1 or w2_scale.shape[
+        1] == w2.shape[1], "W2 scale shape mismatch"
+    assert w1.shape[0] == w2.shape[0], "Expert number mismatch"
+    assert a1q_scale is None or a1q_scale.dim(
+    ) == 0 or a1q_scale.shape[0] == 1 or a1q_scale.shape[0] == a1q.shape[
+        0], "Input scale shape mismatch"
+    assert w1.shape[0] == w2.shape[0], "Weights expert number mismatch"
+    assert w1.shape[0] == w1_scale.shape[0], "w1 scales expert number mismatch"
+    assert w1.shape[0] == w2_scale.shape[0], "w2 scales expert number mismatch"
+    assert a2_scale is None or a2_scale.dim(
+    ) == 0 or a2_scale.shape[0] == 1 or a2_scale.shape[0] == a1q.shape[
+        0], "Intermediate scale shape mismatch"
+    assert out_dtype in [torch.half, torch.bfloat16], "Invalid output dtype"
+    if expert_map is not None:
+        assert expert_num_tokens is None
+
+    # We have two modes: PPLX and non-PPLX. We differentiate them by checking
+    # if expert_num_tokens is None (expert_num_tokens is a tensor which PPLX
+    # uses to track the number of tokens per expert).
+    # In the non-PPLX mode, the input tokens are not padded: thus, the shape
+    # of the input is [total_num_tokens, hidden_size]. The input and output
+    # require shuffling by a_map and c_map such that the tokens assigned to
+    # each expert are contiguous.
+    # In the PPLX mode, the input tokens are padded per expert to ensure that
+    # the PPLX dispatch and combine functions work correctly: thus, the shape
+    # of the input is [num_experts, max_num_tokens_per_expert, hidden_size].
+    # The PPLX input and output require no shuffling by a_map and c_map since
+    # their tokens are already contiguous for each expert as a result of
+    # the dispatch function.
+    is_pplx = expert_num_tokens is not None
+
+    M = a1q.shape[0]  # no pplx
+    padded_M = a1q.shape[1]  # pplx
+    _, K, N = w2.shape
+    device = a1q.device
+
+    assert w1.shape[2] == K
+    assert global_num_experts != -1
+    assert a1q_scale is not None
+
+    if expert_map is not None:
+        "Translate info from expert_map to topk_ids"
+        local_topk_ids = torch.where(expert_map[topk_ids] != -1,
+                                     expert_map[topk_ids], -1)
+    else:
+        local_topk_ids = topk_ids
+
+    topk = local_topk_ids.shape[1]
+    local_E = w1.shape[0]
+
+    if is_pplx:
+        expert_offsets = torch.empty((local_E),
+                                     dtype=torch.int32,
+                                     device=device)
+        problem_sizes1 = torch.empty((local_E, 3),
+                                     dtype=torch.int32,
+                                     device=device)
+        problem_sizes2 = torch.empty((local_E, 3),
+                                     dtype=torch.int32,
+                                     device=device)
 
-        topk = local_topk_ids.shape[1]
+        ops.get_cutlass_pplx_moe_mm_data(expert_offsets, problem_sizes1,
+                                         problem_sizes2, expert_num_tokens,
+                                         local_E, padded_M, N, K)
 
-        per_act_token = a1q_scale.numel() != 1 if a1q_scale is not None else (
-            a2_scale.numel() != 1 if a2_scale is not None else False)
+        w1_scale = w1_scale.reshape(w1_scale.shape[0], -1)
+        w2_scale = w2_scale.reshape(w2_scale.shape[0], -1)
+        a1q = a1q.reshape(-1, a1q.shape[2])
+        a1q_scale = a1q_scale.reshape(-1, a1q_scale.shape[2]).contiguous()
 
+    else:
         expert_offsets = torch.empty((global_num_experts + 1),
                                      dtype=torch.int32,
                                      device=device)
@@ -148,50 +148,130 @@ def apply(
 
         a1q = _fp8_perm(a1q, a_map)
         a1q_scale = a1q_scale[a_map] if per_act_token else a1q_scale
-
+        expert_offsets = expert_offsets[:-1]
+
+    ab_strides1 = torch.full((w1.shape[0], ),
+                             K,
+                             device=device,
+                             dtype=torch.int64)
+    c_strides1 = torch.full((w1.shape[0], ),
+                            2 * N,
+                            device=device,
+                            dtype=torch.int64)
+    ab_strides2 = torch.full((w1.shape[0], ),
+                             N,
+                             device=device,
+                             dtype=torch.int64)
+    c_strides2 = torch.full((w1.shape[0], ),
+                            K,
+                            device=device,
+                            dtype=torch.int64)
+
+    if is_pplx:
+        c1 = _resize_cache(workspace13, (local_E * padded_M, N * 2))
+        c2 = _resize_cache(workspace2, (local_E * padded_M, N))
+        c3 = _resize_cache(workspace13, (local_E * padded_M, K))
+    else:
         c1 = _resize_cache(workspace13, (M * topk, N * 2))
         c2 = _resize_cache(workspace2, (M * topk, N))
         c3 = _resize_cache(workspace13, (M * topk, K))
 
-        ops.cutlass_moe_mm(c1, a1q, w1, a1q_scale, w1_scale,
-                           expert_offsets[:-1], problem_sizes1,
-                           self.ab_strides1, self.ab_strides1, self.c_strides1)
+    ops.cutlass_moe_mm(c1, a1q, w1, a1q_scale, w1_scale, expert_offsets,
+                       problem_sizes1, ab_strides1, ab_strides1, c_strides1,
+                       per_act_token, per_out_ch)
 
-        self.activation(activation, c2, c1)
+    activation_callable(c2, c1)
 
-        a2q, a2q_scale = ops.scaled_fp8_quant(
-            c2, a2_scale, use_per_token_if_dynamic=per_act_token)
+    a2q, a2q_scale = ops.scaled_fp8_quant(
+        c2, a2_scale, use_per_token_if_dynamic=per_act_token)
 
-        if expert_map is not None:
-            c3.fill_(0)
+    if expert_map is not None:
+        c3.fill_(0)
 
-        ops.cutlass_moe_mm(c3, a2q, w2, a2q_scale, w2_scale,
-                           expert_offsets[:-1], problem_sizes2,
-                           self.ab_strides2, self.ab_strides2, self.c_strides2)
+    ops.cutlass_moe_mm(c3, a2q, w2, a2q_scale, w2_scale, expert_offsets,
+                       problem_sizes2, ab_strides2, ab_strides2, c_strides2,
+                       per_act_token, per_out_ch)
 
-        c3 = c3[c_map]
+    if is_pplx:
+        return c3.reshape(local_E, padded_M, K)
+    else:
+        return c3[c_map].view(M, topk, K)
 
-        return c3
+
+class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute):
+
+    def __init__(
+        self,
+        max_experts_per_worker: int,
+        out_dtype: torch.dtype,
+        per_act_token: bool,
+        per_out_ch: bool,
+    ):
+        super().__init__()
+        self.max_experts_per_worker = max_experts_per_worker
+        self.out_dtype = out_dtype
+        self.per_act_token = per_act_token
+        self.per_out_ch = per_out_ch
+
+    def workspace_shapes(
+        self,
+        a: torch.Tensor,
+        aq: torch.Tensor,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        num_experts: int,
+    ) -> tuple[int, int, torch.dtype]:
+        padded_M = aq.shape[1]
+        workspace1 = self.max_experts_per_worker * padded_M * max(N, K)
+        workspace2 = self.max_experts_per_worker * padded_M * (N // 2)
+        return (workspace1, workspace2, self.out_dtype)
+
+    def apply(
+        self,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_num_tokens: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        assert w1_zp is None, "w1_zp is not supported in CUTLASS MoE"
+        assert w2_zp is None, "w2_zp is not supported in CUTLASS MoE"
+        activation_callable = lambda i, o: self.activation(activation, i, o)
+        return run_cutlass_moe_fp8(hidden_states, w1, w2, topk_ids,
+                                   activation_callable, global_num_experts,
+                                   expert_map, w1_scale, w2_scale, a1q_scale,
+                                   a2_scale, workspace13, workspace2,
+                                   expert_num_tokens, self.out_dtype,
+                                   self.per_act_token, self.per_out_ch)
 
 
-#TODO make the grouped gemm kernel consistent with scaled gemm kernel
 def cutlass_moe_fp8(
     a: torch.Tensor,
     w1_q: torch.Tensor,
     w2_q: torch.Tensor,
-    w1_scale: torch.Tensor,
-    w2_scale: torch.Tensor,
     topk_weights: torch.Tensor,
     topk_ids: torch.Tensor,
-    ab_strides1: torch.Tensor,
-    c_strides1: torch.Tensor,
-    ab_strides2: torch.Tensor,
-    c_strides2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    activation: str = "silu",
     a1_scale: Optional[torch.Tensor] = None,
     a2_scale: Optional[torch.Tensor] = None,
-    out_dtype: torch.dtype = torch.half,
     expert_map: Optional[torch.Tensor] = None,
     apply_router_weight_on_input: bool = False,
+    global_num_experts: int = -1,
 ) -> torch.Tensor:
     """
     This function computes a a8w8-quantized Mixture of Experts (MoE) layer
@@ -206,25 +286,17 @@ def cutlass_moe_fp8(
         Shape: [num_experts, K, 2N] (the weights are passed transposed)
     - w2_q (torch.Tensor): The second set of fp8-quantized expert weights.
         Shape: [num_experts, N, K] (the weights are passed transposed)
+    - topk_weights (torch.Tensor): The weights of each token->expert mapping.
+    - topk_ids (torch.Tensor): The token->expert mappings.
     - w1_scale (torch.Tensor): The fp32 scale to dequantize w1_q.
         Shape: [num_experts] or [num_experts, 2N]
     - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q.
         Shape: [num_experts] or [num_experts, K]
-    - gating_output (torch.Tensor): The output of the gating operation
-        (before softmax).
-    - topk_weights (torch.Tensor): The weights of each token->expert mapping.
-    - ab_strides1 (torch.Tensor): The input and weights strides of the first
-        grouped gemm.
-    - c_strides1 (torch.Tensor): The output strides of the first grouped gemm.
-    - ab_strides2 (torch.Tensor): The input and weights strides of the second
-        grouped gemm.
-    - c_strides2 (torch.Tensor): The output strides of the second grouped gemm.
     - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a.
         Shape: scalar or [M]
     - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to
         quantize the intermediate result between the gemms.
         Shape: scalar or [M]
-    - out_dtype (torch.dtype): The output tensor type.
     - expert_map (Optional[torch.Tensor]): In the case of Expert parallel,
         every Rank is responsible for a subset of experts. expert_map is a
         mapping from global expert-id to local expert-id. When expert_map[i]
@@ -232,24 +304,27 @@ def cutlass_moe_fp8(
         expert-id i.
     - apply_router_weight_on_input (bool): When true, the topk weights are
         applied directly on the inputs. This is only applicable when topk is 1.
+    - global_num_experts (int): The total number of experts.
 
     Returns:
     - torch.Tensor: The fp16 output tensor after applying the MoE layer.
     """
     per_act_token = a1_scale.numel() != 1 if a1_scale is not None else (
         a2_scale.numel() != 1 if a2_scale is not None else False)
+    per_out_ch = w1_scale.numel() != w1_q.shape[0]
+
+    out_dtype = a.dtype
 
     fn = mk.FusedMoEModularKernel(
         MoEPrepareAndFinalizeNoEP(
-            per_channel_quant=per_act_token,
             quant_dtype=torch.float8_e4m3fn,
+            per_channel_quant=per_act_token,
         ),
         CutlassExpertsFp8(
-            ab_strides1,
-            c_strides1,
-            ab_strides2,
-            c_strides2,
-            out_dtype,
+            max_experts_per_worker=global_num_experts,
+            out_dtype=out_dtype,
+            per_act_token=per_act_token,
+            per_out_ch=per_out_ch,
         ),
     )
 
@@ -259,9 +334,12 @@ def cutlass_moe_fp8(
         w2_q,
         topk_weights,
         topk_ids,
-        expert_map=expert_map,
-        w1_scale=w1_scale,
-        w2_scale=w2_scale,
+        False,
+        activation,
+        global_num_experts if global_num_experts != -1 else w1_q.size(0),
+        expert_map,
+        w1_scale,
+        w2_scale,
         a1_scale=a1_scale,
         a2_scale=a2_scale,
         apply_router_weight_on_input=apply_router_weight_on_input,
@@ -332,6 +410,7 @@ def cutlass_moe_fp4(a: torch.Tensor, a1_gscale: torch.Tensor,
     num_topk = topk_ids.shape[1]
 
     expert_offsets = torch.empty((e + 1), dtype=torch.int32, device=device)
+    blockscale_offsets = torch.empty((e + 1), dtype=torch.int32, device=device)
     # Problem size:  (num_experts, (m,2n,k))
     problem_sizes1 = torch.empty((e, 3), dtype=torch.int32, device=device)
     # Problem size:  (num_experts, (m,n,k))
@@ -343,12 +422,10 @@ def cutlass_moe_fp4(a: torch.Tensor, a1_gscale: torch.Tensor,
     # problem shapes should have [m, n, k]
     # Note that problem sizes are based on logical number of elements.
     ops.get_cutlass_moe_mm_data(topk_ids, expert_offsets, problem_sizes1,
-                                problem_sizes2, a_map, c_map, e, n, k)
+                                problem_sizes2, a_map, c_map, e, n, k,
+                                blockscale_offsets)
 
-    tokens_per_expert = problem_sizes1[:, 0]
-    rounded_tokens_per_expert = (tokens_per_expert + (128 - 1)) // 128 * 128
-    blockscale_offsets = torch.zeros(e + 1, dtype=torch.int32, device=device)
-    blockscale_offsets[1:] = torch.cumsum(rounded_tokens_per_expert, dim=0)
+    a = ops.shuffle_rows(a, a_map)
 
     rep_a_fp4, rep_a_blockscale = ops.scaled_fp4_experts_quant(
         a,
@@ -356,7 +433,7 @@ def cutlass_moe_fp4(a: torch.Tensor, a1_gscale: torch.Tensor,
         expert_offsets,
         blockscale_offsets,
         num_topk,
-        expert_map=a_map)
+    )
 
     c1 = ops.cutlass_fp4_moe_mm(rep_a_fp4, w1_fp4, rep_a_blockscale,
                                 w1_blockscale, w1_alphas, problem_sizes1,
@@ -377,6 +454,8 @@ def cutlass_moe_fp4(a: torch.Tensor, a1_gscale: torch.Tensor,
                                 w2_alphas, problem_sizes2, expert_offsets[:-1],
                                 blockscale_offsets[:-1], out_dtype, device)
     del int_fp4, int_blockscale
-    out = (c2[c_map].view(m, num_topk, k) *
+
+    c2 = ops.shuffle_rows(c2, c_map)
+    out = (c2.view(m, num_topk, k) *
            topk_weights.view(m, num_topk, 1).half()).sum(dim=1)
     return out.to(dtype=out_dtype)
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
index 46a814e6ecc3..c00e849b4ebb 100644
--- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import functools
 import importlib.util
 from typing import Optional
@@ -11,8 +12,8 @@
     _moe_permute)
 from vllm.model_executor.layers.fused_moe.prepare_finalize import (
     MoEPrepareAndFinalizeNoEP)
-from vllm.model_executor.layers.fused_moe.utils import (_fp8_quantize,
-                                                        _resize_cache)
+from vllm.model_executor.layers.fused_moe.utils import (
+    _resize_cache, per_token_group_quant_fp8)
 from vllm.utils import round_up
 
 logger = init_logger(__name__)
@@ -33,10 +34,8 @@ def _valid_deep_gemm_shape(M: int, N: int, K: int):
     return align <= M and N % align == 0 and K % align == 0
 
 
-def _valid_deep_gemm(hidden_states: torch.Tensor,
-                     w1: torch.Tensor,
-                     w2: torch.Tensor,
-                     expert_map: Optional[torch.Tensor] = None) -> bool:
+def _valid_deep_gemm(hidden_states: torch.Tensor, w1: torch.Tensor,
+                     w2: torch.Tensor) -> bool:
     """
     Check if the given problem size is supported by the DeepGemm grouped
     gemm kernel.  All of M, N, K and the quantization block_shape must be
@@ -46,10 +45,6 @@ def _valid_deep_gemm(hidden_states: torch.Tensor,
         logger.debug("DeepGemm disabled: deep_gemm not available.")
         return False
 
-    if expert_map is not None:
-        logger.debug("DeepGemm disabled: expert map NYI.")
-        return False
-
     M = hidden_states.size(0)
     _, K, N = w2.size()
     if not _valid_deep_gemm_shape(M, N, K):
@@ -78,6 +73,7 @@ def __init__(self):
     def workspace_shapes(
         self,
         a: torch.Tensor,
+        aq: torch.Tensor,
         M: int,
         N: int,
         K: int,
@@ -115,7 +111,9 @@ def apply(
         a1q = hidden_states
         _, N, K = w1.size()
 
-        assert global_num_experts != -1
+        if global_num_experts == -1:
+            global_num_experts = w1.size(0)
+
         assert w2.size(1) == K
 
         a1q, a1q_scale, _, expert_ids, inv_perm = _moe_permute(
@@ -127,6 +125,14 @@ def apply(
             self.block_shape[0],
         )
 
+        if expert_map is not None:
+            # DeepGemm (Grouped Contiguous) kernel needs a valid B index
+            # for all rows of A. To that effect, simply compute with
+            # the 0th weight matrix.
+            # Note that this relies on the fact that corresponding topk
+            # weights would be 0 during weight multiplication.
+            expert_ids = torch.where(expert_ids == -1, 0, expert_ids)
+
         # Note: M_sum is different than the pre-permuted shape of a1q.
         M_sum = a1q.size(0)
         workspace1 = _resize_cache(workspace13, (M_sum, N))
@@ -139,9 +145,9 @@ def apply(
         self.activation(activation, workspace2, workspace1.view(-1, N))
 
         a2q_scale: Optional[torch.Tensor] = None
-
-        a2q, a2q_scale = _fp8_quantize(workspace2, a2_scale, False,
-                                       self.block_shape)
+        a2q, a2q_scale = per_token_group_quant_fp8(workspace2,
+                                                   self.block_shape[1],
+                                                   column_major_scales=True)
 
         dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
             (a2q, a2q_scale), (w2, w2_scale), workspace3, expert_ids)
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
new file mode 100644
index 000000000000..48cf01638ade
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
@@ -0,0 +1,236 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional
+
+import deep_ep
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.utils import (
+    moe_kernel_quantize_input)
+
+
+class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
+    """
+    Prepare/Finalize using DeepEP High-Throughput kernels.
+    """
+
+    def __init__(self,
+                 buffer: deep_ep.Buffer,
+                 world_size: int,
+                 rank: int,
+                 dp_size: int,
+                 rank_expert_offset: int,
+                 quant_dtype: Optional[torch.dtype] = None,
+                 block_shape: Optional[list[int]] = None):
+        super().__init__()
+        self.buffer = buffer
+        self.world_size = world_size
+        self.rank = rank
+        self.dp_size = dp_size
+        self.rank_expert_offset = rank_expert_offset
+        self.quant_dtype = quant_dtype
+        self.block_shape = block_shape
+        # The dispatch function returns a handle that the combine function
+        # requires. We store the handle here so it is available to the
+        # combine function.
+        self.handle = None
+
+        # From https://github.com/deepseek-ai/DeepEP/blob/9fe9021f29c9083cd1808ab36b740208524d9f63/deep_ep/buffer.py#L164
+        self.available_rank_configs = [2, 4, 8, 16, 24, 32, 64, 128, 144, 160]
+
+    def max_num_tokens_per_rank(self) -> Optional[int]:
+        return None
+
+    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+        return torch.int64
+
+    def _get_dispatch_config(self) -> Optional[deep_ep.Config]:
+        if self.dp_size not in self.available_rank_configs:
+            return None
+        return deep_ep.Buffer.get_dispatch_config(self.dp_size)
+
+    def _get_combine_config(self) -> Optional[deep_ep.Config]:
+        if self.dp_size not in self.available_rank_configs:
+            return None
+        return deep_ep.Buffer.get_combine_config(self.dp_size)
+
+    def _do_quant(self, tokens: torch.Tensor,
+                  token_scales: Optional[torch.Tensor], per_act_token: bool):
+        tokens, token_scales = moe_kernel_quantize_input(
+            tokens, token_scales, self.quant_dtype, per_act_token,
+            self.block_shape)
+        return tokens, token_scales
+
+    def _do_dispatch(self, tokens: torch.Tensor,
+                     token_scales: Optional[torch.Tensor],
+                     rank_topk_ids: torch.Tensor,
+                     rank_topk_weights: torch.Tensor, num_experts: int):
+
+        has_scales = token_scales is not None
+
+        (num_tokens_per_rank, num_tokens_per_rdma_rank, expert_num_tokens,
+         is_token_in_rank, event) = self.buffer.get_dispatch_layout(
+             topk_idx=rank_topk_ids,
+             num_experts=num_experts,
+             previous_event=None,
+             async_finish=False,
+             allocate_on_comm_stream=False)
+
+        token_data = tokens
+        if has_scales:
+            token_data = (tokens, token_scales)
+
+        (
+            token_data, expert_topk_ids, expert_topk_weights,
+            expert_num_tokens_per_expert_list, self.handle, event
+        ) = self.buffer.dispatch(
+            x=token_data,
+            handle=None,
+            num_tokens_per_rank=num_tokens_per_rank,
+            num_tokens_per_rdma_rank=num_tokens_per_rdma_rank,
+            is_token_in_rank=is_token_in_rank,
+            num_tokens_per_expert=expert_num_tokens,
+            topk_idx=rank_topk_ids,
+            topk_weights=rank_topk_weights,
+            # expert_alignment rounds the number of tokens per expert
+            # to this value.
+            expert_alignment=1,
+            config=self._get_dispatch_config(),
+            previous_event=None,
+            async_finish=False,
+            allocate_on_comm_stream=False)
+
+        if has_scales:
+            expert_x, expert_x_scale = token_data
+        else:
+            expert_x, expert_x_scale = token_data, None
+
+        # The existing MOE kernels assume that all entries of topk_ids are
+        # valid. To that effect, set the -1s in expert_topk_ids to some expert
+        # outside this rank so the expert_map can remap it to -1 when safe.
+        # With Expert Parallel, the experts are divided amongst the rank
+        # sequentially. For rank 0, set it to num_experts - 1 and for all other
+        # ranks set it to 0 as we know that expert_map will have a -1 in those
+        # regions for those ranks.
+        #
+        # DeepEP's topk_ids output refers to the local experts directly. Offset
+        # the topk_ids to move it back to the global experts space so it aligns
+        # with existing vLLM interfaces.
+        expert_topk_ids = torch.where(
+            expert_topk_ids == -1,
+            num_experts - 1 if self.rank_expert_offset == 0 else 0,
+            expert_topk_ids + self.rank_expert_offset)
+
+        return (expert_x, expert_x_scale, expert_num_tokens, expert_topk_ids,
+                expert_topk_weights)
+
+    def prepare(
+        self,
+        a1: torch.Tensor,
+        a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        rank_topk_weights: torch.Tensor,
+        rank_topk_ids: torch.Tensor,
+        num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        apply_router_weight_on_input: bool,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
+               Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+        if apply_router_weight_on_input:
+            topk = rank_topk_ids.size(1)
+            # TODO: this only works for topK=1, will need to update for topK>1
+            assert topk == 1, (
+                "apply_router_weight_on_input is only implemented for topk=1")
+            a1 = a1 * rank_topk_weights.to(a1.dtype)
+
+        # Check if there is a block_shape / or if we can infer the quantization
+        # schemes from the scales.
+        per_token_quant = None
+        if all([x is None for x in [self.block_shape, a1_scale, a2_scale]
+                ]) and self.quant_dtype is not None:
+            # Quantization required despite none of the inputs suggesting
+            # quantization. Fallback to per_token_dynamic quant.
+            per_token_quant = True
+        else:
+            per_token_quant = ((self.block_shape is not None) or
+                               (a1_scale is not None and a1_scale.numel() != 1)
+                               or (a2_scale is not None
+                                   and a2_scale.numel() != 1))
+
+        if per_token_quant:
+            a1q, a1q_scale = self._do_quant(a1, a1_scale, per_act_token=True)
+            (expert_x, expert_x_scale, expert_num_tokens, expert_topk_ids,
+             expert_topk_weights) = self._do_dispatch(
+                 tokens=a1q,
+                 token_scales=a1q_scale,
+                 rank_topk_ids=rank_topk_ids,
+                 rank_topk_weights=rank_topk_weights,
+                 num_experts=num_experts)
+        else:
+            # DeepEP kernels only support dispatching per-token-quant
+            # quantization. dispatch in bfloat16.
+            (expert_x, _, expert_num_tokens, expert_topk_ids,
+             expert_topk_weights) = self._do_dispatch(
+                 tokens=a1,
+                 token_scales=None,
+                 rank_topk_ids=rank_topk_ids,
+                 rank_topk_weights=rank_topk_weights,
+                 num_experts=num_experts)
+            # quantize now
+            expert_x_scale = None
+            if expert_x.numel() != 0:
+                expert_x, expert_x_scale = self._do_quant(expert_x,
+                                                          a1_scale,
+                                                          per_act_token=False)
+
+        return (expert_x, expert_x_scale, expert_num_tokens, expert_topk_ids,
+                expert_topk_weights)
+
+    def _apply_weights_and_reduce(self, num_tokens: int,
+                                  fused_expert_output: torch.Tensor,
+                                  topk_weights: torch.Tensor,
+                                  apply_router_weight_on_input: bool,
+                                  output_dtype: torch.dtype):
+
+        if fused_expert_output.ndim == 2:
+            hidden_dim = fused_expert_output.size(-1)
+            fused_expert_output = fused_expert_output.view(
+                num_tokens, -1, hidden_dim)
+
+        if not apply_router_weight_on_input:
+            # The DeepEP combine kernels don't do the topk weight
+            # multiplication. We multiply the weights locally.
+            fused_expert_output = fused_expert_output.to(torch.float32)
+            fused_expert_output = fused_expert_output * topk_weights.view(
+                fused_expert_output.size(0), -1, 1)
+            fused_expert_output = fused_expert_output.to(output_dtype)
+
+        return fused_expert_output.sum(dim=1).to(output_dtype)
+
+    def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor,
+                 topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+                 apply_router_weight_on_input: bool) -> None:
+
+        assert self.handle is not None
+
+        # fused_expert_output can have 0 tokens - This happens when none of the
+        # tokens from the all2all reach this EP rank.
+        if fused_expert_output.numel() != 0:
+            fused_expert_output = self._apply_weights_and_reduce(
+                num_tokens=topk_ids.size(0),
+                fused_expert_output=fused_expert_output,
+                topk_weights=topk_weights,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+                output_dtype=output.dtype)
+
+        combined_x, _, event = self.buffer.combine(
+            x=fused_expert_output,
+            handle=self.handle,
+            topk_weights=None,
+            config=self._get_combine_config(),
+            previous_event=None,
+            async_finish=False,
+            allocate_on_comm_stream=False)
+        # Respect inplace outputs.
+        output.copy_(combined_x, non_blocking=True)
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
new file mode 100644
index 000000000000..3484a7a8a496
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -0,0 +1,186 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Union
+
+import deep_ep
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.utils import (
+    moe_kernel_quantize_input)
+
+# DeepEP kernels quantize dispatch inputs in 128 element chunks.
+DEEPEP_QUANT_BLOCK_SIZE = 128
+
+
+def dequant_fp8(expert_x_fp8: torch.Tensor,
+                expert_x_scales: torch.Tensor) -> torch.Tensor:
+    """
+    Return dequantized tensor in fp32
+    """
+    # TODO (varun) : Optimize leverage num_tokens_per_expert counts
+    assert expert_x_fp8.is_contiguous()
+    expert_x_scales = expert_x_scales.contiguous()
+    num_experts = expert_x_fp8.size(0)
+
+    expert_x_fp32 = expert_x_fp8.to(torch.float32).view(
+        num_experts, -1, DEEPEP_QUANT_BLOCK_SIZE)
+    expert_x_scales = expert_x_scales.view(num_experts, -1, 1)
+    return (expert_x_fp32 * expert_x_scales).view(expert_x_fp8.shape)
+
+
+class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
+    """
+    Prepare/Finalize using DeepEP low-latency kernels.
+    """
+
+    # DeepEP low-latency kernels are compiled only for certain
+    # specific hidden sizes.
+    SUPPORTED_HIDDEN_SIZES = [2560, 4096, 5120, 7168]
+
+    def __init__(self,
+                 buffer: deep_ep.Buffer,
+                 world_size: int,
+                 dp_size: int,
+                 max_tokens_per_rank: int,
+                 quant_dtype: Optional[torch.dtype] = None,
+                 block_shape: Optional[list[int]] = None,
+                 use_fp8_dispatch: bool = False):
+        super().__init__()
+
+        self.buffer = buffer
+        self.world_size = world_size
+        self.dp_size = dp_size
+        self.quant_dtype = quant_dtype
+        self.block_shape = block_shape
+        self.max_tokens_per_rank = max_tokens_per_rank
+        self.use_fp8_dispatch = use_fp8_dispatch
+        # The dispatch function returns a handle that the combine function
+        # requires. We store the handle here so it is available to the
+        # combine function.
+        self.handle = None
+
+    def max_num_tokens_per_rank(self) -> Optional[int]:
+        return self.max_tokens_per_rank
+
+    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+        return torch.int64
+
+    def _do_quant(
+            self, x: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
+            a1_scale: Optional[torch.Tensor], a2_scale: Optional[torch.Tensor],
+            a1_dtype: torch.dtype
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+
+        block_k = self.block_shape[1] if self.block_shape is not None else None
+        if self.use_fp8_dispatch:
+            if block_k == DEEPEP_QUANT_BLOCK_SIZE:
+                # DeepEP kernels did the quantization for us.
+                x, x_scales = x
+                return x, x_scales
+
+            # Dequant to get back the tokens in the datatype we dispatched in.
+            x_fp8, x_scales = x
+            x = dequant_fp8(x_fp8, x_scales).to(dtype=a1_dtype)
+
+        assert isinstance(x, torch.Tensor)
+
+        # Check if there is a block_shape / or if we can infer the quantization
+        # schemes from the scales.
+        per_token_quant = None
+        if all([v is None for v in [self.block_shape, a1_scale, a2_scale]
+                ]) and self.quant_dtype is not None:
+            # Quantization required despite none of the inputs suggesting
+            # quantization. Fallback to per_token_dynamic quant.
+            per_token_quant = True
+        else:
+            per_token_quant = ((self.block_shape is not None) or
+                               (a1_scale is not None and a1_scale.numel() != 1)
+                               or (a2_scale is not None
+                                   and a2_scale.numel() != 1))
+
+        num_experts, max_tokens, hidden_dim = x.size()
+
+        # TODO (varun): Optimization - Use a batched version of quant
+        x = x.view((-1, hidden_dim))
+        x, x_scales = moe_kernel_quantize_input(x, a1_scale, self.quant_dtype,
+                                                per_token_quant,
+                                                self.block_shape)
+        x = x.view((num_experts, -1, hidden_dim))
+
+        if per_token_quant:
+            assert x_scales is not None
+            x_scales = x_scales.view(num_experts, max_tokens, -1)
+
+        return x, x_scales
+
+    def prepare(
+        self,
+        a1: torch.Tensor,
+        a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        rank_topk_weights: torch.Tensor,
+        rank_topk_ids: torch.Tensor,
+        num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        apply_router_weight_on_input: bool,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
+               Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+        hidden_size = a1.size(1)
+        assert hidden_size in self.SUPPORTED_HIDDEN_SIZES, \
+            (f"Hidden Size {hidden_size} not in supported list of hidden sizes"
+            f"{self.SUPPORTED_HIDDEN_SIZES}")
+
+        if self.use_fp8_dispatch:
+            assert hidden_size % 128 == 0, \
+            "DeepEP kernels quantize the inputs in blocks of shape 128"
+
+        has_per_token_scales = a1_scale.numel(
+        ) != 1 if a1_scale is not None else (
+            a2_scale.numel() != 1 if a2_scale is not None else False)
+        assert not has_per_token_scales, (
+            "low_latency kernels doesn't support dispatching per-token scales")
+
+        if apply_router_weight_on_input:
+            topk = rank_topk_ids.size(1)
+            # TODO: this only works for topK=1, will need to update for topK>1
+            assert topk == 1, (
+                "apply_router_weight_on_input is only implemented for topk=1")
+            a1 = a1 * rank_topk_weights.to(a1.dtype)
+
+        # Dispatch
+        expert_x, expert_num_tokens, self.handle, event, hook = \
+                self.buffer.low_latency_dispatch(a1,
+                                                rank_topk_ids,
+                                                self.max_tokens_per_rank,
+                                                num_experts,
+                                                use_fp8=self.use_fp8_dispatch,
+                                                async_finish=False,
+                                                return_recv_hook=False)
+
+        expert_x, expert_x_scale = self._do_quant(expert_x, a1_scale, a2_scale,
+                                                  a1.dtype)
+
+        return (expert_x, expert_x_scale, expert_num_tokens, None, None)
+
+    def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor,
+                 topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+                 apply_router_weight_on_input: bool) -> None:
+
+        assert self.handle is not None
+
+        combine_topk_weights = topk_weights
+        if apply_router_weight_on_input:
+            # weights have already been applied.
+            combine_topk_weights = torch.ones_like(topk_weights)
+
+        # TODO (varun) : Enable zero copy mode
+        _, event, hook = self.buffer.low_latency_combine(
+            fused_expert_output,
+            topk_ids,
+            combine_topk_weights,
+            self.handle,
+            async_finish=False,
+            zero_copy=False,
+            return_recv_hook=False,
+            out=output)
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
index c2db79365931..68a3485ff1f6 100644
--- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Fused batched MoE kernel."""
 from typing import Optional
 
@@ -9,7 +10,8 @@
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.model_executor.layers.fused_moe.fused_moe import (
     get_config_dtype_str, try_get_optimal_moe_config)
-from vllm.model_executor.layers.fused_moe.utils import _resize_cache
+from vllm.model_executor.layers.fused_moe.utils import (
+    _resize_cache, moe_kernel_quantize_input)
 
 
 @triton.jit
@@ -396,6 +398,12 @@ def __init__(self, max_num_tokens: Optional[int], world_size: int,
         self.rank = rank
         self.max_num_tokens = max_num_tokens
 
+    def max_num_tokens_per_rank(self) -> Optional[int]:
+        return self.max_num_tokens
+
+    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+        return None
+
     def prepare(
         self,
         a1: torch.Tensor,
@@ -406,7 +414,8 @@ def prepare(
         num_experts: int,
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
+               Optional[torch.Tensor], Optional[torch.Tensor]]:
         assert a1.dim() == 2
         assert topk_ids.dim() == 2
         assert topk_ids.size(0) == a1.size(0)
@@ -449,7 +458,7 @@ def prepare(
                  first_expert, :rows, :] = a1[:topks.numel()][topks]
             tokens_per_expert[expert_id - first_expert] = rows
 
-        return b_a1, a1_scale, tokens_per_expert
+        return b_a1, a1_scale, tokens_per_expert, None, None
 
     def finalize(
         self,
@@ -512,6 +521,7 @@ def __init__(
     def workspace_shapes(
         self,
         a: torch.Tensor,
+        aq: torch.Tensor,
         M: int,
         N: int,
         K: int,
@@ -600,6 +610,7 @@ def __init__(
         use_int8_w8a8: bool = False,
         use_int8_w8a16: bool = False,
         use_int4_w4a16: bool = False,
+        per_channel_quant: bool = False,
         block_shape: Optional[list[int]] = None,
         world_size: int = 1,
         dp_size: int = 1,
@@ -610,15 +621,19 @@ def __init__(
         self.use_int4_w4a16 = use_int4_w4a16
         self.use_int8_w8a16 = use_int8_w8a16
         self.block_shape = block_shape
+        self.per_channel_quant = per_channel_quant
         self.max_num_tokens = max_num_tokens
-        assert not use_int8_w8a8, "NYI"
-        assert not use_int4_w4a16, "NYI"
         self.world_size = world_size
         self.dp_size = dp_size
 
+        assert not use_int8_w8a8, "NYI"
+        assert not use_int4_w4a16, "NYI"
+        assert self.block_shape is None, "NYI"
+
     def workspace_shapes(
         self,
         a: torch.Tensor,
+        aq: torch.Tensor,
         M: int,
         N: int,
         K: int,
@@ -669,8 +684,7 @@ def apply(
             torch.float32, torch.float16, torch.bfloat16, torch.float8_e4m3fn
         ]
 
-        # TODO: num_tokens -> max_num_tokens?
-        E, num_tokens, N, K, top_k_num = mk._moe_problem_size(
+        E, max_num_tokens, N, K, top_k_num = mk._moe_problem_size(
             hidden_states, w1, w2, topk_ids)
 
         assert w1.size(0) == E
@@ -686,7 +700,7 @@ def apply(
             w2.size(),
             top_k_num,
             config_dtype,
-            num_tokens,
+            max_num_tokens,
             block_shape=self.block_shape,
         )
 
@@ -705,10 +719,12 @@ def apply(
         #print(f"shape: E={E}, M={num_tokens}, N={N}, K={K}, top_k={top_k_num}")
         # We can reuse the memory between these because by the time we need
         # cache3, we're done with cache1
-        intermediate_cache1 = _resize_cache(workspace13, (E, num_tokens, N))
+        intermediate_cache1 = _resize_cache(workspace13,
+                                            (E, max_num_tokens, N))
         intermediate_cache2 = _resize_cache(workspace2,
-                                            (E, num_tokens, N // 2))
-        intermediate_cache3 = _resize_cache(workspace13, (E, num_tokens, K))
+                                            (E, max_num_tokens, N // 2))
+        intermediate_cache3 = _resize_cache(workspace13,
+                                            (E, max_num_tokens, K))
 
         # MM1
         invoke_moe_batched_triton_kernel(A=hidden_states,
@@ -730,15 +746,20 @@ def apply(
         self.activation(activation, intermediate_cache2.view(-1, N // 2),
                         intermediate_cache1.view(-1, N))
 
-        #qintermediate_cache2 = intermediate_cache2
-        a2q_scale = a2_scale
-        # TODO (varun) : support w8a8
-        assert not self.use_fp8_w8a8
-        #if self.use_fp8_w8a8:
-        #    qintermediate_cache2, a2q_scale = _fp8_quantize(
-        #        intermediate_cache2, a2_scale, self.block_shape)
+        ic2_hidden_size = intermediate_cache2.size(-1)
+        intermediate_cache2 = intermediate_cache2.view(-1, ic2_hidden_size)
+
+        qintermediate_cache2, a2q_scale = moe_kernel_quantize_input(
+            A=intermediate_cache2,
+            A_scale=a2_scale,
+            qtype=torch.float8_e4m3fn if self.use_fp8_w8a8 else None,
+            per_channel_quant=self.per_channel_quant,
+            block_shape=self.block_shape)
 
-        invoke_moe_batched_triton_kernel(A=intermediate_cache2,
+        qintermediate_cache2 = qintermediate_cache2.view(
+            (E, -1, ic2_hidden_size))
+
+        invoke_moe_batched_triton_kernel(A=qintermediate_cache2,
                                          B=w2,
                                          C=intermediate_cache3,
                                          expert_num_tokens=expert_num_tokens,
@@ -751,5 +772,4 @@ def apply(
                                          use_int4_w4a16=self.use_int4_w4a16,
                                          config=config,
                                          block_shape=self.block_shape)
-
         return intermediate_cache3
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 4c84dd538332..40b76994f412 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Fused MoE utilities for GPTQ."""
 import functools
 from typing import Optional
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 78f8eb926dc8..ba1498e65319 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Fused MoE kernel."""
 import functools
 import json
@@ -1163,7 +1164,7 @@ def fused_experts(hidden_states: torch.Tensor,
     # permute/unpermute ops are available.
     N = w1.shape[1]
     if (allow_deep_gemm and use_fp8_w8a8 and N > 512
-            and _valid_deep_gemm(hidden_states, w1, w2, expert_map)):
+            and _valid_deep_gemm(hidden_states, w1, w2)):
         assert apply_router_weight_on_input is False
         return deep_gemm_moe_fp8(
             hidden_states=hidden_states,
@@ -1544,6 +1545,7 @@ def __init__(
     def workspace_shapes(
         self,
         a: torch.Tensor,
+        aq: torch.Tensor,
         M: int,
         N: int,
         K: int,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 1e193c909f61..cf8e4ee6509c 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1,13 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import importlib
 from abc import abstractmethod
 from dataclasses import dataclass
 from enum import Enum
-from typing import Callable, Optional
+from typing import Callable, Optional, Union
 
 import torch
 import torch.nn.functional as F
+from compressed_tensors.quantization import (QuantizationArgs,
+                                             QuantizationStrategy,
+                                             QuantizationType)
 from torch.nn.parameter import UninitializedParameter
 
 import vllm.envs as envs
@@ -29,16 +33,19 @@
 from vllm.utils import direct_register_custom_op
 
 has_pplx = importlib.util.find_spec("pplx_kernels") is not None
+has_deepep = importlib.util.find_spec("deep_ep") is not None
 
 if current_platform.is_cuda_alike():
-    from .fused_batched_moe import (BatchedPrepareAndFinalize,
-                                    BatchedTritonExperts)
+    from .fused_batched_moe import BatchedTritonExperts
     from .fused_moe import TritonExperts, fused_experts
     from .modular_kernel import (FusedMoEModularKernel,
                                  FusedMoEPermuteExpertsUnpermute,
                                  FusedMoEPrepareAndFinalize)
     if has_pplx:
         from .pplx_prepare_finalize import PplxPrepareAndFinalize
+    if has_deepep:
+        from .deepep_ht_prepare_finalize import DeepEPHTPrepareAndFinalize
+        from .deepep_ll_prepare_finalize import DeepEPLLPrepareAndFinalize
 else:
     fused_experts = None  # type: ignore
     FusedMoEPermuteExpertsUnpermute = None  # type: ignore
@@ -70,10 +77,24 @@ class FusedMoEParallelConfig:
 
     use_ep: bool  # whether to use EP or not
 
+    @property
+    def use_all2all_kernels(self):
+        return self.dp_size > 1 and self.use_ep
+
     @property
     def use_pplx_kernels(self):
-        return self.dp_size > 1 and self.use_ep and \
-             envs.VLLM_ALL2ALL_BACKEND == "pplx"
+        return (self.use_all2all_kernels
+                and envs.VLLM_ALL2ALL_BACKEND == "pplx")
+
+    @property
+    def use_deepep_ht_kernels(self):
+        return (self.use_all2all_kernels
+                and envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput")
+
+    @property
+    def use_deepep_ll_kernels(self):
+        return (self.use_all2all_kernels
+                and envs.VLLM_ALL2ALL_BACKEND == "deepep_low_latency")
 
     @staticmethod
     def make(tp_size_: int, dp_size_: int,
@@ -192,6 +213,7 @@ class MoEConfig:
     moe_parallel_config: FusedMoEParallelConfig
 
     in_dtype: torch.dtype  # The activation type.
+    quant_dtype: torch.dtype = None
 
     # TODO: add more quantization params, blocked, per-token, etc.
     block_size: int = 128
@@ -230,6 +252,14 @@ def use_ep(self):
     def use_pplx_kernels(self):
         return self.moe_parallel_config.use_pplx_kernels
 
+    @property
+    def use_deepep_ht_kernels(self):
+        return self.moe_parallel_config.use_deepep_ht_kernels
+
+    @property
+    def use_deepep_ll_kernels(self):
+        return self.moe_parallel_config.use_deepep_ll_kernels
+
 
 class FusedMoeWeightScaleSupported(Enum):
     TENSOR = "tensor"
@@ -238,8 +268,22 @@ class FusedMoeWeightScaleSupported(Enum):
     BLOCK = "block"
 
 
+def get_quant_config_input_activations(
+        quant_config: Optional[QuantizationConfig]
+) -> Optional[QuantizationArgs]:
+    if (quant_config is not None and hasattr(quant_config, 'target_scheme_map')
+            and "Linear" in quant_config.target_scheme_map and
+            "input_activations" in quant_config.target_scheme_map["Linear"]):
+        return quant_config.target_scheme_map["Linear"].get(
+            "input_activations")
+    else:
+        return None
+
+
 class FusedMoEMethodBase(QuantizeMethodBase):
 
+    moe: MoEConfig
+
     @abstractmethod
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size_per_partition: int,
@@ -251,7 +295,17 @@ def init_prepare_finalize(self, moe: MoEConfig,
         all2all_manager = get_ep_group().device_communicator.all2all_manager
         assert all2all_manager is not None
 
-        prepare_finalize = None
+        self.moe = moe
+        quant_dtype = None
+        act_quant_block_size = None
+        from vllm.model_executor.layers.quantization.fp8 import Fp8Config
+        if isinstance(quant_config, Fp8Config):
+            act_quant_block_size = quant_config.weight_block_size
+            quant_dtype = torch.float8_e4m3fn
+
+        prepare_finalize: Optional[Union[PplxPrepareAndFinalize,
+                                         DeepEPHTPrepareAndFinalize,
+                                         DeepEPLLPrepareAndFinalize]] = None
         if moe.use_pplx_kernels:
             all_to_all_args = dict(
                 max_num_tokens=moe.max_num_tokens,
@@ -262,13 +316,14 @@ def init_prepare_finalize(self, moe: MoEConfig,
                 # dp_size actually means tp_size, bug in pplx kernels
                 dp_size=all2all_manager.tp_group.world_size,
                 hidden_dim=moe.hidden_dim,
-                hidden_dim_bytes=moe.hidden_dim * moe.in_dtype.itemsize,
+                hidden_dim_bytes=moe.hidden_dim * moe.quant_dtype.itemsize,
                 # For blocked per token: set to
                 #   ceil_div(hidden_dim, block_size) * sizeof(float32)
                 # For per-token: set to sizeof(float32)
-                hidden_dim_scale_bytes=(0 if moe.in_dtype.itemsize != 1 else (
-                    (moe.hidden_dim + moe.block_size - 1) // moe.block_size *
-                    torch.float32.itemsize)),
+                hidden_dim_scale_bytes=(
+                    0 if moe.quant_dtype.itemsize != 1 else
+                    ((moe.hidden_dim + moe.block_size - 1) // moe.block_size *
+                     torch.float32.itemsize)),
             )
 
             # Intranode pplx a2a takes a group name while internode does not.
@@ -278,6 +333,9 @@ def init_prepare_finalize(self, moe: MoEConfig,
 
             handle = all2all_manager.get_handle(all_to_all_args)
 
+            input_activations = get_quant_config_input_activations(
+                quant_config)
+
             prepare_finalize = PplxPrepareAndFinalize(
                 handle,
                 max_num_tokens=moe.max_num_tokens,
@@ -285,19 +343,63 @@ def init_prepare_finalize(self, moe: MoEConfig,
                 rank=all2all_manager.rank,
                 # dp_size actually means tp_size, bug in pplx kernels
                 dp_size=all2all_manager.tp_group.world_size,
-                quant_dtype=moe.in_dtype,
+                quant_dtype=moe.quant_dtype,
+                per_act_token=(input_activations.strategy
+                               == QuantizationStrategy.TOKEN
+                               if input_activations is not None else False),
+            )
+        elif moe.use_deepep_ht_kernels:
+            assert moe.dp_size == all2all_manager.dp_world_size
+
+            all_to_all_args = dict()
+            handle = all2all_manager.get_handle(all_to_all_args)
+            prepare_finalize = DeepEPHTPrepareAndFinalize(
+                handle,
+                world_size=all2all_manager.world_size,
+                rank=all2all_manager.rank,
+                dp_size=all2all_manager.dp_world_size,
+                rank_expert_offset=all2all_manager.rank *
+                moe.num_local_experts,
+                quant_dtype=quant_dtype,
+                block_shape=act_quant_block_size,
             )
 
+        elif moe.use_deepep_ll_kernels:
+            assert moe.dp_size == all2all_manager.dp_world_size
+
+            all_to_all_args = dict(
+                max_num_tokens_per_dp_rank=moe.max_num_tokens,
+                token_hidden_size=moe.hidden_dim,
+                num_ep_ranks=all2all_manager.world_size,
+                num_global_experts=moe.num_experts,
+                num_local_experts=moe.num_experts //
+                all2all_manager.world_size)
+            handle = all2all_manager.get_handle(all_to_all_args)
+
+            # Note (varun): Whether to use FP8 dispatch or not needs some
+            # profiling. Turning it off for now.
+            prepare_finalize = DeepEPLLPrepareAndFinalize(
+                handle,
+                world_size=all2all_manager.world_size,
+                dp_size=all2all_manager.dp_world_size,
+                max_tokens_per_rank=moe.max_num_tokens,
+                quant_dtype=quant_dtype,
+                block_shape=act_quant_block_size,
+                use_fp8_dispatch=False,
+            )
+
+        self.topk_indices_dtype = None
         if prepare_finalize is not None:
-            experts = self.select_gemm_impl(prepare_finalize)
+            self.topk_indices_dtype = prepare_finalize.topk_indices_dtype()
+            experts = self.select_gemm_impl(prepare_finalize, moe)
             self.fused_experts = FusedMoEModularKernel(
                 prepare_finalize,
                 experts,
             )
 
     def select_gemm_impl(
-        self, prepare_finalize: Optional[FusedMoEPrepareAndFinalize]
-    ) -> FusedMoEPermuteExpertsUnpermute:
+            self, prepare_finalize: FusedMoEPrepareAndFinalize,
+            moe: Optional[MoEConfig]) -> FusedMoEPermuteExpertsUnpermute:
         # based on the all2all implementation, select the appropriate
         # gemm implementation
         raise NotImplementedError(
@@ -333,6 +435,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
     def __init__(self, moe: MoEConfig):
         super().__init__()
         self.fused_experts = fused_experts  # type: ignore
+        self.topk_indices_dtype = None
         self.moe = moe
 
         self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
@@ -342,8 +445,8 @@ def __init__(self, moe: MoEConfig):
         else:
             self.rocm_aiter_fused_experts = None  # type: ignore
 
-    def select_gemm_impl(
-            self, prepare_finalize: Optional[FusedMoEPrepareAndFinalize]):
+    def select_gemm_impl(self, prepare_finalize: FusedMoEPrepareAndFinalize,
+                         moe: Optional[MoEConfig]):
 
         assert self.fused_experts == fused_experts
 
@@ -352,11 +455,13 @@ def select_gemm_impl(
 
         experts: Optional[FusedMoEPermuteExpertsUnpermute] = None
 
-        if isinstance(prepare_finalize,
-                      (BatchedPrepareAndFinalize, PplxPrepareAndFinalize)):
+        use_batched_experts = prepare_finalize.max_num_tokens_per_rank(
+        ) is not None
+        if use_batched_experts:
             logger.debug("BatchedTritonExperts %s", self.moe)
+            assert self.moe.dp_size == all2all_manager.dp_world_size
             experts = BatchedTritonExperts(
-                max_num_tokens=MOE_DP_CHUNK_SIZE,
+                max_num_tokens=self.moe.max_num_tokens,
                 world_size=all2all_manager.world_size,
                 # dp_size actually means tp_size, bug in pplx kernels
                 dp_size=all2all_manager.tp_group.world_size,
@@ -365,6 +470,7 @@ def select_gemm_impl(
                 use_int8_w8a16=False,
                 use_int4_w4a16=False,
                 block_shape=None,
+                per_channel_quant=False,
             )
         else:
             logger.debug("TritonExperts %s", self.moe)
@@ -493,6 +599,7 @@ def forward_cuda(
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
+
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
             router_logits=router_logits,
@@ -504,7 +611,7 @@ def forward_cuda(
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
             e_score_correction_bias=e_score_correction_bias,
-            indices_type=torch.uint32 if self.moe.use_pplx_kernels else None)
+            indices_type=self.topk_indices_dtype)
 
         if self.rocm_aiter_moe_enabled:
             assert expert_map is None
@@ -729,7 +836,6 @@ def __init__(
         activation: str = "silu",
     ):
         super().__init__()
-
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
         self.params_dtype = params_dtype
@@ -789,14 +895,24 @@ def __init__(
             from vllm_hpu_extension.ops import DynamicFusedMOE
             self.hpu_fused_moe = DynamicFusedMOE(self.global_num_experts)
 
+        # Only support float8 for now.
+        quant_dtype = params_dtype
+        if quant_config is not None:
+            input_activations = get_quant_config_input_activations(
+                quant_config)
+            if (input_activations is not None
+                    and input_activations.num_bits == 8
+                    and input_activations.type == QuantizationType.FLOAT):
+                quant_dtype = torch.float8_e4m3fn
+
         moe = MoEConfig(
             num_experts=self.global_num_experts,
             experts_per_token=top_k,
             hidden_dim=hidden_size,
             num_local_experts=self.local_num_experts,
             moe_parallel_config=self.moe_parallel_config,
-            # TODO (bnell): this needs to be fixed for quantized types.
             in_dtype=params_dtype,
+            quant_dtype=quant_dtype,
             max_num_tokens=MOE_DP_CHUNK_SIZE,
         )
         self.moe_config = moe
@@ -805,11 +921,8 @@ def __init__(
         # Note: get_quant_method will look at the layer's local_num_experts
         # for heuristic purposes, so it must be initialized first.
         quant_method: Optional[QuantizeMethodBase] = None
-
-        if quant_config is None:
-            quant_method = UnquantizedFusedMoEMethod(moe)
-        else:
-            quant_method = quant_config.get_quant_method(self, prefix)
+        quant_method = (UnquantizedFusedMoEMethod(moe) if quant_config is None
+                        else quant_config.get_quant_method(self, prefix))
 
         assert quant_method is not None
         assert isinstance(quant_method, FusedMoEMethodBase)
@@ -835,7 +948,8 @@ def __init__(
         # Chunked all2all staging tensor
         self.batched_hidden_states: Optional[torch.Tensor] = None
         self.batched_router_logits: Optional[torch.Tensor] = None
-        if self.moe_parallel_config.use_pplx_kernels:
+        if (self.moe_parallel_config.use_pplx_kernels
+                or self.moe_parallel_config.use_deepep_ll_kernels):
             act_dtype = vllm_config.model_config.dtype
             self.batched_hidden_states = torch.zeros(
                 (MOE_DP_CHUNK_SIZE, self.hidden_size),
@@ -879,6 +993,14 @@ def use_ep(self):
     def use_pplx_kernels(self):
         return self.moe_parallel_config.use_pplx_kernels
 
+    @property
+    def use_deepep_ht_kernels(self):
+        return self.moe_parallel_config.use_deepep_ht_kernels
+
+    @property
+    def use_deepep_ll_kernels(self):
+        return self.moe_parallel_config.use_deepep_ll_kernels
+
     def _load_per_tensor_weight_scale(self, shard_id: str,
                                       param: torch.nn.Parameter,
                                       loaded_weight: torch.Tensor,
@@ -1209,19 +1331,21 @@ def must_reduce_shared_expert_outputs(self) -> bool:
         When just tensor-parallel is used, it is not required to reduce
         the shared_experts results immediately. Instead we reduce at the
         once at the end of the MoE op. (Refer to DeepSeekV2MoE module)
-        With EP and the pplx kernels - this is no longer viable as all
+        With EP and all2all kernels - this is no longer viable as all
         GPU ranks in DP, produce the complete set of hidden_states.
         Therefore it is required that we reduce the shared_experts output
         early.
         """
-        return self.use_pplx_kernels
+        return (self.use_pplx_kernels or self.use_deepep_ht_kernels
+                or self.use_deepep_ll_kernels)
 
     def maybe_all_reduce_tensor_model_parallel(
             self, final_hidden_states: torch.Tensor):
         """
         The pplx combine kernel reduces across GPU ranks by default.
         """
-        if self.use_pplx_kernels:
+        if (self.use_pplx_kernels or self.use_deepep_ht_kernels
+                or self.use_deepep_ll_kernels):
             return final_hidden_states
         else:
             return tensor_model_parallel_all_reduce(final_hidden_states)
@@ -1288,7 +1412,7 @@ def process_chunk(chunk_start, chunk_end, skip_result_store=False):
 
         ctx = get_forward_context()
         max_tokens_across_dp = ctx.dp_metadata.max_tokens_across_dp_cpu
-        moe_dp_chunk_size_per_rank = MOE_DP_CHUNK_SIZE
+        moe_dp_chunk_size_per_rank = self.moe_config.max_num_tokens
 
         num_tokens = full_hidden_states.size(0)
         for chunk_start_ in range(0, max_tokens_across_dp,
@@ -1309,12 +1433,17 @@ def process_chunk(chunk_start, chunk_end, skip_result_store=False):
     def forward_impl(self, hidden_states: torch.Tensor,
                      router_logits: torch.Tensor):
         assert self.quant_method is not None
-        if self.moe_parallel_config.use_pplx_kernels:
+        if (self.moe_parallel_config.use_pplx_kernels
+                or self.moe_parallel_config.use_deepep_ll_kernels):
             return self.forward_impl_chunked(hidden_states, router_logits)
 
-        if self.dp_size > 1:
+        do_naive_dispatch_combine: bool = (
+            self.dp_size > 1
+            and not self.moe_parallel_config.use_deepep_ht_kernels)
+        if do_naive_dispatch_combine:
             hidden_states, router_logits = get_ep_group().dispatch(
                 hidden_states, router_logits)
+
         # Matrix multiply.
         final_hidden_states = self.quant_method.apply(
             layer=self,
@@ -1334,12 +1463,12 @@ def forward_impl(self, hidden_states: torch.Tensor,
             apply_router_weight_on_input=self.apply_router_weight_on_input,
         )
 
-        if self.dp_size > 1:
+        if do_naive_dispatch_combine:
             final_hidden_states = get_ep_group().combine(final_hidden_states)
 
         if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1):
-            # Default set to False. (May have to add shared expert outputs.)
-            final_hidden_states = tensor_model_parallel_all_reduce(
+            # Default set to False. (May have to add shared expert outputs.
+            final_hidden_states = self.maybe_all_reduce_tensor_model_parallel(
                 final_hidden_states)
 
         return final_hidden_states
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index 7d3ddf8f14c4..e7aaf62fb340 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from typing import Optional
 
@@ -93,7 +94,8 @@ def prepare(
         num_experts: int,
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
+               Optional[torch.Tensor], Optional[torch.Tensor]]:
         """
         Perform any quantization (and/or) dispatching needed
         for this kernel.
@@ -112,6 +114,10 @@ def prepare(
         Returns a tuple of:
         - quantized + dispatched a.
         - quantized + dispatched a1_scales.
+        - Optional tensor as big as number of local experts that contains the
+          number of tokens assigned to each local expert. 
+        - Optional dispatched expert topk IDs
+        - Optional dispatched expert topk weight 
         """
         raise NotImplementedError
 
@@ -137,6 +143,27 @@ def finalize(
         """
         raise NotImplementedError
 
+    @abstractmethod
+    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+        """
+        The PrepareFinalize All2All implementations generally constrain the
+        dtype of the topk_ids they support. This function returns the
+        required topk indices dtype so it can be respected.
+        Return None if there are no such restrictions.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def max_num_tokens_per_rank(self) -> Optional[int]:
+        """
+        Some PrepareFinalize All2All implementations are batched. Meaning,
+        they can processes only as set of tokens at a time. This
+        function returns the batch size i.e the maximum number of tokens
+        the implementation can process at a time. 
+        Return None if there are no such restrictions.
+        """
+        raise NotImplementedError
+
 
 class FusedMoEPermuteExpertsUnpermute(ABC):
     """
@@ -148,6 +175,7 @@ class FusedMoEPermuteExpertsUnpermute(ABC):
     def workspace_shapes(
         self,
         a: torch.Tensor,
+        aq: torch.Tensor,
         M: int,
         N: int,
         K: int,
@@ -260,6 +288,61 @@ def __init__(
         self.prepare_finalize = prepare_finalize
         self.fused_experts = fused_experts
 
+    def _do_fused_experts(
+            self,
+            a1: torch.Tensor,  # input to forward fn
+            a1q: torch.Tensor,  # output of prepare fn
+            w1: torch.Tensor,
+            w2: torch.Tensor,
+            topk_ids: torch.Tensor,
+            expert_num_tokens: torch.Tensor,
+            activation: str,
+            global_num_experts: int,
+            expert_map: Optional[torch.Tensor],
+            w1_scale: Optional[torch.Tensor],
+            w2_scale: Optional[torch.Tensor],
+            w1_zp: Optional[torch.Tensor],
+            w2_zp: Optional[torch.Tensor],
+            a1q_scale: Optional[torch.Tensor],
+            a2_scale: Optional[torch.Tensor]) -> torch.Tensor:
+
+        _, M, N, K, top_k = _moe_problem_size(a1q, w1, w2, topk_ids)
+
+        # Use a1 here to decipher the correct workspace datatype
+        workspace13_shape, workspace2_shape, workspace_dtype = (
+            self.fused_experts.workspace_shapes(a1, a1q, M, N, K, top_k,
+                                                global_num_experts))
+
+        # We can reuse the memory between cache1 and cache3 because by the time
+        # we need cache3, we're done with cache1
+        workspace13 = torch.zeros(workspace13_shape,
+                                  device=a1.device,
+                                  dtype=workspace_dtype)
+        workspace2 = torch.zeros(workspace2_shape,
+                                 device=a1.device,
+                                 dtype=workspace_dtype)
+
+        fused_out = self.fused_experts.apply(
+            a1q,
+            w1,
+            w2,
+            topk_ids,
+            activation=activation,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            w1_zp=w1_zp,
+            w2_zp=w2_zp,
+            a1q_scale=a1q_scale,
+            a2_scale=a2_scale,
+            workspace13=workspace13,
+            workspace2=workspace2,
+            expert_num_tokens=expert_num_tokens,
+        )
+
+        return fused_out
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -314,49 +397,48 @@ def forward(
         Returns:
         - torch.Tensor: The output tensor after applying the MoE layer.
         """
-        a1 = hidden_states
-        E, M, N, K, top_k = _moe_problem_size(a1, w1, w2, topk_ids)
-
-        if global_num_experts == -1:
-            global_num_experts = E
 
+        a1 = hidden_states
         output = a1 if inplace else torch.zeros_like(a1)
 
-        workspace13_shape, workspace2_shape, workspace_dtype = (
-            self.fused_experts.workspace_shapes(a1, M, N, K, top_k,
-                                                global_num_experts))
-
-        # We can reuse the memory between cache1 and cache3 because by the time
-        # we need cache3, we're done with cache1
-        workspace13 = torch.zeros(workspace13_shape,
-                                  device=a1.device,
-                                  dtype=workspace_dtype)
-        workspace2 = torch.zeros(workspace2_shape,
-                                 device=a1.device,
-                                 dtype=workspace_dtype)
-
-        a1q, a1q_scale, expert_num_tokens = self.prepare_finalize.prepare(
-            a1, a1_scale, a2_scale, topk_weights, topk_ids, global_num_experts,
-            expert_map, apply_router_weight_on_input)
-
-        fused_out = self.fused_experts.apply(
-            a1q,
-            w1,
-            w2,
-            topk_ids,
-            activation=activation,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
-            w1_scale=w1_scale,
-            w2_scale=w2_scale,
-            w1_zp=w1_zp,
-            w2_zp=w2_zp,
-            a1q_scale=a1q_scale,
-            a2_scale=a2_scale,
-            workspace13=workspace13,
-            workspace2=workspace2,
-            expert_num_tokens=expert_num_tokens,
-        )
+        if global_num_experts == -1:
+            global_num_experts = w1.size(0)
+
+        (a1q, a1q_scale, expert_num_tokens, _expert_topk_ids,
+         _expert_topk_weights) = self.prepare_finalize.prepare(
+             a1, a1_scale, a2_scale, topk_weights, topk_ids,
+             global_num_experts, expert_map, apply_router_weight_on_input)
+        # Maybe prepare gathered topk_ids and topk_weights from other EP ranks.
+        topk_ids = topk_ids if _expert_topk_ids is None else _expert_topk_ids
+        topk_weights = (topk_weights if _expert_topk_weights is None else
+                        _expert_topk_weights)
+
+        fused_out = None
+        if a1q.numel() == 0:
+            # This happens when none of the tokens from the all2all reach this
+            # EP rank. Also, note that this is only relevant for CUDAGraph
+            # incompatible all2all kernels like the DeepEP high-throughput
+            # kernels. CUDAGraph compatible all2all kernels like the pplx
+            # kernels and the DeepEP low-latency kernels are always batched
+            # and can never run into the tensor.numel() == 0 case.
+            fused_out = torch.empty_like(a1q).to(dtype=a1.dtype)
+        else:
+            fused_out = self._do_fused_experts(
+                a1=a1,
+                a1q=a1q,
+                w1=w1,
+                w2=w2,
+                topk_ids=topk_ids,
+                expert_num_tokens=expert_num_tokens,
+                activation=activation,
+                global_num_experts=global_num_experts,
+                expert_map=expert_map,
+                w1_scale=w1_scale,
+                w2_scale=w2_scale,
+                w1_zp=w1_zp,
+                w2_zp=w2_zp,
+                a1q_scale=a1q_scale,
+                a2_scale=a2_scale)
 
         self.prepare_finalize.finalize(output, fused_out, topk_weights,
                                        topk_ids, apply_router_weight_on_input)
diff --git a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
index d025f1257a9f..98e175b12ed4 100644
--- a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
+++ b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
diff --git a/vllm/model_executor/layers/fused_moe/moe_pallas.py b/vllm/model_executor/layers/fused_moe/moe_pallas.py
index 9d8bd62c6969..d35bd0098b3c 100644
--- a/vllm/model_executor/layers/fused_moe/moe_pallas.py
+++ b/vllm/model_executor/layers/fused_moe/moe_pallas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 import torch.nn.functional as F
diff --git a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
index cb396f26c96e..89481e5bd6b0 100644
--- a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
+++ b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
@@ -24,7 +25,7 @@ def _moe_permute(
     """
     top_k_num = curr_topk_ids.size(1)
 
-    tokens_in_chunk = curr_hidden_states.sizze(0)
+    tokens_in_chunk = curr_hidden_states.size(0)
 
     sorted_token_ids, expert_ids, num_tokens_post_padded = (
         moe_align_block_size(curr_topk_ids,
@@ -36,11 +37,12 @@ def _moe_permute(
     inv_perm: Optional[torch.Tensor] = None
 
     num_tokens = top_k_num * tokens_in_chunk
-    sorted_token_ids = sorted_token_ids.clamp(max=num_tokens - 1)
     expert_ids = torch.repeat_interleave(expert_ids, block_m, dim=0)
     inv_perm = torch.argsort(sorted_token_ids)[:num_tokens]
 
     # Permute according to sorted token ids.
+    sorted_token_ids = sorted_token_ids.clamp(max=num_tokens - 1)
+
     curr_hidden_states = _fp8_perm(curr_hidden_states,
                                    sorted_token_ids // top_k_num)
 
diff --git a/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py b/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py
index da27633f2723..6160da732951 100644
--- a/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py
+++ b/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 import torch.nn.functional as F
diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
index 783ebebbfec9..5bc01dbf2025 100644
--- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import pplx_kernels as pplx
@@ -20,7 +21,8 @@ def __init__(self,
                  rank: int,
                  dp_size: int,
                  quant_dtype: Optional[torch.dtype] = None,
-                 block_shape: Optional[list[int]] = None):
+                 block_shape: Optional[list[int]] = None,
+                 per_act_token: bool = False):
         super().__init__()
         assert max_num_tokens > 0
         self.a2a = a2a
@@ -30,6 +32,13 @@ def __init__(self,
         self.rank = rank
         self.dp_size = dp_size
         self.quant_dtype = quant_dtype
+        self.per_act_token = per_act_token
+
+    def max_num_tokens_per_rank(self) -> Optional[int]:
+        return self.max_num_tokens
+
+    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+        return torch.uint32
 
     def prepare(
         self,
@@ -41,7 +50,8 @@ def prepare(
         num_experts: int,
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
+               Optional[torch.Tensor], Optional[torch.Tensor]]:
         num_tokens = a1.size(0)  # M
         hidden_dim = a1.size(-1)  # K
 
@@ -58,13 +68,14 @@ def prepare(
                 "apply_router_weight_on_input is only implemented for topk=1")
             a1 = a1 * rank_topk_weights.to(a1.dtype)
 
-        per_act_token = a1_scale.numel() != 1 if a1_scale is not None else (
-            a2_scale.numel() != 1 if a2_scale is not None else False)
+        repeat_cols = 4
+        repeat_rows = 1 if self.per_act_token else a1.shape[0]
+        a1q, a1q_scale = moe_kernel_quantize_input(
+            a1, (None if self.per_act_token else a1_scale), self.quant_dtype,
+            self.per_act_token, self.block_shape)
 
-        a1q, a1q_scale = moe_kernel_quantize_input(a1, a1_scale,
-                                                   self.quant_dtype,
-                                                   per_act_token,
-                                                   self.block_shape)
+        if a1q_scale is not None:
+            a1q_scale = a1q_scale.repeat(repeat_rows, repeat_cols)
 
         # rem_experts need to be 0 for pplx to work properly.
         rem_experts = num_experts % self.world_size
@@ -92,7 +103,7 @@ def prepare(
                           else 1) * float32_size
             expert_x_scale = torch.empty(
                 (
-                    num_experts,
+                    num_local_experts,
                     expert_x.size(1),
                     (expert_x.size(2) + block_size - 1) // block_size,
                 ),
@@ -113,8 +124,10 @@ def prepare(
             indices=rank_topk_ids,
             bound_m=bound_m,
         )
+        if expert_x_scale is not None:
+            expert_x_scale = expert_x_scale[:, :, 0:1]
 
-        return expert_x, expert_x_scale, expert_num_tokens
+        return expert_x, expert_x_scale, expert_num_tokens, None, None
 
     def finalize(
         self,
diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
index 98f98b3bd20b..9ed95e1de9fe 100644
--- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
@@ -23,6 +24,12 @@ def __init__(
         self.block_shape = block_shape
         self.quant_dtype = quant_dtype
 
+    def max_num_tokens_per_rank(self) -> Optional[int]:
+        return None
+
+    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+        return None
+
     def prepare(
         self,
         a1: torch.Tensor,
@@ -33,7 +40,9 @@ def prepare(
         num_experts: int,
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool = False,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
+               Optional[torch.Tensor], Optional[torch.Tensor]]:
+
         if apply_router_weight_on_input:
             topk = topk_ids.size(1)
             # TODO: this only works for topK=1, will need to update for topK>1
@@ -46,7 +55,7 @@ def prepare(
                                                    self.per_channel_quant,
                                                    self.block_shape)
 
-        return a1q, a1q_scale, None
+        return a1q, a1q_scale, None, None, None
 
     def finalize(
         self,
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index 824062491f0e..d44989cce724 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from enum import IntEnum
 from functools import cache
 from typing import Optional
diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
index 2cfe373140bb..87de29444c01 100644
--- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
@@ -28,13 +29,15 @@ def __init__(self,
                                            per_channel_quant=per_channel_quant,
                                            block_shape=block_shape,
                                            block_m=block_m)
-        self.deep_gemm_expert = DeepGemmExperts()
         self.allow_deep_gemm = allow_deep_gemm
         self.use_fp8_w8a8 = use_fp8_w8a8
+        self.deep_gemm_expert = DeepGemmExperts(
+        ) if self.allow_deep_gemm else None
 
     def workspace_shapes(
         self,
         a: torch.Tensor,
+        aq: torch.Tensor,
         M: int,
         N: int,
         K: int,
@@ -45,10 +48,11 @@ def workspace_shapes(
         # workspaces so we can be pessimistic here and allocate for DeepGemm
         # even if we fall back to triton later, e.g. if expert maps are set.
         if self.allow_deep_gemm and _valid_deep_gemm_shape(M, N, K):
+            assert self.deep_gemm_expert is not None
             return self.deep_gemm_expert.workspace_shapes(
-                a, M, N, K, topk, num_experts)
+                a, aq, M, N, K, topk, num_experts)
         else:
-            return self.triton_expert.workspace_shapes(a, M, N, K, topk,
+            return self.triton_expert.workspace_shapes(a, aq, M, N, K, topk,
                                                        num_experts)
 
     def apply(
@@ -72,7 +76,8 @@ def apply(
     ) -> torch.Tensor:
         N = w1.size(1)
         if (self.allow_deep_gemm and self.use_fp8_w8a8 and N > 512
-                and _valid_deep_gemm(hidden_states, w1, w2, expert_map)):
+                and _valid_deep_gemm(hidden_states, w1, w2)):
+            assert self.deep_gemm_expert is not None
             return self.deep_gemm_expert.apply(
                 hidden_states,
                 w1,
diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py
index d9d2520e18b3..692482c2ea69 100644
--- a/vllm/model_executor/layers/fused_moe/utils.py
+++ b/vllm/model_executor/layers/fused_moe/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from math import prod
 from typing import Optional
 
@@ -17,8 +18,8 @@ def _resize_cache(x: torch.Tensor, v: tuple[int, ...]) -> torch.Tensor:
     Shrink the given tensor and apply the given view to it.  This is
     used to resize the intermediate fused_moe caches.
     """
-    assert prod(
-        v) <= x.numel(), f"{prod(v)} <= {x.numel()}"  # CUDAGRAPH unfriendly?
+    assert prod(v) <= x.numel(
+    ), f"{v} ({prod(v)}) <= {x.shape} ({x.numel()})"  # CUDAGRAPH unfriendly?
     return x.flatten()[:prod(v)].view(*v)
 
 
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index e8abd32ff6ba..b3c65e34178a 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Custom normalization layers."""
 from typing import Optional, Union
 
diff --git a/vllm/model_executor/layers/lightning_attn.py b/vllm/model_executor/layers/lightning_attn.py
index 96659af408ed..978086d1909d 100644
--- a/vllm/model_executor/layers/lightning_attn.py
+++ b/vllm/model_executor/layers/lightning_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch
 from einops import rearrange
 
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 269ac043d26c..588aa8deb183 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 from abc import abstractmethod
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index 6b69a260826b..3d01253447c0 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A layer that compute logits from hidden_stats."""
 import inspect
 from concurrent.futures import ThreadPoolExecutor
diff --git a/vllm/model_executor/layers/mamba/mamba2_metadata.py b/vllm/model_executor/layers/mamba/mamba2_metadata.py
index 019f634a9ef4..88053faf9e52 100644
--- a/vllm/model_executor/layers/mamba/mamba2_metadata.py
+++ b/vllm/model_executor/layers/mamba/mamba2_metadata.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 from dataclasses import dataclass
 
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py
index 156e8752e96c..118bd8d55c1d 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 from torch import nn
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index f94ab75f9a4f..6d9ea5387879 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional, Union
 
diff --git a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
index 21e27160f090..a10c5ab69787 100644
--- a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
+++ b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright (c) 2024, Tri Dao.
 # Adapted from https://github.com/Dao-AILab/causal-conv1d/blob/main/causal_conv1d/causal_conv1d_interface.py
diff --git a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
index 689c940d11ba..ccfb278cdff6 100644
--- a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
+++ b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright (c) 2024, Tri Dao, Albert Gu.
 # Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/selective_state_update.py
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_bmm.py b/vllm/model_executor/layers/mamba/ops/ssd_bmm.py
index 0fdb055aab82..11ca1255ebfb 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_bmm.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_bmm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright (c) 2024, Tri Dao, Albert Gu.
 # Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_bmm.py
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
index 1652c51814cd..365e1c54b555 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright (c) 2024, Tri Dao, Albert Gu.
 # Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_chunk_scan.py
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py
index ee633569097b..58bfb661d332 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright (c) 2024, Tri Dao, Albert Gu.
 # Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_chunk_state.py
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_combined.py b/vllm/model_executor/layers/mamba/ops/ssd_combined.py
index 79a1663b85bb..b121275e9eb3 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_combined.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_combined.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright (c) 2024, Tri Dao, Albert Gu.
 # Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_combined.py
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py b/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py
index 6f69ca74389e..a28fc9ffad71 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright (c) 2024, Tri Dao, Albert Gu.
 # Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_state_passing.py
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index d2c42191bb3f..258038bed40b 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from enum import IntEnum
 from typing import Optional, Union
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index 407b9c72f41d..1cb23e7a1887 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Literal, get_args
 
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index 8bf0ca5c0448..2ea8c5dc5113 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Supports AQLM compression, see https://github.com/Vahe1994/AQLM
 # and https://arxiv.org/pdf/2401.06118.pdf
diff --git a/vllm/model_executor/layers/quantization/auto_round.py b/vllm/model_executor/layers/quantization/auto_round.py
index eb8ffa37882c..ea17cd56c985 100644
--- a/vllm/model_executor/layers/quantization/auto_round.py
+++ b/vllm/model_executor/layers/quantization/auto_round.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from fractions import Fraction
 from typing import Any, Optional, Union
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index 87afdb623d91..f8bc3ab5e7d1 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 0c8d082bb428..56d803c6baf1 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/awq_triton.py b/vllm/model_executor/layers/quantization/awq_triton.py
index 5e5491578979..ebc526d6db2f 100644
--- a/vllm/model_executor/layers/quantization/awq_triton.py
+++ b/vllm/model_executor/layers/quantization/awq_triton.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
index c9533da9d46e..78c5c75c0651 100644
--- a/vllm/model_executor/layers/quantization/base_config.py
+++ b/vllm/model_executor/layers/quantization/base_config.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import inspect
 from abc import ABC, abstractmethod
diff --git a/vllm/model_executor/layers/quantization/bitblas.py b/vllm/model_executor/layers/quantization/bitblas.py
index 1cd12bb76317..9e5ce39ec8f2 100644
--- a/vllm/model_executor/layers/quantization/bitblas.py
+++ b/vllm/model_executor/layers/quantization/bitblas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 049ce7a7191d..38935bc96785 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 27547f315fef..1ee4617e1054 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import suppress
 from typing import Any, Literal, Optional, cast
@@ -221,15 +222,15 @@ def _is_fp4a16_nvfp4(self, weight_quant: BaseModel,
                          input_quant: BaseModel):
 
         is_weight_only = weight_quant is not None and input_quant is None
-        is_group_quant = (
-            weight_quant.strategy == QuantizationStrategy.GROUP.value)
+        is_tensor_group_quant = (
+            weight_quant.strategy == QuantizationStrategy.TENSOR_GROUP.value)
         is_symmetric = weight_quant.symmetric
 
         is_group_size_16 = weight_quant.group_size == 16
         is_float_type = weight_quant.type == QuantizationType.FLOAT
         is_4_bits = weight_quant.num_bits == 4
 
-        return (is_weight_only and is_group_quant and is_float_type
+        return (is_weight_only and is_tensor_group_quant and is_float_type
                 and is_4_bits and is_group_size_16 and is_symmetric)
 
     def _is_static_tensor_w8a8(self, weight_quant: BaseModel,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 9241ceeb4db2..bc9d399cf135 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
+import importlib
 from enum import Enum
 from typing import Callable, Optional
 
@@ -10,7 +12,6 @@
                                              QuantizationStrategy)
 
 import vllm.envs as envs
-import vllm.model_executor.layers.fused_moe  # noqa
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
@@ -29,6 +30,15 @@
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 
+has_pplx = importlib.util.find_spec("pplx_kernels") is not None
+
+if current_platform.is_cuda_alike():
+    from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+        BatchedPrepareAndFinalize)
+    if has_pplx:
+        from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
+            PplxPrepareAndFinalize)
+
 logger = init_logger(__name__)
 
 
@@ -76,8 +86,7 @@ def get_moe_method(
             else:
                 logger.info_once("Using CompressedTensorsWNA16MarlinMoEMethod")
                 return CompressedTensorsWNA16MarlinMoEMethod(quant_config)
-        elif (quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant)
-              and layer.activation == "silu"):
+        elif quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant):
             return CompressedTensorsW8A8Fp8MoECutlassMethod(quant_config)
         elif quant_config._is_fp8_w8a8(weight_quant, input_quant):
             return CompressedTensorsW8A8Fp8MoEMethod(quant_config)
@@ -420,6 +429,11 @@ def __init__(
                 "For FP8 Fused MoE layer, we require either per tensor or "
                 "channelwise, dynamic per token quantization.")
 
+        from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+            cutlass_moe_fp8)
+        self.fused_experts = cutlass_moe_fp8  # type: ignore
+        self.disable_expert_map = False
+
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size_per_partition: int,
                        params_dtype: torch.dtype, **extra_weight_attrs):
@@ -498,25 +512,6 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
             layer.w13_input_scale = None
             layer.w2_input_scale = None
 
-        device = w13_weight.device
-        # TODO strides can be shared across multiple layers
-        self.ab_strides1 = torch.full((num_experts, ),
-                                      hidden_size,
-                                      device=device,
-                                      dtype=torch.int64)
-        self.c_strides1 = torch.full((num_experts, ),
-                                     2 * intermediate_size_per_partition,
-                                     device=device,
-                                     dtype=torch.int64)
-        self.ab_strides2 = torch.full((num_experts, ),
-                                      intermediate_size_per_partition,
-                                      device=device,
-                                      dtype=torch.int64)
-        self.c_strides2 = torch.full((num_experts, ),
-                                     hidden_size,
-                                     device=device,
-                                     dtype=torch.int64)
-
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # Fp8 moe kernels require a single activation scale.
         # We take the max of all the scales in case they differ.
@@ -557,6 +552,27 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
                                                         requires_grad=False)
 
+    def select_gemm_impl(self, prepare_finalize, moe):
+        from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+            CutlassExpertsFp8)
+
+        assert moe is not None
+
+        max_experts_per_worker = (
+            (moe.num_experts + prepare_finalize.world_size - 1) //
+            prepare_finalize.world_size)
+        experts = CutlassExpertsFp8(
+            max_experts_per_worker, moe.in_dtype,
+            self.input_quant.strategy == QuantizationStrategy.TOKEN,
+            self.weight_quant.strategy == QuantizationStrategy.CHANNEL)
+
+        if has_pplx and isinstance(
+                prepare_finalize,
+            (BatchedPrepareAndFinalize, PplxPrepareAndFinalize)):
+            # no expert_map support in this case
+            self.disable_expert_map = True
+        return experts
+
     def apply(
         self,
         layer: torch.nn.Module,
@@ -576,9 +592,6 @@ def apply(
         activation: str = "silu",
     ) -> torch.Tensor:
 
-        assert activation == "silu", (
-            f"{activation} not supported for Cutlass MoE.")
-
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
             router_logits=router_logits,
@@ -589,27 +602,22 @@ def apply(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias)
-
-        from vllm.model_executor.layers.fused_moe import cutlass_moe_fp8
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=torch.uint32)
 
-        return cutlass_moe_fp8(
+        return self.fused_experts(
             x,
-            layer.w13_weight.transpose(1, 2),
-            layer.w2_weight.transpose(1, 2),
-            layer.w13_weight_scale,
-            layer.w2_weight_scale,
+            layer.w13_weight,
+            layer.w2_weight,
             topk_weights,
             topk_ids,
-            self.ab_strides1,
-            self.c_strides1,
-            self.ab_strides2,
-            self.c_strides2,
+            activation=activation,
+            global_num_experts=global_num_experts,
+            expert_map=None if self.disable_expert_map else expert_map,
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
             a1_scale=layer.w13_input_scale,
             a2_scale=layer.w2_input_scale,
-            out_dtype=x.dtype,
-            expert_map=expert_map,
-            apply_router_weight_on_input=apply_router_weight_on_input,
         )
 
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
index 79bf5c108ac2..25924c733e76 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from .compressed_tensors_scheme import CompressedTensorsScheme
 from .compressed_tensors_w4a16_24 import (W4A16SPARSE24_SUPPORTED_BITS,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index f010bc03418c..30ed55aee04f 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
index daa25d23a306..a5d48f235674 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from typing import Optional
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
index 6ea31e50caa7..3f3e7668fcf7 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py
index cf60b34ba78a..8202ce951496 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Callable, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
index 61e4918ca47f..01a87a088899 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index 99bb73b71e9f..1e61e058cb84 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
index 7792ce86553c..6189f0609d85 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index a33c58acb045..74787603e002 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
index 2380d35702c6..9bcf1aa2bc1c 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index 75e81c4dd49d..402646498cee 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable, Mapping
 from types import MappingProxyType
diff --git a/vllm/model_executor/layers/quantization/deepspeedfp.py b/vllm/model_executor/layers/quantization/deepspeedfp.py
index 0c1eaff93e8b..8030be525944 100644
--- a/vllm/model_executor/layers/quantization/deepspeedfp.py
+++ b/vllm/model_executor/layers/quantization/deepspeedfp.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 3601d219df3b..01b0064f0805 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
index 223682ee9765..3e465ee2cdd2 100644
--- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py
+++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index ac9b74945e0c..c785e0d1674d 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import functools
 import importlib.util
-from typing import Any, Callable, Optional
+from typing import Any, Callable, Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -451,6 +452,9 @@ def __init__(self, quant_config: Fp8Config):
         if envs.VLLM_USE_DEEP_GEMM:
             if not has_deep_gemm:
                 logger.warning_once("Failed to import DeepGemm kernels.")
+            elif not self.block_quant:
+                logger.warning_once("Model is not block quantized. Not using "
+                                    " DeepGemm kernels")
             elif (current_platform.is_cuda()
                   and current_platform.has_device_capability(90)):
                 logger.info_once("Using DeepGemm kernels for Fp8MoEMethod.")
@@ -459,8 +463,10 @@ def __init__(self, quant_config: Fp8Config):
                 logger.warning_once(
                     "DeepGemm not supported on the current platform.")
 
+        self.topk_indices_dtype = None
         self.fused_experts = functools.partial(  # type: ignore
             fused_experts,
+            use_fp8_w8a8=True,
             block_shape=self.quant_config.weight_block_size,
             allow_deep_gemm=self.allow_deep_gemm)
 
@@ -763,19 +769,42 @@ def process_weights_after_loading(self, layer: Module) -> None:
             del layer.w13_input_scale
             del layer.w2_input_scale
 
-    def select_gemm_impl(self, prepare_finalize):
+    def select_gemm_impl(self, prepare_finalize, moe):
+
+        from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (  # noqa: E501
+            BatchedTritonOrDeepGemmExperts)
         from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
             TritonOrDeepGemmExperts)
 
         assert not self.use_marlin and not self.rocm_aiter_moe_enabled, (
             "Marlin and ROCm AITER are not supported with all2all yet.")
 
-        experts = TritonOrDeepGemmExperts(
-            use_fp8_w8a8=True,
-            block_shape=self.quant_config.weight_block_size,
-            allow_deep_gemm=self.allow_deep_gemm,
-        )
+        experts: Optional[Union[BatchedTritonOrDeepGemmExperts,
+                                TritonOrDeepGemmExperts]] = None
+        max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank()
+        use_batched_experts = max_num_tokens_per_rank is not None
 
+        if use_batched_experts:
+            experts = BatchedTritonOrDeepGemmExperts(
+                max_num_tokens=max_num_tokens_per_rank,
+                world_size=prepare_finalize.world_size,
+                dp_size=prepare_finalize.dp_size,
+                use_fp8_w8a8=True,
+                use_int8_w8a8=False,
+                use_int8_w8a16=False,
+                use_int4_w4a16=False,
+                per_channel_quant=False,
+                block_shape=self.quant_config.weight_block_size,
+                allow_deep_gemm=self.allow_deep_gemm,
+            )
+        else:
+            experts = TritonOrDeepGemmExperts(
+                use_fp8_w8a8=True,
+                block_shape=self.quant_config.weight_block_size,
+                allow_deep_gemm=self.allow_deep_gemm,
+            )
+
+        assert experts is not None
         return experts
 
     def apply(
@@ -796,6 +825,7 @@ def apply(
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
+
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
             router_logits=router_logits,
@@ -807,6 +837,7 @@ def apply(
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
             e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype,
         )
 
         if self.rocm_aiter_moe_enabled:
@@ -854,7 +885,6 @@ def apply(
                 topk_ids=topk_ids,
                 inplace=True,
                 activation=activation,
-                use_fp8_w8a8=True,
                 global_num_experts=global_num_experts,
                 apply_router_weight_on_input=apply_router_weight_on_input,
                 expert_map=expert_map,
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index 1fcb6d7afc9b..2171f729afad 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index 436f1e3ccc1a..d3ab1be3bee0 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
 from enum import Enum
diff --git a/vllm/model_executor/layers/quantization/gptq_bitblas.py b/vllm/model_executor/layers/quantization/gptq_bitblas.py
index be9510abdffb..78e0f59fa4be 100644
--- a/vllm/model_executor/layers/quantization/gptq_bitblas.py
+++ b/vllm/model_executor/layers/quantization/gptq_bitblas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any, Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index cf012e145ee6..f92ebdea986d 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Callable, Optional, Union
 
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin_24.py b/vllm/model_executor/layers/quantization/gptq_marlin_24.py
index e90416f37791..eba917d85411 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin_24.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin_24.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/vllm/model_executor/layers/quantization/hqq_marlin.py b/vllm/model_executor/layers/quantization/hqq_marlin.py
index a8faf97723cd..ee8a0e34b32e 100644
--- a/vllm/model_executor/layers/quantization/hqq_marlin.py
+++ b/vllm/model_executor/layers/quantization/hqq_marlin.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
index 8108c797637d..31ad96eccaf3 100644
--- a/vllm/model_executor/layers/quantization/ipex_quant.py
+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
index 55ad00b1cf46..07ecc096231a 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
index bb1dc40ad71a..0bf0d530d235 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py
index e07177dd675f..785e559df8f7 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py
index 29e20699184c..649d07b4d072 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
index 50d293cf415b..fef333e862d5 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
index 855867fa4a00..c7c45861875a 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from functools import partial
 from typing import Optional
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
index 899011f00051..1597492a5cf6 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
index 2d92af74bbf9..9ebf5f303792 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
index 5d58c0489a28..18f5ce04fd35 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from typing import Optional
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
index 6c2c464e6f1b..165548a06012 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
index 98a0b30be1f6..6ddd4a9ec423 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
index c09ca83d01cb..817565cf2827 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
index a97b53b9d7b9..3de28af40aaa 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import warnings
 from typing import Optional
diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py
index 67723c7c91cc..e5604670fb4c 100644
--- a/vllm/model_executor/layers/quantization/kv_cache.py
+++ b/vllm/model_executor/layers/quantization/kv_cache.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py
index 2437030c8771..62667db26b66 100644
--- a/vllm/model_executor/layers/quantization/marlin.py
+++ b/vllm/model_executor/layers/quantization/marlin.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 2abe16a08a26..3f79b203aa17 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Callable, Optional, Union
 
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index 74bd6dc13f84..3aa23f068257 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/neuron_quant.py b/vllm/model_executor/layers/quantization/neuron_quant.py
index b2d6bf5dbf9c..8040236663dd 100644
--- a/vllm/model_executor/layers/quantization/neuron_quant.py
+++ b/vllm/model_executor/layers/quantization/neuron_quant.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from importlib.util import find_spec
diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py
index 9e4fb33639b2..32ba1055f9c8 100644
--- a/vllm/model_executor/layers/quantization/ptpc_fp8.py
+++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/vllm/model_executor/layers/quantization/qqq.py b/vllm/model_executor/layers/quantization/qqq.py
index 6028b8a2ada3..25978cb13b3a 100644
--- a/vllm/model_executor/layers/quantization/qqq.py
+++ b/vllm/model_executor/layers/quantization/qqq.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
index df4bfbbbcb4c..6ae5f5c9ad46 100644
--- a/vllm/model_executor/layers/quantization/quark/quark.py
+++ b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import fnmatch
 from typing import Any, Optional, cast
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index aa7d725433ea..4c2da4c8b04e 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/__init__.py b/vllm/model_executor/layers/quantization/quark/schemes/__init__.py
index d7dac17574ff..ec09d9b2ac26 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from .quark_scheme import QuarkScheme
 from .quark_w4a4_mxfp4 import QuarkW4A4MXFP4
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
index 40c8ea86d3c3..c167e949ac26 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from typing import Optional
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py
index 34c077b29163..3c56251b7a00 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
index 149c9093797f..47e0a492b23b 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
index 94f9fcd56aca..ae68d5bbc268 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, Optional
 
diff --git a/vllm/model_executor/layers/quantization/quark/utils.py b/vllm/model_executor/layers/quantization/quark/utils.py
index 5e56bcb7564c..99f5ec15933a 100644
--- a/vllm/model_executor/layers/quantization/quark/utils.py
+++ b/vllm/model_executor/layers/quantization/quark/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable, Mapping
 from types import MappingProxyType
diff --git a/vllm/model_executor/layers/quantization/schema.py b/vllm/model_executor/layers/quantization/schema.py
index c0be40c16aff..a108152929d9 100644
--- a/vllm/model_executor/layers/quantization/schema.py
+++ b/vllm/model_executor/layers/quantization/schema.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This file contains the Pydantic schemas for various quantization-related
 parameters. When a relevant quantization technique is specified, these
diff --git a/vllm/model_executor/layers/quantization/torchao.py b/vllm/model_executor/layers/quantization/torchao.py
index 7f9f3e643bfa..a7d9332032a2 100644
--- a/vllm/model_executor/layers/quantization/torchao.py
+++ b/vllm/model_executor/layers/quantization/torchao.py
@@ -1,10 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any, Optional
 
 import torch
 import torch.nn.functional as F
 from torch.nn.parameter import Parameter
 
+from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -12,12 +14,28 @@
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.utils import set_weight_attrs
 
+logger = init_logger(__name__)
+
 
 class TorchAOConfig(QuantizationConfig):
     """Config class for torchao."""
 
     def __init__(self, torchao_config) -> None:
         self.torchao_config = torchao_config
+        """
+        # TorchAO quantization relies on tensor subclasses. In order,
+        # to enable proper caching this needs standalone compile
+        if is_torch_equal_or_newer("2.8.0"):
+            os.environ["VLLM_TEST_STANDALONE_COMPILE"] = "1"
+            logger.info(
+                "Using TorchAO: Setting VLLM_TEST_STANDALONE_COMPILE=1")
+
+        # TODO: remove after the torch dependency is updated to 2.8
+        if is_torch_equal_or_newer(
+                "2.7.0") and not is_torch_equal_or_newer("2.8.0"):
+            os.environ["VLLM_DISABLE_COMPILE_CACHE"] = "1"
+            logger.info("Using TorchAO: Setting VLLM_DISABLE_COMPILE_CACHE=1")
+        """
 
     def __repr__(self) -> str:
         return f"TorchAOConfig({self.torchao_config})"
@@ -60,10 +78,10 @@ def get_quant_method(self, layer: torch.nn.Module,
         if not isinstance(layer, LinearBase):
             return None
 
-        from torchao.quantization import AOPerModuleConfig
+        from torchao.quantization import ModuleFqnToConfig
 
         module_fqn = prefix
-        if isinstance(self.torchao_config, AOPerModuleConfig):
+        if isinstance(self.torchao_config, ModuleFqnToConfig):
             module_fqn_to_config = self.torchao_config.module_fqn_to_config
             c = module_fqn_to_config.get(
                 module_fqn) or module_fqn_to_config.get("_default", None)
diff --git a/vllm/model_executor/layers/quantization/tpu_int8.py b/vllm/model_executor/layers/quantization/tpu_int8.py
index 7941ec9732fe..83c8a98eac91 100644
--- a/vllm/model_executor/layers/quantization/tpu_int8.py
+++ b/vllm/model_executor/layers/quantization/tpu_int8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
diff --git a/vllm/model_executor/layers/quantization/utils/__init__.py b/vllm/model_executor/layers/quantization/utils/__init__.py
index f7ee47288514..6ad56bae3dca 100644
--- a/vllm/model_executor/layers/quantization/utils/__init__.py
+++ b/vllm/model_executor/layers/quantization/utils/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from .layer_utils import replace_parameter, update_tensor_inplace
 
diff --git a/vllm/model_executor/layers/quantization/utils/allspark_utils.py b/vllm/model_executor/layers/quantization/utils/allspark_utils.py
index 97860765a9e1..1992b4d20147 100644
--- a/vllm/model_executor/layers/quantization/utils/allspark_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/allspark_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/vllm/model_executor/layers/quantization/utils/bitblas_utils.py b/vllm/model_executor/layers/quantization/utils/bitblas_utils.py
index 70d24cc897e1..82ee3edfd5e1 100644
--- a/vllm/model_executor/layers/quantization/utils/bitblas_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/bitblas_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 4c213f2c874e..270979c8e932 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from https://github.com/sgl-project/sglang/pull/2575
 import functools
@@ -135,24 +136,10 @@ def ceil_div(x: int, y: int) -> int:
         use_cutlass, use_aiter_and_is_supported)
 
     if use_cutlass:
-        rows, cols = input_2d.shape
-        # Blackwell GPUs (SM100) require row dimensions to be multiple of 4 for
-        # optimal tensor core usage. Can be removed when targeting platforms
-        # without this constraint.
-        should_pad = current_platform.has_device_capability(
-            100) and rows % 4 != 0
-        if should_pad:
-            input_2d = torch.nn.functional.pad(input_2d,
-                                               (0, 0, 0, 4 - (rows % 4)),
-                                               value=0).contiguous()
-
         q_input, x_scale = per_token_group_quant_fp8(
             input_2d, block_size[1], column_major_scales=use_cutlass)
-
         output = w8a8_blockscale_func(q_input, weight, x_scale, weight_scale,
                                       block_size, input.dtype)
-        if should_pad:
-            output = output[:rows, :]
 
     else:
         q_input, x_scale = per_token_group_quant_fp8(
diff --git a/vllm/model_executor/layers/quantization/utils/gptq_utils.py b/vllm/model_executor/layers/quantization/utils/gptq_utils.py
index 36161d13b24f..db82b0def165 100644
--- a/vllm/model_executor/layers/quantization/utils/gptq_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/gptq_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from copy import deepcopy
 from typing import Optional, Union
 
diff --git a/vllm/model_executor/layers/quantization/utils/int8_utils.py b/vllm/model_executor/layers/quantization/utils/int8_utils.py
index 72fff3fa1aed..a694a191745d 100644
--- a/vllm/model_executor/layers/quantization/utils/int8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/int8_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from https://github.com/sgl-project/sglang/blob/4cb53ecd0cffceb6dee5c011a58f65997a86f151/python/sglang/srt/layers/quantization/int8_kernel.py
 import functools
diff --git a/vllm/model_executor/layers/quantization/utils/layer_utils.py b/vllm/model_executor/layers/quantization/utils/layer_utils.py
index 5acae7ca3b84..fbc0f23acb59 100644
--- a/vllm/model_executor/layers/quantization/utils/layer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/layer_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Union
 
diff --git a/vllm/model_executor/layers/quantization/utils/machete_utils.py b/vllm/model_executor/layers/quantization/utils/machete_utils.py
index 6d840b568612..580c36a0e2fa 100644
--- a/vllm/model_executor/layers/quantization/utils/machete_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/machete_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index e059a7ac3f92..7540a1516fcb 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
index 13dcdc00a215..ca10db69dc16 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
index 1f6e74244c5d..5372c49d9838 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
index 81112b27f53a..b2c228c24253 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utility functions used for tests and benchmarks"""
 
 from typing import Optional
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
index 73feb4264a8b..1c93c364679d 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utility functions used for tests and benchmarks"""
 
 import random
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py
index 0123540fc5dd..8a64bebae04c 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import numpy
 import torch
diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
index e7c95e38e9fd..9d4a188f52df 100644
--- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py b/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py
index f292208311e2..6e8e98d544f8 100644
--- a/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch
 
 __all__ = [
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index 6ba327f3db7a..d6b96774b4e8 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """This file is used for /tests and /benchmarks"""
 from collections.abc import Mapping
 from types import MappingProxyType
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index eed8998fe3da..adc67aa64952 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Callable, Optional, Union
 
diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py
index 3db73495827c..a6e58a77d42c 100644
--- a/vllm/model_executor/layers/rejection_sampler.py
+++ b/vllm/model_executor/layers/rejection_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from functools import cached_property
 from importlib.util import find_spec
diff --git a/vllm/model_executor/layers/resampler.py b/vllm/model_executor/layers/resampler.py
index 839688e313aa..3f2d571777c0 100644
--- a/vllm/model_executor/layers/resampler.py
+++ b/vllm/model_executor/layers/resampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index afc059719796..9de2338968a1 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 32375db0c8f1..08840fc40cf6 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A layer that samples the next tokens from the model's outputs."""
 import itertools
 from collections.abc import Iterator
diff --git a/vllm/model_executor/layers/spec_decode_base_sampler.py b/vllm/model_executor/layers/spec_decode_base_sampler.py
index 969cd59b57cc..0a36fe9be45b 100644
--- a/vllm/model_executor/layers/spec_decode_base_sampler.py
+++ b/vllm/model_executor/layers/spec_decode_base_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import abstractmethod
 from typing import Optional, Union
diff --git a/vllm/model_executor/layers/typical_acceptance_sampler.py b/vllm/model_executor/layers/typical_acceptance_sampler.py
index a14c86148e73..5dabaa5379e7 100644
--- a/vllm/model_executor/layers/typical_acceptance_sampler.py
+++ b/vllm/model_executor/layers/typical_acceptance_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 import torch.jit
diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py
index 001e6aaf0cc7..41b5253dca04 100644
--- a/vllm/model_executor/layers/utils.py
+++ b/vllm/model_executor/layers/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utility methods for model layers."""
 from typing import Callable, Optional
 
@@ -49,16 +50,11 @@ def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor,
                                                    vocab_size, num_seqs)
     output_bin_counts, output_mask = get_token_bin_counts_and_mask(
         output_tokens_tensor, vocab_size, num_seqs)
-    repetition_penalties = repetition_penalties.unsqueeze(dim=1).repeat(
-        1, vocab_size)
 
-    # If token appears in prompt or output, apply, otherwise use 1.0 for no-op.
-    penalties = torch.where(prompt_mask | output_mask, repetition_penalties,
-                            1.0)
-
-    # If logits are positive, divide by penalty, otherwise multiply by penalty.
-    scaling = torch.where(logits > 0, 1.0 / penalties, penalties)
-    logits *= scaling
+    # Apply repetition penalties as a custom op
+    from vllm._custom_ops import apply_repetition_penalties
+    apply_repetition_penalties(logits, prompt_mask, output_mask,
+                               repetition_penalties)
 
     # We follow the definition in OpenAI API.
     # Refer to https://platform.openai.com/docs/api-reference/parameter-details
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index 46d2075af99d..0f636d83a6dd 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
 from dataclasses import dataclass
diff --git a/vllm/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py
index a443a652d8a3..f364371033f5 100644
--- a/vllm/model_executor/model_loader/__init__.py
+++ b/vllm/model_executor/model_loader/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/model_loader/base_loader.py b/vllm/model_executor/model_loader/base_loader.py
index d619d9f25e08..5018c7d9a360 100644
--- a/vllm/model_executor/model_loader/base_loader.py
+++ b/vllm/model_executor/model_loader/base_loader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 
 import torch
diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py
index 3df835a93896..ebbb021cad64 100644
--- a/vllm/model_executor/model_loader/bitsandbytes_loader.py
+++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: SIM117
 import fnmatch
 import glob
diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py
index 6946627a54d2..4624ff01ddc0 100644
--- a/vllm/model_executor/model_loader/default_loader.py
+++ b/vllm/model_executor/model_loader/default_loader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import dataclasses
 import glob
 import os
diff --git a/vllm/model_executor/model_loader/dummy_loader.py b/vllm/model_executor/model_loader/dummy_loader.py
index 64fa2be76d08..f4a7da5744e0 100644
--- a/vllm/model_executor/model_loader/dummy_loader.py
+++ b/vllm/model_executor/model_loader/dummy_loader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch.nn as nn
 
 from vllm.config import LoadConfig, ModelConfig
diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py
index 1eac504227e2..203c80760145 100644
--- a/vllm/model_executor/model_loader/gguf_loader.py
+++ b/vllm/model_executor/model_loader/gguf_loader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 from collections.abc import Generator
 
diff --git a/vllm/model_executor/model_loader/neuron.py b/vllm/model_executor/model_loader/neuron.py
index e65d16cae76c..fad97aba84b6 100644
--- a/vllm/model_executor/model_loader/neuron.py
+++ b/vllm/model_executor/model_loader/neuron.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utilities for selecting and loading Neuron models in transformers-neuronx
 framework."""
 import ast
diff --git a/vllm/model_executor/model_loader/neuronx_distributed.py b/vllm/model_executor/model_loader/neuronx_distributed.py
index 72ad4da296ac..f450961c64ff 100644
--- a/vllm/model_executor/model_loader/neuronx_distributed.py
+++ b/vllm/model_executor/model_loader/neuronx_distributed.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utilities for selecting and loading Neuron models in
 neuronx-distributed-inference framework."""
 # Disabling yapf because yapf and isort have conflicts for the below imports
diff --git a/vllm/model_executor/model_loader/runai_streamer_loader.py b/vllm/model_executor/model_loader/runai_streamer_loader.py
index a39e26c6da50..83e0f386c108 100644
--- a/vllm/model_executor/model_loader/runai_streamer_loader.py
+++ b/vllm/model_executor/model_loader/runai_streamer_loader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: SIM117
 import glob
 import os
diff --git a/vllm/model_executor/model_loader/sharded_state_loader.py b/vllm/model_executor/model_loader/sharded_state_loader.py
index b5a5031bb6f9..2fd9cfba3f61 100644
--- a/vllm/model_executor/model_loader/sharded_state_loader.py
+++ b/vllm/model_executor/model_loader/sharded_state_loader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import collections
 import glob
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index 90c0bdf08ef8..24d1e136539a 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import contextlib
diff --git a/vllm/model_executor/model_loader/tensorizer_loader.py b/vllm/model_executor/model_loader/tensorizer_loader.py
index 1923e040af38..b9982f312fe5 100644
--- a/vllm/model_executor/model_loader/tensorizer_loader.py
+++ b/vllm/model_executor/model_loader/tensorizer_loader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: SIM117
 import copy
 from collections.abc import Generator
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 9c8d647a24fe..e6eaade09027 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utilities for selecting and loading models."""
 import contextlib
 import inspect
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 7a9a68be8805..857f4bca6824 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utilities for downloading and initializing model weights."""
 import fnmatch
 import glob
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 3580c4fa5252..27c169d2d1e8 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from .interfaces import (HasInnerState, SupportsLoRA, SupportsMultiModal,
                          SupportsPP, SupportsV0Only, has_inner_state,
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 6ab03c40ab4a..1651e3e429e6 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from typing import TYPE_CHECKING, Any, Optional, TypeVar
diff --git a/vllm/model_executor/models/aimv2.py b/vllm/model_executor/models/aimv2.py
index 2e2a18abd03d..b13d863ebb74 100644
--- a/vllm/model_executor/models/aimv2.py
+++ b/vllm/model_executor/models/aimv2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # A modified implementation of the AIMv2 Transformer
 # inserted here also the image tokenizer used by Ovis2
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index 94a4328564bb..4693c9487a8b 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only Snowflake Arctic model."""
 from collections.abc import Iterable
 from typing import Optional, Union
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index f74e13888c48..bb4177dfc457 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable, Mapping, Sequence
 from typing import Optional, TypedDict, Union
 
diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
index 08d49d71eca1..7e15e57a4d03 100644
--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@@ -1,4 +1,5 @@
-# SPDX-License-Identifier: Apache-2.0 Adapted from
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project Adapted from
 # https://github.com/huggingface/transformers/tree/main/src/transformers/models/aya_vision
 from collections.abc import Iterable, Mapping, Sequence
 from typing import Literal, Optional, TypedDict, Union, cast
@@ -110,7 +111,13 @@ def get_hf_config(self) -> AyaVisionConfig:
         return self.ctx.get_hf_config(AyaVisionConfig)
 
     def get_hf_processor(self, **kwargs: object) -> AyaVisionProcessor:
-        return self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs)
+        processor = self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs)
+
+        # Temporary workaround since this processor has multiple image tokens
+        # See https://github.com/huggingface/transformers/issues/38350
+        processor._check_special_mm_tokens = lambda *args, **kwargs: None
+
+        return processor
 
     def get_image_processor(self) -> GotOcr2ImageProcessor:
         return self.get_hf_processor().image_processor
@@ -187,9 +194,7 @@ def _call_hf_processor(
         image_processor = hf_processor.image_processor
 
         # HF processor pops the `num_patches` kwarg, which is needed by vLLM
-        if (images :=
-                mm_data.get("images")) is not None and '<image>' in prompt:
-            assert isinstance(images, list)
+        if (images := mm_data.get("images")) is not None:
             parsed_images = (self._get_data_parser().parse_mm_data({
                 "image":
                 images
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index bcff6eb3fd31..0de5de5e835a 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index d6a705fb1859..29e0e2a2edb1 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only Bamba model."""
 # Added by the IBM Team, 2024
 from collections.abc import Iterable
diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py
index 92bbe1bb67a3..a0ec12674f19 100644
--- a/vllm/model_executor/models/bart.py
+++ b/vllm/model_executor/models/bart.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Derived from BART implementation posted on HuggingFace; license below:
 #
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 0b1d0f103408..cacec7342ac2 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from typing import Optional
@@ -413,10 +414,15 @@ def forward(
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        return self.model(input_ids=input_ids,
-                          position_ids=positions,
-                          inputs_embeds=inputs_embeds,
-                          intermediate_tensors=intermediate_tensors)
+        hidden_states = self.model(input_ids=input_ids,
+                                   position_ids=positions,
+                                   inputs_embeds=inputs_embeds,
+                                   intermediate_tensors=intermediate_tensors)
+
+        # convert the embedding output to float32,
+        # otherwise precision will be lost significantly
+        hidden_states = hidden_states.to(torch.float32)
+        return hidden_states
 
     def pooler(
         self,
diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py
index 8a387d71f1cb..d1b84a9f04fa 100644
--- a/vllm/model_executor/models/bert_with_rope.py
+++ b/vllm/model_executor/models/bert_with_rope.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
 from copy import deepcopy
 from typing import Optional
@@ -431,7 +432,12 @@ def forward(
         else:
             hidden_states = self.embeddings(input_ids=input_ids,
                                             token_type_ids=token_type_ids)
-        return self.encoder(positions, hidden_states)
+        hidden_states = self.encoder(positions, hidden_states)
+
+        # convert the embedding output to float32,
+        # otherwise precision will be lost significantly
+        hidden_states = hidden_states.to(torch.float32)
+        return hidden_states
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index acbc5d04d7e3..2b457fd8a5b2 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Minimal implementation of BlipVisionModel intended to be only used 
 within a vision language model."""
 from collections.abc import Iterable
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index db0dd2051d52..279541bed55a 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable, Mapping, Sequence
 from typing import Literal, Optional, TypedDict, Union
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index 10424e218fbc..6e4a399f3cc6 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/bloom/modeling_bloom.py
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index a4528ca26d01..aea44261dd69 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 4e95afe1a147..129f0942f14e 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Adapted from
 # https://github.com/THUDM/ChatGLM2-6B
 """Inference-only ChatGLM model compatible with THUDM weights."""
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 9fd528fd7977..dcab00822870 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Minimal implementation of CLIPVisionModel intended to be only used
 within a vision language model."""
 from collections.abc import Iterable
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 546b5f932877..ee67cc64050e 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 Cohere and the HuggingFace Inc. team. All rights reserved.
 #
diff --git a/vllm/model_executor/models/constant_size_cache.py b/vllm/model_executor/models/constant_size_cache.py
index f1cc7e0f9e29..f03c58a12932 100644
--- a/vllm/model_executor/models/constant_size_cache.py
+++ b/vllm/model_executor/models/constant_size_cache.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from typing import Any
 
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index f21887f71d85..7a4dd69443ad 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from typing import Optional, Union
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index 88d1ca9f7b83..2f0202f1e038 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py
index 03ef7bed0edc..6e6e74b0d1d9 100644
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
 from typing import Optional
 
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index b78c193c1345..0f996d04e6e8 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 5c8793f59ffb..765718e57520 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py
 """Inference-only Deepseek-VL2 model compatible with HuggingFace weights."""
diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py
index fb1675d29915..2219321457b2 100644
--- a/vllm/model_executor/models/eagle.py
+++ b/vllm/model_executor/models/eagle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from typing import Optional
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 838560692bcf..aaf105ec2552 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/modeling_exaone.py
diff --git a/vllm/model_executor/models/fairseq2_llama.py b/vllm/model_executor/models/fairseq2_llama.py
index 00dbbebb120e..d78ee100b26d 100644
--- a/vllm/model_executor/models/fairseq2_llama.py
+++ b/vllm/model_executor/models/fairseq2_llama.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 The vLLM team.
 # Copyright 2024 Meta Platforms, Inc. and affiliates. All rights reserved.
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 376793594f8b..62a93dabd5d7 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/a5cc30d72ae2dc19af534e4b35c986cc28db1275/src/transformers/models/falcon/modeling_falcon.py
diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py
index 1c0e3911fcce..28f257eabed0 100644
--- a/vllm/model_executor/models/falcon_h1.py
+++ b/vllm/model_executor/models/falcon_h1.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only FalconH1 model."""
 from collections.abc import Iterable
 from typing import Optional
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index f8acc56706d2..47760aabb959 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 from collections import OrderedDict
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index fbad7f56d0ba..cb141dbc5aa3 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/fuyu/modeling_fuyu.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 0f6d94e7518b..99ed51f8e70a 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2023 The vLLM team.
 # Copyright (c) Google Inc.
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index b46716213c62..ce405041b3d4 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 The vLLM team.
 # Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index 3a88adcce0bd..e19e0026b3f9 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Copyright 2025 The vLLM team.
 # Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
 #
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 182cc86d3ca8..23e25170799b 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 from collections.abc import Iterable, Mapping, Sequence
 from typing import Any, Literal, Optional, TypedDict
diff --git a/vllm/model_executor/models/glm.py b/vllm/model_executor/models/glm.py
index 6269ebcee5c0..defa77b84e44 100644
--- a/vllm/model_executor/models/glm.py
+++ b/vllm/model_executor/models/glm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only HF format GLM-4 model compatible with THUDM weights."""
 from vllm.config import VllmConfig
 from vllm.model_executor.models.llama import LlamaForCausalLM
diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py
index f351ce5a0681..5e2908a82c41 100644
--- a/vllm/model_executor/models/glm4.py
+++ b/vllm/model_executor/models/glm4.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2025 The Zhipu AI team.
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index 4e13716719ac..034c7654f4d9 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/THUDM/CogAgent
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index c2c310fca4d9..fd3decbaebec 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index c4ae4fc3c006..661a67bdc0db 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index 69fdd90cfbe8..bd162a5e57bc 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gptj/modeling_gptj.py
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index 401fa9f5cc8b..d418d8bb86ce 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt_neox/modeling_gpt_neox.py
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index 3524d036db22..bd4d5d0b6b28 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py
index fd8fb48c50e3..831164ba88a4 100644
--- a/vllm/model_executor/models/granite_speech.py
+++ b/vllm/model_executor/models/granite_speech.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index f342dfff824f..5a70f3a616c6 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py
index 443b102c9968..f434b7a74e48 100644
--- a/vllm/model_executor/models/granitemoehybrid.py
+++ b/vllm/model_executor/models/granitemoehybrid.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only GraniteMoeHybrid model."""
 # Added by the IBM Team, 2025
 from collections.abc import Iterable
diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py
index 817e6091d276..bb160dbce45b 100644
--- a/vllm/model_executor/models/granitemoeshared.py
+++ b/vllm/model_executor/models/granitemoeshared.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only GraniteMoeShared model.
 
 The architecture is the same as granitemoe but with the addition of shared
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index 6a444e8d1068..4273afbf4699 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from array import array
 from typing import Optional
diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py
index bc9e9a3c0206..2d930527b2be 100644
--- a/vllm/model_executor/models/grok1.py
+++ b/vllm/model_executor/models/grok1.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Adapted from
 # https://github.com/ROCm/vllm/blob/cea7419f151cc50293a05b7fac8547f8f887c9f6/vllm/model_executor/models/grok1.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index 904f5330c653..8f7f359b7552 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/modeling_h2ovl_chat.py
 # https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/image_process.py
diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
index b8bdc7aa32b2..9e27200fb1c8 100644
--- a/vllm/model_executor/models/idefics2_vision_model.py
+++ b/vllm/model_executor/models/idefics2_vision_model.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://github.com/huggingface/transformers/blob/v4.43.2/src/transformers/models/idefics2/modeling_idefics2.py
 # Copyright 2024 The vLLM team.
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index fdb128ef5b54..de8596282ca9 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 the HuggingFace Inc. team. All rights reserved.
 #
@@ -21,8 +22,8 @@
 
 import torch
 from torch import nn
-from transformers import (BatchFeature, Idefics3Config, Idefics3ImageProcessor,
-                          Idefics3Processor)
+from transformers import (AddedToken, BatchFeature, Idefics3Config,
+                          Idefics3ImageProcessor, Idefics3Processor)
 
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.linear import ReplicatedLinear
@@ -198,13 +199,21 @@ def get_num_patches(
 
         return grid_w * grid_h + 1
 
+    # TODO: Remove after requiring transformers>=4.52
+    def _get_content(self, token: Union[AddedToken, str]) -> str:
+        if isinstance(token, str):
+            return token
+
+        return token.content
+
     def _get_image_token(
             self,
             processor: Optional[Idefics3Processor]) -> tuple[str, str, str]:
         if processor is None:
             processor = self.get_hf_processor()
-        image_token = processor.image_token.content
-        fake_image_token = processor.fake_image_token.content
+
+        image_token = self._get_content(processor.image_token)
+        fake_image_token = self._get_content(processor.fake_image_token)
         global_image_token = processor.global_image_tag
         return image_token, fake_image_token, global_image_token
 
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 8be8841c1f6c..cb2a4062b84c 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol,
                     Union, overload, runtime_checkable)
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index d325a6b67132..4a1ea74a218a 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import (TYPE_CHECKING, Optional, Protocol, Union, overload,
                     runtime_checkable)
diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
index 538e9de4f78f..58e8163e0b26 100644
--- a/vllm/model_executor/models/intern_vit.py
+++ b/vllm/model_executor/models/intern_vit.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py
 # --------------------------------------------------------
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 3f3e3966e838..e8549b4e0538 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from functools import partial
diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py
index 6893d0239121..4bbb49da0e96 100644
--- a/vllm/model_executor/models/internlm2_ve.py
+++ b/vllm/model_executor/models/internlm2_ve.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional, Union
 
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index c37d3afb4e44..0c61369c5f51 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_internvl_chat.py
 # --------------------------------------------------------
diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py
index d6a1e0bb4845..bed4a5dff2ef 100644
--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/inceptionai/jais-30b-chat-v3/blob/main/modeling_jais.py
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 6f9fa60c9b05..8294f846bbd1 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only Jamba model."""
 from collections.abc import Iterable
 from typing import Optional
diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py
index b575f44765a8..351d1fbdc744 100644
--- a/vllm/model_executor/models/kimi_vl.py
+++ b/vllm/model_executor/models/kimi_vl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: E501
 # Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/modeling_kimi_vl.py
 # Copyright 2025 The Moonshot AI Team, DeepSeek-AI, and HuggingFace Inc. team. All rights reserved.
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index d36b6466c0bb..5d5080479e51 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index 40fdd84d8fb0..a852be66bde8 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 #
 # Copyright 2025 the LLAMA4, Meta Inc., vLLM, and HuggingFace Inc. team.
 # All rights reserved.
diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py
index 172dc8b5ec06..c7690604c1d0 100644
--- a/vllm/model_executor/models/llama_eagle.py
+++ b/vllm/model_executor/models/llama_eagle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 
@@ -54,13 +55,11 @@ def __init__(
             speculative_config.draft_model_config.hf_config
         self.vocab_size = self.config.vocab_size
 
-        # if PP disabled then draft will share embed with target
-        if get_pp_group().world_size > 1:
-            self.embed_tokens = VocabParallelEmbedding(
-                self.config.vocab_size,
-                self.config.hidden_size,
-                prefix=maybe_prefix(prefix, "embed_tokens"),
-            )
+        self.embed_tokens = VocabParallelEmbedding(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            prefix=maybe_prefix(prefix, "embed_tokens"),
+        )
 
         self.layers = nn.ModuleList([
             LlamaDecoderLayer(
@@ -163,4 +162,4 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
             if "lm_head" not in name:
                 name = "model." + name
             model_weights[name] = loaded_weight
-        return loader.load_weights(model_weights.items())
+        loader.load_weights(model_weights.items())
diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py
index 1e40017fc792..7fc9fe2ebb6f 100644
--- a/vllm/model_executor/models/llama_eagle3.py
+++ b/vllm/model_executor/models/llama_eagle3.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from typing import Optional
@@ -9,7 +10,6 @@
 
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
-from vllm.distributed.parallel_state import get_pp_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import QKVParallelLinear
@@ -94,13 +94,11 @@ def __init__(
             speculative_config.draft_model_config.hf_config
         self.vocab_size = self.config.vocab_size
 
-        # if PP disabled then draft will share embed with target
-        if get_pp_group().world_size > 1:
-            self.embed_tokens = VocabParallelEmbedding(
-                self.config.vocab_size,
-                self.config.hidden_size,
-                prefix=maybe_prefix(prefix, "embed_tokens"),
-            )
+        self.embed_tokens = VocabParallelEmbedding(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            prefix=maybe_prefix(prefix, "embed_tokens"),
+        )
 
         self.layers = nn.ModuleList([
             LlamaDecoderLayer(
@@ -239,6 +237,7 @@ def combine_hidden_states(
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         model_weights = {}
         includes_draft_id_mapping = False
+        includes_embed_tokens = False
         for name, loaded_weight in weights:
             if "t2d" in name:
                 continue
@@ -247,12 +246,18 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
                 includes_draft_id_mapping = True
             elif "lm_head" not in name:
                 name = "model." + name
+            if "embed_tokens" in name:
+                includes_embed_tokens = True
             model_weights[name] = loaded_weight
 
+        skip_substrs = []
+        if not includes_draft_id_mapping:
+            skip_substrs.append("draft_id_to_target_id")
+        if not includes_embed_tokens:
+            skip_substrs.append("embed_tokens")
         loader = AutoWeightsLoader(
             self,
             skip_prefixes=None,
-            skip_substrs=["draft_id_to_target_id"] \
-                if not includes_draft_id_mapping else None,
+            skip_substrs=skip_substrs,
         )
         loader.load_weights(model_weights.items())
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index ced71b6dcdeb..725e1b2c1948 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 2fb79f57a67f..6f5f231875de 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import abstractmethod
 from collections.abc import Iterable, Mapping
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 9303ea121727..a3406d090db8 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 7ea759fd59b8..d90d3d4a0960 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index ce76a76b6574..8162ac3f7597 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """PyTorch MAMBA model."""
 from collections.abc import Iterable
 from typing import Optional
diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py
index 65c6467bcf5f..cf9e1bd03e98 100644
--- a/vllm/model_executor/models/mamba2.py
+++ b/vllm/model_executor/models/mamba2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """PyTorch MAMBA2 model."""
 from collections.abc import Iterable
 from typing import Optional
diff --git a/vllm/model_executor/models/mamba_cache.py b/vllm/model_executor/models/mamba_cache.py
index 47d0ef9cc6bb..49ba974c69a5 100644
--- a/vllm/model_executor/models/mamba_cache.py
+++ b/vllm/model_executor/models/mamba_cache.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 
diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py
index 95ef1134b1bf..709a5a993c6f 100644
--- a/vllm/model_executor/models/medusa.py
+++ b/vllm/model_executor/models/medusa.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from typing import Optional
diff --git a/vllm/model_executor/models/mimo.py b/vllm/model_executor/models/mimo.py
index 49ea64e029d6..9b83f848ef42 100644
--- a/vllm/model_executor/models/mimo.py
+++ b/vllm/model_executor/models/mimo.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py
diff --git a/vllm/model_executor/models/mimo_mtp.py b/vllm/model_executor/models/mimo_mtp.py
index cbca6a4c8f9d..6066ec76c5fc 100644
--- a/vllm/model_executor/models/mimo_mtp.py
+++ b/vllm/model_executor/models/mimo_mtp.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/vllm-project/vllm/blob/v0.7.3/vllm/model_executor/models/deepseek_mtp.py
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index f471a86ffba3..d398a5d12bbc 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py
index 2a6867d12d99..92c13e81bf3e 100644
--- a/vllm/model_executor/models/minicpm3.py
+++ b/vllm/model_executor/models/minicpm3.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py
index 039c3d22d160..06c2eb4e80af 100644
--- a/vllm/model_executor/models/minicpm_eagle.py
+++ b/vllm/model_executor/models/minicpm_eagle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index ae5df0f9273f..ff5959ed196e 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 04cc7e35e345..4100fee0ec84 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/minimax_cache.py b/vllm/model_executor/models/minimax_cache.py
index c95cbb419eb9..9164ac06a3b0 100644
--- a/vllm/model_executor/models/minimax_cache.py
+++ b/vllm/model_executor/models/minimax_cache.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
 
 import torch
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
index ac0fe7b10c83..02800449bda3 100644
--- a/vllm/model_executor/models/minimax_text_01.py
+++ b/vllm/model_executor/models/minimax_text_01.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only MiniMaxText01 model."""
 import copy
 import math
diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py
index 14c1250ca3b4..b2ededcaf67c 100644
--- a/vllm/model_executor/models/minimax_vl_01.py
+++ b/vllm/model_executor/models/minimax_vl_01.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable, Mapping
 from typing import Literal, Optional, TypedDict, Union, cast
 
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
index 051a73120838..9147240b2b2a 100644
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 9bc7a16153e1..dec365119c72 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index 8220200d270c..3183c762d2b1 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 713c9e8d203f..e9f91feb3359 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 the HuggingFace Inc. team. All rights reserved.
 #
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index 58549b10e966..54fae279d531 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 #
 # Copyright 2025 the LLAMA4, Meta Inc., vLLM, and HuggingFace Inc. team.
 # All rights reserved.
diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py
index a7d7aa7d44ef..c6a97388dc18 100644
--- a/vllm/model_executor/models/mlp_speculator.py
+++ b/vllm/model_executor/models/mlp_speculator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 from collections.abc import Iterable
diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py
index 18eab6051736..35f416a6e21e 100644
--- a/vllm/model_executor/models/modernbert.py
+++ b/vllm/model_executor/models/modernbert.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
 from typing import Optional
 
diff --git a/vllm/model_executor/models/module_mapping.py b/vllm/model_executor/models/module_mapping.py
index 25e6f594069e..11a2a384c165 100644
--- a/vllm/model_executor/models/module_mapping.py
+++ b/vllm/model_executor/models/module_mapping.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 #  https://github.com/modelscope/ms-swift/blob/v2.4.2/swift/utils/module_mapping.py
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 640a2049a629..1fa76b9ac7af 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
diff --git a/vllm/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py
index 9f11d4a42273..d0fdab13ef0c 100644
--- a/vllm/model_executor/models/moonvit.py
+++ b/vllm/model_executor/models/moonvit.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: E501
 # Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/modeling_kimi_vl.py
 # This file is meant to be used in kimi_vl.py only
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index 6c396d778ae7..0878ada34d1d 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main
 import math
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index d0999e30e1ba..eabf47b1aede 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py
new file mode 100644
index 000000000000..3424efa80d48
--- /dev/null
+++ b/vllm/model_executor/models/nemotron_h.py
@@ -0,0 +1,573 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from https://github.com/vllm-project/vllm/blob/94d8ec8d2bcb4ec55e33022b313c7e978edf05e1/vllm/model_executor/models/bamba.py
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only NemotronH model."""
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+from torch import nn
+
+from vllm.attention.layer import Attention
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import divide, get_tensor_model_parallel_world_size
+from vllm.distributed.parallel_state import get_pp_group
+from vllm.forward_context import get_forward_context
+from vllm.model_executor.layers.activation import ReLUSquaredActivation
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.mamba2_metadata import (
+    Mamba2Metadata, prepare_mamba2_metadata)
+from vllm.model_executor.layers.mamba.mamba_mixer2 import (
+    MambaMixer2, extra_groups_for_head_shards)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.interfaces import (HasInnerState, IsHybrid,
+                                                   SupportsLoRA, SupportsPP,
+                                                   SupportsQuant,
+                                                   SupportsV0Only)
+from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
+                                                    MambaCacheParams)
+from vllm.model_executor.models.utils import (
+    AutoWeightsLoader, make_empty_intermediate_tensors_factory, make_layers,
+    maybe_prefix)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs import NemotronHConfig
+from vllm.utils import LayerBlockType
+
+
+class NemotronHMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: NemotronHConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.up_proj = ColumnParallelLinear(
+            input_size=config.hidden_size,
+            output_size=config.intermediate_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            input_size=config.intermediate_size,
+            output_size=config.hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
+        self.act_fn = ReLUSquaredActivation()
+
+    def forward(self, x: torch.Tensor):
+        x, _ = self.up_proj(x)
+        x = self.act_fn(x)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class NemotronHMLPDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: NemotronHConfig,
+        layer_idx: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+
+        self.mixer = NemotronHMLP(
+            config,
+            quant_config=quant_config,
+            bias=config.mlp_bias,
+            prefix=f"{prefix}.mixer",
+        )
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        **kwargs,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+        else:
+            hidden_states, residual = self.norm(hidden_states, residual)
+
+        hidden_states = self.mixer(hidden_states)
+        return hidden_states, residual
+
+
+class NemotronHMambaDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: NemotronHConfig,
+        layer_idx: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.mixer = MambaMixer2(
+            hidden_size=config.hidden_size,
+            ssm_state_size=config.ssm_state_size,
+            conv_kernel_size=config.conv_kernel,
+            intermediate_size=config.expand * config.hidden_size,
+            use_conv_bias=config.use_conv_bias,
+            use_bias=config.use_bias,
+            n_groups=config.n_groups,
+            num_heads=config.mamba_num_heads,
+            head_dim=config.mamba_head_dim,
+            rms_norm_eps=config.rms_norm_eps,
+            activation=config.mamba_hidden_act,
+            quant_config=quant_config,
+        )
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        mamba_cache_params: MambaCacheParams,
+        mamba2_metadata: Mamba2Metadata,
+        **kwargs,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+        else:
+            hidden_states, residual = self.norm(hidden_states, residual)
+
+        hidden_states = self.mixer(hidden_states, mamba_cache_params,
+                                   mamba2_metadata)
+        return hidden_states, residual
+
+
+class NemotronHAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: NemotronHConfig,
+        layer_idx: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = config.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+
+        self.qkv_proj = QKVParallelLinear(
+            config.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class NemotronHAttentionDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: NemotronHConfig,
+        layer_idx: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.mixer = NemotronHAttention(
+            config,
+            layer_idx,
+            cache_config,
+            quant_config,
+            prefix=f"{prefix}.mixer",
+        )
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        **kwargs,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+        else:
+            hidden_states, residual = self.norm(hidden_states, residual)
+
+        hidden_states = self.mixer(hidden_states=hidden_states)
+        return hidden_states, residual
+
+
+ALL_DECODER_LAYER_TYPES = {
+    "M": NemotronHMambaDecoderLayer,
+    "-": NemotronHMLPDecoderLayer,
+    "*": NemotronHAttentionDecoderLayer,
+}
+
+
+class NemotronHModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config: NemotronHConfig = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+        )
+
+        def get_layer(prefix: str):
+            layer_idx = int(prefix.rsplit(".", 1)[1])
+            layer_class = ALL_DECODER_LAYER_TYPES[
+                config.hybrid_override_pattern[layer_idx]]
+            return layer_class(
+                config,
+                layer_idx,
+                cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            len(config.hybrid_override_pattern),
+            get_layer,
+            prefix=f"{prefix}.layers")
+        self.make_empty_intmd_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size)
+
+        self.norm_f = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        mamba_cache_params: MambaCacheParams,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+
+        attn_metadata = get_forward_context().attn_metadata
+
+        mamba2_metadata = prepare_mamba2_metadata(
+            chunk_size=self.config.chunk_size,
+            attn_metadata=attn_metadata,
+        )
+
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        residual = None
+        num_non_mamba_layers = 0
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            layer_mamba_cache_params = None
+            if isinstance(layer, NemotronHMambaDecoderLayer):
+                layer_mamba_cache_params = mamba_cache_params.at_layer_idx(
+                    i - num_non_mamba_layers)
+            else:
+                num_non_mamba_layers += 1
+
+            hidden_states, residual = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                residual=residual,
+                mamba_cache_params=layer_mamba_cache_params,
+                mamba2_metadata=mamba2_metadata,
+            )
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states, _ = self.norm_f(hidden_states, residual)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        attb_params_mapping = {
+            "q_proj": "q",
+            "k_proj": "k",
+            "v_proj": "v",
+        }
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "embeddings" in name:
+                name = name.replace("embeddings", "embed_tokens")
+
+            if "A_log" in name:
+                name = name.replace("A_log", "A")
+                loaded_weight = loaded_weight.to(torch.float32)
+
+            if "D" in name:
+                loaded_weight = loaded_weight.to(torch.float32)
+
+            if "dt_bias" in name:
+                loaded_weight = loaded_weight.to(torch.float32)
+
+            # load attn params
+            if any(proj in name for proj in ["q_proj", "k_proj", "v_proj"]):
+                weight_name = next(proj
+                                   for proj in ["q_proj", "k_proj", "v_proj"]
+                                   if proj in name)
+                name = name.replace(weight_name, "qkv_proj")
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight,
+                              attb_params_mapping[weight_name])
+            # load other params
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+            loaded_params.add(name)
+        return loaded_params
+
+
+class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
+                           IsHybrid, SupportsV0Only, SupportsQuant):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        config = vllm_config.model_config.hf_config
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
+        lora_config = vllm_config.lora_config
+        scheduler_config = vllm_config.scheduler_config
+        assert not cache_config.enable_prefix_caching, \
+            "NemotronH currently does not support prefix caching"
+
+        self.quant_config = vllm_config.quant_config
+
+        super().__init__()
+        self.config = config
+        self.scheduler_config = scheduler_config
+        self.model = NemotronHModel(vllm_config=vllm_config,
+                                    prefix=maybe_prefix(prefix, "model"))
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE
+            # We need bigger padding if using lora for kernel
+            # compatibility
+            if not lora_config else lora_config.lora_vocab_padding_size,
+        )
+        # Used to track and store by the Mamba cache between steps.
+        self.mamba_cache: Optional[MambaCacheManager] = None
+
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size)
+
+        self.make_empty_intmd_tensors = (self.model.make_empty_intmd_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                **kwargs):
+        if self.mamba_cache is None:
+
+            num_mamba_layers = self.model_config.get_num_layers_by_block_type(
+                self.vllm_config.parallel_config, LayerBlockType.mamba)
+
+            self.mamba_cache = MambaCacheManager(
+                self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers,
+                *self._get_mamba_cache_shape())
+        mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
+        hidden_states = self.model(input_ids, positions, mamba_cache_params,
+                                   intermediate_tensors, inputs_embeds)
+
+        return hidden_states
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
+        return self.mamba_cache.copy_inputs_before_cuda_graphs(
+            input_buffers, **kwargs)
+
+    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
+        return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
+
+    def _get_mamba_cache_shape(
+            self) -> tuple[tuple[int, int], tuple[int, int]]:
+        world_size = get_tensor_model_parallel_world_size()
+        hidden_size = self.config.hidden_size
+
+        conv_state_shape, temporal_state_shape = None, None
+
+        intermediate_size = self.config.expand * hidden_size
+
+        # if n_groups is not divisible by world_size, need to extend the shards
+        # to ensure all groups needed by a head is sharded along with it
+        n_groups = (
+            self.config.n_groups +
+            extra_groups_for_head_shards(self.config.n_groups, world_size))
+
+        # - heads and n_groups are TP-ed
+        conv_dim = (intermediate_size +
+                    2 * n_groups * self.config.ssm_state_size)
+        conv_state_shape = (
+            divide(conv_dim, world_size),
+            self.config.conv_kernel - 1,
+        )
+
+        # These are not TP-ed as they depend on A, dt_bias, D
+        # - they are typically small
+        #   e.g., (h_heads, d_head, d_state) = (128, 64, 128)
+        temporal_state_shape = (
+            divide(self.config.mamba_num_heads, world_size),
+            self.config.mamba_head_dim,
+            self.config.ssm_state_size,
+        )
+        return conv_state_shape, temporal_state_shape
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        # update name in weights before passing to loader
+        updated_weights = []
+        for name, loaded_weight in weights:
+            name = name.replace("backbone", "model")
+            updated_weights.append((name, loaded_weight))
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(updated_weights)
diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py
index 9808fe05558e..a766ed9476a6 100644
--- a/vllm/model_executor/models/nemotron_nas.py
+++ b/vllm/model_executor/models/nemotron_nas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py
index 172434e66ae2..2f7f8e437f0a 100644
--- a/vllm/model_executor/models/nvlm_d.py
+++ b/vllm/model_executor/models/nvlm_d.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://huggingface.co/nvidia/NVLM-D-72B/blob/main/modeling_nvlm_d.py
 # --------------------------------------------------------
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index fcb7c619a102..1dc4df85c1bc 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/olmo/modeling_olmo.py
diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py
index 33adacdae5f5..499e6d30ed6b 100644
--- a/vllm/model_executor/models/olmo2.py
+++ b/vllm/model_executor/models/olmo2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/modeling_olmo2.py
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index af289455527c..ebfdb690fe29 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 8376d62410d4..9eaac1e28dcd 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index da2a194e6bdf..d121188ba5d4 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/modeling_orion.py
diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py
index 232a63c50689..5c11d54c6124 100644
--- a/vllm/model_executor/models/ovis.py
+++ b/vllm/model_executor/models/ovis.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/ovis/modeling_ovis.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 427005e9b704..a0e2912578c5 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable, Mapping, Sequence
 from typing import Literal, Optional, TypedDict, Union
 
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
index d46b95fea5a8..f8db99eb92ba 100644
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/persimmon/modeling_persimmon.py
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index 330ad5c59448..21d517b3a490 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_phi.py
diff --git a/vllm/model_executor/models/phi3.py b/vllm/model_executor/models/phi3.py
index 8f84e0726951..f4e870c53030 100644
--- a/vllm/model_executor/models/phi3.py
+++ b/vllm/model_executor/models/phi3.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from llama.py
 """Inference-only Phi3 model code inherit from Llama.py"""
diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py
index d00d7d886d67..533655fd5200 100644
--- a/vllm/model_executor/models/phi3_small.py
+++ b/vllm/model_executor/models/phi3_small.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 from collections.abc import Iterable
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index b757e661d771..376c53d2cb99 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 The vLLM team.
 # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 418ff900ffd5..924e6436897d 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 from collections.abc import Iterable, Mapping, Sequence
 from typing import Any, Literal, Optional, TypedDict, Union
diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py
index 98cef75069ae..ae7a8a732c44 100644
--- a/vllm/model_executor/models/phi4mm_audio.py
+++ b/vllm/model_executor/models/phi4mm_audio.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 # Code copied from Microsoft/MoE by Jacob Platin (jacobplatin@microsoft.com)
diff --git a/vllm/model_executor/models/phi4mm_utils.py b/vllm/model_executor/models/phi4mm_utils.py
index f468fdbd5417..c4890d8427e2 100644
--- a/vllm/model_executor/models/phi4mm_utils.py
+++ b/vllm/model_executor/models/phi4mm_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 # Code copied from Microsoft/MoE by Jacob Platin (jacobplatin@microsoft.com)
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index d9917c26d1b1..dddd19c7462b 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 9f28d4cef425..705586b6a6ea 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py
index 55a65f8078a4..670576c68efd 100644
--- a/vllm/model_executor/models/plamo2.py
+++ b/vllm/model_executor/models/plamo2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only PLaMo2 model."""
 import math
 from collections.abc import Iterable
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
index 40ac5e30a368..4fdcae5de644 100644
--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2025 The vLLM team.
 # Copyright 2025 IBM.
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 2fda87a4ff0f..e804f03e014e 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index a664864ff898..23f65b99c22c 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index d89b822dd873..7172394e4200 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Copyright 2024 The Qwen team.
 # Copyright 2023 The vLLM team.
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index f62c7e1d2ee1..7770ec711ce7 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 3182a7532578..6951630c6f23 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 The Qwen team.
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 143b9f98b029..a2c65f4b5edb 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
index 81dc38988c9d..76d7ecdd1272 100644
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 5c30e36c7ce3..a4f8a361ec71 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/19e6e80e10118f855137b90740936c0b11ac397f/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
index dbe2be8a73d5..393ce41a91a0 100644
--- a/vllm/model_executor/models/qwen3.py
+++ b/vllm/model_executor/models/qwen3.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 The Qwen team.
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index 8a4c2850dda3..823197fc9350 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 The Qwen team.
 # Copyright 2023 The vLLM team.
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index f5d242fdf1c2..e828ce9c9849 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/Qwen/Qwen-VL/blob/main/modeling_qwen.py
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index fcef457a7829..e82e36638069 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Whenever you add an architecture to this page, please also update
 `tests/models/registry.py` with example HuggingFace models for it.
@@ -91,6 +92,7 @@
     "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
     "MiMoForCausalLM": ("mimo", "MiMoForCausalLM"),
     "NemotronForCausalLM": ("nemotron", "NemotronForCausalLM"),
+    "NemotronHForCausalLM": ("nemotron_h", "NemotronHForCausalLM"),
     "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),
     "Olmo2ForCausalLM": ("olmo2", "Olmo2ForCausalLM"),
     "OlmoeForCausalLM": ("olmoe", "OlmoeForCausalLM"),
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index 76008b72941d..8fa8b89798d0 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 from collections.abc import Iterable
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 4803da2956ef..3630f59f53e0 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Implementation of SiglipVisionModel intended to be only used
 within a vision language model."""
 
diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
index eefadda918f6..08c47facad97 100644
--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py
 # --------------------------------------------------------
diff --git a/vllm/model_executor/models/smolvlm.py b/vllm/model_executor/models/smolvlm.py
index 31dec55026ba..0f22ba5b406c 100644
--- a/vllm/model_executor/models/smolvlm.py
+++ b/vllm/model_executor/models/smolvlm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index fcd17cc1c2ba..8dd52f1d204a 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index 86ce813ddf3d..d6ec743ce845 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team.
 # All rights reserved.
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index f4ba5a8030e5..9d9a2bff0e43 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 BigCode and the HuggingFace Inc. team. All rights reserved.
 #
diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py
index 7d713d23c772..f0b31b1332fb 100644
--- a/vllm/model_executor/models/telechat2.py
+++ b/vllm/model_executor/models/telechat2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2023 The vLLM team.
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
diff --git a/vllm/model_executor/models/teleflm.py b/vllm/model_executor/models/teleflm.py
index e05f23f99e97..3666f7011a99 100644
--- a/vllm/model_executor/models/teleflm.py
+++ b/vllm/model_executor/models/teleflm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index b87a2ebf211a..2f78d9d4cc06 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 The vLLM team.
 #
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index c1a4dc1b33d7..43836f2956c3 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py
 """PyTorch Ultravox model."""
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 3d821d3dc6b5..aa88f4210160 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 from collections.abc import Iterable, Mapping
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 901d83ec5b9e..ac6a659bbaa3 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from typing import Final, Generic, Optional, Protocol, TypeVar, Union
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index c6e303d6024a..3ee5f7dba01f 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py
index 48e254bdd85b..a4f97c774f70 100644
--- a/vllm/model_executor/models/zamba2.py
+++ b/vllm/model_executor/models/zamba2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """PyTorch Zamba2 model implementation for vLLM.
 
 This module implements the Zamba2 architecture from 
diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py
index 34a0b527b585..750ee7850268 100644
--- a/vllm/model_executor/parameter.py
+++ b/vllm/model_executor/parameter.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from fractions import Fraction
 from typing import Callable, Optional, Union
diff --git a/vllm/model_executor/pooling_metadata.py b/vllm/model_executor/pooling_metadata.py
index 4c5db7396c03..4dd443bc26ea 100644
--- a/vllm/model_executor/pooling_metadata.py
+++ b/vllm/model_executor/pooling_metadata.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Any
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 6b83a59b5988..56f0f0984bfa 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from array import array
 from dataclasses import dataclass
diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py
index 27cea6521787..cbaa34bfc30b 100644
--- a/vllm/model_executor/utils.py
+++ b/vllm/model_executor/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utils for model executor."""
 import copy
 from typing import Any, Optional
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 815e34d5ac5d..2ef9f1ccc02b 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from .base import MultiModalPlaceholderMap
 from .hasher import MultiModalHashDict, MultiModalHasher
 from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins,
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index 1fd2ab7f87d1..fbb29276f6bd 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import base64
 from io import BytesIO
 from pathlib import Path
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 184c801e64d8..7188ed14c573 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
index b4cd6a90834c..b7988359737a 100644
--- a/vllm/multimodal/hasher.py
+++ b/vllm/multimodal/hasher.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pickle
 from collections.abc import Iterable, Mapping
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index a63ec0bd8ada..e673632d4366 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import base64
 from io import BytesIO
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 600a34d39ef6..0bf5b1cf1c6c 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from collections import UserDict, defaultdict
@@ -746,17 +747,11 @@ def as_kwargs(
         batched_inputs: BatchedTensorInputs,
         *,
         device: torch.types.Device,
-        dtype: Optional[torch.dtype] = None,
     ) -> BatchedTensorInputs:
         json_inputs = cast(JSONTree[torch.Tensor], batched_inputs)
 
-        def maybe_cast_dtype(x: torch.Tensor):
-            # This mimics the behavior of transformers.BatchFeature
-            return x.to(dtype=dtype) if x.is_floating_point() else x
-
         json_mapped = json_map_leaves(
-            # NOTE: Cast the dtype before sending it to device
-            lambda x: maybe_cast_dtype(x).to(device=device, non_blocking=True),
+            lambda x: x.to(device=device, non_blocking=True),
             json_inputs,
         )
 
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 63af842747a5..cae62b2235e4 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from collections import UserDict
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index aa7914e40cbf..5cfca57bffee 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
 import sys
 from abc import ABC, abstractmethod
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 53f5b243d496..1faecb7bd24a 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
 from dataclasses import dataclass, field
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index b9f5cee922a7..27aaa661c35c 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Mapping
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Generic, Optional, Protocol, TypeVar
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 1d838f66f1de..11a25f851546 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from itertools import groupby
 from pathlib import Path
@@ -8,7 +9,7 @@
 import numpy as np
 import numpy.typing as npt
 import torch
-from PIL import Image
+from PIL import Image, UnidentifiedImageError
 
 import vllm.envs as envs
 from vllm.connections import HTTPConnection, global_http_connection
@@ -184,11 +185,15 @@ def fetch_image(
         """
         image_io = ImageMediaIO(image_mode=image_mode)
 
-        return self.load_from_url(
-            image_url,
-            image_io,
-            fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
-        )
+        try:
+            return self.load_from_url(
+                image_url,
+                image_io,
+                fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
+            )
+        except UnidentifiedImageError as e:
+            # convert to ValueError to be properly caught upstream
+            raise ValueError(str(e)) from e
 
     async def fetch_image_async(
         self,
@@ -203,11 +208,15 @@ async def fetch_image_async(
         """
         image_io = ImageMediaIO(image_mode=image_mode)
 
-        return await self.load_from_url_async(
-            image_url,
-            image_io,
-            fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
-        )
+        try:
+            return await self.load_from_url_async(
+                image_url,
+                image_io,
+                fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
+            )
+        except UnidentifiedImageError as e:
+            # convert to ValueError to be properly caught upstream
+            raise ValueError(str(e)) from e
 
     def fetch_video(
         self,
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 261d56abad9c..bedb9536e3c9 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import base64
 from abc import abstractmethod
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 3960388bf73c..891305eb7936 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 from collections.abc import MutableSequence
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index 00d00d05f47a..13453d2c4b4b 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import logging
 import traceback
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index eaffaac78cce..71c964fbfbb5 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
+import platform
 import sys
 from importlib.util import find_spec
 from typing import TYPE_CHECKING, Optional
@@ -21,6 +23,15 @@
     VllmConfig = None
 
 
+def get_max_threads(pid=0):
+    if hasattr(os, 'sched_getaffinity'):
+        return len(os.sched_getaffinity(pid))
+    elif platform.system() == 'Darwin':
+        return os.cpu_count()
+    else:
+        raise NotImplementedError("Unsupported OS")
+
+
 class CpuPlatform(Platform):
     _enum = PlatformEnum.CPU
     device_name: str = "cpu"
@@ -56,7 +67,10 @@ def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
             logger.info("Using CPU MLA backend.")
             return "vllm.attention.backends.cpu_mla.CPUMLABackend"
         logger.info("Using Torch SDPA backend.")
-        return "vllm.attention.backends.torch_sdpa.TorchSDPABackend"
+        if use_v1:
+            return "vllm.v1.attention.backends.cpu_attn.TorchSDPABackend"
+        else:
+            return "vllm.attention.backends.torch_sdpa.TorchSDPABackend"
 
     @classmethod
     def get_device_total_memory(cls, device_id: int = 0) -> int:
@@ -80,6 +94,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         if not model_config.enforce_eager:
             model_config.enforce_eager = True
 
+        model_config.disable_cascade_attn = True
+
         cache_config = vllm_config.cache_config
 
         ipex_available = find_spec("intel_extension_for_pytorch") is not None
@@ -127,7 +143,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 f" {kv_cache_space}, expect a positive integer value.")
 
         parallel_config = vllm_config.parallel_config
-        if (parallel_config.distributed_executor_backend is not None
+        if (parallel_config.world_size > 1
+                and parallel_config.distributed_executor_backend is not None
                 and parallel_config.distributed_executor_backend != "mp"):
             logger.warning(("%s is not supported on CPU, fallback to mp "
                             "distributed executor backend."),
@@ -140,7 +157,38 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 parallel_config.sd_worker_cls = \
                     "vllm.worker.cpu_worker.CPUWorker"
             else:
-                parallel_config.worker_cls = "vllm.worker.cpu_worker.CPUWorker"
+                if envs.VLLM_USE_V1:
+                    parallel_config.worker_cls = \
+                        "vllm.v1.worker.cpu_worker.CPUWorker"
+                else:
+                    parallel_config.worker_cls = \
+                        "vllm.worker.cpu_worker.CPUWorker"
+
+        # Note: workaround for v1 gpu_model_runner
+        from vllm.config import CompilationLevel
+        vllm_config.compilation_config.cudagraph_capture_sizes = []
+
+        compilation_config = vllm_config.compilation_config
+        if (envs.VLLM_USE_V1 and vllm_config.compilation_config.level
+                == CompilationLevel.PIECEWISE):
+            compilation_config.level = CompilationLevel.DYNAMO_ONCE
+            compilation_config.backend = "eager"
+            compilation_config.custom_ops += ["none"]
+            compilation_config.inductor_compile_config.update({
+                "dce":
+                True,
+                "size_asserts":
+                False,
+                "nan_asserts":
+                False,
+                "memory_planning":
+                True,
+                "epilogue_fusion":
+                True,
+            })
+
+        if vllm_config.lora_config is not None:
+            compilation_config.level = CompilationLevel.NO_COMPILATION
 
         assert vllm_config.device_config.device_type == "cpu"
 
@@ -148,6 +196,12 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         # Environment variables for CPU executor
         #
 
+        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+        # Note: to avoid the error 'nthreads cannot be larger than environment
+        #  variable "NUMEXPR_MAX_THREADS" (64)'.
+        os.environ["NUMEXPR_MAX_THREADS"] = str(get_max_threads())
+
         # Set default threads num for OpenMP parallel
         os.environ["OMP_NUM_THREADS"] = str(torch.get_num_threads())
 
@@ -170,13 +224,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         # To hint IPEX uses shared memory based AllReduce
         os.environ["LOCAL_WORLD_SIZE"] = str(
             vllm_config.parallel_config.tensor_parallel_size)
-        if sys.platform == "darwin" and \
-                envs.VLLM_WORKER_MULTIPROC_METHOD == "fork":
-            if os.environ.get('VLLM_WORKER_MULTIPROC_METHOD', None) is None:
-                logger.warning(
-                    "Default to spawn method on MacOS. If this is not desired,"
-                    " set VLLM_WORKER_MULTIPROC_METHOD to fork explicitly.")
-                os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
 
         if vllm_config.model_config and vllm_config.model_config.use_mla:
             logger.info(
@@ -203,3 +250,14 @@ def get_device_communicator_cls(cls) -> str:
         Get device specific communicator class for distributed communication.
         """
         return "vllm.distributed.device_communicators.cpu_communicator.CpuCommunicator"  # noqa
+
+    @classmethod
+    def supports_structured_output(cls) -> bool:
+        return True
+
+    @classmethod
+    def supports_v1(cls, model_config) -> bool:
+        """Returns whether the current platform can support v1 for the supplied
+        model configuration.
+        """
+        return True
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 9f833cbb587d..48d1aacba185 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Code inside this file can safely assume cuda platform, e.g. importing
 pynvml. However, it should not initialize cuda context.
 """
@@ -153,10 +154,26 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
                 logger.info(
                     "Forcing kv cache block size to 64 for FlashMLA backend.")
 
+        if (envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput"
+                and parallel_config.data_parallel_size > 1
+                and vllm_config.compilation_config.use_cudagraph):
+            logger.info(
+                "Data Parallel: Forcing enforce eager to be True since DP "
+                "with DeepEP high-throughput kernels are not CUDA Graph "
+                "compatible. The DeepEP low-latency kernels are CUDA Graph "
+                "compatible. Set the all_to_all backend to deepep_low_latency "
+                "to use those kernels instead.")
+            vllm_config.compilation_config.use_cudagraph = False
+            vllm_config.model_config.enforce_eager = True
+            # TODO (varun): Turning this ON gives incorrect results for the
+            # Deepseek-V2-lite model.
+            vllm_config.compilation_config.use_inductor = False
+
     @classmethod
     def get_current_memory_usage(cls,
                                  device: Optional[torch.types.Device] = None
                                  ) -> float:
+        torch.cuda.empty_cache()
         torch.cuda.reset_peak_memory_stats(device)
         return torch.cuda.max_memory_allocated(device)
 
@@ -167,6 +184,14 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
         if use_mla:
             # TODO(lucas): refactor to  be more concise
             #  we should probably consider factoring out V1 here
+            if selected_backend == _Backend.CUTLASS_MLA_VLLM_V1:
+                if use_v1:
+                    logger.info_once("Using Cutlass MLA backend on V1 engine.")
+                    return ("vllm.v1.attention.backends.mla."
+                            "cutlass_mla.CutlassMLABackend")
+                else:
+                    logger.warning(
+                        "Cutlass MLA backend is only supported on V1 engine")
             if selected_backend == _Backend.TRITON_MLA or block_size != 64:
                 if use_v1:
                     logger.info_once("Using Triton MLA backend on V1 engine.")
@@ -201,10 +226,28 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
             if selected_backend == _Backend.FLASHINFER:
                 logger.info_once("Using FlashInfer backend on V1 engine.")
                 return "vllm.v1.attention.backends.flashinfer.FlashInferBackend"
+            if selected_backend == _Backend.FLEX_ATTENTION:
+                logger.info("Using FlexAttenion backend on V1 engine.")
+                return "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"  # noqa: E501
             if selected_backend == _Backend.TRITON_ATTN_VLLM_V1:
                 logger.info_once("Using Triton backend on V1 engine.")
                 return ("vllm.v1.attention.backends."
                         "triton_attn.TritonAttentionBackend")
+            if cls.is_device_capability(100):
+                # Prefer FlashInfer for V1 on Blackwell GPUs if installed
+                try:
+                    import flashinfer  # noqa: F401
+                    logger.info_once(
+                        "Using FlashInfer backend on V1 engine by default for "
+                        "Blackwell (SM 10.0) GPUs.")
+                    return ("vllm.v1.attention.backends."
+                            "flashinfer.FlashInferBackend")
+                except ImportError:
+                    logger.info_once(
+                        "FlashInfer failed to import for V1 engine on "
+                        "Blackwell (SM 10.0) GPUs; it is recommended to "
+                        "install FlashInfer for better performance.")
+                    pass
             if cls.has_device_capability(80):
                 logger.info_once("Using Flash Attention backend on V1 engine.")
                 return ("vllm.v1.attention.backends."
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index a8dd7df9f2e3..3cf28950190c 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from typing import TYPE_CHECKING, Optional
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 5c4f7a2f7dc7..f91f222b25e5 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import enum
 import os
 import platform
@@ -46,8 +47,12 @@ class _Backend(enum.Enum):
     ROCM_AITER_MLA_VLLM_V1 = enum.auto()
     TORCH_SDPA = enum.auto()
     FLASHINFER = enum.auto()
+    FLASHINFER_VLLM_V1 = enum.auto()
     TRITON_MLA = enum.auto()  # Supported by V1
+    TRITON_MLA_VLLM_V1 = enum.auto()
+    FLASHMLA_VLLM_V1 = enum.auto()
     FLASHMLA = enum.auto()  # Supported by V1
+    CUTLASS_MLA_VLLM_V1 = enum.auto()
     HPU_ATTN = enum.auto()
     PALLAS = enum.auto()
     PALLAS_VLLM_V1 = enum.auto()
@@ -55,6 +60,7 @@ class _Backend(enum.Enum):
     BLOCK_SPARSE_FLASH_ATTN = enum.auto()
     DUAL_CHUNK_FLASH_ATTN = enum.auto()
     NO_ATTENTION = enum.auto()
+    FLEX_ATTENTION = enum.auto()
 
 
 class PlatformEnum(enum.Enum):
@@ -223,6 +229,30 @@ def has_device_capability(
 
         return current_capability.to_int() >= capability
 
+    @classmethod
+    def is_device_capability(
+        cls,
+        capability: Union[tuple[int, int], int],
+        device_id: int = 0,
+    ) -> bool:
+        """
+        Test whether this platform has exactly the specified device capability.
+
+        The `capability` argument can either be:
+
+        - A tuple `(major, minor)`.
+        - An integer `<major><minor>`. (See
+        [`DeviceCapability.to_int`][vllm.platforms.interface.DeviceCapability.to_int])
+        """
+        current_capability = cls.get_device_capability(device_id=device_id)
+        if current_capability is None:
+            return False
+
+        if isinstance(capability, tuple):
+            return current_capability == capability
+
+        return current_capability.to_int() == capability
+
     @classmethod
     def get_device_name(cls, device_id: int = 0) -> str:
         """Get the name of a device."""
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index 56f204e71da1..04e918d7aebe 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import enum
 import os
 from functools import lru_cache
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index ef1c632a5398..a929366db49c 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from datetime import timedelta
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 0173b15697cf..07e52017f5a5 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import TYPE_CHECKING, Optional, Union, cast
 
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index b2a6ad5d77db..73f6f3d41767 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import TYPE_CHECKING, Optional
 
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index 4cd3552f8a55..2cb177b9ba78 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import logging
 import os
diff --git a/vllm/plugins/lora_resolvers/filesystem_resolver.py b/vllm/plugins/lora_resolvers/filesystem_resolver.py
index 219231f77785..b999d07a6eb7 100644
--- a/vllm/plugins/lora_resolvers/filesystem_resolver.py
+++ b/vllm/plugins/lora_resolvers/filesystem_resolver.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
 import os
 from typing import Optional
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index 9a3b254f9b68..322f9ed3efa9 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import TYPE_CHECKING, Any, Optional
 
diff --git a/vllm/profiler/layerwise_profile.py b/vllm/profiler/layerwise_profile.py
index 6934d328a87e..2f9ebe531cbb 100644
--- a/vllm/profiler/layerwise_profile.py
+++ b/vllm/profiler/layerwise_profile.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import copy
 from collections import defaultdict
diff --git a/vllm/profiler/utils.py b/vllm/profiler/utils.py
index b26fd4dd8c07..9f0f56a15fd5 100644
--- a/vllm/profiler/utils.py
+++ b/vllm/profiler/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 from typing import Callable, Union
diff --git a/vllm/prompt_adapter/layers.py b/vllm/prompt_adapter/layers.py
index c2f9f16919b7..b5b925d042f2 100644
--- a/vllm/prompt_adapter/layers.py
+++ b/vllm/prompt_adapter/layers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Optional
diff --git a/vllm/prompt_adapter/models.py b/vllm/prompt_adapter/models.py
index 795591606f25..864b50c861e1 100644
--- a/vllm/prompt_adapter/models.py
+++ b/vllm/prompt_adapter/models.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import logging
 import math
diff --git a/vllm/prompt_adapter/request.py b/vllm/prompt_adapter/request.py
index dfb8e61d786a..3ce50d0a26bb 100644
--- a/vllm/prompt_adapter/request.py
+++ b/vllm/prompt_adapter/request.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import msgspec
 
diff --git a/vllm/prompt_adapter/utils.py b/vllm/prompt_adapter/utils.py
index dd179ab938f8..ddd007868f6b 100644
--- a/vllm/prompt_adapter/utils.py
+++ b/vllm/prompt_adapter/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # code borrowed from: https://github.com/huggingface/peft/blob/v0.12.0/src/peft/utils/save_and_load.py#L420
 
diff --git a/vllm/prompt_adapter/worker_manager.py b/vllm/prompt_adapter/worker_manager.py
index 28dcc1687112..56265de8087c 100644
--- a/vllm/prompt_adapter/worker_manager.py
+++ b/vllm/prompt_adapter/worker_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import logging
 from typing import Any, Optional, Set, Type
diff --git a/vllm/reasoning/__init__.py b/vllm/reasoning/__init__.py
index 65606ce55af7..e8cd565519f3 100644
--- a/vllm/reasoning/__init__.py
+++ b/vllm/reasoning/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
 from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py
index 9dd5191da918..e827d381ca1d 100644
--- a/vllm/reasoning/abs_reasoning_parsers.py
+++ b/vllm/reasoning/abs_reasoning_parsers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from __future__ import annotations
 
diff --git a/vllm/reasoning/deepseek_r1_reasoning_parser.py b/vllm/reasoning/deepseek_r1_reasoning_parser.py
index 1c283c092a28..1a5ca46a60f1 100644
--- a/vllm/reasoning/deepseek_r1_reasoning_parser.py
+++ b/vllm/reasoning/deepseek_r1_reasoning_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
 from typing import Optional, Union
diff --git a/vllm/reasoning/granite_reasoning_parser.py b/vllm/reasoning/granite_reasoning_parser.py
index 07a63e294df4..5820001b918f 100644
--- a/vllm/reasoning/granite_reasoning_parser.py
+++ b/vllm/reasoning/granite_reasoning_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
 from typing import Optional, Union
diff --git a/vllm/reasoning/qwen3_reasoning_parser.py b/vllm/reasoning/qwen3_reasoning_parser.py
index 7095034b1ca1..61bafc724c17 100644
--- a/vllm/reasoning/qwen3_reasoning_parser.py
+++ b/vllm/reasoning/qwen3_reasoning_parser.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
 from typing import Optional, Union
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index dc38daa388ce..7abdcecca474 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Sampling parameters for text generation."""
 import copy
 from dataclasses import dataclass
@@ -389,6 +390,17 @@ def _verify_args(self) -> None:
                              f"type {type(self.n)}")
         if self.n < 1:
             raise ValueError(f"n must be at least 1, got {self.n}.")
+        if self.best_of is not None:
+            if not isinstance(self.best_of, int):
+                raise ValueError(
+                    f"best_of must be an integer, got {type(self.best_of)}")
+            if self.best_of < 1:
+                raise ValueError(
+                    f"best_of must be at least 1, got {self.best_of}")
+            if self.best_of < self.n:
+                raise ValueError(
+                    f"best_of must be greater than or equal to n, "
+                    f"got n={self.n} and best_of={self.best_of}.")
         if not -2.0 <= self.presence_penalty <= 2.0:
             raise ValueError("presence_penalty must be in [-2, 2], got "
                              f"{self.presence_penalty}.")
diff --git a/vllm/scalar_type.py b/vllm/scalar_type.py
index fc1761c84cd1..9060b55c79b0 100644
--- a/vllm/scalar_type.py
+++ b/vllm/scalar_type.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import functools
 import struct
diff --git a/vllm/scripts.py b/vllm/scripts.py
index 7e569d2d24fd..7a7fdccf0a32 100644
--- a/vllm/scripts.py
+++ b/vllm/scripts.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.entrypoints.cli.main import main as vllm_main
 from vllm.logger import init_logger
diff --git a/vllm/sequence.py b/vllm/sequence.py
index d359f897da25..ffe890eb2dab 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Sequence and its related classes."""
 import copy
 import enum
diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py
index e08ed742a522..f9b882469a4d 100644
--- a/vllm/spec_decode/batch_expansion.py
+++ b/vllm/spec_decode/batch_expansion.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from array import array
 from itertools import chain, count
diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
index 991d2040a878..96646ec94718 100644
--- a/vllm/spec_decode/draft_model_runner.py
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import List, Optional
 
@@ -296,7 +297,6 @@ def execute_model(
                     intermediate_tensors=intermediate_tensors,
                     **MultiModalKwargs.as_kwargs(
                         multi_modal_kwargs,
-                        dtype=self.model_runner.model_config.dtype,
                         device=self.device,
                     ),
                     **model_execute_kwargs,
diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py
index dd085ad77638..70ec1590e7ad 100644
--- a/vllm/spec_decode/interfaces.py
+++ b/vllm/spec_decode/interfaces.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
diff --git a/vllm/spec_decode/medusa_worker.py b/vllm/spec_decode/medusa_worker.py
index 0b62a988e8b2..82b5a79fa7cb 100644
--- a/vllm/spec_decode/medusa_worker.py
+++ b/vllm/spec_decode/medusa_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import weakref
 from typing import List, Optional, Set, Tuple
diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py
index 4430da26c049..a4784cad962d 100644
--- a/vllm/spec_decode/metrics.py
+++ b/vllm/spec_decode/metrics.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 from typing import Callable, Optional, Union
diff --git a/vllm/spec_decode/mlp_speculator_worker.py b/vllm/spec_decode/mlp_speculator_worker.py
index bdaf31895e25..8e8c05d26361 100644
--- a/vllm/spec_decode/mlp_speculator_worker.py
+++ b/vllm/spec_decode/mlp_speculator_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import List, Optional, Set, Tuple
 
diff --git a/vllm/spec_decode/mqa_scorer.py b/vllm/spec_decode/mqa_scorer.py
index 6275c460ecef..18e7b055a678 100644
--- a/vllm/spec_decode/mqa_scorer.py
+++ b/vllm/spec_decode/mqa_scorer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.sequence import (ExecuteModelRequest, SequenceData,
                            SequenceGroupMetadata, get_all_seq_ids)
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
index de57403d1b50..4a9bbe44d89a 100644
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import copy
 import weakref
diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py
index 57ae173af674..7a1a0e56dc00 100644
--- a/vllm/spec_decode/ngram_worker.py
+++ b/vllm/spec_decode/ngram_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import weakref
 from typing import List, Optional, Set, Tuple
diff --git a/vllm/spec_decode/proposer_worker_base.py b/vllm/spec_decode/proposer_worker_base.py
index 2829d631b49e..fb44275aa935 100644
--- a/vllm/spec_decode/proposer_worker_base.py
+++ b/vllm/spec_decode/proposer_worker_base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
 from typing import List, Optional, Set, Tuple
diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py
index ea3d91d7893b..91256cab6e79 100644
--- a/vllm/spec_decode/smaller_tp_proposer_worker.py
+++ b/vllm/spec_decode/smaller_tp_proposer_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import List, Optional, Set, Tuple
 
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 252c80957305..7dda1cbfe230 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import copy
 from collections import defaultdict
diff --git a/vllm/spec_decode/target_model_runner.py b/vllm/spec_decode/target_model_runner.py
index 08e773c562bf..ca89eb60ac58 100644
--- a/vllm/spec_decode/target_model_runner.py
+++ b/vllm/spec_decode/target_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import List, Optional
 
diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py
index b538923c03e7..afd91b42b943 100644
--- a/vllm/spec_decode/top1_proposer.py
+++ b/vllm/spec_decode/top1_proposer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import List, Optional, Set, Tuple
 
diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py
index 466269b2107f..22d2a4833acf 100644
--- a/vllm/spec_decode/util.py
+++ b/vllm/spec_decode/util.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 from contextlib import contextmanager
diff --git a/vllm/test_utils.py b/vllm/test_utils.py
index f8cec380f336..c6b126d002b2 100644
--- a/vllm/test_utils.py
+++ b/vllm/test_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 MODELS_ON_S3 = [
     "adept/fuyu-8b",
     "ai21labs/AI21-Jamba-1.5-Mini",
diff --git a/vllm/third_party/pynvml.py b/vllm/third_party/pynvml.py
index 7ed9ced0e262..d215e5d8bf65 100644
--- a/vllm/third_party/pynvml.py
+++ b/vllm/third_party/pynvml.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # copied from https://pypi.org/project/nvidia-ml-py
 # version 12.570.86
 
diff --git a/vllm/tracing.py b/vllm/tracing.py
index 557ae40b87ae..6a287d82be5f 100644
--- a/vllm/tracing.py
+++ b/vllm/tracing.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from collections.abc import Mapping
diff --git a/vllm/transformers_utils/__init__.py b/vllm/transformers_utils/__init__.py
index 84bd7a747656..6d4231baca50 100644
--- a/vllm/transformers_utils/__init__.py
+++ b/vllm/transformers_utils/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import envs
 
diff --git a/vllm/transformers_utils/chat_templates/__init__.py b/vllm/transformers_utils/chat_templates/__init__.py
index fe2bd3ca4125..2783d12a2214 100644
--- a/vllm/transformers_utils/chat_templates/__init__.py
+++ b/vllm/transformers_utils/chat_templates/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from .registry import get_chat_template_fallback_path
 
 __all__ = ["get_chat_template_fallback_path"]
diff --git a/vllm/transformers_utils/chat_templates/registry.py b/vllm/transformers_utils/chat_templates/registry.py
index 853fed5d4409..e0ef7f0999d4 100644
--- a/vllm/transformers_utils/chat_templates/registry.py
+++ b/vllm/transformers_utils/chat_templates/registry.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from pathlib import Path
 from typing import Callable, Optional, Union
 
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 8774f95a2f60..9bc3b8e09ada 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
 import json
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index ed10c22c84f0..97a1b683a9b8 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
 from vllm.transformers_utils.configs.cohere2 import Cohere2Config
@@ -22,6 +23,7 @@
 from vllm.transformers_utils.configs.moonvit import MoonViTConfig
 from vllm.transformers_utils.configs.mpt import MPTConfig
 from vllm.transformers_utils.configs.nemotron import NemotronConfig
+from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig
 from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config
 from vllm.transformers_utils.configs.ovis import OvisConfig
 from vllm.transformers_utils.configs.skyworkr1v import SkyworkR1VChatConfig
@@ -49,6 +51,7 @@
     "MoonViTConfig",
     "KimiVLConfig",
     "NemotronConfig",
+    "NemotronHConfig",
     "NVLM_D_Config",
     "OvisConfig",
     "SkyworkR1VChatConfig",
diff --git a/vllm/transformers_utils/configs/arctic.py b/vllm/transformers_utils/configs/arctic.py
index 2261f0a9e9aa..a789b93b5edf 100644
--- a/vllm/transformers_utils/configs/arctic.py
+++ b/vllm/transformers_utils/configs/arctic.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # yapf: disable
 # ruff: noqa: E501
diff --git a/vllm/transformers_utils/configs/chatglm.py b/vllm/transformers_utils/configs/chatglm.py
index 43e9503ffe03..7c5de3e948ed 100644
--- a/vllm/transformers_utils/configs/chatglm.py
+++ b/vllm/transformers_utils/configs/chatglm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://github.com/THUDM/ChatGLM2-6B
diff --git a/vllm/transformers_utils/configs/cohere2.py b/vllm/transformers_utils/configs/cohere2.py
index 21328d7675b8..e547a9c281cf 100644
--- a/vllm/transformers_utils/configs/cohere2.py
+++ b/vllm/transformers_utils/configs/cohere2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # ruff: noqa
 
diff --git a/vllm/transformers_utils/configs/dbrx.py b/vllm/transformers_utils/configs/dbrx.py
index bffa127fecb2..7dbda99f85a4 100644
--- a/vllm/transformers_utils/configs/dbrx.py
+++ b/vllm/transformers_utils/configs/dbrx.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # yapf: disable
 # ruff: noqa: E501
diff --git a/vllm/transformers_utils/configs/deepseek_vl2.py b/vllm/transformers_utils/configs/deepseek_vl2.py
index a54486fa41cd..957d63831841 100644
--- a/vllm/transformers_utils/configs/deepseek_vl2.py
+++ b/vllm/transformers_utils/configs/deepseek_vl2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py#L115-L268
 
diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py
index a43e4746cb6c..fb2e8a1df705 100644
--- a/vllm/transformers_utils/configs/eagle.py
+++ b/vllm/transformers_utils/configs/eagle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from typing import Optional, Union
diff --git a/vllm/transformers_utils/configs/exaone.py b/vllm/transformers_utils/configs/exaone.py
index 25bafbb85d30..7450904a15ca 100644
--- a/vllm/transformers_utils/configs/exaone.py
+++ b/vllm/transformers_utils/configs/exaone.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copied from
 # https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/configuration_exaone.py
diff --git a/vllm/transformers_utils/configs/falcon.py b/vllm/transformers_utils/configs/falcon.py
index f161a06f3423..2f5400463d91 100644
--- a/vllm/transformers_utils/configs/falcon.py
+++ b/vllm/transformers_utils/configs/falcon.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py
diff --git a/vllm/transformers_utils/configs/h2ovl.py b/vllm/transformers_utils/configs/h2ovl.py
index 48b5d79ff950..b36a6dd59d3d 100644
--- a/vllm/transformers_utils/configs/h2ovl.py
+++ b/vllm/transformers_utils/configs/h2ovl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/configuration_h2ovl_chat.py
diff --git a/vllm/transformers_utils/configs/internvl.py b/vllm/transformers_utils/configs/internvl.py
index 8ea62546e213..4494ebfef667 100644
--- a/vllm/transformers_utils/configs/internvl.py
+++ b/vllm/transformers_utils/configs/internvl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/configuration_internvl_chat.py
diff --git a/vllm/transformers_utils/configs/jais.py b/vllm/transformers_utils/configs/jais.py
index b947c6a9e2b4..767c4ddae870 100644
--- a/vllm/transformers_utils/configs/jais.py
+++ b/vllm/transformers_utils/configs/jais.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2023 The OpenAI Team Authors and HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
diff --git a/vllm/transformers_utils/configs/kimi_vl.py b/vllm/transformers_utils/configs/kimi_vl.py
index 97ff44bb9c1c..ae8dac0f381d 100644
--- a/vllm/transformers_utils/configs/kimi_vl.py
+++ b/vllm/transformers_utils/configs/kimi_vl.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
 from typing import Optional, Union
 
diff --git a/vllm/transformers_utils/configs/medusa.py b/vllm/transformers_utils/configs/medusa.py
index 885713c5d6cd..9ba52956a8e8 100644
--- a/vllm/transformers_utils/configs/medusa.py
+++ b/vllm/transformers_utils/configs/medusa.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from typing import Optional, Union
diff --git a/vllm/transformers_utils/configs/minimax_text_01.py b/vllm/transformers_utils/configs/minimax_text_01.py
index 660e870ac62d..e3b63dfa0037 100644
--- a/vllm/transformers_utils/configs/minimax_text_01.py
+++ b/vllm/transformers_utils/configs/minimax_text_01.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """ MiniMaxText01 model configuration"""
 
 from transformers.configuration_utils import PretrainedConfig
diff --git a/vllm/transformers_utils/configs/minimax_vl_01.py b/vllm/transformers_utils/configs/minimax_vl_01.py
index 99e0d249dc5a..c62497192cc2 100644
--- a/vllm/transformers_utils/configs/minimax_vl_01.py
+++ b/vllm/transformers_utils/configs/minimax_vl_01.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """MiniMaxVL01 model configuration"""
 
 from transformers.configuration_utils import PretrainedConfig
diff --git a/vllm/transformers_utils/configs/mllama.py b/vllm/transformers_utils/configs/mllama.py
index eb77e09adca4..f0cd2d52a529 100644
--- a/vllm/transformers_utils/configs/mllama.py
+++ b/vllm/transformers_utils/configs/mllama.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from transformers.models.mllama import configuration_mllama as mllama_hf_config
 
diff --git a/vllm/transformers_utils/configs/mlp_speculator.py b/vllm/transformers_utils/configs/mlp_speculator.py
index 70f60752905c..2fa284e5c9e8 100644
--- a/vllm/transformers_utils/configs/mlp_speculator.py
+++ b/vllm/transformers_utils/configs/mlp_speculator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/transformers_utils/configs/moonvit.py b/vllm/transformers_utils/configs/moonvit.py
index a2b4059a63ef..a6f712f3d600 100644
--- a/vllm/transformers_utils/configs/moonvit.py
+++ b/vllm/transformers_utils/configs/moonvit.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
 from transformers.configuration_utils import PretrainedConfig
 
diff --git a/vllm/transformers_utils/configs/mpt.py b/vllm/transformers_utils/configs/mpt.py
index 2d52658d3973..91316408dcd8 100644
--- a/vllm/transformers_utils/configs/mpt.py
+++ b/vllm/transformers_utils/configs/mpt.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copied from
 # https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py
diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py
index fdf4fa2a53e5..d65b572dc7f2 100644
--- a/vllm/transformers_utils/configs/nemotron.py
+++ b/vllm/transformers_utils/configs/nemotron.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2024 HuggingFace Inc. team. All rights reserved.
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
diff --git a/vllm/transformers_utils/configs/nemotron_h.py b/vllm/transformers_utils/configs/nemotron_h.py
new file mode 100644
index 000000000000..9fe75f2dfeea
--- /dev/null
+++ b/vllm/transformers_utils/configs/nemotron_h.py
@@ -0,0 +1,258 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""NemotronH model configuration"""
+
+import regex as re
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class NemotronHConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a
+    [`NemotronHModel`]. It is used to instantiate a NemotronH model according
+    to the specified arguments, defining the model architecture. Instantiating
+    a configuration with the defaults will yield a similar configuration to
+    that of the NemotronH-v0.1 model.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 131072):
+            Vocabulary size of the NemotronH model. Defines the number of
+            different tokens that can be represented by the `inputs_ids`
+            passed when calling [`NemotronHModel`]
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be
+            tied. Note that this is only relevant if the model has a output
+            word embedding layer.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 21504):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 52):
+            Number of hidden layers in the Transformer encoder.
+        hybrid_override_pattern (`str`, *optional*, defaults to
+            `"M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-"`):
+            The pattern of the hybrid model. The pattern is a string of
+            characters where each character represents
+            M: Mamba2, *: Attention, -: MLP
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the
+            Transformer encoder.
+        attention_head_dim (`int`, *optional*, defaults to 128):
+            Dimension of each attention head.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to
+            implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use
+            Multi Head Attention (MHA), if `num_key_value_heads=1` the model
+            will use Multi Query Attention (MQA) otherwise GQA is used.
+        mlp_hidden_act (`str`, *optional*, defaults to "relu2"):
+            The non-linear activation function in the MLP layers.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in attention layers.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in MLP layers.
+        use_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in the model.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        residual_in_fp32 (`bool`, *optional*, defaults to `False`):
+            Whether or not residuals should be in `float32`. If set to `False`
+            residuals will keep the same `dtype` as the rest of the model.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values
+            attentions (not used by all models). Only relevant if
+            `config.is_decoder=True`.
+        num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
+            Number of prompt logits to calculate during generation. If `None`,
+            all logits will be calculated. If an integer value, only last
+            `num_logits_to_keep` logits will be calculated.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        sliding_window (`int`, *optional*, defaults to None):
+            Sliding window attention window size.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used
+            with.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        hidden_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the hidden states.
+        use_mamba_kernels (`bool`, *optional*, defaults to `True`):
+            Flag indicating whether or not to use the fast mamba kernels.
+            These are available only if `mamba-ssm` and `causal-conv1d`
+            are installed, and the mamba modules are running on a CUDA device.
+        ssm_state_size (`int`, *optional*, defaults to 128):
+            The dimension of the mamba state space latents.
+        mamba_num_heads (`int`, *optional*, defaults to 128):
+            Number of heads in Mamba layers.
+        mamba_n_groups (`int`, *optional*, defaults to 8):
+            Number of groups in Mamba layers.
+        mamba_head_dim (`int`, *optional*, defaults to 64):
+            Dimension of each Mamba head.
+        mamba_d_conv (`int`, *optional*, defaults to 4):
+            The size of the mamba convolution kernel.
+        mamba_expand (`int`, *optional*, defaults to 2):
+            Expanding factor used to determine the mamba intermediate size.
+        mamba_hidden_act (`str`, *optional*, defaults to "silu"):
+            The non-linear activation function in the Mamba layers.
+        mamba_dt_min (`float`, *optional*, defaults to 0.001):
+            Minimum value for the time step in Mamba.
+        mamba_dt_max (`float`, *optional*, defaults to 0.1):
+            Maximum value for the time step in Mamba.
+        mamba_dt_limit (`tuple`, *optional*, defaults to (0.0, float("inf"))):
+            Limits for the time step in Mamba.
+        mamba_dt_init_floor (`float`, *optional*, defaults to 1e-4):
+            Floor value for time step initialization in Mamba.
+        mamba_conv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in the convolution layer of the mamba mixer
+            block.
+        mamba_proj_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in the input and output projections of the
+            mamba mixer block.
+        mamba_chunk_size (`int`, *optional*, defaults to 256):
+            Size of chunks for Mamba processing.
+        rescale_prenorm_residual (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the pre-normalization residual connections.
+    """
+
+    model_type = "nemotron_h"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=131072,
+        tie_word_embeddings=False,
+        hidden_size=4096,
+        intermediate_size=21504,
+        num_hidden_layers=52,
+        hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-",
+        num_attention_heads=32,
+        attention_head_dim=128,
+        num_key_value_heads=8,  # nemo: num_query_groups
+        mlp_hidden_act="relu2",
+        attention_bias=False,
+        mlp_bias=False,
+        use_bias=False,
+        initializer_range=0.02,  # nemo: init_method_std
+        layer_norm_epsilon=1e-5,  # nemo: layernorm_epsilon
+        residual_in_fp32=False,  #  Megatron Core default value
+        use_cache=True,
+        num_logits_to_keep=1,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        sliding_window=None,
+        max_position_embeddings=4096,
+        attention_dropout=0.0,
+        hidden_dropout=0.0,  # * ADDED
+        use_mamba_kernels=True,
+        ssm_state_size=128,  # mamba_state_size
+        mamba_num_heads=128,
+        mamba_n_groups=8,  # nemo: mamba_ssm_ngroups = num_heads
+        mamba_head_dim=64,
+        mamba_d_conv=4,
+        mamba_expand=2,
+        mamba_hidden_act="silu",
+        mamba_dt_min=0.001,
+        mamba_dt_max=0.1,
+        mamba_dt_limit=(0.0, float("inf")),
+        mamba_dt_init_floor=1e-4,
+        mamba_conv_bias=True,
+        mamba_proj_bias=False,
+        mamba_chunk_size=256,
+        rescale_prenorm_residual=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.tie_word_embeddings = tie_word_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.hybrid_override_pattern = hybrid_override_pattern
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        self.sliding_window = sliding_window
+        self.max_position_embeddings = max_position_embeddings
+        self.attention_dropout = attention_dropout
+        self.hidden_dropout = hidden_dropout
+
+        # Validate hybrid_override_pattern
+        # M: Mamba2, *: Attention, -: MLP
+        assert len(self.hybrid_override_pattern) == self.num_hidden_layers, (
+            "hybrid_override_pattern must have same length as "
+            "num_hidden_layers")
+        assert re.match(r"^[*-M]+$", self.hybrid_override_pattern), (
+            "hybrid_override_pattern must only contain characters "
+            "'M', '*', or '-'")
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.mlp_hidden_act = mlp_hidden_act
+        self.attention_bias = attention_bias
+        self.mlp_bias = mlp_bias
+        self.use_bias = use_bias
+        self.initializer_range = initializer_range
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.residual_in_fp32 = residual_in_fp32
+
+        self.use_cache = use_cache
+        self.num_logits_to_keep = num_logits_to_keep
+
+        self.use_mamba_kernels = use_mamba_kernels
+        self.n_groups = mamba_n_groups
+        self.mamba_head_dim = mamba_head_dim
+        self.ssm_state_size = ssm_state_size
+        self.mamba_num_heads = mamba_num_heads
+        self.conv_kernel = mamba_d_conv
+        self.expand = mamba_expand
+        self.mamba_hidden_act = mamba_hidden_act
+        self.time_step_min = mamba_dt_min
+        self.time_step_max = mamba_dt_max
+        self.time_step_limit = mamba_dt_limit
+        self.time_step_floor = mamba_dt_init_floor
+        self.use_conv_bias = mamba_conv_bias
+        self.mamba_proj_bias = mamba_proj_bias
+        self.chunk_size = mamba_chunk_size
+        self.rescale_prenorm_residual = rescale_prenorm_residual
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    @property
+    def layers_block_type(self):
+        return [
+            "mamba" if self.hybrid_override_pattern[i] == "M" else
+            "attention" if self.hybrid_override_pattern[i] == "*" else "mlp"
+            for i in range(self.num_hidden_layers)
+        ]
diff --git a/vllm/transformers_utils/configs/nvlm_d.py b/vllm/transformers_utils/configs/nvlm_d.py
index 300f6e21168e..a533720af6c6 100644
--- a/vllm/transformers_utils/configs/nvlm_d.py
+++ b/vllm/transformers_utils/configs/nvlm_d.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py
diff --git a/vllm/transformers_utils/configs/ovis.py b/vllm/transformers_utils/configs/ovis.py
index 0ec224214f06..c2728f0ed64c 100644
--- a/vllm/transformers_utils/configs/ovis.py
+++ b/vllm/transformers_utils/configs/ovis.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # yapf: disable
 # ruff: noqa: E501
diff --git a/vllm/transformers_utils/configs/skyworkr1v.py b/vllm/transformers_utils/configs/skyworkr1v.py
index ef5f9ba85c23..33a45220e315 100644
--- a/vllm/transformers_utils/configs/skyworkr1v.py
+++ b/vllm/transformers_utils/configs/skyworkr1v.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from
 # https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/configuration_skywork_chat.py
diff --git a/vllm/transformers_utils/configs/solar.py b/vllm/transformers_utils/configs/solar.py
index 6eaf699d17be..a83dfa40b43a 100644
--- a/vllm/transformers_utils/configs/solar.py
+++ b/vllm/transformers_utils/configs/solar.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
diff --git a/vllm/transformers_utils/configs/telechat2.py b/vllm/transformers_utils/configs/telechat2.py
index 5da6c5b4427e..050a7851d143 100644
--- a/vllm/transformers_utils/configs/telechat2.py
+++ b/vllm/transformers_utils/configs/telechat2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # adapted from https://www.modelscope.cn/models/TeleAI/TeleChat2-3B/resolve/master/configuration_telechat2.py
 """ Telechat configuration compatible with LlamaConfig. """
diff --git a/vllm/transformers_utils/configs/ultravox.py b/vllm/transformers_utils/configs/ultravox.py
index 4c5072427263..62f63b02d49a 100644
--- a/vllm/transformers_utils/configs/ultravox.py
+++ b/vllm/transformers_utils/configs/ultravox.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_config.py
 from typing import Any, Optional
diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py
index 3adf2e32cca7..380c62a141f0 100644
--- a/vllm/transformers_utils/detokenizer.py
+++ b/vllm/transformers_utils/detokenizer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py
index 7373fa0ede23..342632989d57 100644
--- a/vllm/transformers_utils/detokenizer_utils.py
+++ b/vllm/transformers_utils/detokenizer_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
index ce6427de432d..70cd08263d37 100644
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from functools import lru_cache
 from typing import TYPE_CHECKING, Any, Optional, Union, cast
diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py
index 2bd9ab1f099b..14d15f2bc167 100644
--- a/vllm/transformers_utils/processors/__init__.py
+++ b/vllm/transformers_utils/processors/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.transformers_utils.processors.deepseek_vl2 import (
     DeepseekVLV2Processor)
diff --git a/vllm/transformers_utils/processors/deepseek_vl2.py b/vllm/transformers_utils/processors/deepseek_vl2.py
index df960e9c7aa8..b4669d12fa21 100644
--- a/vllm/transformers_utils/processors/deepseek_vl2.py
+++ b/vllm/transformers_utils/processors/deepseek_vl2.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # yapf: disable
 # ruff: noqa: E501
diff --git a/vllm/transformers_utils/processors/ovis.py b/vllm/transformers_utils/processors/ovis.py
index f1c6407e1f3a..4fe76d0df622 100644
--- a/vllm/transformers_utils/processors/ovis.py
+++ b/vllm/transformers_utils/processors/ovis.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # yapf: disable
 # ruff: noqa: E501
diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py
index 1c3520bcfb27..f95aae7815e0 100644
--- a/vllm/transformers_utils/s3_utils.py
+++ b/vllm/transformers_utils/s3_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import fnmatch
 import os
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index fa7a208c48ed..ae96ebe4eaa2 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import contextlib
 import copy
diff --git a/vllm/transformers_utils/tokenizer_base.py b/vllm/transformers_utils/tokenizer_base.py
index d69e5a6b4251..20e5fea714e7 100644
--- a/vllm/transformers_utils/tokenizer_base.py
+++ b/vllm/transformers_utils/tokenizer_base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import importlib
 from abc import ABC, abstractmethod
diff --git a/vllm/transformers_utils/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group.py
index 8b9e4881ef88..eb53cceaa058 100644
--- a/vllm/transformers_utils/tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/transformers_utils/tokenizers/__init__.py b/vllm/transformers_utils/tokenizers/__init__.py
index 7aac29a6bf96..941156c4bf50 100644
--- a/vllm/transformers_utils/tokenizers/__init__.py
+++ b/vllm/transformers_utils/tokenizers/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from .mistral import (MistralTokenizer, maybe_serialize_tool_calls,
                       truncate_tool_call_ids, validate_request_params)
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 23b6f67f09df..24ac4580d670 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from dataclasses import dataclass
@@ -186,6 +187,8 @@ class MistralTokenizer(TokenizerBase):
     def __init__(self, tokenizer: "PublicMistralTokenizer") -> None:
         self.mistral = tokenizer
         self.instruct = tokenizer.instruct_tokenizer
+        _mistral_version_str = self.instruct.tokenizer.version.value
+        self.version: int = int(_mistral_version_str.split("v")[-1])
 
         tokenizer_ = tokenizer.instruct_tokenizer.tokenizer
         from mistral_common.tokens.tokenizers.tekken import (
diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py
index 8dff1b612fdb..66c8fb797adc 100644
--- a/vllm/transformers_utils/utils.py
+++ b/vllm/transformers_utils/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
 from functools import cache
diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py
index 9f14a907af3a..0fcf5d15afd1 100644
--- a/vllm/triton_utils/__init__.py
+++ b/vllm/triton_utils/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.triton_utils.importing import (HAS_TRITON, TritonLanguagePlaceholder,
                                          TritonPlaceholder)
diff --git a/vllm/triton_utils/importing.py b/vllm/triton_utils/importing.py
index 8cf2e01a33bd..068fa303137c 100644
--- a/vllm/triton_utils/importing.py
+++ b/vllm/triton_utils/importing.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import types
 from importlib.util import find_spec
diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py
index 90af0c63cc02..c149637635b7 100644
--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import datetime
 import json
diff --git a/vllm/utils.py b/vllm/utils.py
index b4152e6b2470..c19c0221cf83 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from __future__ import annotations
 
@@ -898,6 +899,7 @@ def __init__(self, device: Optional[torch.types.Device] = None):
     def current_memory_usage(self) -> float:
         # Return the memory usage in bytes.
         from vllm.platforms import current_platform
+        gc.collect()
         return current_platform.get_current_memory_usage(self.device)
 
     def __enter__(self):
@@ -2460,7 +2462,7 @@ def make_zmq_path(scheme: str, host: str, port: Optional[int] = None) -> str:
     Returns:
         A properly formatted ZMQ path string.
     """
-    if not port:
+    if port is None:
         return f"{scheme}://{host}"
     if is_valid_ipv6_address(host):
         return f"{scheme}://[{host}]:{port}"
diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py
new file mode 100644
index 000000000000..d7a580c2883c
--- /dev/null
+++ b/vllm/v1/attention/backends/cpu_attn.py
@@ -0,0 +1,163 @@
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+import torch
+
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.attention.backends.torch_sdpa import (TorchSDPABackendImpl,
+                                                TorchSDPAMetadata)
+from vllm.attention.backends.utils import CommonAttentionState
+from vllm.attention.ops.ipex_attn import PagedAttention
+from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.kv_cache_interface import AttentionSpec
+from vllm.v1.worker.block_table import BlockTable
+from vllm.v1.worker.cpu_model_runner import CPUModelRunner
+from vllm.v1.worker.gpu_input_batch import InputBatch
+
+
+class TorchSDPABackend:
+    accept_output_buffer: bool = False
+
+    @staticmethod
+    def get_name() -> str:
+        return "TORCH_SDPA_VLLM_V1"
+
+    @staticmethod
+    def get_impl_cls() -> type["TorchSDPABackendImpl"]:
+        return TorchSDPABackendImpl
+
+    @staticmethod
+    def get_metadata_cls() -> type["AttentionMetadata"]:
+        return TorchSDPAMetadata
+
+    @staticmethod
+    def get_state_cls() -> type["CommonAttentionState"]:
+        return CommonAttentionState
+
+    @staticmethod
+    def get_builder_cls() -> type["TorchSDPAMetadataBuilderV1"]:
+        return TorchSDPAMetadataBuilderV1
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> tuple[int, ...]:
+        return PagedAttention.get_kv_cache_shape(num_blocks, block_size,
+                                                 num_kv_heads, head_size)
+
+    @staticmethod
+    def use_cascade_attention(*args, **kwargs) -> bool:
+        return False
+
+
+class TorchSDPAMetadataBuilderV1:
+
+    def __init__(self, runner: CPUModelRunner, kv_cache_spec: AttentionSpec,
+                 block_table: BlockTable) -> None:
+        self.runner = runner
+        self.block_table = block_table
+
+        # For reorder
+        self.reorder_prompt_req_index_list = np.empty(self.runner.max_num_reqs,
+                                                      dtype=np.int64)
+        self.reorder_decode_req_index_list = np.empty(self.runner.max_num_reqs,
+                                                      dtype=np.int64)
+        self.num_prompt_req: int = 0
+
+        self.seq_start_loc_cpu = torch.zeros(
+            runner.max_num_reqs + 1,
+            dtype=torch.int32,
+            device="cpu",
+        )
+        self.seq_start_loc_np = self.seq_start_loc_cpu.numpy()
+
+    def reorder_batch(self, input_batch: InputBatch,
+                      scheduler_output: SchedulerOutput) -> bool:
+        prompt_list_idx = 0
+        decode_list_idx = 0
+        for req_index in range(input_batch.num_reqs):
+            if input_batch.num_computed_tokens_cpu[
+                    req_index] < input_batch.num_prompt_tokens[req_index]:
+                # prompt stage
+                self.reorder_prompt_req_index_list[prompt_list_idx] = req_index
+                prompt_list_idx += 1
+            else:
+                # decode stage
+                self.reorder_decode_req_index_list[decode_list_idx] = req_index
+                decode_list_idx += 1
+        assert decode_list_idx + prompt_list_idx == input_batch.num_reqs
+
+        # Update prompt requests number
+        self.num_prompt_req = prompt_list_idx
+
+        reorder_req_num = 0
+        for req_index in range(decode_list_idx):
+            if self.reorder_decode_req_index_list[req_index] < prompt_list_idx:
+                reorder_req_num += 1
+            else:
+                break
+
+        if reorder_req_num == 0:
+            return False
+
+        reorder_prompt_list = (
+            self.reorder_prompt_req_index_list[:prompt_list_idx]
+            [-reorder_req_num:])
+        reorder_decode_list = (
+            self.reorder_decode_req_index_list[:decode_list_idx]
+            [:reorder_req_num])
+        assert reorder_decode_list.size == reorder_prompt_list.size
+
+        for idx in range(reorder_req_num):
+            prompt_req_index = reorder_prompt_list[idx].item()
+            decode_req_index = reorder_decode_list[idx].item()
+            input_batch.swap_states(prompt_req_index, decode_req_index)
+
+        return True
+
+    def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
+              common_prefix_len: int,
+              common_attn_metadata: CommonAttentionMetadata):
+        runner = self.runner
+        block_table = self.block_table
+        seq_lens_np = runner.seq_lens_np[:num_reqs]
+        num_prompt_req = self.num_prompt_req
+        max_prefill_seq_len = seq_lens_np[:num_prompt_req].max().item(
+        ) if num_prompt_req > 0 else 0
+        max_decode_seq_len = seq_lens_np[num_prompt_req:num_reqs].max().item(
+        ) if num_prompt_req < num_reqs else 0
+        self.seq_start_loc_np[0] = 0
+        np.cumsum(seq_lens_np, out=self.seq_start_loc_np[1:num_reqs + 1])
+        num_prefill_tokens = runner.query_start_loc_np[num_prompt_req].item()
+        num_decode_tokens = runner.query_start_loc_np[num_reqs].item(
+        ) - num_prefill_tokens
+        slot_mapping = block_table.slot_mapping_cpu[:num_actual_tokens].long()
+        block_table_tensor = block_table.get_device_tensor()
+        attn_metadata = TorchSDPAMetadata(
+            num_prefills=num_prompt_req,
+            num_prefill_tokens=num_prefill_tokens,
+            num_decode_tokens=num_decode_tokens,
+            slot_mapping=slot_mapping,
+            seq_lens_tensor=runner.
+            seq_lens_cpu[num_prompt_req:num_reqs],  # decode
+            max_decode_seq_len=max_decode_seq_len,  # decode
+            block_tables=block_table_tensor[num_prompt_req:num_reqs],  # decode
+            chunked_prefill=True,
+            max_query_len=max_query_len,
+            max_kv_len=max_prefill_seq_len,
+            prefill_query_start_loc=runner.
+            query_start_loc_cpu[:num_prompt_req + 1],  # prefill
+            kv_start_loc=self.seq_start_loc_cpu[:num_prompt_req +
+                                                1],  # prefill
+            prefill_block_tables=block_table_tensor[:
+                                                    num_prompt_req],  # prefill
+            query_start_loc=runner.query_start_loc_cpu[:num_reqs +
+                                                       1],  # for logits index
+            multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=False,
+        )
+
+        return attn_metadata
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 9ed3dec7f269..91a7c43cd8d8 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with FlashAttention."""
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Optional
@@ -15,6 +16,8 @@
 from vllm.attention.utils.fa_utils import (flash_attn_supports_fp8,
                                            get_flash_attn_version)
 from vllm.config import VllmConfig, get_layers_from_vllm_config
+from vllm.distributed.kv_transfer.kv_connector.utils import (
+    get_kv_connector_cache_layout)
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import cdiv
@@ -69,6 +72,20 @@ def get_kv_cache_shape(
             raise ValueError("Block size must be a multiple of 16.")
         return (2, num_blocks, block_size, num_kv_heads, head_size)
 
+    @staticmethod
+    def get_kv_cache_stride_order() -> tuple[int, ...]:
+        # NOTE When running disaggregated PD with NIXL, HND layout is used for
+        # faster transfer. `stride_order` indicates the permutation that gets
+        # us from `get_kv_cache_shape` to the actual memory layout we want.
+        cache_layout = get_kv_connector_cache_layout()
+        if cache_layout == "NHD":
+            stride_order = (0, 1, 2, 3, 4)
+        elif cache_layout == "HND":
+            stride_order = (0, 1, 3, 2, 4)
+        else:
+            raise ValueError("Unknown cache layout format %s.", cache_layout)
+        return stride_order
+
 
 @dataclass
 class FlashAttentionMetadata:
@@ -306,13 +323,14 @@ def __init__(self, runner: "GPUModelRunner", kv_cache_spec: AttentionSpec,
         self.kv_cache_spec = kv_cache_spec
         self.block_table = block_table
 
-        if get_flash_attn_version() == 3:
-            self.aot_schedule = not compilation_config.full_cuda_graph
-            if not self.aot_schedule:
-                logger.warning(
-                    "AOT Schedule is disabled when using full_cuda_graph")
-        else:
-            self.aot_schedule = False
+        self.aot_schedule = (get_flash_attn_version() == 3)
+        self.use_full_cuda_graph = compilation_config.full_cuda_graph
+        if self.use_full_cuda_graph and not self.aot_schedule:
+            raise ValueError("Full CUDA graph mode requires AOT scheduling, "
+                             "which requires FlashAttention 3.")
+        self.scheduler_metadata = torch.zeros(self.runner.max_num_reqs + 1,
+                                              dtype=torch.int32,
+                                              device=self.runner.device)
 
         # Sliding window size to be used with the AOT scheduler will be
         # populated on first build() call.
@@ -325,7 +343,7 @@ def reorder_batch(self, input_batch: "InputBatch",
     def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
               common_prefix_len: int,
               common_attn_metadata: CommonAttentionMetadata):
-        max_seq_len = self.runner.seq_lens_np[:num_reqs].max()
+        max_seq_len = int(self.runner.seq_lens_np[:num_reqs].max())
         query_start_loc = common_attn_metadata.query_start_loc
         seq_lens = common_attn_metadata.seq_lens
         block_table = self.block_table
@@ -447,6 +465,18 @@ def schedule(batch_size, cu_query_lens, max_query_len, seqlens,
                                           max_seq_len=max_seq_len,
                                           causal=True)
 
+        if self.use_full_cuda_graph:
+            assert scheduler_metadata is not None
+            n = scheduler_metadata.shape[0]
+            self.scheduler_metadata[:n].copy_(scheduler_metadata,
+                                              non_blocking=True)
+            # NOTE(woosuk): We should zero out the rest of the scheduler
+            # metadata to guarantee the correctness. Otherwise, some thread
+            # blocks may use the invalid scheduler metadata and overwrite the
+            # output buffer.
+            self.scheduler_metadata[n:] = 0
+            scheduler_metadata = self.scheduler_metadata[:n]
+
         attn_metadata = FlashAttentionMetadata(
             num_actual_tokens=num_actual_tokens,
             max_query_len=max_query_len,
@@ -484,6 +514,7 @@ def __init__(
         blocksparse_params: Optional[dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: AttentionType = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
         use_irope: bool = False,
     ) -> None:
         if blocksparse_params is not None:
@@ -505,6 +536,7 @@ def __init__(
             # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
             logits_soft_cap = 0
         self.logits_soft_cap = logits_soft_cap
+        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
 
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
@@ -568,22 +600,26 @@ def forward(
         # performance to make sure it does not introduce any overhead.
 
         num_actual_tokens = attn_metadata.num_actual_tokens
-        # Reshape the input keys and values and store them in the cache.
-        # NOTE(woosuk): Here, key and value are padded while slot_mapping is
-        # not padded. However, we don't need to do key[:num_actual_tokens] and
-        # value[:num_actual_tokens] because the reshape_and_cache_flash op uses
-        # the slot_mapping's shape to determine the number of actual tokens.
         key_cache, value_cache = kv_cache.unbind(0)
-        torch.ops._C_cache_ops.reshape_and_cache_flash(
-            key,
-            value,
-            key_cache,
-            value_cache,
-            attn_metadata.slot_mapping,
-            self.kv_cache_dtype,
-            layer._k_scale,
-            layer._v_scale,
-        )
+
+        if self.kv_sharing_target_layer_name is None:
+            # Reshape the input keys and values and store them in the cache.
+            # Skip this if sharing KV cache with an earlier attention layer.
+            # NOTE(woosuk): Here, key and value are padded while slot_mapping is
+            # not padded. However, we don't need to do key[:num_actual_tokens]
+            # and value[:num_actual_tokens] because the reshape_and_cache_flash
+            # op uses the slot_mapping's shape to determine the number of
+            # actual tokens.
+            torch.ops._C_cache_ops.reshape_and_cache_flash(
+                key,
+                value,
+                key_cache,
+                value_cache,
+                attn_metadata.slot_mapping,
+                self.kv_cache_dtype,
+                layer._k_scale,
+                layer._v_scale,
+            )
 
         if self.kv_cache_dtype.startswith("fp8"):
             key_cache = key_cache.view(torch.float8_e4m3fn)
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 1c4f7f62fa67..f1b61c152a9d 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with FlashInfer."""
 from __future__ import annotations
 
@@ -506,6 +507,7 @@ def __init__(
         blocksparse_params: Optional[dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: AttentionType = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[int] = None,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -520,6 +522,7 @@ def __init__(
             self.sliding_window = (sliding_window - 1, 0)
         self.kv_cache_dtype = kv_cache_dtype
         self.logits_soft_cap = logits_soft_cap
+        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
 
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
@@ -567,21 +570,25 @@ def forward(
         # performance to make sure it does not introduce any overhead.
 
         num_actual_tokens = attn_metadata.num_actual_tokens
-        # Reshape the input keys and values and store them in the cache.
-        # NOTE(woosuk): Here, key and value are padded while slot_mapping is
-        # not padded. However, we don't need to do key[:num_actual_tokens] and
-        # value[:num_actual_tokens] because the reshape_and_cache_flash op uses
-        # the slot_mapping's shape to determine the number of actual tokens.
-        torch.ops._C_cache_ops.reshape_and_cache_flash(
-            key,
-            value,
-            kv_cache[:, 0],
-            kv_cache[:, 1],
-            attn_metadata.slot_mapping,
-            self.kv_cache_dtype,
-            layer._k_scale,
-            layer._v_scale,
-        )
+
+        if self.kv_sharing_target_layer_name is None:
+            # Reshape the input keys and values and store them in the cache.
+            # Skip this if sharing KV cache with an earlier attention layer.
+            # NOTE(woosuk): Here, key and value are padded while slot_mapping is
+            # not padded. However, we don't need to do key[:num_actual_tokens]
+            # and value[:num_actual_tokens] because the reshape_and_cache_flash
+            # op uses the slot_mapping's shape to determine the number of
+            # actual tokens.
+            torch.ops._C_cache_ops.reshape_and_cache_flash(
+                key,
+                value,
+                kv_cache[:, 0],
+                kv_cache[:, 1],
+                attn_metadata.slot_mapping,
+                self.kv_cache_dtype,
+                layer._k_scale,
+                layer._v_scale,
+            )
 
         window_left = (self.sliding_window[0]
                        if self.sliding_window is not None else -1)
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
new file mode 100644
index 000000000000..5b473b1461a6
--- /dev/null
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -0,0 +1,477 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Attention layer with FlashAttention."""
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Optional
+
+import torch
+from torch.nn.attention.flex_attention import (BlockMask, _mask_mod_signature,
+                                               _score_mod_signature,
+                                               create_block_mask,
+                                               flex_attention)
+
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionMetadata, AttentionType,
+                                              is_quantized_kv_cache)
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+from vllm.v1.kv_cache_interface import AttentionSpec
+from vllm.v1.worker.block_table import BlockTable
+
+if current_platform.is_cuda():
+    pass
+
+logger = init_logger(__name__)
+
+if TYPE_CHECKING:
+    from vllm.v1.core.sched.output import SchedulerOutput
+    from vllm.v1.worker.gpu_input_batch import InputBatch
+    from vllm.v1.worker.gpu_model_runner import GPUModelRunner
+
+create_block_mask_compiled = torch.compile(create_block_mask,
+                                           fullgraph=True,
+                                           mode="reduce-overhead")
+flex_attention_compiled = torch.compile(flex_attention, fullgraph=True)
+
+
+def _offsets_to_doc_ids_tensor(offsets: torch.Tensor) -> torch.Tensor:
+    device = offsets.device
+    counts = offsets[1:] - offsets[:-1]
+    return torch.repeat_interleave(
+        torch.arange(len(counts), device=device, dtype=torch.int32), counts)
+
+
+class FlexAttentionBackend(AttentionBackend):
+    accept_output_buffer: bool = True
+
+    @staticmethod
+    def get_supported_head_sizes() -> list[int]:
+        return [16, 32, 64, 96, 128, 160, 192, 224, 256]
+
+    @staticmethod
+    def get_name() -> str:
+        return "FLEX_ATTENTION"
+
+    @staticmethod
+    def get_impl_cls() -> type["FlexAttentionImpl"]:
+        return FlexAttentionImpl
+
+    @staticmethod
+    def get_metadata_cls() -> type["AttentionMetadata"]:
+        return FlexAttentionMetadata
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> tuple[int, ...]:
+        return (2, num_blocks, block_size, num_kv_heads, head_size)
+
+    @staticmethod
+    def get_builder_cls() -> type["FlexAttentionMetadataBuilder"]:
+        return FlexAttentionMetadataBuilder
+
+    @staticmethod
+    def use_cascade_attention(*args, **kwargs) -> bool:
+        return False
+
+
+# @torch.compile(fullgraph=True, mode="reduce-overhead")
+def physical_to_logical_mapping(
+        block_table: torch.Tensor,
+        total_blocks: Optional[int] = None) -> torch.Tensor:
+    """
+    Creates an inverse mapping from physical block locations to logical indices.
+
+    The original block_table maps from logical blocks to physical locations:
+
+    Logical to Physical (Original block_table):
+    ┌───────────────────────────────────────────┐
+    │ Request 0:                                │
+    │                                           │
+    │ Logical Blocks:  0  1  2  3  4  5  6  7   │
+    │                  │  │  │  │  │  │  │  │   │
+    │                  v  v  v  v  v  v  v  v   │
+    │ Physical Blocks: 3  5  1  7  4  2  0  6   │
+    └───────────────────────────────────────────┘
+
+    This function creates the inverse mapping:
+
+    Physical to Logical (Inverse mapping):
+    ┌───────────────────────────────────────────┐
+    │ Request 0:                                │
+    │                                           │
+    │ Physical Blocks: 0  1  2  3  4  5  6  7   │
+    │                  │  │  │  │  │  │  │  │   │
+    │                  v  v  v  v  v  v  v  v   │
+    │ Logical Blocks:  6  2  5  0  4  1  7  3   │
+    └───────────────────────────────────────────┘
+
+    If multiple logical blocks map to the same physical block,
+    this function returns the first (minimum) logical block index.
+
+    If a physical block is not mapped to by any logical block,
+    its value in the result will be -1.
+
+
+    Args:
+        block_table: Tensor of shape [max_reqs, max_num_blocks]
+            mapping logical blocks to physical locations
+
+    Returns:
+        A tensor of shape [max_reqs, max_physical_block]
+    """
+    max_reqs, max_num_blocks = block_table.shape
+    device = block_table.device
+
+    physical_to_logical = torch.full((max_reqs, total_blocks),
+                                     -1,
+                                     dtype=torch.long,
+                                     device=device)
+
+    logical_indices = (torch.arange(max_num_blocks,
+                                    device=device).unsqueeze(0).expand(
+                                        max_reqs, -1))
+
+    physical_to_logical.scatter_(-1, block_table.to(torch.int64),
+                                 logical_indices)
+    # TODO Confirm - Seems like block 0 is always empty so we reset it manually
+    physical_to_logical[:, 0] = -1
+    return physical_to_logical
+
+
+def causal_mask_mod(b: torch.Tensor, h: torch.Tensor, q_idx: torch.Tensor,
+                    kv_idx: torch.Tensor):
+    return q_idx >= kv_idx
+
+
+@dataclass
+class FlexAttentionMetadata:
+    num_actual_tokens: int  # Number of tokens excluding padding.
+    max_query_len: int
+    query_start_loc: torch.Tensor
+    max_seq_len: int
+    seq_lens: torch.Tensor
+    block_table: torch.Tensor
+    slot_mapping: torch.Tensor
+
+    use_cascade: bool
+    common_prefix_len: int
+    cu_prefix_query_lens: Optional[torch.Tensor]
+    prefix_kv_lens: Optional[torch.Tensor]
+    suffix_kv_lens: Optional[torch.Tensor]
+
+    # Block info
+    total_cache_tokens: int
+    block_size: int
+    max_possible_sequence_length: int
+    num_reqs: int
+    physical_to_logical: torch.Tensor
+    decode_offset: torch.Tensor
+
+    # For logging.
+    num_input_tokens: int = 0  # Number of tokens including padding.
+
+    # Flex Metadata
+    num_blocks = 0
+    block_mask: Optional[BlockMask] = None
+    score_mod: Optional[_score_mod_signature] = None
+    mask_mod: Optional[_mask_mod_signature] = None
+    logical_mask_mod: _mask_mod_signature = causal_mask_mod
+
+    def get_mask_mod(self) -> _mask_mod_signature:
+        """Creates the mask_mod function for FlexAttention.
+
+        This function creates the combined mask mod function that handles:
+            1. The paged attention block mapping
+            2. The mapping from packed query sequences to logical query entries
+
+        It also by defaults adds the decoding offset to the query indices.
+        With this info we create the "logical" indices that are passed to
+        mask_mod functions. This allows mask mod functions to be agnostic to
+        layout of the query and key/value tensors.
+
+        TODO is_within_lower_bound: do sequences start on block_boundaries?
+        """
+        # Create a lookup mapping from query indices -> request number
+        request_lookup = _offsets_to_doc_ids_tensor(self.query_start_loc)
+
+        def final_mask_mod(
+            b: torch.Tensor,
+            h: torch.Tensor,
+            q_idx: torch.Tensor,
+            physical_kv_idx: torch.Tensor,
+        ) -> torch.Tensor:
+            # Map query indices to corresponding request indices
+            q_req = request_lookup[q_idx]
+
+            # Convert physical KV indices to logical indices
+            physical_kv_block = physical_kv_idx // self.block_size
+            physical_kv_offset = physical_kv_idx % self.block_size
+            logical_block_idx = self.physical_to_logical[q_req,
+                                                         physical_kv_block]
+            logical_kv_idx = logical_block_idx * self.block_size + physical_kv_offset  # noqa: E501
+
+            # Determine valid kv indices
+            live_block = logical_block_idx >= 0
+            within_upper_bound = logical_kv_idx < self.seq_lens[q_req]
+            within_lower_bound = logical_kv_idx >= 0
+
+            is_valid = live_block & within_upper_bound & within_lower_bound
+
+            # Convert physical query indices to logical indices
+            local_q_idx = q_idx - self.query_start_loc[q_req]
+            logical_q_idx = local_q_idx + self.decode_offset[q_req]
+
+            # Apply mask modification only for valid indices
+            return torch.where(
+                is_valid,
+                self.logical_mask_mod(b, h, logical_q_idx, logical_kv_idx),
+                False,
+            )
+
+        return final_mask_mod
+
+    def build_block_mask(self) -> BlockMask:
+        assert self.mask_mod is not None
+        return create_block_mask_compiled(
+            self.mask_mod,
+            None,
+            None,
+            self.num_actual_tokens,
+            self.total_cache_tokens,
+        )
+
+    def __post_init__(self):
+        assert self.use_cascade is False, "Not implemented yet."
+        assert self.common_prefix_len == 0, "Not implemented yet."
+        assert self.cu_prefix_query_lens is None, "Not implemented yet."
+        assert self.prefix_kv_lens is None, "Not implemented yet."
+        assert self.suffix_kv_lens is None, "Not implemented yet."
+        self.num_blocks = self.total_cache_tokens // self.block_size
+        self.mask_mod = self.get_mask_mod()
+        self.block_mask = self.build_block_mask()
+
+
+class FlexAttentionMetadataBuilder:
+
+    def __init__(self, runner: "GPUModelRunner", kv_cache_spec: AttentionSpec,
+                 block_table: BlockTable):
+        model_config = runner.model_config
+
+        self.runner = runner
+        self.num_heads_q = model_config.get_num_attention_heads(
+            runner.parallel_config)
+        self.num_heads_kv = model_config.get_num_kv_heads(
+            runner.parallel_config)
+        self.headdim = model_config.get_head_size()
+        self.block_size = kv_cache_spec.block_size
+        self.kv_cache_spec = kv_cache_spec
+        self.block_table = block_table
+
+    def reorder_batch(self, input_batch: "InputBatch",
+                      scheduler_output: "SchedulerOutput") -> bool:
+        return False
+
+    def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
+              common_prefix_len: int,
+              common_attn_metadata: CommonAttentionMetadata):
+        max_seq_len = self.runner.seq_lens_np[:num_reqs].max()
+        query_start_loc = common_attn_metadata.query_start_loc
+        seq_lens = common_attn_metadata.seq_lens
+
+        block_table = self.block_table
+        block_table_tensor = block_table.get_device_tensor()[:num_reqs]
+        block_table.slot_mapping[:num_actual_tokens].copy_(
+            block_table.slot_mapping_cpu[:num_actual_tokens],
+            non_blocking=True)
+        slot_mapping = block_table.slot_mapping[:num_actual_tokens]
+
+        use_cascade = common_prefix_len > 0
+        cu_prefix_query_lens = None
+        prefix_kv_lens = None
+        suffix_kv_lens = None
+        if use_cascade:
+            raise NotImplementedError("Not yet my friend")
+
+        block_size = self.kv_cache_spec.block_size
+        max_possible_seq_len = self.runner.model_config.max_model_len
+        total_cache_tokens = (self.runner.cache_config.num_gpu_blocks *
+                              block_size)
+
+        inverse_block_table = physical_to_logical_mapping(
+            block_table_tensor, self.runner.cache_config.num_gpu_blocks)
+
+        # Get the original offset tensor
+        offset_tensor = torch.tensor(
+            self.runner.input_batch.num_computed_tokens_cpu[:num_reqs]).to(
+                self.runner.device, non_blocking=True)
+
+        out = FlexAttentionMetadata(
+            num_actual_tokens=num_actual_tokens,
+            max_query_len=max_query_len,
+            query_start_loc=query_start_loc,
+            max_seq_len=max_seq_len,
+            seq_lens=seq_lens,
+            block_table=block_table_tensor,
+            slot_mapping=slot_mapping,
+            use_cascade=use_cascade,
+            common_prefix_len=common_prefix_len,
+            cu_prefix_query_lens=cu_prefix_query_lens,
+            prefix_kv_lens=prefix_kv_lens,
+            suffix_kv_lens=suffix_kv_lens,
+            block_size=block_size,
+            max_possible_sequence_length=max_possible_seq_len,
+            num_reqs=num_reqs,
+            physical_to_logical=inverse_block_table,
+            total_cache_tokens=total_cache_tokens,
+            decode_offset=offset_tensor,
+        )
+        return out
+
+    def use_cascade_attention(self, *args, **kwargs) -> bool:
+        return False
+
+
+class FlexAttentionImpl(AttentionImpl):
+    sliding_window: Optional[tuple[int, int]]
+    alibi_slopes: Optional[torch.Tensor]
+    logits_soft_cap: Optional[float]
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[list[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        blocksparse_params: Optional[dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
+        attn_type: AttentionType = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
+    ) -> None:
+        if blocksparse_params is not None:
+            # TODO we should support this :think
+            raise ValueError(
+                "FlashAttention does not support block-sparse attention.")
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+
+        if alibi_slopes is not None:
+            raise NotImplementedError(
+                "FlexAttention does not support alibi slopes yet.")
+        else:
+            self.alibi_slopes = None
+        if sliding_window is not None:
+            raise NotImplementedError(
+                "FlexAttention does not support sliding window yet.")
+        else:
+            self.sliding_window = (-1, -1)
+        self.kv_cache_dtype = kv_cache_dtype
+        self.logits_soft_cap = logits_soft_cap
+        if self.logits_soft_cap is not None:
+            raise NotImplementedError(
+                "FlexAttention does not support logits soft cap yet.")
+
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError(
+                "FlexAttention does not support kv sharing yet.")
+
+        support_head_sizes = FlexAttentionBackend.get_supported_head_sizes()
+        if head_size not in support_head_sizes:
+            raise ValueError(
+                f"Head size {head_size} is not supported by FlashAttention. "
+                f"Supported head sizes are: {support_head_sizes}. "
+                "Set VLLM_USE_V1=0 to use another attention backend.")
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "FlexAttention does not support quantized kv-cache. Yet")
+
+    @staticmethod
+    def view_as_4d(tensor: torch.Tensor) -> torch.Tensor:
+        """View a 3d tensor as 4D."""
+        if tensor.ndim == 4:
+            return tensor
+        assert tensor.ndim == 3
+        return tensor[None, :, :, :]
+
+    def forward(
+        self,
+        layer: torch.nn.Module,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: FlexAttentionMetadata,
+        output: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass with FLexAttention.
+
+        Args:
+            query: shape = [num_tokens, num_heads, head_size]
+            key: shape = [num_tokens, num_kv_heads, head_size]
+            value: shape = [num_tokens, num_kv_heads, head_size]
+            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [num_tokens, num_heads * head_size]
+        """
+        assert output is not None, "Output tensor must be provided."
+        enable_gqa = self.num_kv_heads != self.num_heads
+
+        if attn_metadata is None:
+            # Profiling run.
+            return output
+            # query = self.view_as_4d(query).permute(0, 2, 1, 3)
+            # return torch.empty_like(query)
+
+        num_actual_tokens = attn_metadata.num_actual_tokens
+
+        key_cache, value_cache = kv_cache.unbind(0)
+
+        torch.ops._C_cache_ops.reshape_and_cache_flash(
+            key,
+            value,
+            key_cache,
+            value_cache,
+            attn_metadata.slot_mapping,
+            self.kv_cache_dtype,
+            layer._k_scale,
+            layer._v_scale,
+        )
+
+        # View out the block_size dim
+        key_cache = key_cache.view(-1, self.num_kv_heads, self.head_size)
+        value_cache = value_cache.view(-1, self.num_kv_heads, self.head_size)
+        query, key_cache, value_cache = map(
+            lambda x: self.view_as_4d(x).permute(0, 2, 1, 3),
+            (query, key_cache, value_cache),
+        )
+        query = query[:, :, :num_actual_tokens, :]
+        # Doesn't work for now -> constraint violation
+        # torch._dynamo.try_mark_dynamic(query, 2)
+        out = flex_attention_compiled(
+            query,
+            key_cache,
+            value_cache,
+            attn_metadata.score_mod,
+            attn_metadata.block_mask,
+            self.scale,
+            enable_gqa=enable_gqa,
+            kernel_options={"FORCE_USE_FLEX_ATTENTION": True},
+        )
+
+        # Flex doesn't have an out variant today, rely on epilogue fusion
+        out = out.permute(0, 2, 1, 3).squeeze(0)
+        output[:num_actual_tokens, :, :].copy_(out)
+        return output
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 1edfab26b6c1..e6b4f6404632 100644
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 # MLA Common Components
 
@@ -349,7 +350,7 @@ def __init__(self,
         self.num_heads = model_config.get_num_attention_heads(
             runner.parallel_config)
         self.mla_dims = get_mla_dims(model_config)
-        self.aot_schedule = is_vllm_fa and (get_flash_attn_version() == 3)
+        self.aot_schedule = current_platform.is_cuda()
         self.kv_cache_spec = kv_cache_spec
 
         # Dont try to access the runner on AMD
@@ -585,6 +586,7 @@ def __init__(
         blocksparse_params: Optional[dict[str, Any]],
         logits_soft_cap: Optional[float],
         attn_type: str,
+        kv_sharing_target_layer_name: Optional[str],
         # MLA Specific Arguments
         q_lora_rank: Optional[int],
         kv_lora_rank: int,
@@ -594,6 +596,9 @@ def __init__(
         v_head_dim: int,
         kv_b_proj: ColumnParallelLinear,
     ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported for MLA")
+
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py
new file mode 100644
index 000000000000..70aee058e296
--- /dev/null
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -0,0 +1,96 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+
+import vllm._custom_ops as ops
+from vllm.attention.backends.abstract import (AttentionType,
+                                              is_quantized_kv_cache)
+from vllm.logger import init_logger
+from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
+                                                   MLACommonImpl,
+                                                   MLACommonMetadata)
+
+logger = init_logger(__name__)
+
+
+class CutlassMLABackend(MLACommonBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "CUTLASS_MLA_VLLM_V1"
+
+    @staticmethod
+    def get_impl_cls() -> type["CutlassMLAImpl"]:
+        return CutlassMLAImpl
+
+
+class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
+
+    def __init__(
+            self,
+            num_heads: int,
+            head_size: int,
+            scale: float,
+            num_kv_heads: int,
+            alibi_slopes: Optional[list[float]],
+            sliding_window: Optional[int],
+            kv_cache_dtype: str,
+            blocksparse_params: Optional[dict[str, Any]],
+            logits_soft_cap: Optional[float],
+            attn_type: str,
+            # MLA Specific Arguments
+            **mla_args) -> None:
+        super().__init__(num_heads, head_size, scale, num_kv_heads,
+                         alibi_slopes, sliding_window, kv_cache_dtype,
+                         blocksparse_params, logits_soft_cap, attn_type,
+                         **mla_args)
+
+        unsupported_features = [
+            alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
+        ]
+        if any(unsupported_features):
+            raise NotImplementedError(
+                "CutlassMLAImpl does not support one of the following: "
+                "alibi_slopes, sliding_window, blocksparse_params, "
+                "logits_soft_cap")
+
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "CutlassMLAImpl")
+
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "CutlassMLA V1 with FP8 KV cache not yet supported")
+
+    def _forward_decode(
+        self,
+        q_nope: torch.Tensor,
+        q_pe: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: MLACommonMetadata,
+    ) -> torch.Tensor:
+        assert kv_c_and_k_pe_cache.numel() > 0
+        assert attn_metadata.decode is not None
+
+        if self.kv_cache_dtype.startswith("fp8"):
+            raise NotImplementedError("FP8 Cutlass MLA not yet supported")
+
+        B = q_nope.shape[0]
+
+        o = torch.empty((B, self.num_heads, self.kv_lora_rank),
+                        dtype=q_nope.dtype,
+                        device=q_nope.device)
+
+        # Run MLA
+        # Clone q_nope and q_pe to make sure strides computation is correct.
+        q_nope = q_nope.clone()
+        q_pe = q_pe.clone()
+        ops.cutlass_mla_decode(o, q_nope, q_pe, kv_c_and_k_pe_cache,
+                               attn_metadata.decode.seq_lens,
+                               attn_metadata.decode.block_table, self.scale)
+
+        return self._v_up_proj(o)
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
index e6594c6b6fa8..318b8ede1436 100644
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Any, Optional
@@ -92,12 +93,13 @@ def __init__(
             blocksparse_params: Optional[dict[str, Any]],
             logits_soft_cap: Optional[float],
             attn_type: str,
+            kv_sharing_target_layer_name: Optional[str],
             # MLA Specific Arguments
             **mla_args) -> None:
         super().__init__(num_heads, head_size, scale, num_kv_heads,
                          alibi_slopes, sliding_window, kv_cache_dtype,
                          blocksparse_params, logits_soft_cap, attn_type,
-                         **mla_args)
+                         kv_sharing_target_layer_name, **mla_args)
 
         assert is_flashmla_supported(), \
             "FlashMLA is not supported on this device"
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index d1e823bbe396..1f0406a7ac1f 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Any, Optional
@@ -138,12 +139,13 @@ def __init__(
             blocksparse_params: Optional[dict[str, Any]],
             logits_soft_cap: Optional[float],
             attn_type: str,
+            kv_sharing_target_layer_name: Optional[str],
             # MLA Specific Arguments
             **mla_args) -> None:
         super().__init__(num_heads, head_size, scale, num_kv_heads,
                          alibi_slopes, sliding_window, kv_cache_dtype,
                          blocksparse_params, logits_soft_cap, attn_type,
-                         **mla_args)
+                         kv_sharing_target_layer_name, **mla_args)
         assert (num_heads == 16 or num_heads == 128), (
             f"Aiter MLA only supports 16 or 128 number of heads.\n"
             f"Provided {num_heads} number of heads.\n"
diff --git a/vllm/v1/attention/backends/mla/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py
index 2e6b619db628..e26d7909184b 100644
--- a/vllm/v1/attention/backends/mla/triton_mla.py
+++ b/vllm/v1/attention/backends/mla/triton_mla.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Any, Optional
 
@@ -40,12 +41,13 @@ def __init__(
             blocksparse_params: Optional[dict[str, Any]],
             logits_soft_cap: Optional[float],
             attn_type: str,
+            kv_sharing_target_layer_name: Optional[str],
             # MLA Specific Arguments
             **mla_args) -> None:
         super().__init__(num_heads, head_size, scale, num_kv_heads,
                          alibi_slopes, sliding_window, kv_cache_dtype,
                          blocksparse_params, logits_soft_cap, attn_type,
-                         **mla_args)
+                         kv_sharing_target_layer_name, **mla_args)
 
         unsupported_features = [
             alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index 8187e457d9e6..0f956ba88b9c 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Any, Optional
@@ -112,6 +113,7 @@ def __init__(
         blocksparse_params: Optional[dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[int] = None,
         use_irope: bool = False,
     ) -> None:
         if use_irope:
@@ -127,6 +129,7 @@ def __init__(
         self.num_kv_heads = num_kv_heads
         self.sliding_window = sliding_window
         self.logits_soft_cap = logits_soft_cap
+        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
 
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
@@ -180,7 +183,9 @@ def forward(
         num_tokens, hidden_size = query.shape
         query = query.view(num_tokens, self.num_heads, self.head_size)
 
-        if kv_cache.numel() > 0:
+        if self.kv_sharing_target_layer_name is None and kv_cache.numel() > 0:
+            # Write input keys and values to the KV cache.
+            # Skip this if sharing KV cache with an earlier attention layer.
             slot_mapping = attn_metadata.slot_mapping
             write_to_kv_cache(key, value, kv_cache, slot_mapping)
 
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index a97bb85004f6..5db592b15010 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with PagedAttention and Triton prefix prefill."""
 from typing import TYPE_CHECKING, Any, Optional
 
@@ -87,6 +88,7 @@ def __init__(
         blocksparse_params: Optional[dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: AttentionType = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[int] = None,
         use_irope: bool = False,
     ) -> None:
         if blocksparse_params is not None:
@@ -108,6 +110,7 @@ def __init__(
             # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
             logits_soft_cap = 0
         self.logits_soft_cap = logits_soft_cap
+        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
 
         self.use_irope = use_irope
 
@@ -168,40 +171,40 @@ def forward(
         # Whenever making a change in this method, please benchmark the
         # performance to make sure it does not introduce any overhead.
 
-        num_queries_per_kv = query.shape[1] // key.shape[1]
-        num_q_is_pow2 = (num_queries_per_kv & (num_queries_per_kv - 1)) == 0
-        use_prefill_decode_attn = (self.force_prefill_decode_attn
-                                   or not num_q_is_pow2)
+        use_prefill_decode_attn = self.force_prefill_decode_attn
         num_actual_tokens = attn_metadata.num_actual_tokens
 
         if use_prefill_decode_attn:
             key_cache, value_cache = PagedAttention.split_kv_cache(
                 kv_cache, self.num_kv_heads, self.head_size)
-
-            # Reshape the input keys and values and store them in the cache.
-            PagedAttention.write_to_paged_cache(
-                key,
-                value,
-                key_cache,
-                value_cache,
-                attn_metadata.slot_mapping,
-                self.kv_cache_dtype,
-                layer._k_scale,
-                layer._v_scale,
-            )
-
         else:
             key_cache, value_cache = kv_cache.unbind(0)
-            torch.ops._C_cache_ops.reshape_and_cache_flash(
-                key,
-                value,
-                key_cache,
-                value_cache,
-                attn_metadata.slot_mapping,
-                self.kv_cache_dtype,
-                layer._k_scale,
-                layer._v_scale,
-            )
+
+        if self.kv_sharing_target_layer_name is None:
+            # Reshape the input keys and values and store them in the cache.
+            # Skip this if sharing KV cache with an earlier attention layer.
+            if use_prefill_decode_attn:
+                PagedAttention.write_to_paged_cache(
+                    key,
+                    value,
+                    key_cache,
+                    value_cache,
+                    attn_metadata.slot_mapping,
+                    self.kv_cache_dtype,
+                    layer._k_scale,
+                    layer._v_scale,
+                )
+            else:
+                torch.ops._C_cache_ops.reshape_and_cache_flash(
+                    key,
+                    value,
+                    key_cache,
+                    value_cache,
+                    attn_metadata.slot_mapping,
+                    self.kv_cache_dtype,
+                    layer._k_scale,
+                    layer._v_scale,
+                )
 
         if self.kv_cache_dtype.startswith("fp8"):
             key_cache = key_cache.view(self.fp8_dtype)
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 10a771e830b6..72c764353927 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
 
 import torch
@@ -16,3 +17,36 @@ class CommonAttentionMetadata:
     seq_lens: torch.Tensor
     """(batch_size,), the length of each request including both computed tokens
     and newly scheduled tokens"""
+
+
+def validate_kv_sharing_target(current_layer_name, target_layer_name,
+                               static_forward_context):
+    error_msg = (f"Specified KV sharing target layer for {current_layer_name} "
+                 f"is not valid: target layer {target_layer_name} ")
+
+    if current_layer_name == target_layer_name:
+        raise ValueError(error_msg +
+                         "cannot be the same as the current layer.")
+
+    if target_layer_name not in static_forward_context:
+        from vllm.model_executor.models.utils import extract_layer_index
+
+        # If target layer name is not in the static fwd context, it means either
+        # a) the target layer does not come BEFORE the current layer, or
+        # b) the target layer is not an Attention layer that exists in the model
+        current_layer_idx = extract_layer_index(current_layer_name)
+        target_layer_idx = extract_layer_index(target_layer_name)
+        if current_layer_idx <= target_layer_idx:
+            raise ValueError(error_msg + "must come before the current layer.")
+        else:
+            raise ValueError(error_msg +
+                             "is not a valid Attention layer in the model.")
+
+    # Currently KV sharing is only supported between layers of the same type
+    target_layer_attn_type = static_forward_context[
+        target_layer_name].attn_type
+    expected = static_forward_context[current_layer_name].attn_type
+    if target_layer_attn_type != expected:
+        raise ValueError(
+            error_msg +
+            f"must be the same type as the current layer ({expected}).")
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index a0a065df9b1c..3b2a4f936000 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections import defaultdict
 from collections.abc import Iterable
 from typing import Callable, Optional
@@ -6,8 +7,8 @@
 from vllm.distributed.kv_events import (AllBlocksCleared, BlockRemoved,
                                         BlockStored, KVCacheEvent)
 from vllm.logger import init_logger
-from vllm.v1.core.kv_cache_utils import (BlockHash, FreeKVCacheBlockQueue,
-                                         KVCacheBlock,
+from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId,
+                                         FreeKVCacheBlockQueue, KVCacheBlock,
                                          generate_block_hash_extra_keys,
                                          hash_block_tokens)
 from vllm.v1.request import Request
@@ -26,6 +27,7 @@ class BlockPool:
     Args:
         num_gpu_blocks: The number of blocks in the pool.
         enable_caching: Whether to enable prefix caching.
+        enable_kv_cache_events: Whether to enable kv cache events.
     """
 
     def __init__(
@@ -55,33 +57,41 @@ def __init__(
         # if there is already an identical block in the cache. This is because
         # we want to make sure the allocated block IDs won't change so that
         # block tables are append-only.
-        self.cached_block_hash_to_block: dict[BlockHash, dict[
+        self.cached_block_hash_to_block: dict[BlockHashWithGroupId, dict[
             int, KVCacheBlock]] = defaultdict(dict)
 
         # To represent a placeholder block with block_id=0.
         # The ref_cnt of null_block is not maintained, needs special care to
         # avoid freeing it.
         self.null_block = self.free_block_queue.popleft()
+        self.null_block.is_null = True
 
         self.enable_kv_cache_events = enable_kv_cache_events
         self.kv_event_queue: list[KVCacheEvent] = []
 
-    def get_cached_block(self,
-                         block_hash: BlockHash) -> Optional[KVCacheBlock]:
-        """Get a cached block by the block hash, or None if cache miss.
+    def get_cached_block(
+            self, block_hash: BlockHash,
+            kv_cache_group_ids: list[int]) -> Optional[list[KVCacheBlock]]:
+        """Get the cached block by the block hash for each group in 
+        `kv_cache_group_ids`, or None if cache miss for any group.
         If there are duplicated blocks, we return the first block in the cache.
 
         Args:
             block_hash: The hash value of the block.
+            kv_cache_group_ids: The ids of the KV cache groups.
 
         Returns:
-            The cached block if it exists, or None.
+            The cached blocks if exists, or None.
         """
-        cached_blocks = self.cached_block_hash_to_block.get(block_hash)
-        if not cached_blocks:
-            return None
-        first_block_id = next(iter(cached_blocks))
-        return cached_blocks[first_block_id]
+        cached_blocks = []
+        for group_id in kv_cache_group_ids:
+            cached_blocks_one_group = self.cached_block_hash_to_block.get(
+                BlockHashWithGroupId(block_hash, group_id))
+            if not cached_blocks_one_group:
+                return None
+            first_block_id = next(iter(cached_blocks_one_group))
+            cached_blocks.append(cached_blocks_one_group[first_block_id])
+        return cached_blocks
 
     def cache_full_blocks(
         self,
@@ -91,6 +101,7 @@ def cache_full_blocks(
         num_cached_blocks: int,
         num_full_blocks: int,
         block_size: int,
+        kv_cache_group_id: int,
         hash_fn: Callable,
     ) -> None:
         """Cache a list of full blocks for prefix caching.
@@ -110,6 +121,7 @@ def cache_full_blocks(
             num_full_blocks: The number of blocks that are full and should
                 be cached after this function.
             block_size: Number of tokens in each block.
+            kv_cache_group_id: The id of the KV cache group.
             hash_fn: The hash function to use for block hashes.
         """
         if num_cached_blocks == num_full_blocks:
@@ -124,7 +136,7 @@ def cache_full_blocks(
         else:
             prev_block = blocks[num_cached_blocks - 1]
             assert prev_block.block_hash is not None
-            prev_block_hash_value = prev_block.block_hash.hash_value
+            prev_block_hash_value = prev_block.block_hash.get_hash_value()
 
         parent_block_hash = prev_block_hash_value
         new_hashes: Optional[list[int]] = ([] if self.enable_kv_cache_events
@@ -136,8 +148,9 @@ def cache_full_blocks(
                 # The block hash may already be computed in
                 # "get_computed_blocks" if the tokens are not generated by
                 # this request (either the prompt tokens or the previously
-                # generated tokens with preemption). In this case we simply
-                # reuse the block hash.
+                # generated tokens with preemption), or by other
+                # single_type_managers with the same block_size.
+                # In this case we simply reuse the block hash.
                 block_hash = new_block_hashes[i]
             else:
                 # Otherwise compute the block hash and cache it in the request
@@ -164,8 +177,11 @@ def cache_full_blocks(
                 block_hashes.append(block_hash)
 
             # Update and added the full block to the cache.
-            blk.block_hash = block_hash
-            self.cached_block_hash_to_block[block_hash][blk.block_id] = blk
+            block_hash_with_group_id = BlockHashWithGroupId(
+                block_hash, kv_cache_group_id)
+            blk.block_hash = block_hash_with_group_id
+            self.cached_block_hash_to_block[block_hash_with_group_id][
+                blk.block_id] = blk
             if new_hashes is not None:
                 new_hashes.append(block_hash.hash_value)
             prev_block_hash_value = block_hash.hash_value
@@ -235,12 +251,16 @@ def _maybe_evict_cached_block(self, block: KVCacheBlock) -> bool:
                 del self.cached_block_hash_to_block[block_hash]
 
             if self.enable_kv_cache_events:
+                # FIXME (Chen): Not sure whether we should return `hash_value`
+                # or `(hash_value, group_id)` here. But it's fine now because
+                # we disable hybrid kv cache manager when kv cache event is
+                # enabled, so there is only one group.
                 self.kv_event_queue.append(
-                    BlockRemoved(block_hashes=[block_hash.hash_value]))
+                    BlockRemoved(block_hashes=[block_hash.get_hash_value()]))
             return True
         return False
 
-    def touch(self, blocks: list[KVCacheBlock]) -> None:
+    def touch(self, blocks: list[list[KVCacheBlock]]) -> None:
         """Touch a block increases its reference count by 1, and may remove
         the block from the free queue. This is used when a block is hit by
         another request with the same prefix.
@@ -248,12 +268,13 @@ def touch(self, blocks: list[KVCacheBlock]) -> None:
         Args:
             blocks: A list of blocks to touch.
         """
-        for block in blocks:
-            # ref_cnt=0 means this block is in the free list (i.e. eviction
-            # candidate), so remove it.
-            if block.ref_cnt == 0 and block != self.null_block:
-                self.free_block_queue.remove(block)
-            block.incr_ref()
+        for blocks_per_group in blocks:
+            for block in blocks_per_group:
+                # ref_cnt=0 means this block is in the free list (i.e. eviction
+                # candidate), so remove it.
+                if block.ref_cnt == 0 and not block.is_null:
+                    self.free_block_queue.remove(block)
+                block.incr_ref()
 
     def free_blocks(self, ordered_blocks: Iterable[KVCacheBlock]) -> None:
         """Free a list of blocks. The blocks should be ordered by their
@@ -266,7 +287,7 @@ def free_blocks(self, ordered_blocks: Iterable[KVCacheBlock]) -> None:
         for block in ordered_blocks:
             block.decr_ref()
             # null_block should not be added to the free list.
-            if block.ref_cnt == 0 and block != self.null_block:
+            if block.ref_cnt == 0 and not block.is_null:
                 self.free_block_queue.append(block)
 
     def reset_prefix_cache(self) -> bool:
diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
index 05d70bb9b977..16dc67b9b6f6 100644
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import TYPE_CHECKING
 
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
new file mode 100644
index 000000000000..231bad1df922
--- /dev/null
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -0,0 +1,358 @@
+# SPDX-License-Identifier: Apache-2.0
+from abc import ABC, abstractmethod
+from typing import Callable, Optional
+
+from vllm.v1.core.block_pool import BlockPool
+from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
+from vllm.v1.core.single_type_kv_cache_manager import (
+    FullAttentionManager, SingleTypeKVCacheManager,
+    get_manager_for_kv_cache_spec)
+from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheConfig
+from vllm.v1.request import Request
+
+
+class KVCacheCoordinator(ABC):
+    """
+    Coordinate the KV cache of different KV cache groups.
+    """
+
+    def __init__(
+        self,
+        kv_cache_config: KVCacheConfig,
+        max_model_len: int,
+        use_eagle: bool,
+        enable_caching: bool,
+        caching_hash_fn: Callable,
+        enable_kv_cache_events: bool,
+    ):
+        self.kv_cache_config = kv_cache_config
+        self.max_model_len = max_model_len
+
+        self.block_pool = BlockPool(kv_cache_config.num_blocks, enable_caching,
+                                    enable_kv_cache_events)
+        self.single_type_managers: list[SingleTypeKVCacheManager] = []
+
+        # Needs special handling for find_longest_cache_hit if eagle is enabled
+        self.use_eagle = use_eagle
+
+        for i in range(len(self.kv_cache_config.kv_cache_groups)):
+            kv_cache_spec = self.kv_cache_config.kv_cache_groups[
+                i].kv_cache_spec
+            self.single_type_managers.append(
+                get_manager_for_kv_cache_spec(
+                    kv_cache_spec=kv_cache_spec,
+                    block_pool=self.block_pool,
+                    kv_cache_group_id=i,
+                    caching_hash_fn=caching_hash_fn,
+                ))
+
+    def get_num_blocks_to_allocate(
+            self, request_id: str, num_tokens: int,
+            new_computed_blocks: list[list[KVCacheBlock]]) -> int:
+        """
+        Get the number of blocks needed to be allocated for the request.
+
+        Args:
+            request_id: The request ID.
+            num_tokens: The total number of tokens that need a slot (including 
+                tokens that are already allocated).
+            new_computed_blocks: The new computed blocks just hitting the
+                prefix caching.
+
+        Returns:
+            The number of blocks.
+        """
+        num_blocks_to_allocate = 0
+        for i, manager in enumerate(self.single_type_managers):
+            num_blocks_to_allocate += manager.get_num_blocks_to_allocate(
+                request_id, num_tokens, new_computed_blocks[i])
+        return num_blocks_to_allocate
+
+    def save_new_computed_blocks(
+            self, request_id: str,
+            new_computed_blocks: list[list[KVCacheBlock]]) -> None:
+        """
+        Add the new computed blocks to the request.
+
+        Args:
+            request_id: The request ID.
+            new_computed_blocks: The new computed blocks just hitting the
+                prefix cache.
+        """
+        for i, manager in enumerate(self.single_type_managers):
+            manager.save_new_computed_blocks(request_id,
+                                             new_computed_blocks[i])
+
+    def allocate_new_blocks(self, request_id: str,
+                            num_tokens: int) -> list[list[KVCacheBlock]]:
+        """
+        Allocate new blocks for the request to give it at least `num_tokens` 
+        token slots.
+
+        Args:
+            request_id: The request ID.
+            num_tokens: The total number of tokens that need a slot (including 
+                tokens that are already allocated).
+
+        Returns:
+            The new allocated blocks.
+        """
+        new_blocks = []
+        for manager in self.single_type_managers:
+            new_blocks.append(
+                manager.allocate_new_blocks(request_id, num_tokens))
+        return new_blocks
+
+    def cache_blocks(self, request: Request, block_hashes: list[BlockHash],
+                     num_computed_tokens: int) -> None:
+        """
+        Cache the blocks for the request.
+
+        Args:
+            request: The request.
+            block_hashes: The block hashes of the request.
+            num_tokens: The total number of tokens that need to be cached 
+                (including tokens that are already cached).
+        """
+        for manager in self.single_type_managers:
+            manager.cache_blocks(request, block_hashes, num_computed_tokens)
+
+    def free(self, request_id: str) -> None:
+        """
+        Free the blocks for the request.
+
+        Args:
+            request_id: The request ID.
+        """
+        for manager in self.single_type_managers:
+            manager.free(request_id)
+
+    def get_num_common_prefix_blocks(self, request_id: str,
+                                     num_running_requests: int) -> list[int]:
+        """
+        Get the number of common prefix blocks for a request.
+
+        Args:
+            request_id: The request ID.
+            block_hashes: The block hashes of the request.
+
+        Returns:
+            The number of common prefix blocks.
+        """
+        num_blocks_per_group = [
+            manager.get_num_common_prefix_blocks(request_id,
+                                                 num_running_requests)
+            for manager in self.single_type_managers
+        ]
+        return num_blocks_per_group
+
+    def remove_skipped_blocks(self, request_id: str,
+                              num_computed_tokens: int) -> None:
+        """
+        Remove the blocks that are no longer needed from `blocks` and replace 
+        the removed blocks with null_block.
+
+        Args:
+            request_id: The request ID.
+            num_computed_tokens: The number of tokens that have been computed.
+        """
+        for manager in self.single_type_managers:
+            manager.remove_skipped_blocks(request_id, num_computed_tokens)
+
+    def get_blocks(self, request_id: str) -> list[list[KVCacheBlock]]:
+        """
+        Get the blocks for the request.
+        """
+        return [
+            manager.req_to_blocks.get(request_id) or []
+            for manager in self.single_type_managers
+        ]
+
+    @abstractmethod
+    def find_longest_cache_hit(
+            self, block_hashes: list[BlockHash],
+            max_cache_hit_length: int) -> tuple[list[list[KVCacheBlock]], int]:
+        pass
+
+
+class UnitaryKVCacheCoordinator(KVCacheCoordinator):
+    """
+    KV cache coordinator for models with only one KV cache group. This is the
+    case for models with only one KV cache type, e.g., all attention layers use
+    full attention or all attention layers use sliding window attention.
+    """
+
+    def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int,
+                 use_eagle: bool, enable_caching: bool,
+                 caching_hash_fn: Callable, enable_kv_cache_events: bool):
+        super().__init__(kv_cache_config, max_model_len, use_eagle,
+                         enable_caching, caching_hash_fn,
+                         enable_kv_cache_events)
+        self.kv_cache_spec = self.kv_cache_config.kv_cache_groups[
+            0].kv_cache_spec
+        self.block_size = self.kv_cache_spec.block_size
+        assert len(self.kv_cache_config.kv_cache_groups) == 1, (
+            "UnitaryKVCacheCoordinator assumes only one kv cache group")
+
+    def find_longest_cache_hit(
+            self, block_hashes: list[BlockHash],
+            max_cache_hit_length: int) -> tuple[list[list[KVCacheBlock]], int]:
+        hit_blocks = self.single_type_managers[0].find_longest_cache_hit(
+            block_hashes=block_hashes,
+            max_length=max_cache_hit_length,
+            kv_cache_group_ids=[0],
+            block_pool=self.block_pool,
+            kv_cache_spec=self.kv_cache_spec,
+            use_eagle=self.use_eagle,
+        )
+        return hit_blocks, len(hit_blocks[0]) * self.block_size
+
+
+class HybridKVCacheCoordinator(KVCacheCoordinator):
+    """
+    KV cache coordinator for hybrid models with multiple KV cache types, and
+    thus multiple kv cache groups.
+    To simplify `find_longest_cache_hit`, it only supports the combination of 
+    two types of KV cache groups, and one of them must be full attention.
+    May extend to more general cases in the future.
+    """
+
+    def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int,
+                 use_eagle: bool, enable_caching: bool,
+                 caching_hash_fn: Callable, enable_kv_cache_events: bool):
+        super().__init__(kv_cache_config, max_model_len, use_eagle,
+                         enable_caching, caching_hash_fn,
+                         enable_kv_cache_events)
+        self.verify_and_split_kv_cache_groups()
+
+    def verify_and_split_kv_cache_groups(self) -> None:
+        """
+        Verifies that the model has exactly two types of KV cache groups, and 
+        one of them is full attention. Then, split the kv cache groups into full
+        attention groups and other groups.
+        """
+        full_attention_type_id: Optional[str] = None
+        other_type_id: Optional[str] = None
+        self.full_attention_group_ids: list[int] = []
+        self.other_group_ids: list[int] = []
+        for i, g in enumerate(self.kv_cache_config.kv_cache_groups):
+            if isinstance(g.kv_cache_spec, FullAttentionSpec):
+                if full_attention_type_id is None:
+                    full_attention_type_id = g.kv_cache_spec.type_id
+                else:
+                    assert full_attention_type_id == g.kv_cache_spec.type_id, (
+                        "HybridKVCacheCoordinator assumes exactly one type of "
+                        "full attention groups now.")
+                self.full_attention_group_ids.append(i)
+            else:
+                if other_type_id is None:
+                    other_type_id = g.kv_cache_spec.type_id
+                else:
+                    assert other_type_id == g.kv_cache_spec.type_id, (
+                        "HybridKVCacheCoordinator assumes "
+                        "exactly one other type of groups now.")
+                self.other_group_ids.append(i)
+
+        assert full_attention_type_id is not None, (
+            "HybridKVCacheCoordinator assumes exactly one type of full "
+            "attention groups now.")
+        assert other_type_id is not None, (
+            "HybridKVCacheCoordinator assumes exactly one type of other "
+            "groups now.")
+
+        self.full_attention_manager_cls = FullAttentionManager
+        self.other_attention_cls = self.single_type_managers[
+            self.other_group_ids[0]].__class__
+
+        self.full_attention_spec = self.kv_cache_config.kv_cache_groups[
+            self.full_attention_group_ids[0]].kv_cache_spec
+        self.other_spec = self.kv_cache_config.kv_cache_groups[
+            self.other_group_ids[0]].kv_cache_spec
+
+        self.full_attention_block_size = self.full_attention_spec.block_size
+        self.other_block_size = self.other_spec.block_size
+        assert self.other_block_size % self.full_attention_block_size == 0, (
+            "KVCacheCoordinator assumes the block_size of full attention "
+            "layers is divisible by other layers now.")
+
+    def find_longest_cache_hit(
+        self,
+        block_hashes: list[BlockHash],
+        max_cache_hit_length: int,
+    ) -> tuple[list[list[KVCacheBlock]], int]:
+        """
+        Find the longest cache hit for the request.
+
+        Args:
+            block_hashes: The block hashes of the request.
+            max_cache_hit_length: The maximum length of the cache hit.
+
+        Returns:
+            A tuple containing:
+                - A list of the cache hit blocks for each single type manager.
+                - The number of tokens of the longest cache hit.
+        """
+        # First, find the longest cache hit for full attention.
+        hit_blocks_full_attn = (
+            self.full_attention_manager_cls.find_longest_cache_hit(
+                block_hashes=block_hashes,
+                max_length=max_cache_hit_length,
+                kv_cache_group_ids=self.full_attention_group_ids,
+                block_pool=self.block_pool,
+                kv_cache_spec=self.full_attention_spec,
+                use_eagle=self.use_eagle,
+            ))
+        hit_length = len(
+            hit_blocks_full_attn[0]) * self.full_attention_block_size
+
+        # Next, find the cache hit for the other attention WITHIN
+        # the cache hit of full attention.
+        hit_blocks_other_attn = (
+            self.other_attention_cls.find_longest_cache_hit(
+                block_hashes=block_hashes,
+                max_length=hit_length,
+                kv_cache_group_ids=self.other_group_ids,
+                block_pool=self.block_pool,
+                kv_cache_spec=self.other_spec,
+                use_eagle=self.use_eagle,
+            ))
+        hit_length = len(hit_blocks_other_attn[0]) * self.other_block_size
+
+        # NOTE: the prefix cache hit length must be a multiply of block_size as
+        # we don't support partial block cache hit yet. The cache hit length
+        # of other attention is ensured to be a multiply of the block size of
+        # full attention layers in current implementation, because hit_length is
+        # a multiply of other attention's block size, and other attention's
+        # block size is a multiply of full attention's block size (verified in
+        # `verify_and_split_kv_cache_groups`).
+        assert hit_length % self.full_attention_block_size == 0
+
+        # Truncate the full attention cache hit to the length of the
+        # cache hit of the other attention.
+        for i in range(len(hit_blocks_full_attn)):
+            del hit_blocks_full_attn[i][hit_length //
+                                        self.full_attention_block_size:]
+
+        # Merge the hit blocks of full attention and other attention.
+        hit_blocks = hit_blocks_other_attn
+        for group_id, blocks in enumerate(hit_blocks_full_attn):
+            # NOTE: there is only one full attention group in most cases. So
+            # the time complexity of insert is fine.
+            hit_blocks.insert(group_id, blocks)
+        return hit_blocks, hit_length
+
+
+def get_kv_cache_coordinator(
+        kv_cache_config: KVCacheConfig, max_model_len: int, use_eagle: bool,
+        enable_caching: bool, caching_hash_fn: Callable,
+        enable_kv_cache_events: bool) -> KVCacheCoordinator:
+    if len(kv_cache_config.kv_cache_groups) == 1:
+        return UnitaryKVCacheCoordinator(kv_cache_config, max_model_len,
+                                         use_eagle, enable_caching,
+                                         caching_hash_fn,
+                                         enable_kv_cache_events)
+    else:
+        return HybridKVCacheCoordinator(kv_cache_config, max_model_len,
+                                        use_eagle, enable_caching,
+                                        caching_hash_fn,
+                                        enable_kv_cache_events)
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 59e07382b652..35fb189fda34 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections import defaultdict
 from dataclasses import dataclass
@@ -7,11 +8,9 @@
 from vllm.distributed.kv_events import KVCacheEvent
 from vllm.logger import init_logger
 from vllm.utils import sha256
-from vllm.v1.core.block_pool import BlockPool
+from vllm.v1.core.kv_cache_coordinator import get_kv_cache_coordinator
 from vllm.v1.core.kv_cache_utils import (BlockHash, KVCacheBlock,
                                          hash_request_tokens)
-from vllm.v1.core.single_type_kv_cache_manager import (
-    get_manager_for_kv_cache_spec)
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.metrics.stats import PrefixCacheStats
 from vllm.v1.request import Request, RequestStatus
@@ -21,16 +20,24 @@
 
 @dataclass
 class KVCacheBlocks:
-    blocks: list[KVCacheBlock]
+    """
+    The allocation result of KVCacheManager, work as the interface between 
+    Scheduler and KVCacheManager, to hide KVCacheManager's internal data 
+    structure from the Scheduler.
+    """
+    blocks: list[list[KVCacheBlock]]
+    """
+    blocks[i][j] refers to the i-th kv_cache_group and the j-th block of tokens.
+    We don't use block of tokens as the outer dimension because it assumes all
+    kv_cache_groups have the same number of blocks, which is true for now but 
+    will be broken if we want to give different block_size to different 
+    kv_cache_groups in the future.
+    """
 
     def __add__(self, other: "KVCacheBlocks") -> "KVCacheBlocks":
         """Adds two KVCacheBlocks instances."""
-        return KVCacheBlocks(self.blocks + other.blocks)
-
-    @classmethod
-    def create_empty(cls) -> "KVCacheBlocks":
-        """Creates a new KVCacheBlocks instance with no blocks."""
-        return cls([])
+        return KVCacheBlocks(
+            [blk1 + blk2 for blk1, blk2 in zip(self.blocks, other.blocks)])
 
     def get_block_ids(self) -> list[list[int]]:
         """
@@ -38,17 +45,26 @@ def get_block_ids(self) -> list[list[int]]:
         
         Returns:
             list[list[int]]: A two-level list where
-            * the outer list corresponds to KV cache groups (only 1 group now)
+            * the outer list corresponds to KV cache groups
             * each inner list contains the block_ids of the blocks in that group
         """
-        return [[block.block_id for block in self.blocks]]
+        block_ids = []
+        for group in self.blocks:
+            block_ids.append([blk.block_id for blk in group])
+        return block_ids
 
     def get_unhashed_block_ids(self) -> list[int]:
         """Get block_ids of unhashed blocks from KVCacheBlocks instance."""
+        assert len(self.blocks) == 1, "Only one group is supported"
         return [
-            block.block_id for block in self.blocks if block.block_hash is None
+            block.block_id for block in self.blocks[0]
+            if block.block_hash is None
         ]
 
+    def new_empty(self) -> "KVCacheBlocks":
+        """Creates a new KVCacheBlocks instance with no blocks."""
+        return KVCacheBlocks([[] for _ in range(len(self.blocks))])
+
 
 class KVCacheManager:
 
@@ -62,12 +78,6 @@ def __init__(
         log_stats: bool = False,
         enable_kv_cache_events: bool = False,
     ) -> None:
-        assert len(kv_cache_config.kv_cache_groups) == 1, (
-            "KVCacheManager does not support hybrid models with more than 1 "
-            "kv cache group")
-        kv_cache_spec = kv_cache_config.kv_cache_groups[0].kv_cache_spec
-        self.block_size = kv_cache_spec.block_size
-        self.num_gpu_blocks = kv_cache_config.num_blocks
         self.max_model_len = max_model_len
 
         self.enable_caching = enable_caching
@@ -76,17 +86,24 @@ def __init__(
         self.log_stats = log_stats
         # FIXME: make prefix cache stats conditional on log_stats
         self.prefix_cache_stats = PrefixCacheStats() if log_stats else None
-
-        self.block_pool = BlockPool(self.num_gpu_blocks, enable_caching,
-                                    enable_kv_cache_events)
-
-        self.single_type_manager = get_manager_for_kv_cache_spec(
-            kv_cache_spec=kv_cache_spec,
-            block_pool=self.block_pool,
+        assert len(
+            set(g.kv_cache_spec.block_size
+                for g in kv_cache_config.kv_cache_groups)
+        ) == 1, "Only one block size is supported for now"
+        self.block_size = kv_cache_config.kv_cache_groups[
+            0].kv_cache_spec.block_size
+
+        self.coordinator = get_kv_cache_coordinator(
+            kv_cache_config=kv_cache_config,
+            max_model_len=self.max_model_len,
             use_eagle=self.use_eagle,
-            num_kv_cache_groups=1,
+            enable_caching=enable_caching,
             caching_hash_fn=self.caching_hash_fn,
+            enable_kv_cache_events=enable_kv_cache_events,
         )
+        self.num_kv_cache_groups = len(kv_cache_config.kv_cache_groups)
+        self.block_pool = self.coordinator.block_pool
+        self.kv_cache_config = kv_cache_config
 
         # Mapping from request ID to kv block hashes.
         # This is to avoid recomputing the block hashes for each call of
@@ -132,7 +149,7 @@ def get_computed_blocks(self,
         # When the request requires prompt logprobs, we skip prefix caching.
         if (not self.enable_caching
                 or request.sampling_params.prompt_logprobs is not None):
-            return KVCacheBlocks.create_empty(), 0
+            return self.create_empty_block_list(), 0
 
         # The block hashes for the request may already be computed
         # if the scheduler has tried to schedule the request before.
@@ -153,20 +170,16 @@ def get_computed_blocks(self,
         # num_computed_tokens to be block-size aligned. Removing this limitation
         # could slightly improve performance in the future.
         max_cache_hit_length = request.num_tokens - 1
-
-        computed_blocks = self.single_type_manager.find_longest_cache_hit(
-            block_hashes, max_cache_hit_length)
-        # NOTE(woosuk): Since incomplete blocks are not eligible for
-        # sharing, `num_computed_tokens` is always a multiple of
-        # `block_size`.
-        num_computed_tokens = len(computed_blocks) * self.block_size
+        computed_blocks, num_new_computed_tokens = (
+            self.coordinator.find_longest_cache_hit(block_hashes,
+                                                    max_cache_hit_length))
 
         if self.log_stats:
             assert self.prefix_cache_stats is not None
             self.prefix_cache_stats.queries += request.num_tokens
-            self.prefix_cache_stats.hits += num_computed_tokens
+            self.prefix_cache_stats.hits += num_new_computed_tokens
 
-        return KVCacheBlocks(computed_blocks), num_computed_tokens
+        return KVCacheBlocks(computed_blocks), num_new_computed_tokens
 
     def allocate_slots(
         self,
@@ -219,7 +232,9 @@ def allocate_slots(
         if new_computed_blocks is not None:
             new_computed_block_list = new_computed_blocks.blocks
         else:
-            new_computed_block_list = []
+            new_computed_block_list = [
+                [] for _ in range(len(self.kv_cache_config.kv_cache_groups))
+            ]
 
         # Free the blocks that are skipped during the attention computation
         # (e.g., tokens outside the sliding window).
@@ -227,8 +242,8 @@ def allocate_slots(
         # insufficient free blocks.
         # Should call this function before allocating new blocks to reduce
         # the number of evicted blocks.
-        self.single_type_manager.remove_skipped_blocks(
-            request.request_id, request.num_computed_tokens)
+        self.coordinator.remove_skipped_blocks(request.request_id,
+                                               request.num_computed_tokens)
 
         # The number of computed tokens is the number of computed tokens plus
         # the new prefix caching hits
@@ -237,12 +252,12 @@ def allocate_slots(
         num_tokens_need_slot = min(
             num_computed_tokens + num_new_tokens + num_lookahead_tokens,
             self.max_model_len)
-        num_blocks_to_allocate = (
-            self.single_type_manager.get_num_blocks_to_allocate(
-                request_id=request.request_id,
-                num_tokens=num_tokens_need_slot,
-                new_computed_blocks=new_computed_block_list,
-            ))
+
+        num_blocks_to_allocate = self.coordinator.get_num_blocks_to_allocate(
+            request_id=request.request_id,
+            num_tokens=num_tokens_need_slot,
+            new_computed_blocks=new_computed_block_list,
+        )
 
         if num_blocks_to_allocate > self.block_pool.get_num_free_blocks():
             # Cannot allocate new blocks
@@ -252,16 +267,16 @@ def allocate_slots(
         if self.enable_caching:
             self.block_pool.touch(new_computed_block_list)
         else:
-            assert not new_computed_block_list, (
+            assert all(not blocks for blocks in new_computed_block_list), (
                 "Computed blocks should be empty when "
                 "prefix caching is disabled")
 
         # Append the new computed blocks to the request blocks until now to
         # avoid the case where the new blocks cannot be allocated.
-        self.single_type_manager.save_new_computed_blocks(
-            request.request_id, new_computed_block_list)
+        self.coordinator.save_new_computed_blocks(request.request_id,
+                                                  new_computed_block_list)
 
-        new_blocks = self.single_type_manager.allocate_new_blocks(
+        new_blocks = self.coordinator.allocate_new_blocks(
             request.request_id, num_tokens_need_slot)
 
         # P/D: delay caching blocks if we have to recv from
@@ -272,7 +287,7 @@ def allocate_slots(
         # Speculated tokens might be rejected in the future, so we does
         # not cache any speculated tokens. We only cache blocks with
         # generated (accepted) tokens.
-        self.single_type_manager.cache_blocks(
+        self.coordinator.cache_blocks(
             request, self.req_to_block_hashes[request.request_id],
             num_computed_tokens + num_new_tokens - num_draft_tokens)
 
@@ -286,7 +301,7 @@ def free(self, request: Request) -> None:
         Args:
             request: The request to free the blocks.
         """
-        self.single_type_manager.free(request.request_id)
+        self.coordinator.free(request.request_id)
 
     def reset_prefix_cache(self) -> bool:
         """Reset prefix cache. This function may be used in RLHF
@@ -344,10 +359,8 @@ def get_num_common_prefix_blocks(
             group.
         """
         assert request.status == RequestStatus.RUNNING
-        return [
-            self.single_type_manager.get_num_common_prefix_blocks(
-                request.request_id, num_running_requests)
-        ]
+        return self.coordinator.get_num_common_prefix_blocks(
+            request.request_id, num_running_requests)
 
     def free_block_hashes(self, request: Request) -> None:
         """Discard the block hashes for the request.
@@ -367,6 +380,15 @@ def take_events(self) -> list[KVCacheEvent]:
 
     def get_block_ids(self, request_id: str) -> list[list[int]]:
         """Get the block ids of a request."""
-        assert request_id in self.single_type_manager.req_to_blocks
-        return KVCacheBlocks(self.single_type_manager.req_to_blocks[request_id]
-                             ).get_block_ids()
+        return KVCacheBlocks(
+            self.coordinator.get_blocks(request_id)).get_block_ids()
+
+    def cache_blocks(self, request: Request, block_hashes: list[BlockHash],
+                     num_computed_tokens: int) -> None:
+        """Cache the blocks for the request."""
+        self.coordinator.cache_blocks(request, block_hashes,
+                                      num_computed_tokens)
+
+    def create_empty_block_list(self) -> KVCacheBlocks:
+        """Creates a new KVCacheBlocks instance with no blocks."""
+        return KVCacheBlocks([[] for _ in range(self.num_kv_cache_groups)])
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 3ccad97e9919..6d4bcfe64a35 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -1,14 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """KV-Cache Utilities."""
+
 import os
-from collections import deque
-from collections.abc import Sequence
+from collections import defaultdict, deque
+from collections.abc import Iterable, Sequence
 from dataclasses import dataclass
 from typing import Any, Callable, NamedTuple, Optional
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.utils import GiB_bytes, sha256
+from vllm.utils import GiB_bytes, cdiv, sha256
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec, KVCacheSpec,
                                         KVCacheTensor, SlidingWindowSpec)
@@ -32,6 +34,18 @@ class BlockHash(NamedTuple):
     extra_keys: Optional[Any] = None
 
 
+class BlockHashWithGroupId(NamedTuple):
+    # The hash value for the contents (e.g., token_ids) of a block without group
+    # ID. The value is the same for blocks representing the same tokens but for
+    # different groups.
+    block_hash: BlockHash
+    # The KV cache group ID.
+    group_id: int
+
+    def get_hash_value(self) -> int:
+        return self.block_hash.hash_value
+
+
 # The hash seed for the first block of the prefix block sequence.
 #
 # Even if the hash function is the builtin hash(), we use sha256 to generate
@@ -43,7 +57,7 @@ class BlockHash(NamedTuple):
 # This aligns with the behavior of Python's hash() function, which also uses
 # a random seed if PYTHONHASHSEED is not set.
 NONE_HASH = int.from_bytes(os.urandom(32), byteorder="big") if os.getenv(
-    'PYTHONHASHSEED') is None else sha256(os.getenv('PYTHONHASHSEED'))
+    "PYTHONHASHSEED") is None else sha256(os.getenv("PYTHONHASHSEED"))
 
 
 class PrefixCachingMetrics:
@@ -117,13 +131,16 @@ class KVCacheBlock:
     ref_cnt: int = 0
     # The hash of the block composed of (block hash, tuple of token IDs).
     # It is only available when the block is full.
-    _block_hash: Optional[BlockHash] = None
+    _block_hash: Optional[BlockHashWithGroupId] = None
 
     # Used to construct a doubly linked list for free blocks.
     # These two attributes should only be manipulated by FreeKVCacheBlockQueue.
     prev_free_block: Optional["KVCacheBlock"] = None
     next_free_block: Optional["KVCacheBlock"] = None
 
+    # Whether the block is a null block that should never be cached.
+    is_null: bool = False
+
     def incr_ref(self):
         self.ref_cnt += 1
 
@@ -131,11 +148,11 @@ def decr_ref(self):
         self.ref_cnt -= 1
 
     @property
-    def block_hash(self) -> Optional[BlockHash]:
+    def block_hash(self) -> Optional[BlockHashWithGroupId]:
         return self._block_hash
 
     @block_hash.setter
-    def block_hash(self, block_hash: BlockHash):
+    def block_hash(self, block_hash: BlockHashWithGroupId):
         assert self.block_hash is None, (
             "The block already has a hash. This should not happen.")
         self._block_hash = block_hash
@@ -147,10 +164,10 @@ def reset_hash(self):
     def __repr__(self) -> str:
         # Use block_id instead of KVCacheBlock object to avoid calling __repr__
         # on KVCacheBlock object recursively.
-        prev_block_id = self.prev_free_block.block_id \
-            if self.prev_free_block else None
-        next_block_id = self.next_free_block.block_id \
-            if self.next_free_block else None
+        prev_block_id = (self.prev_free_block.block_id
+                         if self.prev_free_block else None)
+        next_block_id = (self.next_free_block.block_id
+                         if self.next_free_block else None)
         return (f"KVCacheBlock(block_id={self.block_id}, "
                 f"ref_cnt={self.ref_cnt}, "
                 f"_block_hash={self._block_hash}, "
@@ -464,6 +481,15 @@ def hash_request_tokens(hash_function: Any, block_size: int,
     return ret
 
 
+def max_memory_usage_bytes(vllm_config: VllmConfig,
+                           kv_cache_specs: Iterable[KVCacheSpec]) -> int:
+    """
+    Get the maximum memory usage in bytes for the given KV cache specs.
+    """
+    return sum(
+        spec.max_memory_usage_bytes(vllm_config) for spec in kv_cache_specs)
+
+
 def estimate_max_model_len(vllm_config: VllmConfig,
                            kv_cache_spec: dict[str, KVCacheSpec],
                            available_memory: int) -> int:
@@ -485,11 +511,8 @@ def fits_in_memory(model_len: int) -> bool:
         # Modify the max_model_len for this calculation
         vllm_config.model_config.max_model_len = model_len
         # Calculate memory needed for the given model length
-        memory_needed = sum(
-            (layer_spec.max_memory_usage_bytes(vllm_config)
-             for layer_spec in kv_cache_spec.values()),
-            start=0,
-        )
+        memory_needed = max_memory_usage_bytes(vllm_config,
+                                               kv_cache_spec.values())
         return memory_needed <= available_memory
 
     # Binary search for the maximum model length
@@ -534,9 +557,7 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
                          "initializing the engine.")
 
     max_model_len = vllm_config.model_config.max_model_len
-    needed_memory = 0
-    for layer_spec in kv_cache_spec.values():
-        needed_memory += layer_spec.max_memory_usage_bytes(vllm_config)
+    needed_memory = max_memory_usage_bytes(vllm_config, kv_cache_spec.values())
 
     if needed_memory > available_memory:
         # Estimate the maximum model length that can fit in the available memory
@@ -562,20 +583,20 @@ def create_kv_cache_group_specs(
         kv_cache_spec: dict[str, KVCacheSpec],
         grouped_layer_names: list[list[str]]) -> list[KVCacheGroupSpec]:
     """
-     Create KVCacheGroupSpec object for each kv cache group layer.
-     The layers in the same group should share the same
-     KVCacheSpec.
-
-     Args:
-         kv_cache_spec:
-             A mapping from each layer name to its corresponding KVCacheSpec.
-         grouped_layer_names:
-             A list of kv cache groups, where each element is a list of layer
-             names that belong to the same group and should share the same
-             KVCacheSpec.
-     Returns:
-         A list of KVCacheGroupSpec objects, one for each group.
-     """
+    Create KVCacheGroupSpec object for each kv cache group layer.
+    The layers in the same group should share the same
+    KVCacheSpec.
+
+    Args:
+        kv_cache_spec:
+            A mapping from each layer name to its corresponding KVCacheSpec.
+        grouped_layer_names:
+            A list of kv cache groups, where each element is a list of layer
+            names that belong to the same group and should share the same
+            KVCacheSpec.
+    Returns:
+        A list of KVCacheGroupSpec objects, one for each group.
+    """
     kv_cache_groups = []
     for layer_names_one_group in grouped_layer_names:
         layer_specs = [
@@ -602,6 +623,55 @@ def is_kv_cache_type_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
     return len(layer_keys) == 1
 
 
+def get_max_concurrency_for_kv_cache_config(
+        vllm_config: VllmConfig, kv_cache_config: KVCacheConfig) -> float:
+    """
+    Get the maximum concurrency for the given KV cache configuration.
+    """
+    num_layer_per_group = max(
+        len(group.layer_names) for group in kv_cache_config.kv_cache_groups)
+    max_memory_usage_per_request = num_layer_per_group * max_memory_usage_bytes(
+        vllm_config,
+        (group.kv_cache_spec for group in kv_cache_config.kv_cache_groups))
+    memory_per_block = kv_cache_config.kv_cache_groups[
+        0].kv_cache_spec.page_size_bytes * num_layer_per_group
+    num_block_per_request = cdiv(max_memory_usage_per_request,
+                                 memory_per_block)
+    max_concurrency = kv_cache_config.num_blocks / num_block_per_request
+    return max_concurrency
+
+
+def get_num_blocks(vllm_config: VllmConfig, num_layers: int,
+                   available_memory: int, page_size: int) -> int:
+    """
+    Get the number of kv cache blocks.
+
+    Args:
+        vllm_config: The global VllmConfig
+        num_layers: The number of layers
+        available_memory: Memory available for KV cache in bytes.
+        page_size: The page size of the KV cache.
+    """
+    num_blocks = int(available_memory // page_size // num_layers)
+    num_blocks = max(num_blocks, 0)
+    if vllm_config.cache_config.num_gpu_blocks_override is not None:
+        num_gpu_blocks_override = \
+            vllm_config.cache_config.num_gpu_blocks_override
+        logger.info(
+            "Overriding num_gpu_blocks=%d with "
+            "num_gpu_blocks_override=%d", num_blocks, num_gpu_blocks_override)
+    return num_blocks
+
+
+def get_uniform_page_size(kv_cache_spec: dict[str, KVCacheSpec]) -> int:
+    """
+    Get the page size of the KV cache.
+    """
+    page_sizes = set(layer.page_size_bytes for layer in kv_cache_spec.values())
+    assert len(page_sizes) == 1
+    return page_sizes.pop()
+
+
 def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
                                       kv_cache_spec: dict[str, KVCacheSpec],
                                       available_memory: int) -> KVCacheConfig:
@@ -618,57 +688,218 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
         The generated KVCacheConfig
     """
 
-    page_sizes = {layer.page_size_bytes for layer in kv_cache_spec.values()}
-    assert len(page_sizes) == 1
-    page_size = page_sizes.pop()
+    page_size = get_uniform_page_size(kv_cache_spec)
+    num_blocks = get_num_blocks(vllm_config, len(kv_cache_spec),
+                                available_memory, page_size)
 
-    num_blocks = int(available_memory // page_size // len(kv_cache_spec))
-    num_blocks = max(num_blocks, 0)
+    per_layer_size = page_size * num_blocks
+    # All layers have the same KV cache spec, so we create one kv cache group
+    # for all layers.
+    grouped_layer_names = [list(kv_cache_spec.keys())]
 
-    if vllm_config.cache_config.num_gpu_blocks_override is not None:
-        num_gpu_blocks_override = \
-            vllm_config.cache_config.num_gpu_blocks_override
-        logger.info(
-            "Overriding num_gpu_blocks=%d with "
-            "num_gpu_blocks_override=%d", num_blocks, num_gpu_blocks_override)
-        num_blocks = num_gpu_blocks_override
+    # Each layer uses a separate Tensor to store its KV cache.
+    kv_cache_tensors = [
+        KVCacheTensor(size=per_layer_size, shared_by=[layer_name])
+        for layer_name in kv_cache_spec
+    ]
+
+    kv_cache_config = KVCacheConfig(
+        num_blocks=num_blocks,
+        kv_cache_tensors=kv_cache_tensors,
+        kv_cache_groups=create_kv_cache_group_specs(kv_cache_spec,
+                                                    grouped_layer_names),
+    )
 
     num_tokens = num_blocks * vllm_config.cache_config.block_size
     num_tokens_str = f"{num_tokens:,}"
     logger.info("GPU KV cache size: %s tokens", num_tokens_str)
     max_model_len_str = f"{vllm_config.model_config.max_model_len:,}"
-    max_concurrency = num_tokens / vllm_config.model_config.max_model_len
+    max_concurrency = get_max_concurrency_for_kv_cache_config(
+        vllm_config, kv_cache_config)
     logger.info("Maximum concurrency for %s tokens per request: %.2fx",
                 max_model_len_str, max_concurrency)
+    return kv_cache_config
 
-    per_layer_size = page_size * num_blocks
-    # All layers have the same KV cache spec, so we create one kv cache group
-    # for all layers.
-    grouped_layer_names = [list(kv_cache_spec.keys())]
+
+def is_kv_cache_page_size_uniform(
+        kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
+    """
+    Whether all layers in the given KVCacheSpec have the same page size.
+    Args:
+        kv_cache_spec: The KVCacheSpec of each attention layer in the model
+
+    Returns:
+        True if all layers have the same page size, False otherwise.
+    """
+
+    page_sizes = {layer.page_size_bytes for layer in kv_cache_spec.values()}
+    return len(page_sizes) == 1
+
+
+def _get_kv_cache_config_uniform_page_size(
+        vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec],
+        available_memory: int) -> KVCacheConfig:
+    """
+    Generates the KV cache configuration for hybrid models with multiple 
+    attention types but still with a uniform page size (physical memory per 
+    block per layer) for all layers.
+
+    Detailed explanation about kv cache management of hybrid models:
+    The layers in the models are repeated with some patterns, e.g., a model
+    with 10 full attention layers and 20 sliding window attention layers can be
+    regarded as repeating the pattern (1 * full, 2 * sw) 10 times. 
+    The KVCacheManager allocates different block tables for each of the 3 layers
+    in the pattern, and repeats each of them 10 times to generate the 
+    block_table for the 30 layers in the model.
+    Therefore, we can group the layers in the model into 3 kv_cache_groups, each
+    of which contains 10 layers in the model.
+    The KVCacheManager allocates the block_table for each group based on its
+    kv_cache spec, and the model runner applies the block table to each layer 
+    in the group.
+    For example:
+    1. A model only uses full attention. The pattern is 
+    (num_hidden_layers * full), so there is only one group and the block table 
+    is shared by all layers. It is already handled by 
+    `_get_kv_cache_config_uniform_type`.
+    2. A model with 10 full attention layers and 20 sliding window 
+    attention layers. There are 3 layers in the pattern (1 * full, 2 * sw), so 
+    there are 3 kv_cache_groups, each of which represents 10 layers.
+
+    To simplify the implementation, we make the following assumptions:
+    1. Physical memory per block: Must be the same across all KV cache groups. 
+    Breaking this assumption is non-trivial due to memory fragmentation concerns
+    when allocating blocks of different sizes.
+    2. Tokens per block (block_size): Currently, we directly use 
+    `CacheConfig.block_size` for all layers. It can be extended to vary by KV 
+    cache group, but within each KV cache group, all layers must share the same 
+    block size.
+    3. Physical memory per token per layer: This property is decided by model 
+    config. Currently we only support models that have the same physical memory 
+    per token per layer for all layers. Can be relaxed with a simple extension, 
+    but still need to keep physical memory per block the same for all groups.
+    4. Number of layers per group: Currently assumed the same for all layers. 
+    Can be relaxed with a simple extension, but still need to keep physical 
+    memory per block the same for all groups.
+    5. Attention type within groups: All layers in a group must share the same
+    attention type. One exception is that, when 
+    `--disable-hybrid-kv-cache-manager` is true, the single group for full 
+    attention layers may also include attention layers using sliding window or 
+    LLaMA 4 local attention. See `unify_hybrid_kv_cache_specs` for more details.
+    6. Support for multiple attention types: The design for most components is 
+    general to an arbitrary number of attention types. But 
+    `find_longest_cache_hit` only supports one attention type or two 
+    types of full-attention plus exactly one another type. The general
+    implementation of this function is feasible but we don't know how to 
+    implement it cleanly yet.
+
+    As we assume tokens per block, physical memory per token per layer, and 
+    number of layers per group are the same now, we can ensure that physical 
+    memory per block is the same for all groups.
+
+    Args:
+        vllm_config: The global VllmConfig
+        kv_cache_spec: The KVCacheSpec of each attention layer in the model
+        available_memory: Memory available for KV cache in bytes.
+    Returns:
+        The generated KVCacheConfig
+    """
+    # Group all layers by type_id.
+    # E.g., 2 full attention layers and 3 sliding window attention layers,
+    # -> (full.0, full.1), (sw.0, sw.1, sw.2).
+    same_type_layers: dict[str, list[str]] = defaultdict(list)
+    for layer_name, layer_spec in kv_cache_spec.items():
+        same_type_layers[layer_spec.type_id].append(layer_name)
+
+    # Split each group into smaller groups, to make the number of layers in each
+    # group identical. Add padding to the last group of each type if necessary.
+    # E.g., (full.0, full.1), (sw.0, sw.1, sw.2)
+    # split to 3 groups with 2 layers each:
+    # (full.0, full.1), (sw.0, sw.1), (sw.2, padding).
+    # FIXME(Chen): At the moment of writing this code (2025-06-02), all
+    # open-source hybrid model follows a n:1 pattern between different attention
+    # types (e.g., Gemma3 5:1 between sw and full, LLaMA4 3:1 between local and
+    # full), so we can use the "1" in the n:1 pattern as the group size, which
+    # is the minimum number of layers among all attention types. Need a better
+    # strategy if we want to support more complex patterns (e.g., 20 full + 30
+    # sw, where the group size should be 10).
+    group_size = min([len(layers) for layers in same_type_layers.values()])
+    grouped_layers = []
+    for layers in same_type_layers.values():
+        num_padding_layers = group_size - len(layers) % group_size
+        if num_padding_layers != group_size:
+            logger.warning(
+                "Add %d padding layers, may waste at most %.2f%% KV cache memory",  # noqa
+                num_padding_layers,
+                num_padding_layers / len(layers) * 100,
+            )
+        for i in range(0, len(layers), group_size):
+            grouped_layers.append(layers[i:i + group_size])
+    kv_cache_groups = create_kv_cache_group_specs(kv_cache_spec,
+                                                  grouped_layers)
+
+    # Determine how model runners should initialize the KV cache tensors.
+    # We will have group_size memory pools, each is shared by one layer from
+    # each group. As layers of different groups have different block table,
+    # they will use different parts of the shared Tensor.
+    # The memory layout in the example will be:
+    # full.0, sw.0, sw.2: share a Tensor with size=available_memory//2
+    # full.1, sw.1: share another Tensor with size=available_memory//2
+    page_size = get_uniform_page_size(kv_cache_spec)
+    num_blocks = get_num_blocks(vllm_config, group_size, available_memory,
+                                page_size)
+    per_memory_pool_size = page_size * num_blocks
+    kv_cache_tensors = []
+    for i in range(group_size):
+        shared_by = []
+        for j in range(len(kv_cache_groups)):
+            if i < len(grouped_layers[j]):
+                shared_by.append(grouped_layers[j][i])
+        kv_cache_tensors.append(
+            KVCacheTensor(size=per_memory_pool_size, shared_by=shared_by))
 
     kv_cache_config = KVCacheConfig(
         num_blocks=num_blocks,
-        tensors={
-            layer_name: KVCacheTensor(size=per_layer_size)
-            for layer_name in kv_cache_spec
-        },
-        kv_cache_groups=create_kv_cache_group_specs(kv_cache_spec,
-                                                    grouped_layer_names),
+        kv_cache_tensors=kv_cache_tensors,
+        kv_cache_groups=kv_cache_groups,
     )
+
+    # Print the KV cache size and maximum concurrency.
+    num_tokens = num_blocks // len(
+        grouped_layers) * vllm_config.cache_config.block_size
+    num_tokens_str = f"{num_tokens:,}"
+    logger.info("GPU KV cache size: %s tokens", num_tokens_str)
+    max_model_len_str = f"{vllm_config.model_config.max_model_len:,}"
+    max_concurrency = get_max_concurrency_for_kv_cache_config(
+        vllm_config, kv_cache_config)
+    logger.info("Maximum concurrency for %s tokens per request: %.2fx",
+                max_model_len_str, max_concurrency)
     return kv_cache_config
 
 
 def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
     """
-    Only models with one type of KV cache are supported yet. This function tries
-    to convert the KV cache specs to one type if the model is a hybrid model
-    with multiple type of KV cache. It will convert all SlidingWindowSpec to
-    FullAttentionSpec if both types are present.
+    This function tries to convert the KV cache specs to one type if the model
+    is a hybrid model with multiple type of KV cache. It will convert all
+    SlidingWindowSpec to FullAttentionSpec if both types are present.
 
     Args:
         kv_cache_spec: The kv cache spec of each attention layer in the model
     """
 
+    def is_hybrid(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
+        type_ids = set(layer_spec.type_id
+                       for layer_spec in kv_cache_spec.values())
+        return len(type_ids) > 1
+
+    if not is_hybrid(kv_cache_spec):
+        return
+
+    logger.warning(
+        "Hybrid KV cache manager is disabled for this hybrid model, "
+        "This means we do not enable any optimizations for saving KV cache "
+        "memory (e.g., dropping the KV cache outside the sliding window). "
+        "The compute of layers like sliding window is still saved.")
+
     has_full_attention = any(
         isinstance(spec, FullAttentionSpec) for spec in kv_cache_spec.values())
     has_sliding_window = any(
@@ -685,13 +916,18 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
                     sliding_window=spec.sliding_window,
                 )
 
+    if is_hybrid(kv_cache_spec):
+        raise ValueError("Hybrid KV cache manager is disabled but failed to "
+                         "convert the KV cache specs to one unified type.")
+
 
-def get_kv_cache_config(vllm_config: VllmConfig,
-                        kv_cache_spec: dict[str, KVCacheSpec],
-                        available_memory: int) -> KVCacheConfig:
+def get_kv_cache_config(
+    vllm_config: VllmConfig,
+    kv_cache_spec: dict[str, KVCacheSpec],
+    available_memory: int,
+) -> KVCacheConfig:
     """
-    Generates the KV cache configuration for a model
-    TODO: support hybrid models with more than one type of KV cache.
+    Generates the KV cache configuration for a model.
 
     Args:
         vllm_config: The global VllmConfig
@@ -702,13 +938,24 @@ def get_kv_cache_config(vllm_config: VllmConfig,
         The generated KVCacheConfigs
     """
     check_enough_kv_cache_memory(vllm_config, kv_cache_spec, available_memory)
-    unify_hybrid_kv_cache_specs(kv_cache_spec)
+
+    if vllm_config.scheduler_config.disable_hybrid_kv_cache_manager:
+        unify_hybrid_kv_cache_specs(kv_cache_spec)
+
     if is_kv_cache_type_uniform(kv_cache_spec):
         # KV cache of all layers are the same, which is true for
         # most models. Allocate the same amount of memory for
         # each layer.
         return _get_kv_cache_config_uniform_type(vllm_config, kv_cache_spec,
                                                  available_memory)
+    elif is_kv_cache_page_size_uniform(kv_cache_spec):
+        # Model contains multiple attention types, but KV cache of all layers
+        # have the same physical memory per block per layer. Split the layers
+        # into groups with the same number of layers, and thus same total page
+        # size.
+        return _get_kv_cache_config_uniform_page_size(vllm_config,
+                                                      kv_cache_spec,
+                                                      available_memory)
 
     raise NotImplementedError
 
diff --git a/vllm/v1/core/sched/interface.py b/vllm/v1/core/sched/interface.py
index 055ce446051e..dd5052a3480b 100644
--- a/vllm/v1/core/sched/interface.py
+++ b/vllm/v1/core/sched/interface.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from collections.abc import Iterable
 from typing import TYPE_CHECKING, Optional, Union
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
index 257234430983..b404c70eb1e4 100644
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from __future__ import annotations
 
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 89e4016a7b97..aae5cb4886e6 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from __future__ import annotations
 
@@ -18,7 +19,7 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
                                                 compute_encoder_budget)
-from vllm.v1.core.kv_cache_manager import KVCacheBlocks, KVCacheManager
+from vllm.v1.core.kv_cache_manager import KVCacheManager
 from vllm.v1.core.sched.interface import SchedulerInterface
 from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
                                        SchedulerOutput)
@@ -76,11 +77,16 @@ def __init__(
         # KV Connector pushes/pull of remote KVs for P/D and offloading.
         self.connector = None
         if self.vllm_config.kv_transfer_config is not None:
+            assert len(self.kv_cache_config.kv_cache_groups) == 1, (
+                "Multiple KV cache groups are not currently supported "
+                "with KV connectors")
             self.connector = KVConnectorFactory.create_connector_v1(
                 config=self.vllm_config, role=KVConnectorRole.SCHEDULER)
 
         self.kv_event_publisher = EventPublisherFactory.create(
-            self.kv_events_config)
+            self.kv_events_config,
+            vllm_config.parallel_config.data_parallel_rank,
+        )
 
         num_gpu_blocks = self.cache_config.num_gpu_blocks
         assert num_gpu_blocks is not None and num_gpu_blocks > 0
@@ -103,7 +109,7 @@ def __init__(
         # This is flushed at the end of each scheduling step.
         self.finished_req_ids: set[str] = set()
 
-        # P/D: requests in process of recving KV transfers
+        # KV Connector: requests in process of async KV loading or recving
         self.finished_recving_kv_req_ids: set[str] = set()
 
         # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating
@@ -444,7 +450,8 @@ def schedule(self) -> SchedulerOutput:
                 # KVTransfer: WAITING reqs have num_computed_tokens > 0
                 # after async KV recvs are completed.
                 else:
-                    new_computed_blocks = KVCacheBlocks.create_empty()
+                    new_computed_blocks = (
+                        self.kv_cache_manager.create_empty_block_list())
                     num_new_local_computed_tokens = 0
                     num_computed_tokens = request.num_computed_tokens
 
@@ -505,11 +512,11 @@ def schedule(self) -> SchedulerOutput:
                         cast(deque[Request], self.waiting).appendleft(request)
                     break
 
-                # KVConnector: update internal state after allocation.
+                # KVTransfer: the connector uses this info to determine
+                # if a load is needed. Note that
                 # This information is used to determine if a load is
                 # needed for this request.
-                if num_external_computed_tokens:
-                    assert self.connector is not None
+                if self.connector is not None:
                     self.connector.update_state_after_alloc(
                         request,
                         new_computed_blocks + new_blocks,
@@ -923,7 +930,7 @@ def update_from_output(
             if not stopped:
                 new_running.append(request)
 
-        # P/D: update state for finished KV Transfers.
+        # KV Connector: update state for finished KV Transfers.
         self._update_from_kv_xfer_finished(model_runner_output)
 
         # Return the cached request data to the queue so they can be reused.
@@ -943,7 +950,7 @@ def update_from_output(
         }
 
         finished_req_ids = self.finished_req_ids_dict
-        if finished_req_ids is not None:
+        if finished_req_ids:
             # Include ids of requests that finished since last outputs
             # were sent.
             for client_index, finished_set in finished_req_ids.items():
@@ -1077,7 +1084,7 @@ def shutdown(self) -> None:
             self.kv_event_publisher.shutdown()
 
     ########################################################################
-    # P/D Related Methods
+    # KV Connector Related Methods
     ########################################################################
 
     def get_kv_connector(self) -> Optional[KVConnectorBase_V1]:
@@ -1093,14 +1100,13 @@ def _connector_finished(
         """
         if self.connector is None:
             return False, None
-        assert (len(self.kv_cache_config.kv_cache_groups) == 1
-                ), "KV connector only supports one KV cache group now"
-        block_ids = self.kv_cache_manager.get_block_ids(request.request_id)[0]
+
+        (block_ids, ) = self.kv_cache_manager.get_block_ids(request.request_id)
         return self.connector.request_finished(request, block_ids)
 
     def _update_waiting_for_remote_kv(self, request: Request) -> bool:
         """
-        P/D: check if the request_id is finished_recving.
+        KV Connector: check if the request_id is finished_recving.
 
         The finished_recving_kv_req_ids list is populated
         on the previous steps()'s update_from_output based
@@ -1110,16 +1116,17 @@ def _update_waiting_for_remote_kv(self, request: Request) -> bool:
         and the request state will be moved back to WAITING from
         WAITING_FOR_REMOTE_KV.
         """
+        assert self.connector is not None
         if request.request_id not in self.finished_recving_kv_req_ids:
             return False
-        assert (len(self.kv_cache_config.kv_cache_groups) == 1
-                ), "KV connector only supports one KV cache group now"
         # Now that the blocks are ready, actually cache them.
-        block_ids = self.kv_cache_manager.get_block_ids(request.request_id)[0]
+        (block_ids, ) = self.kv_cache_manager.get_block_ids(request.request_id)
         num_computed_tokens = len(block_ids) * self.block_size
+        # Handle the case where num request tokens less then one block.
+        num_computed_tokens = min(num_computed_tokens, request.num_tokens)
         if num_computed_tokens == request.num_tokens:
             num_computed_tokens -= 1
-        self.kv_cache_manager.single_type_manager.cache_blocks(
+        self.kv_cache_manager.cache_blocks(
             request,
             self.kv_cache_manager.req_to_block_hashes[request.request_id],
             num_computed_tokens,
@@ -1135,7 +1142,7 @@ def _update_waiting_for_remote_kv(self, request: Request) -> bool:
     def _update_from_kv_xfer_finished(self,
                                       model_runner_output: ModelRunnerOutput):
         """
-        P/D: update the scheduler state based on the output.
+        KV Connector: update the scheduler state based on the output.
 
         The Worker side connectors add finished_recving and
         finished_sending reqs to the output.
@@ -1143,8 +1150,8 @@ def _update_from_kv_xfer_finished(self,
         # if finished_recving: add to state so we can
             scheduler the request during the next step.
         """
-        # P/D: update recv and send status from last step.
-        for req_id in model_runner_output.finished_recving or ():
+        # KV Connector:: update recv and send status from last step.
+        for req_id in (model_runner_output.finished_recving or ()):
             logger.debug("Finished recving KV transfer for request %s", req_id)
             self.finished_recving_kv_req_ids.add(req_id)
         for req_id in model_runner_output.finished_sending or ():
diff --git a/vllm/v1/core/sched/utils.py b/vllm/v1/core/sched/utils.py
index 3a0028a59016..1397c5f4c9a6 100644
--- a/vllm/v1/core/sched/utils.py
+++ b/vllm/v1/core/sched/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.v1.request import Request, RequestStatus
 
 
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
index e69e9ac9f6a3..98d758f820ad 100644
--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from typing import Callable
@@ -21,8 +22,7 @@ def __init__(
         self,
         kv_cache_spec: KVCacheSpec,
         block_pool: BlockPool,
-        use_eagle: bool,
-        num_kv_cache_groups: int,
+        kv_cache_group_id: int,
         caching_hash_fn: Callable,
     ) -> None:
         """
@@ -30,9 +30,7 @@ def __init__(
         Args:
             kv_cache_spec: The kv_cache_spec for this manager.
             block_pool: The block pool.
-            use_eagle: Whether to use eagle.
-            num_kv_cache_groups: The number of kv cache groups managed by this 
-                manager.
+            kv_cache_group_id: The id of the kv cache group of this manager.
             caching_hash_fn: The caching hash function.
         """
 
@@ -40,9 +38,6 @@ def __init__(
         self.kv_cache_spec = kv_cache_spec
         self.block_pool = block_pool
 
-        # Needs special handling for find_longest_cache_hit if eagle is enabled
-        self.use_eagle = use_eagle
-
         # Mapping from request ID to blocks to track the blocks allocated
         # for each request, so that we can free the blocks when the request
         # is finished.
@@ -55,8 +50,8 @@ def __init__(
         # data for reempted ones.
         self.num_cached_block: dict[str, int] = {}
 
-        self.num_kv_cache_groups = num_kv_cache_groups
         self.caching_hash_fn = caching_hash_fn
+        self.kv_cache_group_id = kv_cache_group_id
 
     def get_num_blocks_to_allocate(
             self, request_id: str, num_tokens: int,
@@ -82,10 +77,10 @@ def get_num_blocks_to_allocate(
         # free queue and ref_cnt == 0), it will be changed from a free block
         # to a computed block when the request is allocated, so we also count
         # it as needed to be allocated.
-        num_evictable_computed_blocks = sum(blk.ref_cnt == 0
-                                            for blk in new_computed_blocks)
-        return ((num_new_blocks + num_evictable_computed_blocks) *
-                self.num_kv_cache_groups)
+        num_evictable_computed_blocks = sum(
+            blk.ref_cnt == 0 and not blk.is_null
+            for blk in new_computed_blocks)
+        return num_new_blocks + num_evictable_computed_blocks
 
     def save_new_computed_blocks(
             self, request_id: str,
@@ -128,8 +123,7 @@ def allocate_new_blocks(self, request_id: str,
         if num_new_blocks <= 0:
             return []
         else:
-            new_blocks = self.block_pool.get_new_blocks(
-                num_new_blocks * self.num_kv_cache_groups)
+            new_blocks = self.block_pool.get_new_blocks(num_new_blocks)
             req_blocks.extend(new_blocks)
             return new_blocks
 
@@ -154,12 +148,19 @@ def cache_blocks(self, request: Request, block_hashes: list[BlockHash],
             num_cached_blocks=num_cached_blocks,
             num_full_blocks=num_full_blocks,
             block_size=self.block_size,
+            kv_cache_group_id=self.kv_cache_group_id,
             hash_fn=self.caching_hash_fn,
         )
 
         self.num_cached_block[request.request_id] = num_full_blocks
 
     def free(self, request_id: str) -> None:
+        """
+        Free the blocks for the request.
+
+        Args:
+            request_id: The request ID.
+        """
         # Default to [] in case a request is freed (aborted) before alloc.
         req_blocks = self.req_to_blocks.pop(request_id, [])
 
@@ -186,12 +187,22 @@ def get_num_common_prefix_blocks(self, request_id: str,
 
         raise NotImplementedError
 
+    @classmethod
     @abstractmethod
-    def find_longest_cache_hit(self, block_hashes: list[BlockHash],
-                               max_length: int) -> list[KVCacheBlock]:
+    def find_longest_cache_hit(
+        cls,
+        block_hashes: list[BlockHash],
+        max_length: int,
+        kv_cache_group_ids: list[int],
+        block_pool: BlockPool,
+        kv_cache_spec: KVCacheSpec,
+        use_eagle: bool,
+    ) -> list[list[KVCacheBlock]]:
         """
         Get the longest cache hit prefix of the blocks that is not longer than 
-        `max_length`. If no cache hit is found, return an empty list. 
+        `max_length`. The prefix should be a common prefix hit for all the 
+        kv cache groups in `kv_cache_group_ids`. If no cache hit is found, 
+        return an empty list. 
         If eagle is enabled, drop the last matched block to force recompute the 
         last block to get the required hidden states for eagle drafting head. 
         Need to be customized for each attention type.
@@ -199,12 +210,20 @@ def find_longest_cache_hit(self, block_hashes: list[BlockHash],
         Args:
             block_hashes: The block hashes of the request.
             max_length: The maximum length of the cache hit prefix.
+            kv_cache_group_ids: The ids of the kv cache groups.
+            block_pool: The block pool.
+            kv_cache_spec: The kv cache spec.
+            use_eagle: Whether to use eagle.
 
         Returns:
-            A list of cached blocks with skipped blocks replaced by null block.
+            A list of cached blocks with skipped blocks replaced by null block
+            for each kv cache group in `kv_cache_group_ids`.
+            Return a list of length `len(kv_cache_group_ids)`, where the i-th
+            element is a list of cached blocks for the i-th kv cache group
+            in `kv_cache_group_ids`.
             For example, sliding window manager should return a list like
-            [NULL, NULL, KVCacheBlock(7), KVCacheBlock(8)] for block size 4 and 
-            sliding window 8. 
+            [[NULL, NULL, KVCacheBlock(7), KVCacheBlock(8)]] for block size 4 
+            and sliding window 8 and len(kv_cache_group_ids) = 1.
         """
 
         raise NotImplementedError
@@ -213,11 +232,9 @@ def find_longest_cache_hit(self, block_hashes: list[BlockHash],
     def remove_skipped_blocks(self, request_id: str,
                               num_computed_tokens: int) -> None:
         """
-        Remove the blocks that are no longer needed from `blocks`. The removed 
-        blocks should be replaced by null_block. Return the removed blocks in 
-        eviction order, where the first returned block should be evicted first.
-        Don't free the removed blocks in this function. Need to be customized 
-        for each attention type.
+        Remove the blocks that are no longer needed from `blocks` and free the 
+        blocks. The removed blocks should be replaced by null_block.
+        Need to be customized for each attention type.
 
         Args:
             request_id: The request ID.
@@ -228,21 +245,36 @@ def remove_skipped_blocks(self, request_id: str,
 
 class FullAttentionManager(SingleTypeKVCacheManager):
 
-    def find_longest_cache_hit(self, block_hashes: list[BlockHash],
-                               max_length: int) -> list[KVCacheBlock]:
-        computed_blocks: list[KVCacheBlock] = []
-        max_num_blocks = max_length // self.block_size
+    @classmethod
+    def find_longest_cache_hit(
+        cls,
+        block_hashes: list[BlockHash],
+        max_length: int,
+        kv_cache_group_ids: list[int],
+        block_pool: BlockPool,
+        kv_cache_spec: KVCacheSpec,
+        use_eagle: bool,
+    ) -> list[list[KVCacheBlock]]:
+        assert isinstance(kv_cache_spec, FullAttentionSpec), (
+            "FullAttentionManager can only be used for full attention groups")
+        computed_blocks: list[list[KVCacheBlock]] = [
+            [] for _ in range(len(kv_cache_group_ids))
+        ]
+        max_num_blocks = max_length // kv_cache_spec.block_size
         for i in range(max_num_blocks):
             block_hash = block_hashes[i]
             # block_hashes is a chain of block hashes. If a block hash is not
             # in the cached_block_hash_to_id, the following block hashes are
             # not computed yet for sure.
-            if cached_block := self.block_pool.get_cached_block(block_hash):
-                computed_blocks.append(cached_block)
+            if cached_block := block_pool.get_cached_block(
+                    block_hash, kv_cache_group_ids):
+                for j in range(len(kv_cache_group_ids)):
+                    computed_blocks[j].append(cached_block[j])
             else:
                 break
-        if self.use_eagle and len(computed_blocks) > 0:
-            computed_blocks.pop()
+        if use_eagle and len(computed_blocks[0]) > 0:
+            for j in range(len(kv_cache_group_ids)):
+                computed_blocks[j].pop()
         return computed_blocks
 
     def remove_skipped_blocks(self, request_id: str,
@@ -265,45 +297,58 @@ def get_num_common_prefix_blocks(self, request_id: str,
 class SlidingWindowManager(SingleTypeKVCacheManager):
 
     def __init__(self, kv_cache_spec: SlidingWindowSpec, block_pool: BlockPool,
-                 use_eagle: bool, **kwargs) -> None:
-        super().__init__(kv_cache_spec, block_pool, use_eagle, **kwargs)
+                 **kwargs) -> None:
+        super().__init__(kv_cache_spec, block_pool, **kwargs)
         self.sliding_window = kv_cache_spec.sliding_window
+        self._null_block = block_pool.null_block
+
+    @classmethod
+    def find_longest_cache_hit(
+        cls,
+        block_hashes: list[BlockHash],
+        max_length: int,
+        kv_cache_group_ids: list[int],
+        block_pool: BlockPool,
+        kv_cache_spec: KVCacheSpec,
+        use_eagle: bool,
+    ) -> list[list[KVCacheBlock]]:
+        assert isinstance(kv_cache_spec, SlidingWindowSpec), (
+            "SlidingWindowManager can only be used for sliding window groups")
+
         # The number of contiguous blocks needed for prefix cache hit.
         # -1 since the input token itself is also included in the window
-        self.sliding_window_contiguous_blocks = cdiv(
-            (kv_cache_spec.sliding_window - 1), self.block_size)
-        if self.use_eagle:
+        sliding_window_contiguous_blocks = cdiv(
+            kv_cache_spec.sliding_window - 1, kv_cache_spec.block_size)
+        if use_eagle:
             # Need to drop the last matched block if eagle is enabled. For
             # sliding window layer, we achieve this by increasing the number of
             # contiguous blocks needed for prefix cache hit by one and dropping
             # the last matched block.
-            self.sliding_window_contiguous_blocks += 1
-        self._null_block = block_pool.null_block
+            sliding_window_contiguous_blocks += 1
 
-    def find_longest_cache_hit(self, block_hashes: list[BlockHash],
-                               max_length: int) -> list[KVCacheBlock]:
         # TODO: reduce i by sliding_window_contiguous_blocks when cache miss, to
         # optimize the time complexity from O(max_num_blocks) to
         # O(max_num_blocks / sliding_window_contiguous_blocks +
         # sliding_window_contiguous_blocks),
         # which is good for low cache hit rate scenarios.
-        max_num_blocks = max_length // self.block_size
-        computed_blocks = [self._null_block] * max_num_blocks
+        max_num_blocks = max_length // kv_cache_spec.block_size
+        computed_blocks = [[block_pool.null_block] * max_num_blocks
+                           for _ in range(len(kv_cache_group_ids))]
         num_contiguous_blocks = 0
-
         match_found = False
         # Search from right to left and early stop when a match is found.
         for i in range(max_num_blocks - 1, -1, -1):
-            if cached_block := self.block_pool.get_cached_block(
-                    block_hashes[i]):
-                computed_blocks[i] = cached_block
+            if cached_block := block_pool.get_cached_block(
+                    block_hashes[i], kv_cache_group_ids):
+                for j in range(len(kv_cache_group_ids)):
+                    computed_blocks[j][i] = cached_block[j]
                 num_contiguous_blocks += 1
-                if (num_contiguous_blocks
-                        >= self.sliding_window_contiguous_blocks):
+                if (num_contiguous_blocks >= sliding_window_contiguous_blocks):
                     # Trim the trailing blocks.
                     # E.g., [NULL, NULL, 8, 3, NULL, 9] -> [NULL, NULL, 8, 3]
                     # when sliding_window_contiguous_blocks=2.
-                    del computed_blocks[i + num_contiguous_blocks:]
+                    for j in range(len(kv_cache_group_ids)):
+                        del computed_blocks[j][i + num_contiguous_blocks:]
                     match_found = True
                     break
             else:
@@ -311,9 +356,11 @@ def find_longest_cache_hit(self, block_hashes: list[BlockHash],
         if not match_found:
             # The first `num_contiguous_blocks` is a cache hit even if
             # `num_contiguous_blocks < sliding_window_contiguous_blocks`.
-            del computed_blocks[num_contiguous_blocks:]
-        if self.use_eagle and len(computed_blocks) > 0:
-            computed_blocks.pop()
+            for j in range(len(kv_cache_group_ids)):
+                del computed_blocks[j][num_contiguous_blocks:]
+        if use_eagle and len(computed_blocks[0]) > 0:
+            for j in range(len(kv_cache_group_ids)):
+                computed_blocks[j].pop()
         return computed_blocks
 
     def remove_skipped_blocks(self, request_id: str,
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 46710e98db97..a084bc0bccc0 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
 import time
@@ -54,6 +55,7 @@ class EngineCoreRequest(
     arrival_time: float
     lora_request: Optional[LoRARequest]
     cache_salt: Optional[str]
+    data_parallel_rank: Optional[int]
 
     # Index of the client, used to ensure outputs are sent back to the same
     # client for this request when scaling out the front-end.
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 4b235c596ed6..7fb36cf5941e 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 from collections.abc import AsyncGenerator, Mapping
 from copy import copy
@@ -27,8 +28,7 @@
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Device, cdiv
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.core_client import (AsyncMPClient, DPAsyncMPClient,
-                                        RayDPClient)
+from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
 from vllm.v1.engine.output_processor import (OutputProcessor,
                                              RequestOutputCollector)
@@ -120,15 +120,8 @@ def __init__(
                                                 log_stats=self.log_stats)
 
         # EngineCore (starts the engine in background process).
-        core_client_class: type[AsyncMPClient]
-        if vllm_config.parallel_config.data_parallel_size == 1:
-            core_client_class = AsyncMPClient
-        elif vllm_config.parallel_config.data_parallel_backend == "ray":
-            core_client_class = RayDPClient
-        else:
-            core_client_class = DPAsyncMPClient
-
-        self.engine_core = core_client_class(
+
+        self.engine_core = EngineCoreClient.make_async_mp_client(
             vllm_config=vllm_config,
             executor_class=executor_class,
             log_stats=self.log_stats,
@@ -228,6 +221,7 @@ async def add_request(
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
+        data_parallel_rank: Optional[int] = None,
     ) -> RequestOutputCollector:
         """Add new request to the AsyncLLM."""
 
@@ -244,7 +238,7 @@ async def add_request(
         prompt_str, request = self.processor.process_inputs(
             request_id, prompt, params, arrival_time, lora_request,
             tokenization_kwargs, trace_headers, prompt_adapter_request,
-            priority)
+            priority, data_parallel_rank)
 
         if params.n == 1:
             await self._add_request(request, prompt_str, None, 0, queue)
@@ -290,6 +284,7 @@ async def generate(
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
+        data_parallel_rank: Optional[int] = None,
     ) -> AsyncGenerator[RequestOutput, None]:
         """
         Main function called by the API server to kick off a request
@@ -320,6 +315,7 @@ async def generate(
                 trace_headers=trace_headers,
                 prompt_adapter_request=prompt_adapter_request,
                 priority=priority,
+                data_parallel_rank=data_parallel_rank,
             )
 
             # The output_handler task pushes items into the queue.
@@ -336,8 +332,9 @@ async def generate(
                 yield out
 
         # If the request is disconnected by the client, generate()
-        # is cancelled. So, we abort the request if we end up here.
-        except asyncio.CancelledError:
+        # is cancelled or the generator is garbage collected. So,
+        # we abort the request if we end up here.
+        except (asyncio.CancelledError, GeneratorExit):
             await self.abort(request_id)
             if self.log_requests:
                 logger.info("Request %s aborted.", request_id)
diff --git a/vllm/v1/engine/coordinator.py b/vllm/v1/engine/coordinator.py
index b84d4b144b5f..4f6ba099c650 100644
--- a/vllm/v1/engine/coordinator.py
+++ b/vllm/v1/engine/coordinator.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import multiprocessing
 import time
 import weakref
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 7253d1dc66d1..f36a491a1970 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 import queue
 import signal
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index fa01998aa9fe..7eff377b74b5 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 import contextlib
 import queue
@@ -67,18 +68,31 @@ def make_client(
                 "is not currently supported.")
 
         if multiprocess_mode and asyncio_mode:
-            if vllm_config.parallel_config.data_parallel_size > 1:
-                if vllm_config.parallel_config.data_parallel_backend == "ray":
-                    return RayDPClient(vllm_config, executor_class, log_stats)
-                return DPAsyncMPClient(vllm_config, executor_class, log_stats)
-
-            return AsyncMPClient(vllm_config, executor_class, log_stats)
+            return EngineCoreClient.make_async_mp_client(
+                vllm_config, executor_class, log_stats)
 
         if multiprocess_mode and not asyncio_mode:
             return SyncMPClient(vllm_config, executor_class, log_stats)
 
         return InprocClient(vllm_config, executor_class, log_stats)
 
+    @staticmethod
+    def make_async_mp_client(
+        vllm_config: VllmConfig,
+        executor_class: type[Executor],
+        log_stats: bool,
+        client_addresses: Optional[dict[str, str]] = None,
+        client_index: int = 0,
+    ) -> "MPClient":
+        if vllm_config.parallel_config.data_parallel_size > 1:
+            if vllm_config.parallel_config.data_parallel_backend == "ray":
+                return RayDPClient(vllm_config, executor_class, log_stats,
+                                   client_addresses, client_index)
+            return DPAsyncMPClient(vllm_config, executor_class, log_stats,
+                                   client_addresses, client_index)
+        return AsyncMPClient(vllm_config, executor_class, log_stats,
+                             client_addresses, client_index)
+
     @abstractmethod
     def shutdown(self):
         ...
@@ -981,7 +995,13 @@ async def run_engine_stats_update_task():
         resources.stats_update_task = asyncio.create_task(
             run_engine_stats_update_task())
 
-    def get_core_engine_for_request(self) -> CoreEngine:
+    def get_core_engine_for_request(self,
+                                    dp_rank: Optional[int] = None
+                                    ) -> CoreEngine:
+        if dp_rank is not None:
+            # engines are already in rank order
+            return self.core_engines[dp_rank]
+
         if not self.lb_engines:
             return self.core_engines[0]
         # TODO use P2C alg for larger DP sizes
@@ -1017,7 +1037,8 @@ async def add_request_async(self, request: EngineCoreRequest) -> None:
         request.current_wave = self.current_wave
         request.client_index = self.client_index
 
-        chosen_engine = self.get_core_engine_for_request()
+        chosen_engine = self.get_core_engine_for_request(
+            request.data_parallel_rank)
         self.reqs_in_flight[request.request_id] = chosen_engine
 
         to_await = self._send_input(EngineCoreRequestType.ADD, request,
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index dca327cc5d07..c6fe2d339c93 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from typing import Optional
 
diff --git a/vllm/v1/engine/exceptions.py b/vllm/v1/engine/exceptions.py
index 97dd31d5e521..692ba9dc840f 100644
--- a/vllm/v1/engine/exceptions.py
+++ b/vllm/v1/engine/exceptions.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 class EngineGenerateError(Exception):
     """Raised when a AsyncLLM.generate() fails. Recoverable."""
     pass
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index c856e2645a2c..736ffd8b40f0 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Mapping
 from copy import copy
diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py
index 03d82b6bbc1d..edc3be5b0120 100644
--- a/vllm/v1/engine/logprobs.py
+++ b/vllm/v1/engine/logprobs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
 from collections.abc import Iterable
diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py
index 45fb5cd23f60..abe98a13dfd3 100644
--- a/vllm/v1/engine/mm_input_cache.py
+++ b/vllm/v1/engine/mm_input_cache.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Sequence
 from typing import Optional
 
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 293c291b4341..1dcfbab30cfb 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
 from collections.abc import Iterable
diff --git a/vllm/v1/engine/parallel_sampling.py b/vllm/v1/engine/parallel_sampling.py
index 4df7ca59731e..1e9911152c6d 100644
--- a/vllm/v1/engine/parallel_sampling.py
+++ b/vllm/v1/engine/parallel_sampling.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from copy import copy
 from typing import Optional
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 15d9f5e917dd..aee8c050a946 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 from collections.abc import Mapping, Sequence
@@ -211,6 +212,7 @@ def process_inputs(
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
+        data_parallel_rank: Optional[int] = None,
     ) -> tuple[Optional[str], EngineCoreRequest]:
 
         # TODO(woosuk): Support pooling models.
@@ -222,6 +224,12 @@ def process_inputs(
         if prompt_adapter_request is not None:
             raise ValueError("V1 does not support prompt_adapter_request.")
 
+        data_parallel_size = self.vllm_config.parallel_config.data_parallel_size
+        if data_parallel_rank is not None and not (0 <= data_parallel_rank <
+                                                   data_parallel_size):
+            raise ValueError(f"data_parallel_rank {data_parallel_rank} "
+                             f"is out of range [0, {data_parallel_size}).")
+
         if arrival_time is None:
             arrival_time = time.time()
 
@@ -326,6 +334,7 @@ def process_inputs(
             lora_request=lora_request,
             cache_salt=decoder_inputs.get("cache_salt"),
             priority=priority,
+            data_parallel_rank=data_parallel_rank,
         )
 
     def _validate_model_inputs(self,
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index 3b9feb0d3298..50b9634a49e1 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from concurrent.futures import Future
 from typing import Callable, Union
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index eb5f9d4bfe00..0bd7383b5f0e 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import multiprocessing
 import os
 import pickle
diff --git a/vllm/v1/executor/ray_distributed_executor.py b/vllm/v1/executor/ray_distributed_executor.py
index 320ebfd37ae3..257564793cf4 100644
--- a/vllm/v1/executor/ray_distributed_executor.py
+++ b/vllm/v1/executor/ray_distributed_executor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from concurrent.futures import Future
 from typing import Union
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index 2747fc7fabd1..e938f3bfc671 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import copy
 from dataclasses import dataclass
@@ -156,11 +157,10 @@ def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
 @dataclass
 class KVCacheTensor:
     """
-    A dataclass for specifying how the workers should initialize the KV cache
-    for a layer. Only contains the size of KV cache for that layer for now. Will
-    be extended to support multiple layers sharing the same memory pool.
+    A class for specifying how the workers should initialize the KV cache.
     """
-    size: int  # The size of KV cache Tensor in bytes
+    size: int  # size of the KV cache tensor in bytes
+    shared_by: list[str]  # layer names that share the same KV cache tensor
 
 
 @dataclass
@@ -182,27 +182,13 @@ class KVCacheConfig:
     """
     """The number of KV cache blocks"""
     num_blocks: int
-    """layer_name -> how to initialize KV cache for that layer"""
-    tensors: dict[str, KVCacheTensor]
+    """How should model runner initialize the KV cache tensors for each layer"""
+    kv_cache_tensors: list[KVCacheTensor]
     """
     The kv cache groups of the model.
-    The layers in the models are repeated with some patterns, e.g., a model
-    with 10 full attention layers and 20 sliding window attention layers can be
-    regarded as repeating the pattern (1 * full, 2 * sw) 10 times. 
-    The KVCacheManager allocates different block tables for each of the 3 layers
-    in the pattern, and repeats each of them 10 times to generate the 
-    block_table for the 30 layers in the model.
-    Therefore, we can group the layers in the model into 3 groups, each of which
-    contains 10 layers in the model.
-    The KVCacheManager allocates the block_table for each group based on its
-    kv_cache spec, and the model runner applies the block table to each layer 
-    in the group.
-    For example:
-    1. A model only uses full attention. The pattern is 
-    (num_hidden_layers * full), so there is only one group and the block table 
-    is shared by all layers.
-    2. (WIP) A model with 10 full attention layers and 20 sliding window 
-    attention layers. There are 3 layers in the pattern (1 * full, 2 * sw), so 
-    there are 3 groups, each of which represents 10 layers in the model.
+    For models with only one type of attention, there is only one group that
+    contains all layers.
+    For models with multiple types of attention, there will be multiple groups,
+    see `_get_kv_cache_config_uniform_page_size` for more details.
     """
     kv_cache_groups: list[KVCacheGroupSpec]
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 665e5873d589..2d621ec31038 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import logging
 import time
diff --git a/vllm/v1/metrics/prometheus.py b/vllm/v1/metrics/prometheus.py
index a364b286d21b..61ba5d66cb31 100644
--- a/vllm/v1/metrics/prometheus.py
+++ b/vllm/v1/metrics/prometheus.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 import tempfile
diff --git a/vllm/v1/metrics/ray_wrappers.py b/vllm/v1/metrics/ray_wrappers.py
index a51c3ed7f572..cce692d6c09e 100644
--- a/vllm/v1/metrics/ray_wrappers.py
+++ b/vllm/v1/metrics/ray_wrappers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import time
 from typing import Optional, Union
 
@@ -30,6 +31,16 @@ def labels(self, *labels, **labelskwargs):
 
             self.metric.set_default_tags(labelskwargs)
 
+        if labels:
+            if len(labels) != len(self.metric._tag_keys):
+                raise ValueError(
+                    "Number of labels must match the number of tag keys. "
+                    f"Expected {len(self.metric._tag_keys)}, got {len(labels)}"
+                )
+
+            self.metric.set_default_tags(
+                dict(zip(self.metric._tag_keys, labels)))
+
         return self
 
 
diff --git a/vllm/v1/metrics/reader.py b/vllm/v1/metrics/reader.py
index 5ab78129a009..4d6e59984154 100644
--- a/vllm/v1/metrics/reader.py
+++ b/vllm/v1/metrics/reader.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Optional
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 8fe1630616a4..50c8b07fe54d 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
 from dataclasses import dataclass, field
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index e8ce0df5ed8d..17a299d57cba 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import NamedTuple, Optional
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 350826386c0a..a6a46a809a51 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
 import time
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index e97e1235fb36..ab13b288a5a9 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
 from typing import Optional
diff --git a/vllm/v1/sample/ops/bad_words.py b/vllm/v1/sample/ops/bad_words.py
index 2984d4e4806f..1b699565f26f 100644
--- a/vllm/v1/sample/ops/bad_words.py
+++ b/vllm/v1/sample/ops/bad_words.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/vllm/v1/sample/ops/penalties.py b/vllm/v1/sample/ops/penalties.py
index ed05e3f48401..48423b9b424d 100644
--- a/vllm/v1/sample/ops/penalties.py
+++ b/vllm/v1/sample/ops/penalties.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index 4a5fbb10d408..30396f159433 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index 17b870fede8e..b2354c53302a 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 16561d30a6dc..6bc0cecdd494 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -1,9 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A layer that samples the next tokens from the model's outputs."""
 
 import torch
 import torch.nn as nn
 
+from vllm.utils import async_tensor_h2d, is_pin_memory_available
 from vllm.v1.outputs import LogprobsTensors, SamplerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.ops.bad_words import apply_bad_words
@@ -19,6 +21,7 @@ class Sampler(nn.Module):
     def __init__(self):
         super().__init__()
         self.topk_topp_sampler = TopKTopPSampler()
+        self.pin_memory = is_pin_memory_available()
 
     def forward(
         self,
@@ -231,6 +234,10 @@ def apply_logits_bias(
         # One idea is implement this as a PyTorch C++ op, and we may
         # even optimize the logit_bias layout.
 
+        rows: list[int] = []
+        cols: list[int] = []
+        vals: list[float] = []
+
         # Get vocabulary size from logits
         vocab_size = logits.shape[-1]
 
@@ -243,7 +250,16 @@ def apply_logits_bias(
                             f"token_id {token_id} in logit_bias contains "
                             f"out-of-vocab token id. Vocabulary size: "
                             f"{vocab_size}")
-                    logits[i, token_id] += bias
+                    rows.append(i)
+                    cols.append(token_id)
+                    vals.append(bias)
+
+        if rows:
+            indices = async_tensor_h2d([rows, cols], torch.int64,
+                                       logits.device, self.pin_memory)
+            values = async_tensor_h2d(vals, torch.float, logits.device,
+                                      self.pin_memory)
+            logits.index_put_(tuple(indices), values=values, accumulate=True)
         return logits
 
     def apply_allowed_token_ids(
diff --git a/vllm/v1/sample/tpu/metadata.py b/vllm/v1/sample/tpu/metadata.py
index a1c7dcdb111f..4c1ac4895197 100644
--- a/vllm/v1/sample/tpu/metadata.py
+++ b/vllm/v1/sample/tpu/metadata.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass, field
 from typing import Optional
 
diff --git a/vllm/v1/sample/tpu/sampler.py b/vllm/v1/sample/tpu/sampler.py
index 7c31a2984b30..1056eb1d7b7f 100644
--- a/vllm/v1/sample/tpu/sampler.py
+++ b/vllm/v1/sample/tpu/sampler.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Sampler layer implementing TPU supported operations."""
 
 import torch
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 78f37c1e8b21..ab6653a786ff 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 import pickle
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 1ca856423165..4b5c9b7ec640 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch
 import torch.nn as nn
 
@@ -329,16 +330,19 @@ def load_model(self, target_model: nn.Module) -> None:
         self.attn_layer_names = list(draft_attn_layer_names)
 
         # share embed_tokens with the target model if needed
-        if get_pp_group().world_size == 1:
+        if get_pp_group().world_size == 1 \
+            and self.model.model.embed_tokens.weight.shape \
+                == target_model.model.embed_tokens.weight.shape:
             logger.info(
-                "The EAGLE head shares the same vocab embedding" \
+                "Assuming the EAGLE head shares the same vocab embedding" \
                 " with the target model."
             )
+            del self.model.model.embed_tokens
             self.model.model.embed_tokens = target_model.model.embed_tokens
         else:
             logger.info(
-                "Since PP > 1, the EAGLE head loaded its own vocab embedding" \
-                " weights instead of sharing them with the target model."
+                "The EAGLE head's vocab embedding will be loaded separately" \
+                " from the target model."
             )
 
         # share lm_head with the target model if needed
diff --git a/vllm/v1/spec_decode/medusa.py b/vllm/v1/spec_decode/medusa.py
index fdac2ef64c3f..f516bf486b8b 100644
--- a/vllm/v1/spec_decode/medusa.py
+++ b/vllm/v1/spec_decode/medusa.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
 import torch.nn as nn
diff --git a/vllm/v1/spec_decode/metadata.py b/vllm/v1/spec_decode/metadata.py
index 1cf650d5fa56..b1efb40612d5 100644
--- a/vllm/v1/spec_decode/metadata.py
+++ b/vllm/v1/spec_decode/metadata.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
 
 import numpy as np
diff --git a/vllm/v1/spec_decode/metrics.py b/vllm/v1/spec_decode/metrics.py
index 36091bef2895..b4bc3058c570 100644
--- a/vllm/v1/spec_decode/metrics.py
+++ b/vllm/v1/spec_decode/metrics.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass, field
 from typing import Optional
diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py
index 704153d43a2b..6b90d0970bd7 100644
--- a/vllm/v1/spec_decode/ngram_proposer.py
+++ b/vllm/v1/spec_decode/ngram_proposer.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import numpy as np
diff --git a/vllm/v1/spec_decode/utils.py b/vllm/v1/spec_decode/utils.py
index 334258e7f87a..5c37333cebc7 100644
--- a/vllm/v1/spec_decode/utils.py
+++ b/vllm/v1/spec_decode/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.triton_utils import tl, triton
 from vllm.v1.worker.gpu_input_batch import InputBatch
 
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 07b422814e13..b2b0ee796954 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 
 import multiprocessing
diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py
index 55c5f609095d..02e7fc33f517 100644
--- a/vllm/v1/structured_output/backend_guidance.py
+++ b/vllm/v1/structured_output/backend_guidance.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from __future__ import annotations
 
diff --git a/vllm/v1/structured_output/backend_types.py b/vllm/v1/structured_output/backend_types.py
index 09f6cdf73337..d500783aa4b3 100644
--- a/vllm/v1/structured_output/backend_types.py
+++ b/vllm/v1/structured_output/backend_types.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from __future__ import annotations
 
diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
index f2570221da25..88544565e544 100644
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from __future__ import annotations
 
diff --git a/vllm/v1/structured_output/request.py b/vllm/v1/structured_output/request.py
index 9a7e30d41aaa..fc365f12573f 100644
--- a/vllm/v1/structured_output/request.py
+++ b/vllm/v1/structured_output/request.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 
 import dataclasses
diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py
index 111e92dc0990..7adee7237bd1 100644
--- a/vllm/v1/structured_output/utils.py
+++ b/vllm/v1/structured_output/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from __future__ import annotations
 
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index d347efc425ef..192c9067740c 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
 import multiprocessing
@@ -406,7 +407,7 @@ def create_dp_placement_groups(
             # For now, each DP rank can only be assigned to one node
             # TODO(rui): support allocating a single DP rank
             # to multiple nodes
-            available_engine_count = node_resources["GPU"] // world_size
+            available_engine_count = int(node_resources["GPU"]) // world_size
             if node_ip == dp_master_ip:
                 assert available_engine_count >= local_engine_count, (
                     "Not enough resources to allocate DP ranks "
diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
index 576086ebeb7f..5cd5674fb522 100644
--- a/vllm/v1/worker/block_table.py
+++ b/vllm/v1/worker/block_table.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import numpy as np
 import torch
@@ -104,10 +105,11 @@ class MultiGroupBlockTable:
 
     def __init__(self, max_num_reqs: int, max_model_len: int,
                  max_num_batched_tokens: int, pin_memory: bool,
-                 device: torch.device, block_size: int) -> None:
+                 device: torch.device, block_sizes: list[int]) -> None:
         self.block_tables = [
             BlockTable(max_num_reqs, cdiv(max_model_len, block_size),
                        max_num_batched_tokens, pin_memory, device)
+            for block_size in block_sizes
         ]
 
     def append_row(self, block_ids: list[list[int]], row_idx: int) -> None:
diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
new file mode 100644
index 000000000000..607cfc0ef69c
--- /dev/null
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -0,0 +1,86 @@
+# SPDX-License-Identifier: Apache-2.0
+from contextlib import contextmanager
+from typing import Any
+
+import torch
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.model_loader import get_model
+from vllm.v1.worker.gpu_model_runner import GPUModelRunner
+
+logger = init_logger(__name__)
+
+
+class CPUModelRunner(GPUModelRunner):
+
+    def __init__(self, vllm_config: VllmConfig, device: torch.device):
+        super().__init__(vllm_config, device)
+
+        assert device == torch.device("cpu")
+        assert self.speculative_config is None, "spec decode is not supported."
+
+        self.use_cuda_graph = False
+        self.cascade_attn_enabled = False
+
+        self._postprocess_tenosrs()
+
+    def _postprocess_tenosrs(self) -> None:
+        # Note: replace device tensors with cpu tensors
+        def replace_tensor(obj: Any, cpu_attr_name: str,
+                           device_attr_name) -> None:
+            cpu_tensor = getattr(obj, cpu_attr_name, None)
+            device_tensor = getattr(obj, device_attr_name, None)
+            if cpu_tensor is not None and device_tensor is not None:
+                assert isinstance(cpu_tensor, torch.Tensor)
+                assert isinstance(device_tensor, torch.Tensor)
+                setattr(obj, device_attr_name, cpu_tensor)
+
+        for k, v in vars(self).items():
+            if k.endswith("_cpu") and isinstance(v, torch.Tensor):
+                replace_tensor(self, k, k[:-4])
+
+        for k, v in vars(self.input_batch).items():
+            if k.endswith("_cpu_tensor") and isinstance(v, torch.Tensor):
+                replace_tensor(self.input_batch, k, k[:-11])
+
+        for k, v in vars(self.input_batch.block_table).items():
+            if k.endswith("_cpu") and isinstance(v, torch.Tensor):
+                replace_tensor(self.input_batch.block_table, k, k[:-4])
+
+    def load_model(self) -> None:
+        logger.info("Starting to load model %s...", self.model_config.model)
+        self.model = get_model(vllm_config=self.vllm_config)
+
+        if self.lora_config:
+            self.model = self.load_lora_model(self.model, self.model_config,
+                                              self.scheduler_config,
+                                              self.lora_config, self.device)
+
+    def warming_up_model(self) -> None:
+        logger.info("Warming up model for the compilation...")
+        # Only generate graph for the generic shape
+        self._dummy_run(max(16, self.max_num_reqs))
+        logger.info("Warming up done.")
+
+    def _init_device_properties(self) -> None:
+        pass
+
+    def _sync_device(self) -> None:
+        pass
+
+
+@contextmanager
+def _set_global_compilation_settings():
+    import torch._inductor.config
+
+    # Note: The CPPGEMM backend requires freezing parameters.
+    freezing_value = torch._inductor.config.freezing
+    torch._inductor.config.freezing = True
+    # Note: workaround for "ValueError: fast mode: can't pickle cyclic objects
+    # including object type dict"
+    force_disable_caches = torch._inductor.config.force_disable_caches
+    torch._inductor.config.force_disable_caches = True
+    yield
+    torch._inductor.config.freezing = freezing_value
+    torch._inductor.config.force_disable_caches = force_disable_caches
diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py
new file mode 100644
index 000000000000..0b710b7bc203
--- /dev/null
+++ b/vllm/v1/worker/cpu_worker.py
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: Apache-2.0
+import os
+from typing import Optional
+
+import torch
+
+from vllm import envs
+from vllm.config import VllmConfig
+from vllm.distributed.parallel_state import get_pp_group, get_tp_group
+from vllm.logger import init_logger
+from vllm.model_executor.utils import set_random_seed
+from vllm.sequence import IntermediateTensors
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.worker.cpu_model_runner import CPUModelRunner
+from vllm.v1.worker.gpu_worker import (Worker,
+                                       init_worker_distributed_environment)
+
+logger = init_logger(__name__)
+
+
+class CPUWorker(Worker):
+
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 local_rank: int,
+                 rank: int,
+                 distributed_init_method: str,
+                 is_driver_worker: bool = False):
+        super().__init__(vllm_config,
+                         local_rank,
+                         rank,
+                         distributed_init_method,
+                         is_driver_worker=is_driver_worker)
+
+        self.parallel_config.disable_custom_all_reduce = True
+
+    def init_device(self):
+        # Setup OpenMP threads affinity.
+        omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND
+        if omp_cpuids == "all":
+            self.local_omp_cpuid = "all"
+        else:
+            self.local_omp_cpuid = omp_cpuids.split("|")[self.rank]
+            ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid)
+            if ret:
+                logger.info(ret)
+
+        # Note: unique identifier for creating allreduce shared memory
+        os.environ["VLLM_DIST_IDENT"] = self.distributed_init_method.split(
+            ":")[-1]
+        # Initialize the distributed environment.
+        init_worker_distributed_environment(self.vllm_config, self.rank,
+                                            self.distributed_init_method,
+                                            self.local_rank, "gloo")
+        # Set random seed.
+        set_random_seed(self.model_config.seed)
+
+        # Construct the model runner
+        self.model_runner: CPUModelRunner = CPUModelRunner(
+            self.vllm_config, torch.device("cpu"))
+
+    def sleep(self, level: int = 1) -> None:
+        logger.warning("sleep mode is not supported on CPU, ignore it.")
+        pass
+
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        logger.warning("sleep mode is not supported on CPU, ignore it.")
+        pass
+
+    def determine_available_memory(self) -> int:
+        return self.cache_config.cpu_kvcache_space_bytes  # type: ignore
+
+    def compile_or_warm_up_model(self) -> None:
+        # Reset the seed to ensure that the random state is not affected by
+        # the model initialization and profiling.
+        set_random_seed(self.model_config.seed)
+        self.model_runner.warming_up_model()
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        scheduler_output: "SchedulerOutput",
+    ) -> Optional[ModelRunnerOutput]:
+        intermediate_tensors = None
+        if not get_pp_group().is_first_rank:
+            intermediate_tensors = IntermediateTensors(
+                get_pp_group().recv_tensor_dict(
+                    all_gather_group=get_tp_group()))
+
+        output = self.model_runner.execute_model(scheduler_output,
+                                                 intermediate_tensors)
+
+        if not get_pp_group().is_last_rank:
+            assert isinstance(output, IntermediateTensors)
+            get_pp_group().send_tensor_dict(output.tensors,
+                                            all_gather_group=get_tp_group())
+            return None
+
+        assert isinstance(output, ModelRunnerOutput)
+        return output if self.is_driver_worker else None
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index b3e65917d3cc..34737029f6bf 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Datastructures defining an input batch
 
 from dataclasses import dataclass
@@ -55,14 +56,14 @@ def get_token_id(self, idx: int) -> int:
 class InputBatch:
 
     def __init__(
-        self,
-        max_num_reqs: int,
-        max_model_len: int,
-        max_num_batched_tokens: int,
-        device: torch.device,
-        pin_memory: bool,
-        vocab_size: int,
-        block_size: int,
+            self,
+            max_num_reqs: int,
+            max_model_len: int,
+            max_num_batched_tokens: int,
+            device: torch.device,
+            pin_memory: bool,
+            vocab_size: int,
+            block_sizes: list[int],  # The block_size of each kv cache group
     ):
         self.max_num_reqs = max_num_reqs
         self.max_model_len = max_model_len
@@ -104,7 +105,7 @@ def __init__(
             max_num_batched_tokens=max_num_batched_tokens,
             pin_memory=pin_memory,
             device=device,
-            block_size=block_size,
+            block_sizes=block_sizes,
         )
 
         # Sampling-related.
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 9f7c474c71cb..e3535ef143ad 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1,10 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import copy
 import gc
 import time
 import weakref
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING, Any, Optional, Union
 
 import numpy as np
 import torch
@@ -37,7 +38,6 @@
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
                         GiB_bytes, LazyLoader, async_tensor_h2d, cdiv,
                         check_use_alibi, is_pin_memory_available)
-from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
 from vllm.v1.kv_cache_interface import (AttentionSpec, FullAttentionSpec,
@@ -58,8 +58,8 @@
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
-from .utils import (gather_mm_placeholders, sanity_check_mm_encoder_outputs,
-                    scatter_mm_placeholders)
+from .utils import (gather_mm_placeholders, initialize_kv_cache_for_kv_sharing,
+                    sanity_check_mm_encoder_outputs, scatter_mm_placeholders)
 
 if TYPE_CHECKING:
     import xgrammar as xgr
@@ -142,7 +142,6 @@ def __init__(
         self.attn_metadata_builders: list[AttentionMetadataBuilder] = []
         self.attn_backends: list[type[AttentionBackend]] = []
         # self.kv_cache_config: KVCacheConfig
-        # self.input_batch: InputBatch # Persistent batch.
 
         # req_id -> (input_id -> encoder_output)
         self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {}
@@ -172,6 +171,15 @@ def __init__(
         # Request states.
         self.requests: dict[str, CachedRequestState] = {}
 
+        # Input Batch
+        # NOTE(Chen): Ideally, we should initialize the input batch inside
+        # `initialize_kv_cache` based on the kv cache config. However, as in
+        # https://github.com/vllm-project/vllm/pull/18298, due to some unknown
+        # reasons, we have to initialize the input batch before `load_model`,
+        # quantization + weight offloading will fail otherwise. As a temporary
+        # solution, we initialize the input batch here, and re-initialize it
+        # in `initialize_kv_cache` if the block_sizes here is different from
+        # the block_sizes in the kv cache config.
         self.input_batch = InputBatch(
             max_num_reqs=self.max_num_reqs,
             max_model_len=self.max_model_len,
@@ -179,7 +187,7 @@ def __init__(
             device=self.device,
             pin_memory=self.pin_memory,
             vocab_size=self.model_config.get_vocab_size(),
-            block_size=self.cache_config.block_size,
+            block_sizes=[self.cache_config.block_size],
         )
 
         self.use_cuda_graph = (self.vllm_config.compilation_config.level
@@ -194,8 +202,7 @@ def __init__(
                 self.vllm_config.compilation_config.cudagraph_capture_sizes))
 
         # Cache the device properties.
-        self.device_properties = torch.cuda.get_device_properties(self.device)
-        self.num_sms = self.device_properties.multi_processor_count
+        self._init_device_properties()
 
         # Persistent buffers for CUDA graphs.
         self.input_ids = torch.zeros(self.max_num_tokens,
@@ -275,6 +282,12 @@ def __init__(
                                         pin_memory=self.pin_memory)
         self.seq_lens_np = self.seq_lens_cpu.numpy()
 
+        # Layer pairings for cross-layer KV sharing.
+        # If an Attention layer `layer_name` is in the keys of this dict, it
+        # means this layer will perform attention using the keys and values
+        # from the KV cache of `shared_kv_cache_layers[layer_name]`.
+        self.shared_kv_cache_layers: dict[str, str] = {}
+
     def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> bool:
         """
         Update the order of requests in the batch based on the attention
@@ -300,6 +313,17 @@ def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> bool:
                 self.input_batch, scheduler_output)
         return batch_reordered
 
+    # Note: used for model runner override.
+    def _init_device_properties(self) -> None:
+        """Initialize attributes from torch.cuda.get_device_properties
+        """
+        self.device_properties = torch.cuda.get_device_properties(self.device)
+        self.num_sms = self.device_properties.multi_processor_count
+
+    # Note: used for model runner override.
+    def _sync_device(self) -> None:
+        torch.cuda.synchronize()
+
     def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         """Update the cached states and the persistent batch with the scheduler
         output.
@@ -436,8 +460,11 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
             # Update the block IDs.
             if not req_data.resumed_from_preemption:
                 # Append the new blocks to the existing block IDs.
-                for i in range(len(self.kv_cache_config.kv_cache_groups)):
-                    req_state.block_ids[i].extend(req_data.new_block_ids[i])
+                for block_ids, new_block_ids in zip(  # type: ignore[call-overload]
+                        req_state.block_ids,
+                        req_data.new_block_ids,
+                        strict=True):
+                    block_ids.extend(new_block_ids)
             else:
                 # The request is resumed from preemption.
                 # Replace the existing block IDs with the new ones.
@@ -523,8 +550,7 @@ def _get_cumsum_and_arange(
     def _prepare_inputs(
         self,
         scheduler_output: "SchedulerOutput",
-    ) -> tuple[dict[str, FlashAttentionMetadata], torch.Tensor,
-               Optional[SpecDecodeMetadata]]:
+    ) -> tuple[dict[str, Any], torch.Tensor, Optional[SpecDecodeMetadata]]:
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         assert total_num_scheduled_tokens > 0
         num_reqs = self.input_batch.num_reqs
@@ -637,7 +663,7 @@ def _prepare_inputs(
         common_attn_metadata = CommonAttentionMetadata(
             query_start_loc=query_start_loc, seq_lens=seq_lens)
 
-        attn_metadata: dict[str, FlashAttentionMetadata] = {}
+        attn_metadata: dict[str, Any] = {}
         # Prepare the attention metadata for each KV cache group and make layers
         # in the same group share the same metadata.
         for kv_cache_group_id, kv_cache_group_spec in enumerate(
@@ -934,7 +960,6 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
             batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs)
             batched_mm_inputs = MultiModalKwargs.as_kwargs(
                 batched_mm_inputs,
-                dtype=self.model_config.dtype,
                 device=self.device,
             )
 
@@ -1695,7 +1720,7 @@ def _get_prompt_logprobs_dict(
 
         # Must synchronize the non-blocking GPU->CPU transfers.
         if prompt_logprobs_dict:
-            torch.cuda.synchronize()
+            self._sync_device()
 
         return prompt_logprobs_dict
 
@@ -1715,7 +1740,7 @@ def _dummy_run(
         # has num_tokens in total.
         assert num_tokens <= self.scheduler_config.max_num_batched_tokens
         max_num_reqs = self.scheduler_config.max_num_seqs
-        num_reqs = max_num_reqs if num_tokens >= max_num_reqs else num_tokens
+        num_reqs = min(num_tokens, max_num_reqs)
         min_tokens_per_req = num_tokens // num_reqs
         num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs
         num_scheduled_tokens_list[-1] += num_tokens % num_reqs
@@ -1725,9 +1750,14 @@ def _dummy_run(
                                         dtype=np.int32)
 
         if skip_attn:
-            attn_metadata: Optional[dict[str, FlashAttentionMetadata]] = None
+            attn_metadata: Optional[dict[str, Any]] = None
         else:
             query_start_loc = self.query_start_loc[:num_reqs + 1]
+            # Make sure max_model_len is used at the graph capture time.
+            self.seq_lens_np[:num_reqs] = self.max_model_len
+            self.seq_lens_np[num_reqs:] = 0
+            self.seq_lens[:num_reqs].copy_(self.seq_lens_cpu[:num_reqs],
+                                           non_blocking=True)
             seq_lens = self.seq_lens[:num_reqs]
 
             common_attn_metadata = CommonAttentionMetadata(
@@ -1738,7 +1768,7 @@ def _dummy_run(
                     self.kv_cache_config.kv_cache_groups):
                 attn_metadata_i = (
                     self.attn_metadata_builders[kv_cache_group_id].build(
-                        num_reqs=num_tokens,
+                        num_reqs=num_reqs,
                         num_actual_tokens=num_tokens,
                         max_query_len=num_tokens,
                         common_prefix_len=0,
@@ -1928,7 +1958,6 @@ def profile_run(self) -> None:
                 [dummy_mm_kwargs] * max_num_mm_items)
             batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs(
                 batched_dummy_mm_inputs,
-                dtype=self.model_config.dtype,
                 device=self.device,
             )
 
@@ -1949,7 +1978,7 @@ def profile_run(self) -> None:
             sampler_output = self._dummy_sampler_run(hidden_states)
         else:
             sampler_output = None
-        torch.cuda.synchronize()
+        self._sync_device()
         del hidden_states, sampler_output
         self.encoder_cache.clear()
         gc.collect()
@@ -2033,36 +2062,87 @@ def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None:
             self.attn_backends.append(attn_backend_i)
             self.attn_metadata_builders.append(attn_metadata_builder_i)
 
-    def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
+    def may_reinitialize_input_batch(self,
+                                     kv_cache_config: KVCacheConfig) -> None:
         """
-        Initialize KV cache based on `kv_cache_config`.
+        Re-initialize the input batch if the block sizes are different from
+        `[self.cache_config.block_size]`. This usually happens when there
+        are multiple KV cache groups.
+
         Args:
-            kv_cache_config: Configuration for the KV cache, including the KV
-            cache size of each layer
+            kv_cache_config: The KV cache configuration.
         """
-        if len(kv_cache_config.kv_cache_groups) > 1:
-            raise NotImplementedError(
-                "Hybrid models with more than one KV cache type are not "
-                "supported yet.")
-        self.kv_cache_config = kv_cache_config
-        self.initialize_attn_backend(kv_cache_config)
+        block_sizes = [
+            kv_cache_group.kv_cache_spec.block_size
+            for kv_cache_group in kv_cache_config.kv_cache_groups
+        ]
+        if block_sizes != [self.cache_config.block_size]:
+            assert self.cache_config.cpu_offload_gb == 0, (
+                "Cannot re-initialize the input batch when CPU weight "
+                "offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 "  # noqa: E501
+                "for more details.")
+            self.input_batch = InputBatch(
+                max_num_reqs=self.max_num_reqs,
+                max_model_len=self.max_model_len,
+                max_num_batched_tokens=self.max_num_tokens,
+                device=self.device,
+                pin_memory=self.pin_memory,
+                vocab_size=self.model_config.get_vocab_size(),
+                block_sizes=block_sizes,
+            )
 
-        kv_caches: dict[str, torch.Tensor] = {}
+    def _allocate_kv_cache_tensors(
+            self, kv_cache_config: KVCacheConfig) -> dict[str, torch.Tensor]:
+        """
+        Initializes the KV cache buffer with the correct size. The buffer needs
+        to be reshaped to the desired shape before being used by the models.
+
+        Args:
+            kv_cache_config: The KV cache config 
+        Returns:
+            dict[str, torch.Tensor]: A map between layer names to their 
+            corresponding memory buffer for KV cache.
+         """
+        kv_cache_raw_tensors: dict[str, torch.Tensor] = {}
+        for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
+            tensor = torch.zeros(kv_cache_tensor.size,
+                                 dtype=torch.int8,
+                                 device=self.device)
+            for layer_name in kv_cache_tensor.shared_by:
+                kv_cache_raw_tensors[layer_name] = tensor
+
+        layer_names = set()
+        for group in kv_cache_config.kv_cache_groups:
+            layer_names.update(group.layer_names)
+        assert layer_names == set(kv_cache_raw_tensors.keys(
+        )), "Some layers are not correctly initialized"
+        return kv_cache_raw_tensors
+
+    def _reshape_kv_cache_tensors(
+        self,
+        kv_cache_config: KVCacheConfig,
+        kv_cache_raw_tensors: dict[str, torch.Tensor],
+    ) -> dict[str, torch.Tensor]:
+        """
+        Reshape the KV cache tensors to the desired shape and dtype.
 
-        for i, kv_cache_group in enumerate(kv_cache_config.kv_cache_groups):
-            kv_cache_spec = kv_cache_group.kv_cache_spec
-            for layer_name in kv_cache_group.layer_names:
-                tensor_config = kv_cache_config.tensors[layer_name]
-                assert tensor_config.size % kv_cache_spec.page_size_bytes == 0
-                num_blocks = tensor_config.size // kv_cache_spec.page_size_bytes
-                # `num_blocks` is the number of blocks the model runner can use.
-                # `kv_cache_config.num_blocks` is the number of blocks that
-                # KVCacheManager may allocate.
-                # Since different GPUs may have different number of layers and
-                # different memory capacities, `num_blocks` can be different on
-                # different GPUs, and `kv_cache_config.num_blocks` is set to
-                # the min of all `num_blocks`. Verify it here.
-                assert num_blocks >= kv_cache_config.num_blocks
+        Args:
+            kv_cache_config: The KV cache config 
+            kv_cache_raw_tensors: The KV cache buffer of each layer, with 
+            correct size but uninitialized shape.
+        Returns:
+            Dict[str, torch.Tensor]: A map between layer names to their 
+            corresponding memory buffer for KV cache.
+        """
+        kv_caches: dict[str, torch.Tensor] = {}
+        for i, kv_cache_group_spec in enumerate(
+                kv_cache_config.kv_cache_groups):
+            kv_cache_spec = kv_cache_group_spec.kv_cache_spec
+            for layer_name in kv_cache_group_spec.layer_names:
+                raw_tensor = kv_cache_raw_tensors[layer_name]
+                assert raw_tensor.numel() % kv_cache_spec.page_size_bytes == 0
+                num_blocks = (raw_tensor.numel() //
+                              kv_cache_spec.page_size_bytes)
                 if isinstance(kv_cache_spec, AttentionSpec):
                     kv_cache_shape = self.attn_backends[i].get_kv_cache_shape(
                         num_blocks, kv_cache_spec.block_size,
@@ -2088,24 +2168,62 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
                         kv_cache_stride_order.index(i)
                         for i in range(len(kv_cache_stride_order))
                     ]
-                    kv_caches[layer_name] = torch.zeros(
-                        kv_cache_shape, dtype=dtype,
-                        device=self.device).permute(*inv_order)
+                    kv_caches[layer_name] = kv_cache_raw_tensors[
+                        layer_name].view(dtype).view(kv_cache_shape).permute(
+                            *inv_order)
                 else:
-                    # TODO: add new branches when introducing more types of
-                    # KV cache specs.
-                    raise ValueError("Unknown KV cache spec type.")
+                    raise NotImplementedError
+        return kv_caches
 
-        if self.speculative_config and self.speculative_config.use_eagle():
-            assert isinstance(self.drafter, EagleProposer)
-            # validate all draft model layers belong to the same kv cache
-            # group
-            self.drafter.validate_same_kv_cache_group(kv_cache_config)
+    def initialize_kv_cache_tensors(
+            self, kv_cache_config: KVCacheConfig) -> dict[str, torch.Tensor]:
+        """
+        Initialize the memory buffer for KV cache.
+
+        Args:
+            kv_cache_config: The KV cache config
+        Returns:
+            Dict[str, torch.Tensor]: A map between layer names to their 
+            corresponding memory buffer for KV cache.
+        """
+        # Initialize the memory buffer for KV cache
+        kv_cache_raw_tensors = self._allocate_kv_cache_tensors(kv_cache_config)
+        # Change the memory buffer to the desired shape
+        kv_caches = self._reshape_kv_cache_tensors(kv_cache_config,
+                                                   kv_cache_raw_tensors)
+
+        # Setup `kv_cache_config` and `kv_caches` for models
+        # with cross-layer KV sharing
+        if self.shared_kv_cache_layers:
+            initialize_kv_cache_for_kv_sharing(
+                self.shared_kv_cache_layers,
+                kv_cache_config.kv_cache_groups,
+                kv_caches,
+            )
 
         bind_kv_cache(
             kv_caches,
             self.vllm_config.compilation_config.static_forward_context,
             self.kv_caches)
+        return kv_caches
+
+    def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
+        """
+        Initialize KV cache based on `kv_cache_config`.
+        Args:
+            kv_cache_config: Configuration for the KV cache, including the KV
+            cache size of each layer
+        """
+        self.kv_cache_config = kv_cache_config
+        self.may_reinitialize_input_batch(kv_cache_config)
+        self.initialize_attn_backend(kv_cache_config)
+        kv_caches = self.initialize_kv_cache_tensors(kv_cache_config)
+
+        if self.speculative_config and self.speculative_config.use_eagle():
+            assert isinstance(self.drafter, EagleProposer)
+            # validate all draft model layers belong to the same kv cache
+            # group
+            self.drafter.validate_same_kv_cache_group(kv_cache_config)
 
         if has_kv_transfer_group():
             get_kv_transfer_group().register_kv_caches(kv_caches)
@@ -2124,6 +2242,18 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
         use_mla = self.vllm_config.model_config.use_mla
         kv_cache_spec: dict[str, KVCacheSpec] = {}
         for layer_name, attn_module in layers.items():
+            if (kv_tgt_layer :=
+                    attn_module.kv_sharing_target_layer_name) is not None:
+                # The layer doesn't need its own KV cache and will use that of
+                # the target layer. We skip creating a KVCacheSpec for it, so
+                # that KV cache management logic will act as this layer does
+                # not exist, and doesn't allocate KV cache for the layer. This
+                # enables the memory saving of cross-layer kv sharing, allowing
+                # a given amount of memory to accommodate longer context lengths
+                # or enable more requests to be processed simultaneously.
+                self.shared_kv_cache_layers[layer_name] = kv_tgt_layer
+                continue
+
             # TODO: Support other attention modules, e.g., cross-attention
             if attn_module.attn_type == AttentionType.DECODER:
                 if attn_module.sliding_window is not None:
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index dd06e729673f..1dfccc9b31bd 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A GPU worker class."""
 import gc
 import os
@@ -129,7 +130,20 @@ def init_device(self):
             _check_if_gpu_supports_dtype(self.model_config.dtype)
             gc.collect()
             torch.cuda.empty_cache()
-            self.init_gpu_memory = torch.cuda.mem_get_info()[0]
+            self.init_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
+            requested_memory = (total_gpu_memory *
+                                self.cache_config.gpu_memory_utilization)
+            if self.init_gpu_memory < requested_memory:
+                GiB = lambda b: round(b / GiB_bytes, 2)
+                raise ValueError(
+                    f"Free memory on device ({GiB(self.init_gpu_memory)}/"
+                    f"{GiB(total_gpu_memory)} GiB) on startup is less than "
+                    f"desired GPU memory utilization "
+                    f"({self.cache_config.gpu_memory_utilization}, "
+                    f"{GiB(requested_memory)} GiB). Decrease GPU memory "
+                    f"utilization or reduce GPU memory used by other processes."
+                )
+
         else:
             raise RuntimeError(
                 f"Not support device type: {self.device_config.device}")
@@ -189,28 +203,47 @@ def determine_available_memory(self) -> int:
         # GPU did not change their memory usage during the profiling.
         assert self.init_gpu_memory > free_gpu_memory, (
             "Error in memory profiling. "
-            f"Initial free memory {self.init_gpu_memory}, current free memory"
-            f" {free_gpu_memory}. This happens when the GPU memory was "
-            "not properly cleaned up before initializing the vLLM instance.")
+            f"Initial free memory {self.init_gpu_memory/GiB_bytes} GiB, "
+            f"current free memory {free_gpu_memory/GiB_bytes} GiB. "
+            f"This happens when the GPU memory was not properly cleaned up "
+            f"before initializing the vLLM instance.")
 
         # Get the peak memory allocation recorded by torch
-        peak_memory = torch.cuda.memory_stats()["allocated_bytes.all.peak"]
+        peak_torch_memory = torch.cuda.memory_stats(
+        )["allocated_bytes.all.peak"]
 
         # Check for any memory left around that may have been allocated on the
         # gpu outside of `torch`. NCCL operations, for example, can use a few
-        # GB during a forward pass
+        # GB during a forward pass.
         torch.cuda.empty_cache()
         torch_allocated_bytes = torch.cuda.memory_stats(
         )["allocated_bytes.all.current"]
-        total_allocated_bytes = torch.cuda.mem_get_info(
-        )[1] - torch.cuda.mem_get_info()[0]
-        non_torch_allocations = total_allocated_bytes - torch_allocated_bytes
-        if non_torch_allocations > 0:
-            peak_memory += non_torch_allocations
+
+        # Reset after emptying torch cache
+        free_gpu_memory = torch.cuda.mem_get_info()[0]
+
+        # Total forward allocation (current) is equal to the diff in free memory
+        fwd_alloc_bytes = self.init_gpu_memory - free_gpu_memory
+        # We assume current non-torch allocation is equal to peak
+        non_torch_alloc_bytes = max(0, fwd_alloc_bytes - torch_allocated_bytes)
+        # Total forward allocation (peak) is peak torch + non-torch
+        peak_memory = peak_torch_memory + non_torch_alloc_bytes
+
         available_kv_cache_memory = (
             total_gpu_memory * self.cache_config.gpu_memory_utilization -
             peak_memory)
 
+        GiB = lambda b: b / GiB_bytes
+        logger.debug(
+            "Initial free memory: %.2f GiB, free memory: %.2f GiB, "
+            "total GPU memory: %.2f GiB", GiB(self.init_gpu_memory),
+            GiB(free_gpu_memory), GiB(total_gpu_memory))
+        logger.debug(
+            "Peak torch memory: %.2f GiB, non-torch forward-pass memory: "
+            "%.2f GiB, available KVCache memory: %.2f GiB",
+            GiB(peak_torch_memory), GiB(non_torch_alloc_bytes),
+            GiB(available_kv_cache_memory))
+
         return int(available_kv_cache_memory)
 
     def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
@@ -341,13 +374,14 @@ def init_worker_distributed_environment(
     rank: int,
     distributed_init_method: Optional[str] = None,
     local_rank: int = -1,
+    backend: str = "nccl",
 ) -> None:
     """Initialize the distributed environment."""
     parallel_config = vllm_config.parallel_config
     set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
 
     init_distributed_environment(parallel_config.world_size, rank,
-                                 distributed_init_method, local_rank)
+                                 distributed_init_method, local_rank, backend)
 
     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
                                       parallel_config.pipeline_parallel_size)
diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
index eb8ed622161d..afa41a37eeb3 100644
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Define LoRA functionality mixin for model runners.
 """
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index c5171b9736b3..d5f40e4d3103 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import bisect
 import gc
 import time
@@ -28,7 +29,8 @@
                                     PlaceholderRange)
 from vllm.multimodal.utils import group_mm_inputs_by_modality
 from vllm.sequence import IntermediateTensors
-from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available
+from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType, cdiv,
+                        is_pin_memory_available)
 from vllm.v1.attention.backends.pallas import (PallasAttentionBackend,
                                                PallasMetadata)
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
@@ -43,7 +45,8 @@
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
-from .utils import sanity_check_mm_encoder_outputs
+from .utils import (initialize_kv_cache_for_kv_sharing,
+                    sanity_check_mm_encoder_outputs)
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
@@ -136,6 +139,11 @@ def __init__(
 
         self.pin_memory = is_pin_memory_available()
         self.dtype = self.model_config.dtype
+        if cache_config.cache_dtype == "auto":
+            self.kv_cache_dtype = self.dtype
+        else:
+            self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
+                cache_config.cache_dtype]
         self._hidden_states_dtype = self.dtype
 
         self.is_multimodal_model = model_config.is_multimodal_model
@@ -198,7 +206,7 @@ def __init__(
             device=self.device,
             pin_memory=self.pin_memory,
             vocab_size=self.model_config.get_vocab_size(),
-            block_size=self.block_size,
+            block_sizes=[self.block_size],
         )
 
         # Cached torch/numpy tensor
@@ -237,6 +245,12 @@ def __init__(
         self.num_reqs_paddings = _get_req_paddings(
             min_req_size=MIN_NUM_SEQS, max_req_size=self.max_num_reqs)
 
+        # Layer pairings for cross-layer KV sharing.
+        # If an Attention layer `layer_name` is in the keys of this dict, it
+        # means this layer will perform attention using the keys and values
+        # from the KV cache of `shared_kv_cache_layers[layer_name]`.
+        self.shared_kv_cache_layers: dict[str, str] = {}
+
         # tensors for structured decoding
         self.grammar_bitmask_cpu = torch.zeros(
             (self.max_num_reqs, cdiv(self.vocab_size, 32)),
@@ -399,7 +413,11 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
             req_state.num_computed_tokens = req_data.num_computed_tokens
             if not req_data.resumed_from_preemption:
                 # Append the new blocks to the existing block IDs.
-                req_state.block_ids.extend(req_data.new_block_ids)
+                for block_ids, new_block_ids in zip(  # type: ignore[call-overload]
+                        req_state.block_ids,
+                        req_data.new_block_ids,
+                        strict=True):
+                    block_ids.extend(new_block_ids)
             else:
                 # The request is resumed from preemption.
                 # Replace the existing block IDs with the new ones.
@@ -454,13 +472,25 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
         block_size = self.vllm_config.cache_config.block_size
         kv_cache_spec: dict[str, KVCacheSpec] = {}
         for layer_name, attn_module in layers.items():
+            if (kv_tgt_layer :=
+                    attn_module.kv_sharing_target_layer_name) is not None:
+                # The layer doesn't need its own KV cache and will use that of
+                # the target layer. We skip creating a KVCacheSpec for it, so
+                # that KV cache management logic will act as this layer does
+                # not exist, and doesn't allocate KV cache for the layer. This
+                # enables the memory saving of cross-layer kv sharing, allowing
+                # a given amount of memory to accommodate longer context lengths
+                # or enable more requests to be processed simultaneously.
+                self.shared_kv_cache_layers[layer_name] = kv_tgt_layer
+                continue
+
             if attn_module.attn_type == AttentionType.DECODER:
                 if attn_module.sliding_window is not None:
                     kv_cache_spec[layer_name] = SlidingWindowSpec(
                         block_size=block_size,
                         num_kv_heads=attn_module.num_kv_heads,
                         head_size=attn_module.head_size,
-                        dtype=attn_module.dtype,
+                        dtype=self.kv_cache_dtype,
                         sliding_window=attn_module.sliding_window,
                         use_mla=False,
                     )
@@ -469,7 +499,7 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
                         block_size=block_size,
                         num_kv_heads=attn_module.num_kv_heads,
                         head_size=attn_module.head_size,
-                        dtype=attn_module.dtype,
+                        dtype=self.kv_cache_dtype,
                         use_mla=False,
                     )
             elif attn_module.attn_type in (AttentionType.ENCODER,
@@ -698,7 +728,6 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
             batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs)
             batched_mm_inputs = MultiModalKwargs.as_kwargs(
                 batched_mm_inputs,
-                dtype=self.model_config.dtype,
                 device=self.device,
             )
 
@@ -1338,21 +1367,28 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
                 device=self.device,
                 pin_memory=self.pin_memory,
                 vocab_size=self.model_config.get_vocab_size(),
-                block_size=kv_cache_config.kv_cache_groups[0].kv_cache_spec.
-                block_size,
+                block_sizes=[
+                    kv_cache_config.kv_cache_groups[0].kv_cache_spec.block_size
+                ],
             )
         # Verify dtype compatibility between block_table_cpu and input_batch
         assert self.block_table_cpu.dtype == self.input_batch.block_table[
             0].get_cpu_tensor().dtype
 
-        kv_caches: dict[str, torch.Tensor] = {}
+        kv_cache_sizes = {}
+        for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
+            assert len(kv_cache_tensor.shared_by) == 1, (
+                "KV cache tensor shared by multiple layers is not supported in "
+                "TPU.")
+            kv_cache_sizes[kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size
 
+        kv_caches: dict[str, torch.Tensor] = {}
         for kv_cache_group in kv_cache_config.kv_cache_groups:
             kv_cache_spec = kv_cache_group.kv_cache_spec
             for layer_name in kv_cache_group.layer_names:
-                tensor_config = kv_cache_config.tensors[layer_name]
-                assert tensor_config.size % kv_cache_spec.page_size_bytes == 0
-                num_blocks = tensor_config.size // kv_cache_spec.page_size_bytes
+                tensor_size = kv_cache_sizes[layer_name]
+                assert tensor_size % kv_cache_spec.page_size_bytes == 0
+                num_blocks = tensor_size // kv_cache_spec.page_size_bytes  # noqa
                 if isinstance(kv_cache_spec, AttentionSpec):
                     if self.use_spmd:
                         num_kv_heads = kv_cache_spec.num_kv_heads
@@ -1375,6 +1411,15 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
                 else:
                     raise NotImplementedError
 
+        # Setup `kv_cache_config` and `kv_caches` for models
+        # with cross-layer KV sharing
+        if self.shared_kv_cache_layers:
+            initialize_kv_cache_for_kv_sharing(
+                self.shared_kv_cache_layers,
+                kv_cache_config.kv_cache_groups,
+                kv_caches,
+            )
+
         bind_kv_cache(
             kv_caches,
             self.vllm_config.compilation_config.static_forward_context,
@@ -1530,7 +1575,6 @@ def _get_mm_dummy_batch(self, modality: str,
                                                          batch_size)
         return MultiModalKwargs.as_kwargs(
             batched_dummy_mm_inputs,
-            dtype=self.model_config.dtype,
             device=self.device,
         )
 
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index bf0a5777cb3f..16a9f0959b5c 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A TPU worker class."""
 import os
 from typing import Optional
@@ -99,7 +100,8 @@ def init_device(self):
         # `xla_tpu_force_1d_allreduce_at_chunk_count` is a temporary solution to
         # fix this. It will be removed after the bug in XLA compiler is fixed.
         os.environ["LIBTPU_INIT_ARGS"] = (
-            "--xla_tpu_force_1d_allreduce_at_chunk_count=1")
+            os.environ.get("LIBTPU_INIT_ARGS", "") +
+            " --xla_tpu_force_1d_allreduce_at_chunk_count=1")
         torch.set_grad_enabled(False)
         torch.set_default_dtype(self.model_config.dtype)
 
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 91548a52cfc7..055cf01530f0 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -1,8 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
 import torch
 
+from vllm.v1.kv_cache_interface import KVCacheGroupSpec
+
 
 def sanity_check_mm_encoder_outputs(
     mm_embeddings: object,
@@ -72,3 +75,37 @@ def gather_mm_placeholders(
         return placeholders
 
     return placeholders[is_embed]
+
+
+def initialize_kv_cache_for_kv_sharing(
+    shared_kv_cache_layers: dict[str, str],
+    kv_cache_groups: list[KVCacheGroupSpec],
+    kv_caches: dict[str, torch.Tensor],
+) -> None:
+    """
+    Sets up KV cache sharing by reusing the allocated KV caches in `kv_caches`
+    for layers that do not allocate its own KV cache, based on the mapping in
+    `shared_kv_cache_layers`. Adds these layers to the corresponding KV cache
+    group, which is needed to ensure that attention metadata is assigned later.
+
+    Args:
+        shared_kv_cache_layers: Layer pairings for cross-layer KV sharing.
+            If an Attention layer `layer_name` is in the keys of this dict, it
+            means this layer will perform attention using the keys and values
+            from the KV cache of `shared_kv_cache_layers[layer_name]`.
+        kv_cache_groups: The KV cache groups of the model.
+        kv_caches: The allocated kv_caches with layer names as keys.
+            Note that layers in shared_kv_cache_layers.keys() are not
+            originally included as it only contains layers which have its own
+            KV cache allocation.
+    """
+    # Record index of KV cache group for each layer that allocates a KV cache.
+    layer_to_kv_cache_group_idx: dict[str, int] = {}
+    for i, kv_cache_group in enumerate(kv_cache_groups):
+        for layer_name in kv_cache_group.layer_names:
+            layer_to_kv_cache_group_idx[layer_name] = i
+
+    for layer_name, target_layer_name in shared_kv_cache_layers.items():
+        kv_caches[layer_name] = kv_caches[target_layer_name]
+        group_idx = layer_to_kv_cache_group_idx[target_layer_name]
+        kv_cache_groups[group_idx].layer_names.append(layer_name)
diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py
index 487a49b6211e..9c93754f93f8 100644
--- a/vllm/v1/worker/worker_base.py
+++ b/vllm/v1/worker/worker_base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import Optional
 
diff --git a/vllm/version.py b/vllm/version.py
index 8329d7becb68..6c88b1b5a3bf 100644
--- a/vllm/version.py
+++ b/vllm/version.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 try:
     from ._version import __version__, __version_tuple__
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index d48a6957c5dd..530907012f70 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """CacheEngine class for managing the KV cache."""
 from typing import List
 
diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py
index 82eeeb570d22..c99e2652a397 100644
--- a/vllm/worker/cpu_enc_dec_model_runner.py
+++ b/vllm/worker/cpu_enc_dec_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, cast
@@ -299,7 +300,6 @@ def execute_model(
             model_input.encoder_input_positions,
             **MultiModalKwargs.as_kwargs(
                 model_input.multi_modal_kwargs or {},
-                dtype=self.model_config.dtype,
                 device=self.device,
             ),
             "intermediate_tensors":
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index fb436a079f87..68cdf65cafa7 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 import weakref
@@ -629,7 +630,6 @@ def execute_model(
         if model_input.multi_modal_kwargs is not None:
             multimodal_kwargs = MultiModalKwargs.as_kwargs(
                 model_input.multi_modal_kwargs,
-                dtype=self.model_config.dtype,
                 device=self.device,
             )
         execute_model_kwargs = {}
diff --git a/vllm/worker/cpu_pooling_model_runner.py b/vllm/worker/cpu_pooling_model_runner.py
index 2a60e51261ad..203fdf225a41 100644
--- a/vllm/worker/cpu_pooling_model_runner.py
+++ b/vllm/worker/cpu_pooling_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 from typing import Any, Dict, List, Optional, Tuple, Type, Union
@@ -52,7 +53,6 @@ def execute_model(
             model_input.input_positions,
             **MultiModalKwargs.as_kwargs(
                 model_input.multi_modal_kwargs or {},
-                dtype=self.model_config.dtype,
                 device=self.device,
             ),
             **cross_enc_kwargs,
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 1436a404335a..b04a9a1eb08d 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A CPU worker class."""
 import os
 from typing import Dict, List, Optional, Set, Tuple, Type
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index 3957e5608524..8d92edc5b386 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 import itertools
@@ -204,7 +205,6 @@ def execute_model(
                 intermediate_tensors=intermediate_tensors,
                 **MultiModalKwargs.as_kwargs(
                     multi_modal_kwargs,
-                    dtype=self.model_config.dtype,
                     device=self.device,
                 ),
                 **seqlen_agnostic_kwargs,
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index e2261cbb26b4..17123d2b4837 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 ###############################################################################
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py
index 533fead0e669..6d76ea499a90 100644
--- a/vllm/worker/hpu_worker.py
+++ b/vllm/worker/hpu_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 ###############################################################################
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 8c968faa7810..82db6617ba55 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 import gc
@@ -1847,7 +1848,6 @@ def execute_model(
                     intermediate_tensors=intermediate_tensors,
                     **MultiModalKwargs.as_kwargs(
                         multi_modal_kwargs,
-                        dtype=self.model_config.dtype,
                         device=self.device,
                     ),
                     **seqlen_agnostic_kwargs,
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index 935325cb2e1c..d567ce4a6e78 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 from abc import ABC, abstractmethod
diff --git a/vllm/worker/multi_step_hpu_worker.py b/vllm/worker/multi_step_hpu_worker.py
index 2c5e2eac7589..f0210c13c755 100644
--- a/vllm/worker/multi_step_hpu_worker.py
+++ b/vllm/worker/multi_step_hpu_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 ###############################################################################
 # Copyright (C) 2025 Habana Labs, Ltd. an Intel Company
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index f8d5acf586c5..cc0cc855e7be 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 import functools
diff --git a/vllm/worker/multi_step_neuron_model_runner.py b/vllm/worker/multi_step_neuron_model_runner.py
index aafb7ab7cfb8..25f588077cb4 100644
--- a/vllm/worker/multi_step_neuron_model_runner.py
+++ b/vllm/worker/multi_step_neuron_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from importlib.util import find_spec
 from typing import List, Optional
@@ -72,7 +73,6 @@ def execute_model(
             input_block_ids=model_input.input_block_ids,
             **MultiModalKwargs.as_kwargs(
                 model_input.multi_modal_kwargs or {},
-                dtype=self.model_config.dtype,
                 device=self.device,
             ),
         )
diff --git a/vllm/worker/multi_step_neuronx_distributed_model_runner.py b/vllm/worker/multi_step_neuronx_distributed_model_runner.py
index 3a9c0993e004..dd521dd67dad 100644
--- a/vllm/worker/multi_step_neuronx_distributed_model_runner.py
+++ b/vllm/worker/multi_step_neuronx_distributed_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import List, Optional
 
 import torch
@@ -51,7 +52,6 @@ def execute_model(
             sampling_params=sampling_params,
             **MultiModalKwargs.as_kwargs(
                 model_input.multi_modal_kwargs or {},
-                dtype=self.model_config.dtype,
                 device=self.device,
             ),
         )
diff --git a/vllm/worker/multi_step_tpu_worker.py b/vllm/worker/multi_step_tpu_worker.py
index 3871199987ce..ed9f00166615 100644
--- a/vllm/worker/multi_step_tpu_worker.py
+++ b/vllm/worker/multi_step_tpu_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 from typing import Dict, Optional, Tuple
diff --git a/vllm/worker/multi_step_worker.py b/vllm/worker/multi_step_worker.py
index 3518ab2f64fe..ea16e14f9ecd 100644
--- a/vllm/worker/multi_step_worker.py
+++ b/vllm/worker/multi_step_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 from dataclasses import dataclass
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index 3aff3e01aef1..7ccf1a2c0a87 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from dataclasses import dataclass
@@ -394,7 +395,6 @@ def execute_model(
                 adapter_ids=model_input.adapter_ids,
                 **MultiModalKwargs.as_kwargs(
                     model_input.multi_modal_kwargs or {},
-                    dtype=self.model_config.dtype,
                     device=self.device,
                 ),
             )
@@ -407,7 +407,6 @@ def execute_model(
                 input_block_ids=model_input.input_block_ids,
                 **MultiModalKwargs.as_kwargs(
                     model_input.multi_modal_kwargs or {},
-                    dtype=self.model_config.dtype,
                     device=self.device,
                 ),
             )
diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
index 64daee31bbdf..662bde6bc07b 100644
--- a/vllm/worker/neuron_worker.py
+++ b/vllm/worker/neuron_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A Neuron worker class."""
 import os
 from typing import List, Optional, Set, Tuple
diff --git a/vllm/worker/neuronx_distributed_model_runner.py b/vllm/worker/neuronx_distributed_model_runner.py
index 9cd4f88d32f0..2a0f4e77c99e 100644
--- a/vllm/worker/neuronx_distributed_model_runner.py
+++ b/vllm/worker/neuronx_distributed_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from typing import List, Optional, Set
 
diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py
index 912e04c435f5..f80955f71a5a 100644
--- a/vllm/worker/pooling_model_runner.py
+++ b/vllm/worker/pooling_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 from typing import Any, Dict, List, Optional, Tuple, Type, Union
@@ -121,7 +122,6 @@ def execute_model(
                 intermediate_tensors=intermediate_tensors,
                 **MultiModalKwargs.as_kwargs(
                     multi_modal_kwargs,
-                    dtype=self.model_config.dtype,
                     device=self.device,
                 ),
                 **cross_enc_kwargs,
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index e0cca9072745..5f1535271b9a 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
 import time
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index 4bb9bea022f9..ad5ed19e2f89 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
 from typing import List, Optional, Tuple, Union
diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py
index e2854bcb37ce..1a5f62cb3c47 100644
--- a/vllm/worker/utils.py
+++ b/vllm/worker/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 '''
 Worker-related helper functions.
 '''
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 2a4317271934..9a928632688a 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A GPU worker class."""
 import gc
 import os
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index e5662e69343c..0b37caa71669 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 import os
@@ -596,7 +597,8 @@ def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None:
 
     def initialize_from_config(self, kv_cache_configs: List[Any]) -> None:
         kv_cache_config = kv_cache_configs[self.rpc_rank]
-        self.worker.initialize_from_config(kv_cache_config)  # type: ignore
+        with set_current_vllm_config(self.vllm_config):
+            self.worker.initialize_from_config(kv_cache_config)  # type: ignore
 
     def init_device(self):
         with set_current_vllm_config(self.vllm_config):
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 79fa7d2c73e8..b2d3ce8526d5 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
 import time
@@ -564,7 +565,6 @@ def execute_model(
                 intermediate_tensors=intermediate_tensors,
                 **MultiModalKwargs.as_kwargs(
                     model_input.multi_modal_kwargs or {},
-                    dtype=self.model_config.dtype,
                     device=self.device,
                 ),
             )
diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py
index a5109a982cbf..fe321c059f52 100644
--- a/vllm/worker/xpu_worker.py
+++ b/vllm/worker/xpu_worker.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A XPU worker class."""
 import gc
 import os

From 0a11d2d7bf4d16c7bcf9ab5fea497b527532f5a0 Mon Sep 17 00:00:00 2001
From: amit <amit.man@gmail.com>
Date: Sun, 8 Jun 2025 10:08:20 +0300
Subject: [PATCH 05/33] Minor, remove comma

Signed-off-by: amit <amit.man@gmail.com>
---
 vllm/v1/core/sched/scheduler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index aae5cb4886e6..d15534ab55e0 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -845,7 +845,7 @@ def update_from_output(
                 spec_decoding_stats = self.make_spec_decoding_stats(
                     spec_decoding_stats,
                     num_draft_tokens=len(scheduled_spec_token_ids),
-                    num_accepted_tokens=len(generated_token_ids) - 1,
+                    num_accepted_tokens=len(generated_token_ids) - 1
                 )
 
             cached_encoder_input_ids = (

From f423ca92f3108bae3dbe426cb412414eab5d70a9 Mon Sep 17 00:00:00 2001
From: amit <amit.man@gmail.com>
Date: Sun, 8 Jun 2025 10:21:05 +0300
Subject: [PATCH 06/33] fix priority tester

Signed-off-by: amit <amit.man@gmail.com>
---
 tests/v1/core/test_scheduler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 1aef7511cc02..39e9882f0fdd 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1326,7 +1326,7 @@ def create_scheduler_with_priority(
     )
     kv_cache_config = KVCacheConfig(
         num_blocks=num_blocks,  # A large number of blocks to hold all requests
-        tensors={},
+        kv_cache_tensors=[],
         kv_cache_groups=[
             KVCacheGroupSpec(['layer'],
                              FullAttentionSpec(block_size, 1, 1, torch.float32,

From 15d820263530850f2a398f58e96156323d6b5923 Mon Sep 17 00:00:00 2001
From: amit <amit.man@gmail.com>
Date: Sun, 8 Jun 2025 11:00:33 +0300
Subject: [PATCH 07/33] create a subclass of deque for waiting queue that has
 optional priority instead of casting

Signed-off-by: amit <amit.man@gmail.com>
---
 vllm/v1/core/sched/scheduler.py     | 130 +++++----------------
 vllm/v1/core/sched/waiting_queue.py | 170 ++++++++++++++++++++++++++++
 2 files changed, 200 insertions(+), 100 deletions(-)
 create mode 100644 vllm/v1/core/sched/waiting_queue.py

diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index d15534ab55e0..314b85df4b5e 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -3,11 +3,10 @@
 
 from __future__ import annotations
 
-import heapq
 import time
 from collections import defaultdict, deque
 from collections.abc import Iterable
-from typing import Any, Optional, Union, cast
+from typing import Any, Optional, Union
 
 from vllm.config import VllmConfig
 from vllm.distributed.kv_events import EventPublisherFactory, KVEventBatch
@@ -24,6 +23,7 @@
 from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
                                        SchedulerOutput)
 from vllm.v1.core.sched.utils import check_stop
+from vllm.v1.core.sched.waiting_queue import create_waiting_queue
 from vllm.v1.engine import (EngineCoreEventType, EngineCoreOutput,
                             EngineCoreOutputs)
 from vllm.v1.kv_cache_interface import KVCacheConfig
@@ -98,9 +98,7 @@ def __init__(
         # Scheduling policy
         self.policy = self.scheduler_config.policy
         # Priority queues for requests.
-        self.waiting: Union[list[tuple[int, float, Request]],
-                            deque[Request]] = ([] if self.policy == "priority"
-                                               else deque())
+        self.waiting = create_waiting_queue(self.policy)
         self.running: list[Request] = []
 
         # The request IDs that are finished in between the previous and the
@@ -281,19 +279,11 @@ def schedule(self) -> SchedulerOutput:
                         preempted_req.record_event(
                             EngineCoreEventType.PREEMPTED, scheduled_timestamp)
 
-                    if self.policy == "priority":
-                        heapq.heappush(
-                            cast(list[tuple[int, float, Request]],
-                                 self.waiting),
-                            (
-                                preempted_req.priority,
-                                preempted_req.arrival_time,
-                                preempted_req,
-                            ),
-                        )
-                    else:
-                        cast(deque[Request],
-                             self.waiting).appendleft(preempted_req)
+                    self.waiting.push_request(
+                        preempted_req,
+                        preempted_req.priority,
+                        preempted_req.arrival_time,
+                    )
                     preempted_reqs.append(preempted_req)
                     if preempted_req == request:
                         # No more request to preempt.
@@ -349,12 +339,9 @@ def schedule(self) -> SchedulerOutput:
                 if req.lora_request and req.lora_request.lora_int_id > 0)
             assert len(scheduled_loras) <= self.lora_config.max_loras
 
-        # Use a temporary deque to collect requests that need to be skipped
+        # Use a temporary queue to collect requests that need to be skipped
         # and put back at the head of the waiting queue later
-        skipped_waiting_requests: Union[list[tuple[int, float, Request]],
-                                        deque[Request]] = ([] if self.policy
-                                                           == "priority" else
-                                                           deque())
+        skipped_waiting_requests = create_waiting_queue(self.policy)
 
         # Next, schedule the WAITING requests.
         if not preempted_reqs:
@@ -362,14 +349,7 @@ def schedule(self) -> SchedulerOutput:
                 if len(self.running) == self.max_num_running_reqs:
                     break
 
-                if self.policy == "priority":
-                    if (not self.waiting
-                        ):  # Should not happen due to outer loop condition
-                        break
-                    priority_val, arrival_time_val, request = heapq.heappop(
-                        cast(list[tuple[int, float, Request]], self.waiting))
-                else:
-                    request = cast(deque[Request], self.waiting).popleft()
+                request = self.waiting.pop_request()
 
                 # KVTransfer: skip request if still waiting for remote kvs.
                 if request.status == RequestStatus.WAITING_FOR_REMOTE_KVS:
@@ -381,16 +361,8 @@ def schedule(self) -> SchedulerOutput:
                             "%s is still in WAITING_FOR_REMOTE_KVS state.",
                             request.request_id,
                         )
-                        if self.policy == "priority":
-                            waiting_queue = cast(
-                                list[tuple[int, float, Request]],
-                                skipped_waiting_requests)
-                            heapq.heappush(
-                                waiting_queue,
-                                (priority_val, arrival_time_val, request))
-                        else:
-                            cast(deque[Request],
-                                 skipped_waiting_requests).appendleft(request)
+                        skipped_waiting_requests.push_request(
+                            request, request.priority, request.arrival_time)
                         continue
 
                 # Skip request if the structured output request is still waiting
@@ -400,16 +372,8 @@ def schedule(self) -> SchedulerOutput:
                     if structured_output_req and structured_output_req.grammar:
                         request.status = RequestStatus.WAITING
                     else:
-                        if self.policy == "priority":
-                            waiting_queue = cast(
-                                list[tuple[int, float, Request]],
-                                skipped_waiting_requests)
-                            heapq.heappush(
-                                waiting_queue,
-                                (priority_val, arrival_time_val, request))
-                        else:
-                            cast(deque[Request],
-                                 skipped_waiting_requests).appendleft(request)
+                        skipped_waiting_requests.push_request(
+                            request, request.priority, request.arrival_time)
                         continue
 
                 # Check that adding the request still respects the max_loras
@@ -418,15 +382,8 @@ def schedule(self) -> SchedulerOutput:
                     (len(scheduled_loras) == self.lora_config.max_loras and
                      request.lora_request.lora_int_id not in scheduled_loras)):
                     # Scheduling would exceed max_loras, skip.
-                    if self.policy == "priority":
-                        waiting_queue = cast(list[tuple[int, float, Request]],
-                                             skipped_waiting_requests)
-                        heapq.heappush(
-                            waiting_queue,
-                            (priority_val, arrival_time_val, request))
-                    else:
-                        cast(deque[Request],
-                             skipped_waiting_requests).appendleft(request)
+                    skipped_waiting_requests.push_request(
+                        request, request.priority, request.arrival_time)
                     continue
 
                 num_external_computed_tokens = 0
@@ -501,15 +458,13 @@ def schedule(self) -> SchedulerOutput:
                 )
                 if new_blocks is None:
                     # The request cannot be scheduled.
+                    # For FCFS, push back to the front; for priority, add to
+                    # skipped queue
                     if self.policy == "priority":
-                        waiting_queue = cast(list[tuple[int, float, Request]],
-                                             skipped_waiting_requests)
-                        heapq.heappush(
-                            waiting_queue,
-                            (priority_val, arrival_time_val, request))
+                        skipped_waiting_requests.push_request(
+                            request, request.priority, request.arrival_time)
                     else:
-                        # For FCFS, push back to the front of the deque.
-                        cast(deque[Request], self.waiting).appendleft(request)
+                        self.waiting.appendleft(request)
                     break
 
                 # KVTransfer: the connector uses this info to determine
@@ -524,20 +479,12 @@ def schedule(self) -> SchedulerOutput:
                     )
 
                 # Request was already popped from self.waiting
-                # (either via heapq.heappop or self.waiting.popleft())
                 # unless it was re-added above due to new_blocks being None.
                 if load_kv_async:
                     # If loading async, allocate memory and put request
                     # into the WAITING_FOR_REMOTE_KV state.
-                    if self.policy == "priority":
-                        waiting_queue = cast(list[tuple[int, float, Request]],
-                                             skipped_waiting_requests)
-                        heapq.heappush(
-                            waiting_queue,
-                            (priority_val, arrival_time_val, request))
-                    else:
-                        cast(deque[Request],
-                             skipped_waiting_requests).appendleft(request)
+                    skipped_waiting_requests.push_request(
+                        request, request.priority, request.arrival_time)
                     request.status = RequestStatus.WAITING_FOR_REMOTE_KVS
                     continue
 
@@ -579,17 +526,7 @@ def schedule(self) -> SchedulerOutput:
 
         # Put back any skipped requests at the head of the waiting queue
         if skipped_waiting_requests:
-            if self.policy == "priority":
-                waiting_queue = cast(list[tuple[int, float, Request]],
-                                     self.waiting)
-                skipped_queue = cast(list[tuple[int, float, Request]],
-                                     skipped_waiting_requests)
-                for item in skipped_queue:
-                    heapq.heappush(waiting_queue, item)
-            else:  # FCFS
-                waiting_deque = cast(deque[Request], self.waiting)
-                skipped_deque = cast(deque[Request], skipped_waiting_requests)
-                waiting_deque.extendleft(skipped_deque)
+            self.waiting.extend_left_requests(skipped_waiting_requests)
 
         # Check if the scheduling constraints are satisfied.
         total_num_scheduled_tokens = sum(num_scheduled_tokens.values())
@@ -845,8 +782,7 @@ def update_from_output(
                 spec_decoding_stats = self.make_spec_decoding_stats(
                     spec_decoding_stats,
                     num_draft_tokens=len(scheduled_spec_token_ids),
-                    num_accepted_tokens=len(generated_token_ids) - 1
-                )
+                    num_accepted_tokens=len(generated_token_ids) - 1)
 
             cached_encoder_input_ids = (
                 self.encoder_cache_manager.get_cached_input_ids(request))
@@ -974,13 +910,7 @@ def get_request_counts(self) -> tuple[int, int]:
         return len(self.running), len(self.waiting)
 
     def add_request(self, request: Request) -> None:
-        if self.policy == "priority":
-            heapq.heappush(
-                cast(list[tuple[int, float, Request]], self.waiting),
-                (request.priority, request.arrival_time, request),
-            )
-        else:
-            cast(deque[Request], self.waiting).append(request)
+        self.waiting.add_request(request)
         self.requests[request.request_id] = request
         if self.log_stats:
             request.record_event(EngineCoreEventType.QUEUED)
@@ -1010,7 +940,7 @@ def finish_requests(
             if request.status == RequestStatus.RUNNING:
                 self.running.remove(request)
             else:
-                cast(deque[Request], self.waiting).remove(request)
+                self.waiting.remove_request(request)
             request.status = finished_status
             self._free_request(request)
 
@@ -1039,7 +969,7 @@ def _free_blocks(self, request: Request):
         del self.requests[request.request_id]
 
     def get_num_unfinished_requests(self) -> int:
-        return len(cast(deque[Request], self.waiting)) + len(self.running)
+        return len(self.waiting) + len(self.running)
 
     def has_finished_requests(self) -> bool:
         return len(self.finished_req_ids) > 0
@@ -1057,7 +987,7 @@ def make_stats(
         assert prefix_cache_stats is not None
         return SchedulerStats(
             num_running_reqs=len(self.running),
-            num_waiting_reqs=len(cast(deque[Request], self.waiting)),
+            num_waiting_reqs=len(self.waiting),
             gpu_cache_usage=self.kv_cache_manager.usage,
             prefix_cache_stats=prefix_cache_stats,
             spec_decoding_stats=spec_decoding_stats,
diff --git a/vllm/v1/core/sched/waiting_queue.py b/vllm/v1/core/sched/waiting_queue.py
new file mode 100644
index 000000000000..992cf560daf5
--- /dev/null
+++ b/vllm/v1/core/sched/waiting_queue.py
@@ -0,0 +1,170 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import heapq
+from abc import ABC, abstractmethod
+from collections import deque
+
+from vllm.v1.request import Request
+
+
+class WaitingQueue(ABC):
+    """Abstract base class for request queues."""
+    
+    @abstractmethod
+    def add_request(self, request: Request) -> None:
+        """Add a request to the queue according to the policy."""
+        pass
+
+    @abstractmethod
+    def pop_request(self) -> Request:
+        """Pop a request from the queue according to the policy."""
+        pass
+
+    @abstractmethod
+    def push_request(self,
+                     request: Request,
+                     priority: int = 0,
+                     arrival_time: float = 0.0) -> None:
+        """Push a request back to the queue (used for skipped requests)."""
+        pass
+
+    @abstractmethod
+    def extend_left_requests(self, requests: 'WaitingQueue') -> None:
+        """Extend left with requests from another WaitingQueue."""
+        pass
+
+    @abstractmethod
+    def remove_request(self, request: Request) -> None:
+        """Remove a specific request from the queue."""
+        pass
+
+    @abstractmethod
+    def __bool__(self) -> bool:
+        """Check if queue has any requests."""
+        pass
+
+    @abstractmethod
+    def __len__(self) -> int:
+        """Get number of requests in queue."""
+        pass
+
+    @abstractmethod
+    def __iter__(self):
+        """Iterate over the queue according to the policy."""
+        pass
+
+
+class FCFSWaitingQueue(WaitingQueue, deque[Request]):
+    """A first-come-first-served queue that supports deque operations."""
+
+    def __init__(self) -> None:
+        super().__init__()
+        deque.__init__(self)
+
+    def add_request(self, request: Request) -> None:
+        """Add a request to the queue according to FCFS policy."""
+        self.append(request)
+
+    def pop_request(self) -> Request:
+        """Pop a request from the queue according to FCFS policy."""
+        return self.popleft()
+
+    def push_request(self,
+                     request: Request,
+                     priority: int = 0,
+                     arrival_time: float = 0.0) -> None:
+        """Push a request back to the queue (used for skipped requests)."""
+        self.appendleft(request)
+
+    def extend_left_requests(self, requests: WaitingQueue) -> None:
+        """Extend left with requests from another WaitingQueue."""
+        if isinstance(requests, FCFSWaitingQueue):
+            self.extendleft(requests)
+        else:
+            # Handle priority queue case
+            for item in requests:
+                if isinstance(item, tuple):
+                    _, _, request = item
+                    self.appendleft(request)
+                else:
+                    self.appendleft(item)
+
+    def remove_request(self, request: Request) -> None:
+        """Remove a specific request from the queue."""
+        self.remove(request)
+
+    def __bool__(self) -> bool:
+        """Check if queue has any requests."""
+        return deque.__bool__(self)
+
+    def __len__(self) -> int:
+        """Get number of requests in queue."""
+        return deque.__len__(self)
+
+    def __iter__(self):
+        """Iterate over the queue according to FCFS policy."""
+        return deque.__iter__(self)
+
+
+class PriorityWaitingQueue(WaitingQueue):
+    """A priority queue that supports heap operations."""
+
+    def __init__(self) -> None:
+        self._heap: list[tuple[int, float, Request]] = []
+
+    def add_request(self, request: Request) -> None:
+        """Add a request to the queue according to priority policy."""
+        heapq.heappush(self._heap,
+                       (request.priority, request.arrival_time, request))
+
+    def pop_request(self) -> Request:
+        """Pop a request from the queue according to priority policy."""
+        if not self._heap:
+            raise IndexError("pop from empty heap")
+        _, _, request = heapq.heappop(self._heap)
+        return request
+
+    def push_request(self,
+                     request: Request,
+                     priority: int = 0,
+                     arrival_time: float = 0.0) -> None:
+        """Push a request back to the queue (used for skipped requests)."""
+        heapq.heappush(self._heap, (priority, arrival_time, request))
+
+    def extend_left_requests(self, requests: WaitingQueue) -> None:
+        """Extend left with requests from another WaitingQueue."""
+        if isinstance(requests, PriorityWaitingQueue):
+            for item in requests._heap:
+                heapq.heappush(self._heap, item)
+        else:
+            # Handle FCFS queue case
+            for request in requests:
+                heapq.heappush(self._heap, (request.priority, request.arrival_time, request))
+
+    def remove_request(self, request: Request) -> None:
+        """Remove a specific request from the queue."""
+        self._heap = [(p, t, r) for p, t, r in self._heap if r != request]
+        heapq.heapify(self._heap)
+
+    def __bool__(self) -> bool:
+        """Check if queue has any requests."""
+        return bool(self._heap)
+
+    def __len__(self) -> int:
+        """Get number of requests in queue."""
+        return len(self._heap)
+
+    def __iter__(self):
+        """Iterate over the queue according to priority policy."""
+        return iter(self._heap)
+
+
+def create_waiting_queue(policy: str) -> WaitingQueue:
+    """Factory function to create the appropriate waiting queue based on policy."""
+    if policy == "priority":
+        return PriorityWaitingQueue()
+    else:
+        return FCFSWaitingQueue() 
\ No newline at end of file

From 7bcab392b3b86a4d0ed931c8387c115e90af7532 Mon Sep 17 00:00:00 2001
From: amit <amit.man@gmail.com>
Date: Sun, 8 Jun 2025 11:10:11 +0300
Subject: [PATCH 08/33] formatting

Signed-off-by: amit <amit.man@gmail.com>
---
 vllm/v1/core/sched/scheduler.py     |  8 ++++----
 vllm/v1/core/sched/waiting_queue.py | 14 ++++++++------
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 314b85df4b5e..9aefc96db819 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -65,8 +65,8 @@ def __init__(
 
         # Scheduling constraints.
         self.max_num_running_reqs = self.scheduler_config.max_num_seqs
-        self.max_num_scheduled_tokens = (
-            self.scheduler_config.max_num_batched_tokens)
+        self.max_num_scheduled_tokens = \
+            self.scheduler_config.max_num_batched_tokens
         self.max_model_len = self.scheduler_config.max_model_len
         self.enable_kv_cache_events = (
             self.kv_events_config is not None
@@ -113,8 +113,8 @@ def __init__(
         # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating
         # them at each scheduling step.
         # Request id -> deque of CachedRequestData
-        self._cached_reqs_data: dict[str, deque[CachedRequestData]] = (
-            defaultdict(deque))
+        self._cached_reqs_data: dict[
+            str, deque[CachedRequestData]] = defaultdict(deque)
 
         # Encoder-related.
         # Calculate encoder cache size if applicable
diff --git a/vllm/v1/core/sched/waiting_queue.py b/vllm/v1/core/sched/waiting_queue.py
index 992cf560daf5..971ab6dcb450 100644
--- a/vllm/v1/core/sched/waiting_queue.py
+++ b/vllm/v1/core/sched/waiting_queue.py
@@ -12,7 +12,7 @@
 
 class WaitingQueue(ABC):
     """Abstract base class for request queues."""
-    
+
     @abstractmethod
     def add_request(self, request: Request) -> None:
         """Add a request to the queue according to the policy."""
@@ -32,7 +32,7 @@ def push_request(self,
         pass
 
     @abstractmethod
-    def extend_left_requests(self, requests: 'WaitingQueue') -> None:
+    def extend_left_requests(self, requests: WaitingQueue) -> None:
         """Extend left with requests from another WaitingQueue."""
         pass
 
@@ -142,7 +142,9 @@ def extend_left_requests(self, requests: WaitingQueue) -> None:
         else:
             # Handle FCFS queue case
             for request in requests:
-                heapq.heappush(self._heap, (request.priority, request.arrival_time, request))
+                heapq.heappush(
+                    self._heap,
+                    (request.priority, request.arrival_time, request))
 
     def remove_request(self, request: Request) -> None:
         """Remove a specific request from the queue."""
@@ -162,9 +164,9 @@ def __iter__(self):
         return iter(self._heap)
 
 
-def create_waiting_queue(policy: str) -> WaitingQueue:
-    """Factory function to create the appropriate waiting queue based on policy."""
+def create_waiting_queue(policy: str, ) -> WaitingQueue:
+    """Create waiting queue based on scheduling policy."""
     if policy == "priority":
         return PriorityWaitingQueue()
     else:
-        return FCFSWaitingQueue() 
\ No newline at end of file
+        return FCFSWaitingQueue()

From 03b806c76a83a22cb06add44af061930948b760b Mon Sep 17 00:00:00 2001
From: amit <amit.man@gmail.com>
Date: Sun, 8 Jun 2025 11:43:37 +0300
Subject: [PATCH 09/33] mypy errors fix

Signed-off-by: amit <amit.man@gmail.com>
---
 vllm/v1/core/sched/scheduler.py     | 3 ++-
 vllm/v1/core/sched/waiting_queue.py | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 9aefc96db819..6ffb17807cfc 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -464,7 +464,8 @@ def schedule(self) -> SchedulerOutput:
                         skipped_waiting_requests.push_request(
                             request, request.priority, request.arrival_time)
                     else:
-                        self.waiting.appendleft(request)
+                        self.waiting.push_request(request, request.priority,
+                                                  request.arrival_time)
                     break
 
                 # KVTransfer: the connector uses this info to determine
diff --git a/vllm/v1/core/sched/waiting_queue.py b/vllm/v1/core/sched/waiting_queue.py
index 971ab6dcb450..47eee2e8a0fe 100644
--- a/vllm/v1/core/sched/waiting_queue.py
+++ b/vllm/v1/core/sched/waiting_queue.py
@@ -98,7 +98,7 @@ def remove_request(self, request: Request) -> None:
 
     def __bool__(self) -> bool:
         """Check if queue has any requests."""
-        return deque.__bool__(self)
+        return len(self) > 0
 
     def __len__(self) -> int:
         """Get number of requests in queue."""
@@ -106,7 +106,8 @@ def __len__(self) -> int:
 
     def __iter__(self):
         """Iterate over the queue according to FCFS policy."""
-        return deque.__iter__(self)
+        # Cast to deque to access its iterator
+        return iter(deque[Request](self))
 
 
 class PriorityWaitingQueue(WaitingQueue):

From 462c9fce4c1d0cb094912f867c0f1268ce50652b Mon Sep 17 00:00:00 2001
From: amit <amit.man@gmail.com>
Date: Mon, 9 Jun 2025 09:57:29 +0300
Subject: [PATCH 10/33] fix maximum recursion depth exceeded

Signed-off-by: amit <amit.man@gmail.com>
---
 vllm/v1/core/sched/waiting_queue.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/v1/core/sched/waiting_queue.py b/vllm/v1/core/sched/waiting_queue.py
index 47eee2e8a0fe..59b94d653a7f 100644
--- a/vllm/v1/core/sched/waiting_queue.py
+++ b/vllm/v1/core/sched/waiting_queue.py
@@ -106,8 +106,7 @@ def __len__(self) -> int:
 
     def __iter__(self):
         """Iterate over the queue according to FCFS policy."""
-        # Cast to deque to access its iterator
-        return iter(deque[Request](self))
+        return deque.__iter__(self)
 
 
 class PriorityWaitingQueue(WaitingQueue):

From c016f49d6b7d0846b824684e0efa3dff21ec080b Mon Sep 17 00:00:00 2001
From: amit <amit.man@gmail.com>
Date: Mon, 9 Jun 2025 10:36:41 +0300
Subject: [PATCH 11/33] refactor FCFSWaitingQueue to fix both __iter__ and
 maximum recursion depth exceeded

Signed-off-by: amit <amit.man@gmail.com>
---
 tests/v1/core/test_scheduler.py     | 34 ++++++++++-------------------
 vllm/v1/core/sched/waiting_queue.py | 30 ++++++++++---------------
 2 files changed, 24 insertions(+), 40 deletions(-)

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 39e9882f0fdd..d0ee81ecebf9 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1133,7 +1133,6 @@ def test_kv_connector_handles_preemption():
     assert len(scheduler.running) == 1
     _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
     assert len(scheduler.running) == 0
-    assert len(scheduler.waiting) == 1
     # All memory should be freed since nothing is running.
     assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
         == NUM_BLOCKS - 1
@@ -1632,11 +1631,10 @@ def test_priority_scheduling_preemption_victim_selection():
     assert len(scheduler.waiting) == 2
 
     # Extract waiting requests and verify priority order
-    temp_waiting = list(scheduler.waiting)
-    temp_waiting.sort()  # Sort by (priority, arrival_time, request)
+    waiting_requests = list(scheduler.waiting)
 
-    waiting_priorities = [priority for priority, _, _ in temp_waiting]
-    waiting_req_ids = [req.request_id for _, _, req in temp_waiting]
+    waiting_priorities = [req.priority for req in waiting_requests]
+    waiting_req_ids = [req.request_id for req in waiting_requests]
 
     # Should be req_1 (priority 2) then req_0 (priority 3)
     assert waiting_priorities == [2, 3]
@@ -1671,13 +1669,10 @@ def test_priority_scheduling_equal_priority_preemption():
     assert len(scheduler.waiting) == 2
 
     # Extract waiting requests and verify arrival time order
-    temp_waiting = list(scheduler.waiting)
-    temp_waiting.sort()  # Sort by (priority, arrival_time, request)
+    waiting_requests = list(scheduler.waiting)
 
-    waiting_arrival_times = [
-        arrival_time for _, arrival_time, _ in temp_waiting
-    ]
-    waiting_req_ids = [req.request_id for _, _, req in temp_waiting]
+    waiting_arrival_times = [req.arrival_time for req in waiting_requests]
+    waiting_req_ids = [req.request_id for req in waiting_requests]
 
     # Should be req_2 (arrival 2.0) then req_0 (arrival 3.0)
     assert waiting_arrival_times == [2.0, 3.0]
@@ -1712,14 +1707,10 @@ def test_priority_scheduling_waiting_queue_order():
 
     # Extract requests from waiting queue
     # (it's a heap, so we need to pop to see order)
-    waiting_priorities = []
-    waiting_req_ids = []
-    temp_waiting = list(scheduler.waiting)  # Copy the heap
-    temp_waiting.sort()  # Sort by (priority, arrival_time, request)
+    waiting_requests = list(scheduler.waiting)
 
-    for priority, arrival_time, request in temp_waiting:
-        waiting_priorities.append(priority)
-        waiting_req_ids.append(request.request_id)
+    waiting_priorities = [req.priority for req in waiting_requests]
+    waiting_req_ids = [req.request_id for req in waiting_requests]
 
     # Should be ordered by priority: req_1 (1), req_2 (2), req_0 (3)
     assert waiting_req_ids == ["1", "2", "0"]
@@ -1785,10 +1776,9 @@ def test_priority_scheduling_with_limited_slots():
     assert len(scheduler.waiting) == 2
 
     # Extract waiting requests and verify order
-    temp_waiting = list(scheduler.waiting)
-    temp_waiting.sort()
-    waiting_priorities = [priority for priority, _, _ in temp_waiting]
-    waiting_req_ids = [req.request_id for _, _, req in temp_waiting]
+    waiting_requests = list(scheduler.waiting)
+    waiting_priorities = [req.priority for req in waiting_requests]
+    waiting_req_ids = [req.request_id for req in waiting_requests]
 
     # Should be req_2 (priority 2) then req_0 (priority 3)
     assert waiting_priorities == [2, 3]
diff --git a/vllm/v1/core/sched/waiting_queue.py b/vllm/v1/core/sched/waiting_queue.py
index 59b94d653a7f..f18011691101 100644
--- a/vllm/v1/core/sched/waiting_queue.py
+++ b/vllm/v1/core/sched/waiting_queue.py
@@ -6,6 +6,7 @@
 import heapq
 from abc import ABC, abstractmethod
 from collections import deque
+from collections.abc import Iterator
 
 from vllm.v1.request import Request
 
@@ -52,17 +53,16 @@ def __len__(self) -> int:
         pass
 
     @abstractmethod
-    def __iter__(self):
+    def __iter__(self) -> Iterator[Request]:
         """Iterate over the queue according to the policy."""
         pass
 
 
-class FCFSWaitingQueue(WaitingQueue, deque[Request]):
+class FCFSWaitingQueue(deque[Request], WaitingQueue):
     """A first-come-first-served queue that supports deque operations."""
 
     def __init__(self) -> None:
         super().__init__()
-        deque.__init__(self)
 
     def add_request(self, request: Request) -> None:
         """Add a request to the queue according to FCFS policy."""
@@ -81,16 +81,7 @@ def push_request(self,
 
     def extend_left_requests(self, requests: WaitingQueue) -> None:
         """Extend left with requests from another WaitingQueue."""
-        if isinstance(requests, FCFSWaitingQueue):
-            self.extendleft(requests)
-        else:
-            # Handle priority queue case
-            for item in requests:
-                if isinstance(item, tuple):
-                    _, _, request = item
-                    self.appendleft(request)
-                else:
-                    self.appendleft(item)
+        self.extendleft(reversed(list(requests)))
 
     def remove_request(self, request: Request) -> None:
         """Remove a specific request from the queue."""
@@ -102,11 +93,11 @@ def __bool__(self) -> bool:
 
     def __len__(self) -> int:
         """Get number of requests in queue."""
-        return deque.__len__(self)
+        return super().__len__()
 
-    def __iter__(self):
+    def __iter__(self) -> Iterator[Request]:
         """Iterate over the queue according to FCFS policy."""
-        return deque.__iter__(self)
+        return super().__iter__()
 
 
 class PriorityWaitingQueue(WaitingQueue):
@@ -159,9 +150,12 @@ def __len__(self) -> int:
         """Get number of requests in queue."""
         return len(self._heap)
 
-    def __iter__(self):
+    def __iter__(self) -> Iterator[Request]:
         """Iterate over the queue according to priority policy."""
-        return iter(self._heap)
+        heap_copy = self._heap[:]
+        while heap_copy:
+            _, _, request = heapq.heappop(heap_copy)
+            yield request
 
 
 def create_waiting_queue(policy: str, ) -> WaitingQueue:

From 7c68b174d58449fa84641600aa5bae038045c2a6 Mon Sep 17 00:00:00 2001
From: amit <amit.man@gmail.com>
Date: Wed, 11 Jun 2025 16:09:39 +0300
Subject: [PATCH 12/33] Merge remote-tracking branch 'upstream/main' into
 v1-priority-schedular

Signed-off-by: amit <amit.man@gmail.com>
---
 .buildkite/scripts/ci-clean-log.sh            |  17 ++
 .../scripts/hardware_ci/run-cpu-test.sh       |   5 +-
 .buildkite/scripts/rerun-test.sh              |  18 ++
 .github/ISSUE_TEMPLATE/400-bug-report.yml     |  10 ++
 .github/mergify.yml                           |  34 ++++
 CMakeLists.txt                                |   6 +-
 benchmarks/kernels/benchmark_moe.py           |  14 +-
 csrc/cpu/utils.cpp                            |   5 +-
 docker/Dockerfile.cpu                         |   4 +
 docs/contributing/README.md                   |   2 +-
 docs/contributing/ci-failures.md              |  16 +-
 docs/contributing/model/multimodal.md         |   8 +-
 docs/contributing/model/registration.md       |   4 +-
 docs/contributing/model/tests.md              |   2 +-
 docs/design/multiprocessing.md                |   2 +-
 docs/design/v1/prefix_caching.md              |   2 +-
 docs/features/multimodal_inputs.md            |   2 +-
 docs/features/quantization/README.md          |  26 +--
 docs/features/quantization/quark.md           |   2 +-
 docs/getting_started/installation/cpu.md      |  21 ++-
 docs/getting_started/quickstart.md            |   5 +-
 docs/mkdocs/hooks/remove_announcement.py      |   8 +-
 docs/mkdocs/stylesheets/extra.css             |  37 ++++
 docs/models/generative_models.md              |   4 +-
 docs/models/supported_models.md               |  50 +++---
 docs/serving/openai_compatible_server.md      |   8 +-
 docs/usage/security.md                        |   1 +
 docs/usage/troubleshooting.md                 |   2 +-
 docs/usage/v1_guide.md                        | 163 ++++++++++--------
 .../offline_inference/neuron_multimodal.py    |   6 +-
 .../profiling_tpu/profiling.py                |   8 +-
 examples/offline_inference/qwen3_reranker.py  |  77 +++++++++
 .../others/lmcache/cpu_offload_lmcache.py     |   2 +-
 .../lmcache/kv_cache_sharing_lmcache_v1.py    |   4 +-
 requirements/common.txt                       |   2 +-
 requirements/cpu.txt                          |   2 +
 tests/async_engine/api_server_async_engine.py |  12 +-
 tests/compile/test_config.py                  |  10 ++
 tests/entrypoints/llm/test_generate.py        |  25 +++
 .../attention/test_encoder_decoder_attn.py    |   2 +-
 tests/kernels/moe/test_pplx_moe.py            |   2 +-
 tests/lora/conftest.py                        |   5 -
 tests/lora/test_llama_tp.py                   |  34 ----
 tests/lora/test_lora_functions.py             |   7 -
 tests/lora/test_phi.py                        |  10 +-
 tests/lora/test_worker.py                     |  18 +-
 tests/models/language/pooling/test_gte.py     |   9 +
 .../language/pooling/test_qwen3_reranker.py   |  87 ++++++++++
 .../pooling/test_qwen3_reranker_seq_cls.py    |  73 ++++++++
 tests/models/quantization/test_gguf.py        |   2 +-
 tests/models/registry.py                      |   1 +
 tests/test_config.py                          |  34 ++--
 tests/test_utils.py                           |  15 ++
 tests/v1/core/test_prefix_caching.py          |  44 ++---
 tests/v1/sample/test_topk_topp_sampler.py     |   2 +-
 tests/v1/tpu/worker/test_tpu_model_runner.py  | 120 ++++++-------
 tests/v1/worker/test_gpu_input_batch.py       |   2 +-
 tests/v1/worker/test_gpu_model_runner.py      |   4 +-
 tools/ep_kernels/README.md                    |  10 +-
 tools/ep_kernels/configure_system_drivers.sh  |   7 +
 tools/ep_kernels/install_python_libraries.sh  |  36 ++--
 tools/ep_kernels/install_system_drivers.sh    |  24 ---
 tools/ep_kernels/install_system_libraries.sh  |  18 --
 vllm/config.py                                |   8 +-
 .../device_communicators/all2all.py           |  15 +-
 vllm/engine/arg_utils.py                      |   5 +-
 vllm/engine/output_processor/stop_checker.py  |   2 +-
 vllm/entrypoints/api_server.py                |   4 +-
 vllm/entrypoints/llm.py                       |  82 +++++----
 vllm/entrypoints/openai/api_server.py         |   4 +-
 vllm/env_override.py                          |   2 +-
 vllm/envs.py                                  |  18 +-
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++
 .../layers/fused_moe/deep_gemm_moe.py         |  29 ++--
 .../fused_moe/deepep_ht_prepare_finalize.py   |  16 +-
 .../layers/fused_moe/moe_permute_unpermute.py |   2 +-
 .../layers/quantization/bitsandbytes.py       |   4 +-
 .../layers/quantization/deepgemm.py           |  84 +++++++++
 .../model_executor/layers/quantization/fp8.py |   1 +
 .../layers/quantization/utils/fp8_utils.py    |  72 +++++++-
 .../layers/rejection_sampler.py               |   6 +-
 .../model_loader/bitsandbytes_loader.py       |   3 +-
 vllm/model_executor/models/commandr.py        | 125 +++++++-------
 vllm/model_executor/models/qwen3.py           | 125 +++++++++++++-
 vllm/model_executor/models/registry.py        |   1 +
 vllm/multimodal/hasher.py                     |  29 ----
 vllm/multimodal/inputs.py                     |  19 +-
 vllm/platforms/cpu.py                         |   3 +
 vllm/platforms/cuda.py                        |   4 +-
 vllm/transformers_utils/config.py             |   4 +-
 vllm/utils.py                                 |  41 ++++-
 vllm/v1/attention/backends/flashinfer.py      |   5 +
 vllm/v1/attention/backends/mla/cutlass_mla.py |   3 +-
 vllm/v1/core/block_pool.py                    |   8 +-
 vllm/v1/core/kv_cache_coordinator.py          | 101 +++++------
 vllm/v1/core/kv_cache_manager.py              |  38 ++--
 vllm/v1/core/sched/output.py                  |   8 +-
 vllm/v1/core/sched/scheduler.py               |  12 +-
 vllm/v1/core/single_type_kv_cache_manager.py  |  50 +++---
 vllm/v1/worker/block_table.py                 |   5 +-
 vllm/v1/worker/cpu_worker.py                  |  55 +++++-
 vllm/v1/worker/gpu_input_batch.py             |   2 +-
 vllm/v1/worker/gpu_model_runner.py            |  44 ++++-
 vllm/v1/worker/gpu_worker.py                  |  86 ++++-----
 vllm/v1/worker/tpu_worker.py                  |   5 +-
 vllm/worker/cpu_worker.py                     |  53 +++++-
 108 files changed, 2025 insertions(+), 788 deletions(-)
 create mode 100644 .buildkite/scripts/ci-clean-log.sh
 create mode 100644 .buildkite/scripts/rerun-test.sh
 create mode 100644 examples/offline_inference/qwen3_reranker.py
 create mode 100644 tests/models/language/pooling/test_qwen3_reranker.py
 create mode 100644 tests/models/language/pooling/test_qwen3_reranker_seq_cls.py
 create mode 100644 tools/ep_kernels/configure_system_drivers.sh
 delete mode 100644 tools/ep_kernels/install_system_drivers.sh
 delete mode 100644 tools/ep_kernels/install_system_libraries.sh
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/deepgemm.py

diff --git a/.buildkite/scripts/ci-clean-log.sh b/.buildkite/scripts/ci-clean-log.sh
new file mode 100644
index 000000000000..69d8a3a28831
--- /dev/null
+++ b/.buildkite/scripts/ci-clean-log.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+# Usage: ./ci_clean_log.sh ci.log
+# This script strips timestamps and color codes from CI log files.
+
+# Check if argument is given
+if [ $# -lt 1 ]; then
+    echo "Usage: $0 ci.log"
+    exit 1
+fi
+
+INPUT_FILE="$1"
+
+# Strip timestamps
+sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
+
+# Strip colorization
+sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 61aa7df13b4d..bbcde4009c0e 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -43,7 +43,10 @@ function cpu_tests() {
     pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
     pytest -v -s tests/models/language/generation -m cpu_model
     pytest -v -s tests/models/language/pooling -m cpu_model
-    pytest -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_mllama.py -m cpu_model"
+    pytest -v -s tests/models/multimodal/generation \
+                --ignore=tests/models/multimodal/generation/test_mllama.py \
+                --ignore=tests/models/multimodal/generation/test_pixtral.py \
+                -m cpu_model"
 
   # Run compressed-tensor test
   docker exec cpu-test-"$NUMA_NODE" bash -c "
diff --git a/.buildkite/scripts/rerun-test.sh b/.buildkite/scripts/rerun-test.sh
new file mode 100644
index 000000000000..d79c0d5f381b
--- /dev/null
+++ b/.buildkite/scripts/rerun-test.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+# Usage: ./rerun_test.sh path/to/test.py::test_name
+
+# Check if argument is given
+if [ $# -lt 1 ]; then
+    echo "Usage: $0 path/to/test.py::test_name"
+    echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
+    exit 1
+fi
+
+TEST=$1
+COUNT=1
+
+while pytest -sv "$TEST"; do
+    COUNT=$((COUNT + 1))
+    echo "RUN NUMBER ${COUNT}"
+done
diff --git a/.github/ISSUE_TEMPLATE/400-bug-report.yml b/.github/ISSUE_TEMPLATE/400-bug-report.yml
index f05be2ba8707..8c5c28cd77cf 100644
--- a/.github/ISSUE_TEMPLATE/400-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml
@@ -8,6 +8,16 @@ body:
   attributes:
     value: >
       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
+- type: markdown
+  attributes:
+    value: |
+      ⚠️ **SECURITY WARNING:** Please review any text you paste to ensure it does not contain sensitive information such as:
+      - API tokens or keys (e.g., Hugging Face tokens, OpenAI API keys)
+      - Passwords or authentication credentials
+      - Private URLs or endpoints
+      - Personal or confidential data
+      
+      Consider redacting or replacing sensitive values with placeholders like `<YOUR_TOKEN_HERE>` when sharing configuration or code examples.
 - type: textarea
   attributes:
     label: Your current environment
diff --git a/.github/mergify.yml b/.github/mergify.yml
index e595060c325a..20b4a8fc2dbc 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -36,6 +36,20 @@ pull_request_rules:
       add:
         - frontend
 
+- name: label-llama
+  description: Automatically apply llama label
+  conditions:
+    - or:
+      - files~=^examples/.*llama.*\.py
+      - files~=^tests/.*llama.*\.py
+      - files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
+      - files~=^vllm/model_executor/models/.*llama.*\.py
+      - files~=^vllm/transformers_utils/configs/.*llama.*\.py
+  actions:
+    label:
+      add:
+        - llama
+
 - name: label-multi-modality
   description: Automatically apply multi-modality label
   conditions:
@@ -51,6 +65,26 @@ pull_request_rules:
       add:
         - multi-modality
 
+- name: label-rocm
+  description: Automatically apply rocm label
+  conditions:
+    - or:
+      - files~=^csrc/rocm/
+      - files~=^docker/Dockerfile.rocm
+      - files~=^requirements/rocm.*\.txt
+      - files~=^vllm/attention/backends/rocm.*\.py
+      - files~=^vllm/attention/ops/rocm.*\.py
+      - files~=^vllm/model_executor/layers/fused_moe/rocm.*\.py
+      - files~=^vllm/v1/attention/backends/mla/rocm.*\.py
+      - files~=^tests/kernels/.*_rocm.*\.py
+      - files=vllm/platforms/rocm.py
+      - title~=(?i)AMD
+      - title~=(?i)ROCm
+  actions:
+    label:
+      add:
+        - rocm
+
 - name: label-structured-output
   description: Automatically apply structured-output label
   conditions:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index afaed7cd1821..bd389823fbb2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -308,7 +308,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # Keep building Marlin for 9.0 as there are some group sizes and shapes that
   # are not supported by Machete yet.
   # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
   if (MARLIN_ARCHS)
 
     #
@@ -454,7 +454,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # kernels for the remaining archs that are not already built for 3x.
   # (Build 8.9 for FP8)
   cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
-    "7.5;8.0;8.9+PTX" "${CUDA_ARCHS}")
+    "7.5;8.0;8.7;8.9+PTX" "${CUDA_ARCHS}")
   # subtract out the archs that are already built for 3x
   list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
   if (SCALED_MM_2X_ARCHS)
@@ -684,7 +684,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 
   list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
   # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
   if (MARLIN_MOE_ARCHS)
 
     #
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 6cb55b35993e..cef53b183cef 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -7,7 +7,6 @@
 from contextlib import nullcontext
 from datetime import datetime
 from itertools import product
-from types import SimpleNamespace
 from typing import Any, TypedDict
 
 import ray
@@ -43,7 +42,7 @@ def benchmark_config(
     use_fp8_w8a8: bool,
     use_int8_w8a16: bool,
     num_iters: int = 100,
-    block_quant_shape: List[int] = None,
+    block_quant_shape: list[int] = None,
     use_deep_gemm: bool = False,
 ) -> float:
     init_dtype = torch.float16 if use_fp8_w8a8 else dtype
@@ -400,7 +399,7 @@ def benchmark(
         dtype: torch.dtype,
         use_fp8_w8a8: bool,
         use_int8_w8a16: bool,
-        block_quant_shape: List[int] = None,
+        block_quant_shape: list[int] = None,
         use_deep_gemm: bool = False,
     ) -> tuple[dict[str, int], float]:
         current_platform.seed_everything(self.seed)
@@ -532,7 +531,7 @@ def save_configs(
     dtype: torch.dtype,
     use_fp8_w8a8: bool,
     use_int8_w8a16: bool,
-    block_quant_shape: List[int],
+    block_quant_shape: list[int],
 ) -> None:
     dtype_str = get_config_dtype_str(
         dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
@@ -563,7 +562,6 @@ def main(args: argparse.Namespace):
     config = get_config(model=args.model, trust_remote_code=args.trust_remote_code)
     if args.model_prefix:
         config = getattr(config, args.model_prefix)
-    config = SimpleNamespace(**config)
 
     if config.architectures[0] == "DbrxForCausalLM":
         E = config.ffn_config.moe_num_experts
@@ -595,11 +593,7 @@ def main(args: argparse.Namespace):
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
 
     hidden_size = config.hidden_size
-    dtype = (
-        torch.float16
-        if current_platform.is_rocm()
-        else getattr(torch, config.torch_dtype)
-    )
+    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
     use_fp8_w8a8 = args.dtype == "fp8_w8a8"
     use_int8_w8a16 = args.dtype == "int8_w8a16"
     block_quant_shape = get_weight_block_size_safety(config)
diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp
index c17a8961629a..02514edce807 100644
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@@ -54,8 +54,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
     *(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp);
     int page_num = numa_migrate_pages(pid, src_mask, mask);
     if (page_num == -1) {
-      TORCH_CHECK(false,
-                  "numa_migrate_pages failed. errno: " + std::to_string(errno));
+      TORCH_WARN("numa_migrate_pages failed. errno: " + std::to_string(errno));
     }
 
     // restrict memory allocation node.
@@ -105,4 +104,4 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
 
   return ss.str();
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index 6db2f307a380..3e9fa0e7af2d 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -98,6 +98,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     VLLM_TARGET_DEVICE=cpu python3 setup.py develop 
 
 RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
+    cp requirements/test.in requirements/test-cpu.in && \
+    sed -i '/mamba_ssm/d' requirements/test-cpu.in && \
+    uv pip compile requirements/test-cpu.in -o requirements/test.txt && \
     uv pip install -r requirements/dev.txt && \
     pre-commit install --hook-type pre-commit --hook-type commit-msg
 
diff --git a/docs/contributing/README.md b/docs/contributing/README.md
index 65ae9cc96367..10c50e007243 100644
--- a/docs/contributing/README.md
+++ b/docs/contributing/README.md
@@ -130,7 +130,7 @@ pytest -s -v tests/test_logger.py
 
 If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
 
-!!! warning
+!!! important
     If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability).
 
 ## Pull Requests & Code Reviews
diff --git a/docs/contributing/ci-failures.md b/docs/contributing/ci-failures.md
index 4d8f78197f33..7caaf10ceb5c 100644
--- a/docs/contributing/ci-failures.md
+++ b/docs/contributing/ci-failures.md
@@ -64,15 +64,13 @@ Download the full log file from Buildkite locally.
 
 Strip timestamps and colorization:
 
-```bash
-# Strip timestamps
-sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' ci.log
+<gh-file:.buildkite/scripts/ci-clean-log.sh>
 
-# Strip colorization
-sed -i -r 's/\x1B\[[0-9;]*[mK]//g' ci.log
+```bash
+./ci-clean-log.sh ci.log
 ```
 
-Use a tool for quick copy-pasting:
+Use a tool [wl-clipboard](https://github.com/bugaevc/wl-clipboard) for quick copy-pasting:
 
 ```bash
 tail -525 ci_build.log | wl-copy
@@ -89,10 +87,10 @@ tail -525 ci_build.log | wl-copy
 
 CI test failures may be flaky. Use a bash loop to run repeatedly:
 
+<gh-file:.buildkite/scripts/rerun-test.sh>
+
 ```bash
-COUNT=1; while pytest -sv tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]; do  
-  COUNT=$[$COUNT + 1]; echo "RUN NUMBER ${COUNT}";  
-done
+./rerun-test.sh tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]
 ```
 
 ## Submitting a PR
diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md
index 892ab9098407..bed6d4e653d6 100644
--- a/docs/contributing/model/multimodal.md
+++ b/docs/contributing/model/multimodal.md
@@ -48,8 +48,8 @@ Further update the model as follows:
             return vision_embeddings
     ```
 
-!!! warning
-        The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request.
+!!! important
+    The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request.
 
 - Implement [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings] to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings.
 
@@ -100,8 +100,8 @@ Further update the model as follows:
   ```
 
 !!! note
-      The model class does not have to be named `*ForCausalLM`.
-      Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples.
+    The model class does not have to be named `*ForCausalLM`.
+    Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples.
 
 ## 2. Specify processing information
 
diff --git a/docs/contributing/model/registration.md b/docs/contributing/model/registration.md
index 7a7bd7914058..a6dc1e32dfb9 100644
--- a/docs/contributing/model/registration.md
+++ b/docs/contributing/model/registration.md
@@ -18,7 +18,7 @@ After you have implemented your model (see [tutorial][new-model-basic]), put it
 Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
 Finally, update our [list of supported models][supported-models] to promote your model!
 
-!!! warning
+!!! important
     The list of models in each section should be maintained in alphabetical order.
 
 ## Out-of-tree models
@@ -49,6 +49,6 @@ def register():
     )
 ```
 
-!!! warning
+!!! important
     If your model is a multimodal model, ensure the model class implements the [SupportsMultiModal][vllm.model_executor.models.interfaces.SupportsMultiModal] interface.
     Read more about that [here][supports-multimodal].
diff --git a/docs/contributing/model/tests.md b/docs/contributing/model/tests.md
index 67f8eda61dc5..a8cb457453b9 100644
--- a/docs/contributing/model/tests.md
+++ b/docs/contributing/model/tests.md
@@ -15,7 +15,7 @@ Without them, the CI for your PR will fail.
 Include an example HuggingFace repository for your model in <gh-file:tests/models/registry.py>.
 This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM.
 
-!!! warning
+!!! important
     The list of models in each section should be maintained in alphabetical order.
 
 !!! tip
diff --git a/docs/design/multiprocessing.md b/docs/design/multiprocessing.md
index 4d58fae20f06..d2d8f73aed33 100644
--- a/docs/design/multiprocessing.md
+++ b/docs/design/multiprocessing.md
@@ -7,7 +7,7 @@ page for information on known issues and how to solve them.
 
 ## Introduction
 
-!!! warning
+!!! important
     The source code references are to the state of the code at the time of writing in December, 2024.
 
 The use of Python multiprocessing in vLLM is complicated by:
diff --git a/docs/design/v1/prefix_caching.md b/docs/design/v1/prefix_caching.md
index bbdfb255214d..e87e4c6a48b7 100644
--- a/docs/design/v1/prefix_caching.md
+++ b/docs/design/v1/prefix_caching.md
@@ -144,7 +144,7 @@ As a result, we will have the following components when the KV cache manager is
 
 **Running request:** Workflow for the scheduler to schedule a running request with KV cache block allocation:
 
-1. The scheduler calls `kv_cache_manager.append_slots()`. It does the following steps:  
+1. The scheduler calls `kv_cache_manager.allocate_slots()`. It does the following steps:  
    1. Compute the number of new required blocks, and return if there are no sufficient blocks to allocate.  
    2. Allocate new blocks by popping the heads of the free queue. If the head block is a cached block, this also “evicts” the block so that no other requests can reuse it anymore from now on.  
    3. Append token IDs to the slots in existing blocks as well as the new blocks. If a block is full, we add it to the Cache Block to cache it.
diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index 19b668172902..afb9a6d4df9a 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -211,7 +211,7 @@ for o in outputs:
 
 Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat).
 
-!!! warning
+!!! important
     A chat template is **required** to use Chat Completions API.
     For HF format models, the default chat template is defined inside `chat_template.json` or `tokenizer_config.json`.
 
diff --git a/docs/features/quantization/README.md b/docs/features/quantization/README.md
index 71f62065f63d..614b43dd0044 100644
--- a/docs/features/quantization/README.md
+++ b/docs/features/quantization/README.md
@@ -7,16 +7,16 @@ Quantization trades off model precision for smaller memory footprint, allowing l
 
 Contents:
 
-- [Supported_Hardware](supported_hardware.md)
-- [Auto_Awq](auto_awq.md)
-- [Bnb](bnb.md)
-- [Bitblas](bitblas.md)
-- [Gguf](gguf.md)
-- [Gptqmodel](gptqmodel.md)
-- [Int4](int4.md)
-- [Int8](int8.md)
-- [Fp8](fp8.md)
-- [Modelopt](modelopt.md)
-- [Quark](quark.md)
-- [Quantized_Kvcache](quantized_kvcache.md)
-- [Torchao](torchao.md)
+- [Supported Hardware](supported_hardware.md)
+- [AutoAWQ](auto_awq.md)
+- [BitsAndBytes](bnb.md)
+- [BitBLAS](bitblas.md)
+- [GGUF](gguf.md)
+- [GPTQModel](gptqmodel.md)
+- [INT4 W4A16](int4.md)
+- [INT8 W8A8](int8.md)
+- [FP8 W8A8](fp8.md)
+- [NVIDIA TensorRT Model Optimizer](modelopt.md)
+- [AMD Quark](quark.md)
+- [Quantized KV Cache](quantized_kvcache.md)
+- [TorchAO](torchao.md)
diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md
index 51da98cc09d3..35e9dbe2609b 100644
--- a/docs/features/quantization/quark.md
+++ b/docs/features/quantization/quark.md
@@ -1,5 +1,5 @@
 ---
-title: AMD QUARK
+title: AMD Quark
 ---
 [](){ #quark }
 
diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index 18c96b264ad8..00bb5cae43f0 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -110,8 +110,9 @@ vLLM CPU backend supports the following vLLM features:
 
 ## Related runtime environment variables
 
-- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
-- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores.
+- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. Default value is `0`.
+- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. By setting to `auto`, the OpenMP threads of each rank are bound to the CPU cores in each NUMA node. By setting to `all`, the OpenMP threads of each rank uses all CPU cores available on the system. Default value is `auto`.
+- `VLLM_CPU_NUM_OF_RESERVED_CPU`: specify the number of CPU cores which are not dedicated to the OpenMP threads for each rank. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. Default value is `0`.
 - `VLLM_CPU_MOE_PREPACK`: whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False).
 
 ## Performance tips
@@ -133,7 +134,15 @@ export VLLM_CPU_OMP_THREADS_BIND=0-29
 vllm serve facebook/opt-125m
 ```
 
-- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND`. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
+ or using default auto thread binding:
+
+```console
+export VLLM_CPU_KVCACHE_SPACE=40
+export VLLM_CPU_NUM_OF_RESERVED_CPU=2
+vllm serve facebook/opt-125m
+```
+
+- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND` or using auto thread binding feature by default. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
 
 ```console
 $ lscpu -e # check the mapping between logical CPU cores and physical CPU cores
@@ -178,6 +187,12 @@ $ python examples/offline_inference/basic/basic.py
     VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
     ```
 
+    or using default auto thread binding:
+
+    ```console
+    VLLM_CPU_KVCACHE_SPACE=40 vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
+    ```
+
   - For each thread id list in `VLLM_CPU_OMP_THREADS_BIND`, users should guarantee threads in the list belong to a same NUMA node.
 
   - Meanwhile, users should also take care of memory capacity of each NUMA node. The memory usage of each TP rank is the sum of `weight shard size` and `VLLM_CPU_KVCACHE_SPACE`, if it exceeds the capacity of a single NUMA node, TP worker will be killed due to out-of-memory.
diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md
index d24e75e8141d..38fc9925eb51 100644
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -61,7 +61,8 @@ from vllm import LLM, SamplingParams
 ```
 
 The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here][sampling-params].
-!!! warning
+
+!!! important
     By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the Hugging Face model repository if it exists. In most cases, this will provide you with the best results by default if [SamplingParams][vllm.SamplingParams] is not specified.
 
     However, if vLLM's default sampling parameters are preferred, please set `generation_config="vllm"` when creating the [LLM][vllm.LLM] instance.
@@ -116,7 +117,7 @@ vllm serve Qwen/Qwen2.5-1.5B-Instruct
 !!! note
     By default, the server uses a predefined chat template stored in the tokenizer.
     You can learn about overriding it [here][chat-template].
-!!! warning
+!!! important
     By default, the server applies `generation_config.json` from the huggingface model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator.
 
     To disable this behavior, please pass `--generation-config vllm` when launching the server.
diff --git a/docs/mkdocs/hooks/remove_announcement.py b/docs/mkdocs/hooks/remove_announcement.py
index f67941d2ad1b..1a84039abc14 100644
--- a/docs/mkdocs/hooks/remove_announcement.py
+++ b/docs/mkdocs/hooks/remove_announcement.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
+from pathlib import Path
 from typing import Literal
 
 
@@ -8,10 +9,9 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
     # see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa
     if os.getenv('READTHEDOCS_VERSION_TYPE') == "tag":
         # remove the warning banner if the version is a tagged release
-        docs_dir = os.path.dirname(__file__)
-        announcement_path = os.path.join(docs_dir,
-                                         "mkdocs/overrides/main.html")
+        mkdocs_dir = Path(__file__).parent.parent
+        announcement_path = mkdocs_dir / "overrides/main.html"
         # The file might be removed already if the build is triggered multiple
         # times (readthedocs build both HTML and PDF versions separately)
-        if os.path.exists(announcement_path):
+        if announcement_path.exists():
             os.remove(announcement_path)
diff --git a/docs/mkdocs/stylesheets/extra.css b/docs/mkdocs/stylesheets/extra.css
index 088143ed5956..6f30d459d5f4 100644
--- a/docs/mkdocs/stylesheets/extra.css
+++ b/docs/mkdocs/stylesheets/extra.css
@@ -34,3 +34,40 @@ body[data-md-color-scheme="slate"] .md-nav__item--section > label.md-nav__link .
   color: rgba(255, 255, 255, 0.75) !important;
   font-weight: 700;
 }
+
+/* Custom admonitions */
+:root {
+  --md-admonition-icon--announcement: url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16" width="16" height="16"><path d="M3.25 9a.75.75 0 0 1 .75.75c0 2.142.456 3.828.733 4.653a.122.122 0 0 0 .05.064.212.212 0 0 0 .117.033h1.31c.085 0 .18-.042.258-.152a.45.45 0 0 0 .075-.366A16.743 16.743 0 0 1 6 9.75a.75.75 0 0 1 1.5 0c0 1.588.25 2.926.494 3.85.293 1.113-.504 2.4-1.783 2.4H4.9c-.686 0-1.35-.41-1.589-1.12A16.4 16.4 0 0 1 2.5 9.75.75.75 0 0 1 3.25 9Z"></path><path d="M0 6a4 4 0 0 1 4-4h2.75a.75.75 0 0 1 .75.75v6.5a.75.75 0 0 1-.75.75H4a4 4 0 0 1-4-4Zm4-2.5a2.5 2.5 0 1 0 0 5h2v-5Z"></path><path d="M15.59.082A.75.75 0 0 1 16 .75v10.5a.75.75 0 0 1-1.189.608l-.002-.001h.001l-.014-.01a5.775 5.775 0 0 0-.422-.25 10.63 10.63 0 0 0-1.469-.64C11.576 10.484 9.536 10 6.75 10a.75.75 0 0 1 0-1.5c2.964 0 5.174.516 6.658 1.043.423.151.787.302 1.092.443V2.014c-.305.14-.669.292-1.092.443C11.924 2.984 9.713 3.5 6.75 3.5a.75.75 0 0 1 0-1.5c2.786 0 4.826-.484 6.155-.957.665-.236 1.154-.47 1.47-.64.144-.077.284-.161.421-.25l.014-.01a.75.75 0 0 1 .78-.061Z"></path></svg>');
+  --md-admonition-icon--important: url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16" width="16" height="16"><path d="M4.47.22A.749.749 0 0 1 5 0h6c.199 0 .389.079.53.22l4.25 4.25c.141.14.22.331.22.53v6a.749.749 0 0 1-.22.53l-4.25 4.25A.749.749 0 0 1 11 16H5a.749.749 0 0 1-.53-.22L.22 11.53A.749.749 0 0 1 0 11V5c0-.199.079-.389.22-.53Zm.84 1.28L1.5 5.31v5.38l3.81 3.81h5.38l3.81-3.81V5.31L10.69 1.5ZM8 4a.75.75 0 0 1 .75.75v3.5a.75.75 0 0 1-1.5 0v-3.5A.75.75 0 0 1 8 4Zm0 8a1 1 0 1 1 0-2 1 1 0 0 1 0 2Z"></path></svg>');
+}
+
+.md-typeset .admonition.announcement,
+.md-typeset details.announcement {
+  border-color: rgb(255, 110, 66);
+}
+.md-typeset .admonition.important,
+.md-typeset details.important {
+  border-color: rgb(239, 85, 82);
+}
+
+.md-typeset .announcement > .admonition-title,
+.md-typeset .announcement > summary {
+  background-color: rgb(255, 110, 66, 0.1);
+}
+.md-typeset .important > .admonition-title,
+.md-typeset .important > summary {
+  background-color: rgb(239, 85, 82, 0.1);
+}
+
+.md-typeset .announcement > .admonition-title::before,
+.md-typeset .announcement > summary::before {
+  background-color: rgb(239, 85, 82);
+  -webkit-mask-image: var(--md-admonition-icon--announcement);
+          mask-image: var(--md-admonition-icon--announcement);
+}
+.md-typeset .important > .admonition-title::before,
+.md-typeset .important > summary::before {
+  background-color: rgb(239, 85, 82);
+  -webkit-mask-image: var(--md-admonition-icon--important);
+          mask-image: var(--md-admonition-icon--important);
+}
diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md
index 566b1c29fca9..e52c5ae01cb8 100644
--- a/docs/models/generative_models.md
+++ b/docs/models/generative_models.md
@@ -51,7 +51,7 @@ for output in outputs:
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 
-!!! warning
+!!! important
     By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the huggingface model repository if it exists. In most cases, this will provide you with the best results by default if [SamplingParams][vllm.SamplingParams] is not specified.
 
     However, if vLLM's default sampling parameters are preferred, please pass `generation_config="vllm"` when creating the [LLM][vllm.LLM] instance.
@@ -81,7 +81,7 @@ The [chat][vllm.LLM.chat] method implements chat functionality on top of [genera
 In particular, it accepts input similar to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat)
 and automatically applies the model's [chat template](https://huggingface.co/docs/transformers/en/chat_templating) to format the prompt.
 
-!!! warning
+!!! important
     In general, only instruction-tuned models have a chat template.
     Base models may perform poorly as they are not trained to respond to the chat conversation.
 
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index a8a6f3417e54..69122b664ba7 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -379,7 +379,7 @@ Specified using `--task generate`.
 
 See [this page](./pooling_models.md) for more information on how to use pooling models.
 
-!!! warning
+!!! important
     Since some model architectures support both generative and pooling tasks,
     you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
 
@@ -387,18 +387,19 @@ See [this page](./pooling_models.md) for more information on how to use pooling
 
 Specified using `--task embed`.
 
-| Architecture                                           | Models              | Example HF Models                                                                                                   | [LoRA][lora-adapter]   | [PP][distributed-serving]   |
-|--------------------------------------------------------|---------------------|---------------------------------------------------------------------------------------------------------------------|------------------------|-----------------------------|
-| `BertModel`                                            | BERT-based          | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc.                                                |                        |                             |
-| `Gemma2Model`                                          | Gemma 2-based       | `BAAI/bge-multilingual-gemma2`, etc.                                                                                | ✅︎                     |                             |
-| `GritLM`                                               | GritLM              | `parasail-ai/GritLM-7B-vllm`.                                                                                       | ✅︎                     | ✅︎                          |
-| `GteModel`                                             | Arctic-Embed-2.0-M  | `Snowflake/snowflake-arctic-embed-m-v2.0`.                                                                          | ︎                      |                             |
-| `GteNewModel`                                          | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc.                                                                           | ︎                      | ︎                           |
-| `ModernBertModel`                                      | ModernBERT-based    | `Alibaba-NLP/gte-modernbert-base`, etc.                                                                             | ︎                      | ︎                           |
-| `NomicBertModel`                                       | Nomic BERT          | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | ︎                      | ︎                           |
-| `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc. | Llama-based         | `intfloat/e5-mistral-7b-instruct`, etc.                                                                             | ✅︎                     | ✅︎                          |
-| `Qwen2Model`, `Qwen2ForCausalLM`                       | Qwen2-based         | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc.              | ✅︎                     | ✅︎                          |
-| `RobertaModel`, `RobertaForMaskedLM`                   | RoBERTa-based       | `sentence-transformers/all-roberta-large-v1`, etc.                                                                  |                        |                             |
+| Architecture                                           | Models              | Example HF Models                                                                                                   | [LoRA][lora-adapter] | [PP][distributed-serving] |
+|--------------------------------------------------------|---------------------|---------------------------------------------------------------------------------------------------------------------|----------------------|---------------------------|
+| `BertModel`                                            | BERT-based          | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc.                                                |                      |                           |
+| `Gemma2Model`                                          | Gemma 2-based       | `BAAI/bge-multilingual-gemma2`, etc.                                                                                | ✅︎                   |                           |
+| `GritLM`                                               | GritLM              | `parasail-ai/GritLM-7B-vllm`.                                                                                       | ✅︎                   | ✅︎                        |
+| `GteModel`                                             | Arctic-Embed-2.0-M  | `Snowflake/snowflake-arctic-embed-m-v2.0`.                                                                          | ︎                    |                           |
+| `GteNewModel`                                          | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc.                                                                           | ︎                    | ︎                         |
+| `ModernBertModel`                                      | ModernBERT-based    | `Alibaba-NLP/gte-modernbert-base`, etc.                                                                             | ︎                    | ︎                         |
+| `NomicBertModel`                                       | Nomic BERT          | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | ︎                    | ︎                         |
+| `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc. | Llama-based         | `intfloat/e5-mistral-7b-instruct`, etc.                                                                             | ✅︎                   | ✅︎                        |
+| `Qwen2Model`, `Qwen2ForCausalLM`                       | Qwen2-based         | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc.              | ✅︎                   | ✅︎                        |
+| `Qwen3Model`, `Qwen3ForCausalLM`                       | Qwen3-based         | `Qwen/Qwen3-Embedding-0.6B`, etc.                                                                                   | ✅︎                   | ✅︎                        |
+| `RobertaModel`, `RobertaForMaskedLM`                   | RoBERTa-based       | `sentence-transformers/all-roberta-large-v1`, etc.                                                                  |                      |                           |
 
 !!! note
     `ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
@@ -431,7 +432,7 @@ Specified using `--task reward`.
 If your model is not in the above list, we will try to automatically convert the model using
 [as_reward_model][vllm.model_executor.models.adapters.as_reward_model]. By default, we return the hidden states of each token directly.
 
-!!! warning
+!!! important
     For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
     e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
 
@@ -450,12 +451,19 @@ If your model is not in the above list, we will try to automatically convert the
 
 Specified using `--task score`.
 
-| Architecture                          | Models            | Example HF Models                            |
-|---------------------------------------|-------------------|----------------------------------------------|
-| `BertForSequenceClassification`       | BERT-based        | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. |
-| `RobertaForSequenceClassification`    | RoBERTa-based     | `cross-encoder/quora-roberta-base`, etc.     |
-| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc.              |
+| Architecture                          | Models            | Example HF Models                                                                    |
+|---------------------------------------|-------------------|--------------------------------------------------------------------------------------|
+| `BertForSequenceClassification`       | BERT-based        | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc.                                         |
+| `Qwen3ForSequenceClassification`      | Qwen3-based       | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. |
+| `RobertaForSequenceClassification`    | RoBERTa-based     | `cross-encoder/quora-roberta-base`, etc.                                             |
+| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc.                                                      |
+
+!!! note
+    Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: <gh-file:examples/offline_inference/qwen3_reranker.py>.
 
+    ```bash
+    vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
+    ```
 [](){ #supported-mm-models }
 
 ## List of Multimodal Language Models
@@ -477,7 +485,7 @@ On the other hand, modalities separated by `/` are mutually exclusive.
 
 See [this page][multimodal-inputs] on how to pass multi-modal inputs to the model.
 
-!!! warning
+!!! important
     **To enable multiple multi-modal items per text prompt in vLLM V0**, you have to set `limit_mm_per_prompt` (offline inference)
     or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt:
 
@@ -632,7 +640,7 @@ Specified using `--task generate`.
 
 See [this page](./pooling_models.md) for more information on how to use pooling models.
 
-!!! warning
+!!! important
     Since some model architectures support both generative and pooling tasks,
     you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
 
diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index c2e39d029dd5..3002b2f92e4d 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -36,7 +36,7 @@ print(completion.choices[0].message)
     vLLM supports some parameters that are not supported by OpenAI, `top_k` for example.
     You can pass these parameters to vLLM using the OpenAI client in the `extra_body` parameter of your requests, i.e. `extra_body={"top_k": 50}` for `top_k`.
 
-!!! warning
+!!! important
     By default, the server applies `generation_config.json` from the Hugging Face model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator.
 
     To disable this behavior, please pass `--generation-config vllm` when launching the server.
@@ -250,7 +250,7 @@ and passing a list of `messages` in the request. Refer to the examples below for
       --chat-template examples/template_vlm2vec.jinja
     ```
 
-    !!! warning
+    !!! important
         Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--task embed`
         to run this model in embedding mode instead of text generation mode.
 
@@ -294,13 +294,13 @@ and passing a list of `messages` in the request. Refer to the examples below for
       --chat-template examples/template_dse_qwen2_vl.jinja
     ```
 
-    !!! warning
+    !!! important
         Like with VLM2Vec, we have to explicitly pass `--task embed`.
 
         Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled
         by a custom chat template: <gh-file:examples/template_dse_qwen2_vl.jinja>
 
-    !!! warning
+    !!! important
         `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
         example below for details.
 
diff --git a/docs/usage/security.md b/docs/usage/security.md
index 1209cc8dd457..76140434dcb3 100644
--- a/docs/usage/security.md
+++ b/docs/usage/security.md
@@ -31,6 +31,7 @@ refer to the [PyTorch Security
 Guide](https://github.com/pytorch/pytorch/security/policy#using-distributed-features).
 
 Key points from the PyTorch security guide:
+
 - PyTorch Distributed features are intended for internal communication only
 - They are not built for use in untrusted environments or networks
 - No authorization protocol is included for performance reasons
diff --git a/docs/usage/troubleshooting.md b/docs/usage/troubleshooting.md
index 889cfccdacac..e9ab425a1d06 100644
--- a/docs/usage/troubleshooting.md
+++ b/docs/usage/troubleshooting.md
@@ -40,7 +40,7 @@ If other strategies don't solve the problem, it's likely that the vLLM instance
 - `export VLLM_LOGGING_LEVEL=DEBUG` to turn on more logging.
 - `export CUDA_LAUNCH_BLOCKING=1` to identify which CUDA kernel is causing the problem.
 - `export NCCL_DEBUG=TRACE` to turn on more logging for NCCL.
-- `export VLLM_TRACE_FUNCTION=1` to record all function calls for inspection in the log files to tell which function crashes or hangs.
+- `export VLLM_TRACE_FUNCTION=1` to record all function calls for inspection in the log files to tell which function crashes or hangs. Do not use this flag unless absolutely needed for debugging, it will cause significant delays in startup time.
 
 ## Incorrect network setup
 
diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index f8519f57128b..65598989fff1 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -1,6 +1,8 @@
 # vLLM V1
 
-**We have started the process of deprecating V0. Please read [RFC #18571](https://github.com/vllm-project/vllm/issues/18571) for more details.**
+!!! announcement
+
+    We have started the process of deprecating V0. Please read [RFC #18571](https://github.com/vllm-project/vllm/issues/18571) for more details.
 
 V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
 
@@ -32,36 +34,9 @@ Upgrade to vLLM’s Core Architecture](https://blog.vllm.ai/2025/01/27/v1-alpha-
 
 This living user guide outlines a few known **important changes and limitations** introduced by vLLM V1. The team has been working actively to bring V1 as the default engine, therefore this guide will be updated constantly as more features get supported on vLLM V1.
 
-### Supports Overview
-#### Hardware
-
-| Hardware | Status                                   |
-|----------|------------------------------------------|
-| **NVIDIA** | <nobr>🚀 Natively Supported</nobr>         |
-| **AMD**    | <nobr>🚧 WIP</nobr>           |
-| **TPU**    | <nobr>🚧 WIP</nobr>           |
-| **CPU**    | <nobr>🚧 WIP</nobr>           |
-
-#### Feature / Model
+## Current Status
 
-| Feature / Model | Status |
-|-----------------|-----------------------------------------------------------------------------------|
-| **Prefix Caching**                    | <nobr>🚀 Optimized</nobr>                                                        |
-| **Chunked Prefill**                    | <nobr>🚀 Optimized</nobr>                                                        |
-| **LoRA**                                    | <nobr>🚀 Optimized</nobr>                                                         |
-| **Logprobs Calculation**                    | <nobr>🟢 Functional</nobr>                                                        |
-| **Multimodal Models**                       | <nobr>🟢 Functional</nobr>                                                        |
-| **FP8 KV Cache**                            | <nobr>🟢 Functional on Hopper devices ([PR #15191](https://github.com/vllm-project/vllm/pull/15191))</nobr>|
-| **Spec Decode**                             | <nobr>🚧 WIP ([PR #13933](https://github.com/vllm-project/vllm/pull/13933))</nobr>|
-| **Prompt Logprobs with Prefix Caching**     | <nobr>🟡 Planned ([RFC #13414](https://github.com/vllm-project/vllm/issues/13414))</nobr>|
-| **Structured Output Alternative Backends**  | <nobr>🟡 Planned</nobr>                                                           |
-| **Embedding Models**                        | <nobr>🚧 WIP ([PR #16188](https://github.com/vllm-project/vllm/pull/16188))</nobr> |
-| **Mamba Models**                            | <nobr>🟡 Planned</nobr>                                                           |
-| **Encoder-Decoder Models**                  | <nobr>🟠 Delayed</nobr>                                                           |
-| **Request-level Structured Output Backend** | <nobr>🔴 Deprecated</nobr>                                                        |
-| **best_of**                                 | <nobr>🔴 Deprecated ([RFC #13361](https://github.com/vllm-project/vllm/issues/13361))</nobr>|
-| **Per-Request Logits Processors**           | <nobr>🔴 Deprecated ([RFC #13360](https://github.com/vllm-project/vllm/pull/13360))</nobr> |
-| **GPU <> CPU KV Cache Swapping**            | <nobr>🔴 Deprecated</nobr>                                                        |
+For each item, our progress towards V1 support falls into one of the following states:
 
 - **🚀 Optimized**: Nearly fully optimized, with no further work currently planned.
 - **🟢 Functional**: Fully operational, with ongoing optimizations.  
@@ -82,9 +57,81 @@ First-Served (FCFS) and priority-based scheduling (where requests are processed
 based on assigned priority, with FCFS as a tie-breaker), configurable via the
 `--scheduling-policy` argument.
 
-### Semantic Changes and Deprecated Features
+### Hardware
+
+| Hardware   | Status                             |
+|------------|------------------------------------|
+| **NVIDIA** | <nobr>🚀</nobr>                   |
+| **AMD**    | <nobr>🟢</nobr>                   |
+| **TPU**    | <nobr>🟢</nobr>                   |
+| **CPU**    | <nobr>🟢 (x86) 🟡 (MacOS) </nobr> |
+
+!!! note
+
+    More hardware platforms may be supported via plugins, e.g.:
+
+    - [vllm-ascend](https://github.com/vllm-project/vllm-ascend)
+    - [vllm-spyre](https://github.com/vllm-project/vllm-spyre)
+    - [vllm-openvino](https://github.com/vllm-project/vllm-openvino)
+
+    Please check their corresponding repositories for more details.
+
+### Models
+
+| Model Type | Status |
+|-----------------|-----------------------------------------------------------------------------------|
+| **Decoder-only Models**                     | <nobr>🚀 Optimized</nobr>                                                          |
+| **Encoder-Decoder Models**                  | <nobr>🟠 Delayed</nobr>                                                            |
+| **Embedding Models**                        | <nobr>🚧 WIP ([PR #16188](https://github.com/vllm-project/vllm/pull/16188))</nobr> |
+| **Mamba Models**                            | <nobr>🚧 WIP ([PR #19327](https://github.com/vllm-project/vllm/pull/19327))</nobr> |
+| **Multimodal Models**                       | <nobr>🟢 Functional</nobr>                                                         |
+
+vLLM V1 currently excludes model architectures with the `SupportsV0Only` protocol,
+and the majority fall into the following categories:
+
+**Embedding Models**  
+The initial support will be provided by [PR #16188](https://github.com/vllm-project/vllm/pull/16188).
+
+Later, we will consider using [hidden states processor](https://github.com/vllm-project/vllm/issues/12249),
+which is based on [global logits processor](https://github.com/vllm-project/vllm/pull/13360)
+to enable simultaneous generation and embedding using the same engine instance in V1.
+
+**Mamba Models**  
+Models using selective state-space mechanisms instead of standard transformer attention (e.g., `MambaForCausalLM`, `JambaForCausalLM`)
+will be supported via [PR #19327](https://github.com/vllm-project/vllm/pull/19327).
+
+**Encoder-Decoder Models**  
+vLLM V1 is currently optimized for decoder-only transformers.
+Models requiring cross-attention between separate encoder and decoder are not yet supported (e.g., `BartForConditionalGeneration`, `MllamaForConditionalGeneration`).
+
+For a complete list of supported models, see the [list of supported models](https://docs.vllm.ai/en/latest/models/supported_models.html).
 
-#### Logprobs
+### Features
+
+| Feature | Status |
+|-----------------|-----------------------------------------------------------------------------------|
+| **Prefix Caching**                          | <nobr>🚀 Optimized</nobr>                                                         |
+| **Chunked Prefill**                         | <nobr>🚀 Optimized</nobr>                                                         |
+| **LoRA**                                    | <nobr>🚀 Optimized</nobr>                                                         |
+| **Logprobs Calculation**                    | <nobr>🟢 Functional</nobr>                                                        |
+| **FP8 KV Cache**                            | <nobr>🟢 Functional on Hopper devices ([PR #15191](https://github.com/vllm-project/vllm/pull/15191))</nobr>|
+| **Spec Decode**                             | <nobr>🚧 WIP ([PR #13933](https://github.com/vllm-project/vllm/pull/13933))</nobr>|
+| **Prompt Logprobs with Prefix Caching**     | <nobr>🟡 Planned ([RFC #13414](https://github.com/vllm-project/vllm/issues/13414))</nobr>|
+| **Structured Output Alternative Backends**  | <nobr>🟢 Functional</nobr>                                                        |
+| **Request-level Structured Output Backend** | <nobr>🔴 Deprecated</nobr>                                                        |
+| **best_of**                                 | <nobr>🔴 Deprecated ([RFC #13361](https://github.com/vllm-project/vllm/issues/13361))</nobr>|
+| **Per-Request Logits Processors**           | <nobr>🔴 Deprecated ([RFC #13360](https://github.com/vllm-project/vllm/pull/13360))</nobr> |
+| **GPU <> CPU KV Cache Swapping**            | <nobr>🔴 Deprecated</nobr>                                                        |
+
+!!! note
+
+    vLLM V1’s unified scheduler treats both prompt and output tokens the same
+    way by using a simple dictionary (e.g., `{request_id: num_tokens}`) to dynamically
+    allocate a fixed token budget per request, enabling features like chunked prefills,
+    prefix caching, and speculative decoding without a strict separation between prefill
+    and decode phases.
+
+#### Semantic Changes to Logprobs
 
 vLLM V1 supports logprobs and prompt logprobs. However, there are some important semantic
 differences compared to V0:
@@ -102,6 +149,14 @@ Support for logprobs with post-sampling adjustments is in progress and will be a
 
 Currently prompt logprobs are only supported when prefix caching is turned off via `--no-enable-prefix-caching`. In a future release, prompt logprobs will be compatible with prefix caching, but a recomputation will be triggered to recover the full prompt logprobs even upon a prefix cache hit. See details in [RFC #13414](https://github.com/vllm-project/vllm/issues/13414).
 
+#### WIP Features
+
+These features are already supported in vLLM V1, but their optimization is still
+in progress.
+
+- **Spec Decode**: Currently, only ngram-based spec decode is supported in V1. There
+  will be follow-up work to support other types of spec decode (e.g., see [PR #13933](https://github.com/vllm-project/vllm/pull/13933)). We will prioritize the support for Eagle, MTP compared to draft model based spec decode.
+
 #### Deprecated Features
 
 As part of the major architectural rework in vLLM V1, several legacy features have been deprecated.
@@ -121,46 +176,4 @@ to handle request preemptions.
 
 **Structured Output features**
 
-- **Request-level Structured Output Backend**: Deprecated, alternative backends
-  (outlines, guidance) with fallbacks is WIP.
-### Feature & Model Support in Progress
-
-Although we have re-implemented and partially optimized many features and models from V0 in vLLM V1, optimization work is still ongoing for some, and others remain unsupported.
-
-#### Features to Be Optimized
-
-These features are already supported in vLLM V1, but their optimization is still
-in progress.
-
-- **Spec Decode**: Currently, only ngram-based spec decode is supported in V1. There
-  will be follow-up work to support other types of spec decode (e.g., see [PR #13933](https://github.com/vllm-project/vllm/pull/13933)). We will prioritize the support for Eagle, MTP compared to draft model based spec decode.
-
-- **Multimodal Models**: V1 is almost fully compatible with V0 except that interleaved modality input is not supported yet.
-  See [here](https://github.com/orgs/vllm-project/projects/8) for the status of upcoming features and optimizations.
-
-#### Features to Be Supported
-
-- **Structured Output Alternative Backends**: Structured output alternative backends (outlines, guidance) support is planned. V1 currently
-  supports only the `xgrammar:no_fallback` mode, meaning that it will error out if the output schema is unsupported by xgrammar.
-  Details about the structured outputs can be found
-  [here](https://docs.vllm.ai/en/latest/features/structured_outputs.html).
-
-#### Models to Be Supported
-
-vLLM V1 currently excludes model architectures with the `SupportsV0Only` protocol,
-and the majority fall into the following categories. V1 support for these models will be added eventually.
-
-**Embedding Models**  
-The initial support will be provided by [PR #16188](https://github.com/vllm-project/vllm/pull/16188).
-
-Later, we will consider using [hidden states processor](https://github.com/vllm-project/vllm/issues/12249), which is based on [global logits processor](https://github.com/vllm-project/vllm/pull/13360) to enable simultaneous generation and embedding using the same engine instance in V1.
-
-**Mamba Models**  
-Models using selective state-space mechanisms (instead of standard transformer attention)
-are not yet supported (e.g., `MambaForCausalLM`, `JambaForCausalLM`).
-
-**Encoder-Decoder Models**  
-vLLM V1 is currently optimized for decoder-only transformers. Models requiring
-  cross-attention between separate encoder and decoder are not yet supported (e.g., `BartForConditionalGeneration`, `MllamaForConditionalGeneration`).
-
-For a complete list of supported models, see the [list of supported models](https://docs.vllm.ai/en/latest/models/supported_models.html).
+- **Request-level Structured Output Backend**: Deprecated, alternative backends (outlines, guidance) with fallbacks is supported now.
diff --git a/examples/offline_inference/neuron_multimodal.py b/examples/offline_inference/neuron_multimodal.py
index 6ff8faabd748..26f7505f2fa5 100644
--- a/examples/offline_inference/neuron_multimodal.py
+++ b/examples/offline_inference/neuron_multimodal.py
@@ -64,7 +64,7 @@ def print_outputs(outputs):
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
 
-if __name__ == "__main__":
+def main():
     assert (
         len(PROMPTS) == len(IMAGES) == len(SAMPLING_PARAMS)
     ), f"""Text, image prompts and sampling parameters should have the 
@@ -104,3 +104,7 @@ def print_outputs(outputs):
     # test batch-size = 4
     outputs = llm.generate(batched_inputs, batched_sample_params)
     print_outputs(outputs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/profiling_tpu/profiling.py b/examples/offline_inference/profiling_tpu/profiling.py
index 5200be82694a..dfcbd8c8d360 100644
--- a/examples/offline_inference/profiling_tpu/profiling.py
+++ b/examples/offline_inference/profiling_tpu/profiling.py
@@ -70,7 +70,7 @@ def run_to_completion():
     return
 
 
-if __name__ == "__main__":
+def parse_args():
     parser = FlexibleArgumentParser(
         description="Benchmark the latency of processing a single batch of "
         "requests till completion."
@@ -102,5 +102,9 @@ def run_to_completion():
     )
 
     parser = EngineArgs.add_cli_args(parser)
-    args = parser.parse_args()
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
     main(args)
diff --git a/examples/offline_inference/qwen3_reranker.py b/examples/offline_inference/qwen3_reranker.py
new file mode 100644
index 000000000000..27c4071bf094
--- /dev/null
+++ b/examples/offline_inference/qwen3_reranker.py
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+
+from vllm import LLM
+
+model_name = "Qwen/Qwen3-Reranker-0.6B"
+
+# What is the difference between the official original version and one
+# that has been converted into a sequence classification model?
+# Qwen3-Reranker is a language model that doing reranker by using the
+# logits of "no" and "yes" tokens.
+# It needs to computing 151669 tokens logits, making this method extremely
+# inefficient, not to mention incompatible with the vllm score API.
+# A method for converting the original model into a sequence classification
+# model was proposed. See：https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3
+# Models converted offline using this method can not only be more efficient
+# and support the vllm score API, but also make the init parameters more
+# concise, for example.
+# model = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score")
+
+# If you want to load the official original version, the init parameters are
+# as follows.
+
+model = LLM(
+    model=model_name,
+    task="score",
+    hf_overrides={
+        "architectures": ["Qwen3ForSequenceClassification"],
+        "classifier_from_token": ["no", "yes"],
+        "is_original_qwen3_reranker": True,
+    },
+)
+
+# Why do we need hf_overrides for the official original version:
+# vllm converts it to Qwen3ForSequenceClassification when loaded for
+# better performance.
+# - Firstly, we need using `"architectures": ["Qwen3ForSequenceClassification"],`
+# to manually route to Qwen3ForSequenceClassification.
+# - Then, we will extract the vector corresponding to classifier_from_token
+# from lm_head using `"classifier_from_token": ["no", "yes"]`.
+# - Third, we will convert these two vectors into one vector.  The use of
+# conversion logic is controlled by `using "is_original_qwen3_reranker": True`.
+
+# Please use the query_template and document_template to format the query and
+# document for better reranker results.
+
+prefix = '<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>\n<|im_start|>user\n'
+suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
+
+query_template = "{prefix}<Instruct>: {instruction}\n<Query>: {query}\n"
+document_template = "<Document>: {doc}{suffix}"
+
+if __name__ == "__main__":
+    instruction = (
+        "Given a web search query, retrieve relevant passages that answer the query"
+    )
+
+    queries = [
+        "What is the capital of China?",
+        "Explain gravity",
+    ]
+
+    documents = [
+        "The capital of China is Beijing.",
+        "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
+    ]
+
+    queries = [
+        query_template.format(prefix=prefix, instruction=instruction, query=query)
+        for query in queries
+    ]
+    documents = [document_template.format(doc=doc, suffix=suffix) for doc in documents]
+
+    outputs = model.score(queries, documents)
+
+    print([output.outputs.score for output in outputs])
diff --git a/examples/others/lmcache/cpu_offload_lmcache.py b/examples/others/lmcache/cpu_offload_lmcache.py
index 354e4cc8c572..9138b53679b3 100644
--- a/examples/others/lmcache/cpu_offload_lmcache.py
+++ b/examples/others/lmcache/cpu_offload_lmcache.py
@@ -28,8 +28,8 @@
 import time
 from dataclasses import asdict
 
-from lmcache.experimental.cache_engine import LMCacheEngineBuilder
 from lmcache.integration.vllm.utils import ENGINE_NAME
+from lmcache.v1.cache_engine import LMCacheEngineBuilder
 
 from vllm import LLM, SamplingParams
 from vllm.config import KVTransferConfig
diff --git a/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py b/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py
index 508cf4a5a498..46e2d903d4be 100644
--- a/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py
+++ b/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py
@@ -17,8 +17,8 @@
 import time
 from multiprocessing import Event, Process
 
-from lmcache.experimental.cache_engine import LMCacheEngineBuilder
 from lmcache.integration.vllm.utils import ENGINE_NAME
+from lmcache.v1.cache_engine import LMCacheEngineBuilder
 
 from vllm import LLM, SamplingParams
 from vllm.config import KVTransferConfig
@@ -105,7 +105,7 @@ def run_retrieve(store_done, prompts, timeout=1):
 
 def run_lmcache_server(port):
     server_proc = subprocess.Popen(
-        ["python", "-m", "lmcache.experimental.server", "localhost", str(port)]
+        ["python", "-m", "lmcache.v1.server", "localhost", str(port)]
     )
     return server_proc
 
diff --git a/requirements/common.txt b/requirements/common.txt
index a6a1ffe76196..871df3d21e74 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -37,7 +37,7 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.10.0 # required for compressed-tensors
+compressed-tensors == 0.10.1 # required for compressed-tensors
 depyf==0.18.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index e43b44397752..d7b0fc6d80a7 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -27,3 +27,5 @@ triton==3.2.0; platform_machine == "x86_64"
 # Intel Extension for PyTorch, only for x86_64 CPUs
 intel-openmp==2024.2.1; platform_machine == "x86_64"
 intel_extension_for_pytorch==2.7.0; platform_machine == "x86_64"
+py-libnuma; platform_system != "Darwin"
+psutil; platform_system != "Darwin"
diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py
index 163185b90be9..ec6b20f5e04b 100644
--- a/tests/async_engine/api_server_async_engine.py
+++ b/tests/async_engine/api_server_async_engine.py
@@ -8,6 +8,7 @@
 from fastapi.responses import JSONResponse, Response
 
 import vllm.entrypoints.api_server
+import vllm.envs as envs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.utils import FlexibleArgumentParser
@@ -46,9 +47,8 @@ def stats() -> Response:
     engine_args = AsyncEngineArgs.from_cli_args(args)
     engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
     vllm.entrypoints.api_server.engine = engine
-    uvicorn.run(
-        app,
-        host=args.host,
-        port=args.port,
-        log_level="debug",
-        timeout_keep_alive=vllm.entrypoints.api_server.TIMEOUT_KEEP_ALIVE)
+    uvicorn.run(app,
+                host=args.host,
+                port=args.port,
+                log_level="debug",
+                timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE)
diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
index 57fd1f1c49b1..52e0fcc2881f 100644
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -11,6 +11,16 @@
 from .piecewise.test_simple import SillyModel
 
 
+def test_use_cudagraphs_dynamic(monkeypatch):
+    assert vllm.envs.VLLM_USE_V1
+    vllm_config = VllmConfig()
+    assert vllm_config.compilation_config.use_cudagraph
+
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+    vllm_config = VllmConfig()
+    assert not vllm_config.compilation_config.use_cudagraph
+
+
 @pytest.mark.parametrize("enabled", [True, False])
 def test_use_cudagraphs(enabled):
     assert vllm.envs.VLLM_USE_V1
diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py
index 4676dc992a87..707891f6bdd8 100644
--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@@ -25,6 +25,12 @@
 ]
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    """We can run both engines for this test."""
+    pass
+
+
 @pytest.fixture(scope="module")
 def llm():
     # pytest caches the fixture so we use weakref.proxy to
@@ -104,3 +110,22 @@ def test_multiple_sampling_params(llm: LLM):
     # sampling_params is None, default params should be applied
     outputs = llm.generate(PROMPTS, sampling_params=None)
     assert len(PROMPTS) == len(outputs)
+
+
+def test_max_model_len():
+    max_model_len = 20
+    llm = LLM(
+        model=MODEL_NAME,
+        max_model_len=max_model_len,
+        gpu_memory_utilization=0.10,
+        enforce_eager=True,  # reduce test time
+    )
+    sampling_params = SamplingParams(max_tokens=max_model_len + 10)
+    outputs = llm.generate(PROMPTS, sampling_params)
+    for output in outputs:
+        num_total_tokens = len(output.prompt_token_ids) + len(
+            output.outputs[0].token_ids)
+        # Total tokens must not exceed max_model_len.
+        # It can be less if generation finishes due to other reasons (e.g., EOS)
+        # before reaching the absolute model length limit.
+        assert num_total_tokens <= max_model_len
diff --git a/tests/kernels/attention/test_encoder_decoder_attn.py b/tests/kernels/attention/test_encoder_decoder_attn.py
index c6ce7b0cce40..8efc701f3a7a 100644
--- a/tests/kernels/attention/test_encoder_decoder_attn.py
+++ b/tests/kernels/attention/test_encoder_decoder_attn.py
@@ -99,7 +99,7 @@ class TestResources(NamedTuple):
     Attributes:
 
     * scale: 1/sqrt(d) scale factor for attn
-    * attn_backend: implementatino of abstraction
+    * attn_backend: implementations of abstraction
                     attention interface using
                     a particular kernel library
                     i.e. XFormers
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
index bbfe31d0e650..0b48bbef6ceb 100644
--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@@ -274,7 +274,7 @@ def pplx_prepare_finalize(pgi: ProcessGroupInfo, dp_size: int, a: torch.Tensor,
     chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
     chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)
 
-    b_a, b_a_scale, expert_num_tokens = prepare_finalize.prepare(
+    b_a, b_a_scale, expert_num_tokens, _, _ = prepare_finalize.prepare(
         a_chunk,
         None,
         None,
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 0737bb886e43..4908f9a060f7 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -164,11 +164,6 @@ def mixtral_lora_files():
     return snapshot_download(repo_id="SangBinCho/mixtral-lora")
 
 
-@pytest.fixture(scope="session")
-def gemma_lora_files():
-    return snapshot_download(repo_id="wskwon/gemma-7b-test-lora")
-
-
 @pytest.fixture(scope="session")
 def chatglm3_lora_files():
     return snapshot_download(repo_id="jeeejeee/chatglm3-text2sql-spider")
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index 23819f03dc51..3ac3b80ec827 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -4,9 +4,6 @@
 import sys
 from typing import Union
 
-import pytest
-import ray
-
 import vllm
 from vllm import LLM
 from vllm.lora.request import LoRARequest
@@ -121,37 +118,6 @@ def test_llama_lora(sql_lora_files):
     generate_and_test(llm, sql_lora_files)
 
 
-# Skipping for v1 as v1 doesn't have a good way to expose the num_gpu_blocks
-# used by the engine yet.
-@pytest.mark.skip_v1
-@create_new_process_for_each_test()
-def test_llama_lora_warmup(sql_lora_files):
-    """Test that the LLM initialization works with a warmup LORA path and
-    is more conservative"""
-
-    @ray.remote(num_gpus=1)
-    def get_num_gpu_blocks_lora():
-        llm = vllm.LLM(MODEL_PATH, enable_lora=True, max_num_seqs=16)
-        num_gpu_blocks_lora_warmup = llm.llm_engine.cache_config.num_gpu_blocks
-        return num_gpu_blocks_lora_warmup
-
-    @ray.remote(num_gpus=1)
-    def get_num_gpu_blocks_no_lora():
-        llm = vllm.LLM(MODEL_PATH, max_num_seqs=16)
-        num_gpu_blocks_no_lora_warmup = (
-            llm.llm_engine.cache_config.num_gpu_blocks)
-        return num_gpu_blocks_no_lora_warmup
-
-    num_gpu_blocks_lora_warmup = ray.get(get_num_gpu_blocks_lora.remote())
-    num_gpu_blocks_no_lora_warmup = ray.get(
-        get_num_gpu_blocks_no_lora.remote())
-    assert num_gpu_blocks_lora_warmup < num_gpu_blocks_no_lora_warmup, (
-        "The warmup with lora should be more "
-        "conservative than without lora, therefore the number of "
-        "memory blocks for the KV cache should be "
-        "less when using lora than when not using lora")
-
-
 @multi_gpu_test(num_gpus=4)
 @create_new_process_for_each_test()
 def test_llama_lora_tp4(sql_lora_files):
diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py
index e9a52e1b6357..50c60341f0d8 100644
--- a/tests/lora/test_lora_functions.py
+++ b/tests/lora/test_lora_functions.py
@@ -15,13 +15,6 @@
 LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test"
 LORA_RANK = 8
 
-# @pytest.fixture(autouse=True)
-# def v1(run_with_both_engines_lora):
-#     # Simple autouse wrapper to run both engines for each test
-#     # This can be promoted up to conftest.py to run for every
-#     # test in a package
-#     pass
-
 
 def make_lora_request(lora_id: int):
     return LoRARequest(lora_name=f"{lora_id}",
diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py
index a21de070517b..9d75512a248b 100644
--- a/tests/lora/test_phi.py
+++ b/tests/lora/test_phi.py
@@ -11,14 +11,6 @@
 PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:"  # noqa: E501
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         PROMPT_TEMPLATE.format(
@@ -59,7 +51,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
 
 # Skipping for V1 for now as we are hitting,
 # "Head size 80 is not supported by FlashAttention." error.
-@pytest.mark.skip_v1
+@pytest.mark.skip(reason="Head size 80 is not supported by FlashAttention")
 def test_phi2_lora(phi2_lora_files):
     # We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
     # Otherwise, the lora-test will fail due to CUDA OOM.
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index 6f13e663a78b..9999c1be54ea 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -16,6 +16,8 @@
 from vllm.v1.worker.gpu_worker import Worker as V1Worker
 from vllm.worker.worker import Worker
 
+NUM_LORAS = 16
+
 
 @patch.dict(os.environ, {"RANK": "0"})
 def test_worker_apply_lora(sql_lora_files):
@@ -58,12 +60,12 @@ def set_active_loras(worker: Union[Worker, V1Worker],
         device_config=DeviceConfig("cuda"),
         cache_config=CacheConfig(
             block_size=16,
-            gpu_memory_utilization=1.0,
             swap_space=0,
             cache_dtype="auto",
         ),
-        lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32,
-                               max_loras=32),
+        lora_config=LoRAConfig(max_lora_rank=8,
+                               max_cpu_loras=NUM_LORAS,
+                               max_loras=NUM_LORAS),
     )
     worker = worker_cls(
         vllm_config=vllm_config,
@@ -78,9 +80,9 @@ def set_active_loras(worker: Union[Worker, V1Worker],
     set_active_loras(worker, [])
     assert worker.list_loras() == set()
 
-    n_loras = 32
     lora_requests = [
-        LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(n_loras)
+        LoRARequest(str(i + 1), i + 1, sql_lora_files)
+        for i in range(NUM_LORAS)
     ]
 
     set_active_loras(worker, lora_requests)
@@ -89,12 +91,12 @@ def set_active_loras(worker: Union[Worker, V1Worker],
         for lora_request in lora_requests
     }
 
-    for i in range(32):
+    for i in range(NUM_LORAS):
         random.seed(i)
         iter_lora_requests = random.choices(lora_requests,
-                                            k=random.randint(1, n_loras))
+                                            k=random.randint(1, NUM_LORAS))
         random.shuffle(iter_lora_requests)
-        iter_lora_requests = iter_lora_requests[:-random.randint(0, n_loras)]
+        iter_lora_requests = iter_lora_requests[:-random.randint(0, NUM_LORAS)]
         set_active_loras(worker, lora_requests)
         assert worker.list_loras().issuperset(
             {lora_request.lora_int_id
diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py
index 05bd479f42b9..6a3a0f150b6d 100644
--- a/tests/models/language/pooling/test_gte.py
+++ b/tests/models/language/pooling/test_gte.py
@@ -45,6 +45,15 @@
     EmbedModelInfo("Alibaba-NLP/gte-modernbert-base",
                    architecture="ModernBertModel",
                    enable_test=True),
+    ########## Qwen3ForCausalLM
+    EmbedModelInfo("Qwen/Qwen3-Embedding-0.6B",
+                   architecture="Qwen3ForCausalLM",
+                   dtype="float32",
+                   enable_test=True),
+    EmbedModelInfo("Qwen/Qwen3-Embedding-4B",
+                   architecture="Qwen3ForCausalLM",
+                   dtype="float32",
+                   enable_test=False),
 ]
 
 
diff --git a/tests/models/language/pooling/test_qwen3_reranker.py b/tests/models/language/pooling/test_qwen3_reranker.py
new file mode 100644
index 000000000000..63b37d9a077d
--- /dev/null
+++ b/tests/models/language/pooling/test_qwen3_reranker.py
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+
+model_name = "Qwen/Qwen3-Reranker-4B"
+
+text_1 = "What is the capital of France?"
+texts_2 = [
+    "The capital of Brazil is Brasilia.",
+    "The capital of France is Paris.",
+]
+
+
+def vllm_reranker(model_name):
+    from vllm import LLM
+
+    model = LLM(model=model_name,
+                task="score",
+                hf_overrides={
+                    "architectures": ["Qwen3ForSequenceClassification"],
+                    "classifier_from_token": ["no", "yes"],
+                    "is_original_qwen3_reranker": True,
+                },
+                dtype="float32")
+
+    text_1 = "What is the capital of France?"
+    texts_2 = [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.",
+    ]
+
+    outputs = model.score(text_1, texts_2)
+
+    return [output.outputs.score for output in outputs]
+
+
+def hf_reranker(model_name):
+    import torch
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left')
+    model = AutoModelForCausalLM.from_pretrained(model_name).eval()
+
+    token_false_id = tokenizer.convert_tokens_to_ids("no")
+    token_true_id = tokenizer.convert_tokens_to_ids("yes")
+
+    max_length = 8192
+
+    def process_inputs(pairs):
+        inputs = tokenizer(pairs,
+                           padding=False,
+                           truncation='longest_first',
+                           return_attention_mask=False,
+                           max_length=max_length)
+        for i, ele in enumerate(inputs['input_ids']):
+            inputs['input_ids'][i] = ele
+        inputs = tokenizer.pad(inputs,
+                               padding=True,
+                               return_tensors="pt",
+                               max_length=max_length)
+        for key in inputs:
+            inputs[key] = inputs[key].to(model.device)
+        return inputs
+
+    @torch.no_grad()
+    def compute_logits(inputs, **kwargs):
+        batch_scores = model(**inputs).logits[:, -1, :]
+        true_vector = batch_scores[:, token_true_id]
+        false_vector = batch_scores[:, token_false_id]
+        batch_scores = torch.stack([false_vector, true_vector], dim=1)
+        batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
+        scores = batch_scores[:, 1].exp().tolist()
+        return scores
+
+    pairs = [(text_1, texts_2[0]), (text_1, texts_2[1])]
+    inputs = process_inputs(pairs)
+    scores = compute_logits(inputs)
+
+    return scores
+
+
+@pytest.mark.parametrize("model_name", [model_name])
+def test_model(model_name):
+    hf_outputs = hf_reranker(model_name)
+    vllm_outputs = vllm_reranker(model_name)
+
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
+    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
diff --git a/tests/models/language/pooling/test_qwen3_reranker_seq_cls.py b/tests/models/language/pooling/test_qwen3_reranker_seq_cls.py
new file mode 100644
index 000000000000..ee07f6ff9dca
--- /dev/null
+++ b/tests/models/language/pooling/test_qwen3_reranker_seq_cls.py
@@ -0,0 +1,73 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+
+model_name = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
+
+text_1 = "What is the capital of France?"
+texts_2 = [
+    "The capital of Brazil is Brasilia.",
+    "The capital of France is Paris.",
+]
+
+
+def vllm_reranker(model_name):
+    from vllm import LLM
+
+    model = LLM(model=model_name, task="score")
+    outputs = model.score(text_1, texts_2)
+
+    return [output.outputs.score for output in outputs]
+
+
+def hf_reranker(model_name):
+    import torch
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left')
+    model = AutoModelForCausalLM.from_pretrained(model_name).eval()
+
+    token_false_id = tokenizer.convert_tokens_to_ids("no")
+    token_true_id = tokenizer.convert_tokens_to_ids("yes")
+
+    max_length = 8192
+
+    def process_inputs(pairs):
+        inputs = tokenizer(pairs,
+                           padding=False,
+                           truncation='longest_first',
+                           return_attention_mask=False,
+                           max_length=max_length)
+        for i, ele in enumerate(inputs['input_ids']):
+            inputs['input_ids'][i] = ele
+        inputs = tokenizer.pad(inputs,
+                               padding=True,
+                               return_tensors="pt",
+                               max_length=max_length)
+        for key in inputs:
+            inputs[key] = inputs[key].to(model.device)
+        return inputs
+
+    @torch.no_grad()
+    def compute_logits(inputs, **kwargs):
+        batch_scores = model(**inputs).logits[:, -1, :]
+        true_vector = batch_scores[:, token_true_id]
+        false_vector = batch_scores[:, token_false_id]
+        batch_scores = torch.stack([false_vector, true_vector], dim=1)
+        batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
+        scores = batch_scores[:, 1].exp().tolist()
+        return scores
+
+    pairs = [(text_1, texts_2[0]), (text_1, texts_2[1])]
+    inputs = process_inputs(pairs)
+    scores = compute_logits(inputs)
+
+    return scores
+
+
+@pytest.mark.parametrize("model_name", [model_name])
+def test_model(model_name):
+    hf_outputs = hf_reranker(model_name)
+    vllm_outputs = vllm_reranker(model_name)
+
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
+    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
diff --git a/tests/models/quantization/test_gguf.py b/tests/models/quantization/test_gguf.py
index 32f9472c12d5..a424bd6798fd 100644
--- a/tests/models/quantization/test_gguf.py
+++ b/tests/models/quantization/test_gguf.py
@@ -79,7 +79,7 @@ def gguf_model(self):
 )
 
 MODELS = [
-    LLAMA_CONFIG,
+    # LLAMA_CONFIG, # broken: https://github.com/vllm-project/vllm/issues/19458
     QWEN2_CONFIG,
     PHI3_CONFIG,
     GPT2_CONFIG,
diff --git a/tests/models/registry.py b/tests/models/registry.py
index e6543c197348..ea1e4a1ad2fb 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -238,6 +238,7 @@ def check_available_online(
     "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
     "Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"),
     "Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"),
+    "Qwen3ForSequenceClassification": _HfExamplesInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls"),  # noqa: E501
     "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"),
     "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b",  # noqa: E501
                                                 v0_only=True),
diff --git a/tests/test_config.py b/tests/test_config.py
index ce383e1b420a..715ef09dd307 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -13,32 +13,32 @@
 from vllm.platforms import current_platform
 
 
-class TestConfig1:
+class _TestConfig1:
     pass
 
 
 @dataclass
-class TestConfig2:
+class _TestConfig2:
     a: int
     """docstring"""
 
 
 @dataclass
-class TestConfig3:
+class _TestConfig3:
     a: int = 1
 
 
 @dataclass
-class TestConfig4:
+class _TestConfig4:
     a: Union[Literal[1], Literal[2]] = 1
     """docstring"""
 
 
 @pytest.mark.parametrize(("test_config", "expected_error"), [
-    (TestConfig1, "must be a dataclass"),
-    (TestConfig2, "must have a default"),
-    (TestConfig3, "must have a docstring"),
-    (TestConfig4, "must use a single Literal"),
+    (_TestConfig1, "must be a dataclass"),
+    (_TestConfig2, "must have a default"),
+    (_TestConfig3, "must have a docstring"),
+    (_TestConfig4, "must use a single Literal"),
 ])
 def test_config(test_config, expected_error):
     with pytest.raises(Exception, match=expected_error):
@@ -57,23 +57,23 @@ def test_compile_config_repr_succeeds():
     assert 'inductor_passes' in val
 
 
-def test_get_field():
+@dataclass
+class _TestConfigFields:
+    a: int
+    b: dict = field(default_factory=dict)
+    c: str = "default"
 
-    @dataclass
-    class TestConfig:
-        a: int
-        b: dict = field(default_factory=dict)
-        c: str = "default"
 
+def test_get_field():
     with pytest.raises(ValueError):
-        get_field(TestConfig, "a")
+        get_field(_TestConfigFields, "a")
 
-    b = get_field(TestConfig, "b")
+    b = get_field(_TestConfigFields, "b")
     assert isinstance(b, Field)
     assert b.default is MISSING
     assert b.default_factory is dict
 
-    c = get_field(TestConfig, "c")
+    c = get_field(_TestConfigFields, "c")
     assert isinstance(c, Field)
     assert c.default == "default"
     assert c.default_factory is MISSING
diff --git a/tests/test_utils.py b/tests/test_utils.py
index a2fd845ea54b..913188455d8e 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -272,6 +272,15 @@ def test_dict_args(parser):
         "val5",
         "--hf_overrides.key-7.key_8",
         "val6",
+        # Test data type detection
+        "--hf_overrides.key9",
+        "100",
+        "--hf_overrides.key10",
+        "100.0",
+        "--hf_overrides.key11",
+        "true",
+        "--hf_overrides.key12.key13",
+        "null",
     ]
     parsed_args = parser.parse_args(args)
     assert parsed_args.model_name == "something.something"
@@ -286,6 +295,12 @@ def test_dict_args(parser):
         "key-7": {
             "key_8": "val6",
         },
+        "key9": 100,
+        "key10": 100.0,
+        "key11": True,
+        "key12": {
+            "key13": None,
+        },
     }
 
 
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index bf4cb539ebef..394336624aca 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -117,7 +117,7 @@ def test_prefill(hash_algo):
     blocks = manager.allocate_slots(req0, 55,
                                     len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert blocks.get_block_ids() == [[1, 2, 3, 4]]
+    assert blocks.get_block_ids() == ([1, 2, 3, 4], )
 
     # Check full block metadata
     parent_block_hash = None
@@ -141,13 +141,13 @@ def test_prefill(hash_algo):
     req1 = make_request("1", common_token_ids + unique_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert len(manager.req_to_block_hashes[req1.request_id]) == 3
-    assert computed_blocks.get_block_ids() == [[1, 2, 3]]
+    assert computed_blocks.get_block_ids() == ([1, 2, 3], )
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req1, num_new_tokens,
                                     len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert blocks.get_block_ids() == [[5]]
+    assert blocks.get_block_ids() == ([5], )
     for block in computed_blocks.blocks[0]:
         assert block.ref_cnt == 2
 
@@ -175,13 +175,13 @@ def test_prefill(hash_algo):
     req2 = make_request("2", common_token_ids + unique_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert len(manager.req_to_block_hashes[req2.request_id]) == 3
-    assert computed_blocks.get_block_ids() == [[1, 2, 3]]
+    assert computed_blocks.get_block_ids() == ([1, 2, 3], )
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req2, num_new_tokens,
                                     len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert blocks.get_block_ids() == [[6]]
+    assert blocks.get_block_ids() == ([6], )
 
     # Although we only have 6 free blocks, we have 8 blocks in
     # the free block queue due to lazy removal.
@@ -205,7 +205,7 @@ def test_prefill(hash_algo):
                                     len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
     # This block ID order also checks the eviction order.
-    assert blocks.get_block_ids() == [[7, 8, 9, 10, 4, 5, 6, 3, 2, 1]]
+    assert blocks.get_block_ids() == ([7, 8, 9, 10, 4, 5, 6, 3, 2, 1], )
     assert manager.block_pool.free_block_queue.num_free_blocks == 0
     assert manager.block_pool.free_block_queue.free_list_head is None
     assert manager.block_pool.free_block_queue.free_list_tail is None
@@ -236,8 +236,8 @@ def test_prefill_hybrid_model():
     blocks = manager.allocate_slots(req0, 55,
                                     len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert blocks.get_block_ids() == [[1, 2, 3, 4], [5, 6, 7, 8],
-                                      [9, 10, 11, 12]]
+    assert blocks.get_block_ids() == ([1, 2, 3, 4], [5, 6, 7,
+                                                     8], [9, 10, 11, 12])
 
     # Check full block metadata
     parent_block_hash = None
@@ -263,14 +263,14 @@ def test_prefill_hybrid_model():
     req1 = make_request("1", common_token_ids + unique_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert len(manager.req_to_block_hashes[req1.request_id]) == 3
-    assert computed_blocks.get_block_ids() == [[1, 2, 3], [0, 6, 7],
-                                               [0, 10, 11]]
+    assert computed_blocks.get_block_ids() == ([1, 2, 3], [0, 6,
+                                                           7], [0, 10, 11])
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req1, num_new_tokens,
                                     len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert blocks.get_block_ids() == [[13], [14], [15]]
+    assert blocks.get_block_ids() == ([13], [14], [15])
     for block_per_group in computed_blocks.blocks:
         for block in block_per_group:
             if block != manager.block_pool.null_block:
@@ -374,7 +374,7 @@ def test_prefill_plp():
     blocks = manager.allocate_slots(req0, 55,
                                     len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert blocks.get_block_ids() == [[1, 2, 3, 4]]
+    assert blocks.get_block_ids() == ([1, 2, 3, 4], )
     req0_block_hashes = [b.block_hash for b in blocks.blocks[0]]
 
     # Check full block metadata
@@ -400,13 +400,13 @@ def test_prefill_plp():
     req1 = make_request("1", common_token_ids + unique_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert len(manager.req_to_block_hashes[req1.request_id]) == 3
-    assert computed_blocks.get_block_ids() == [[1, 2, 3]]
+    assert computed_blocks.get_block_ids() == ([1, 2, 3], )
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req1, num_new_tokens,
                                     len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert blocks.get_block_ids() == [[5]]
+    assert blocks.get_block_ids() == ([5], )
     for block in computed_blocks.blocks[0]:
         assert block.ref_cnt == 2
 
@@ -444,7 +444,7 @@ def test_prefill_plp():
     block_ids = blocks.get_block_ids()
     # Duplicate cached blocks have different ids but same hashes vs request #0
     assert [b.block_hash for b in blocks.blocks[0]] == req0_block_hashes
-    assert block_ids != [[1, 2, 3, 4]]
+    assert block_ids != ([1, 2, 3, 4], )
 
     # Request #2 block hashes are valid since request #0 hashes are.
     # Check block reference counts.
@@ -474,7 +474,7 @@ def test_decode():
     blocks = manager.allocate_slots(req0, 55,
                                     len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert blocks.get_block_ids() == [[1, 2, 3, 4]]
+    assert blocks.get_block_ids() == ([1, 2, 3, 4], )
 
     # Append slots without allocating a new block.
     req0.num_computed_tokens = 55
@@ -546,12 +546,12 @@ def test_evict():
     # Touch the first 2 blocks.
     req2 = make_request("2", list(range(2 * 16 + 3)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
-    assert computed_blocks.get_block_ids() == [[1, 2]]
+    assert computed_blocks.get_block_ids() == ([1, 2], )
     assert num_computed_tokens == 2 * 16
     blocks = manager.allocate_slots(req2, 3,
                                     len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert blocks.get_block_ids() == [[10]]
+    assert blocks.get_block_ids() == ([10], )
     assert manager.block_pool.free_block_queue.num_free_blocks == 7
 
 
@@ -865,7 +865,7 @@ def test_mm_prefix_caching():
     blocks = manager.allocate_slots(req0, 59,
                                     len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert blocks.get_block_ids() == [[1, 2, 3, 4]]
+    assert blocks.get_block_ids() == ([1, 2, 3, 4], )
     req0.num_computed_tokens = 59
 
     # Append slots without allocating a new block.
@@ -926,7 +926,7 @@ def test_cache_key_salting():
     blocks = manager.allocate_slots(req0, 59,
                                     len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert blocks.get_block_ids() == [[1, 2, 3, 4]]
+    assert blocks.get_block_ids() == ([1, 2, 3, 4], )
     req0.num_computed_tokens = 59
 
     # Append slots without allocating a new block.
@@ -1042,7 +1042,7 @@ def test_reset_prefix_cache():
     all_token_ids = full_block_token_ids + unique_token_ids
     req0 = make_request("0", all_token_ids)
     blocks = manager.allocate_slots(req0, 55)
-    assert blocks.get_block_ids() == [[1, 2, 3, 4]]
+    assert blocks.get_block_ids() == ([1, 2, 3, 4], )
 
     unique_token_ids = [4] * 7
     all_token_ids = full_block_token_ids + unique_token_ids
@@ -1053,7 +1053,7 @@ def test_reset_prefix_cache():
     blocks = manager.allocate_slots(req1, 7,
                                     len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
-    assert blocks.get_block_ids() == [[5]]
+    assert blocks.get_block_ids() == ([5], )
 
     # Failed to reset prefix cache because some blocks are not freed yet.
     assert not manager.reset_prefix_cache()
diff --git a/tests/v1/sample/test_topk_topp_sampler.py b/tests/v1/sample/test_topk_topp_sampler.py
index 63fdeb5a6de8..9d695cd91a97 100644
--- a/tests/v1/sample/test_topk_topp_sampler.py
+++ b/tests/v1/sample/test_topk_topp_sampler.py
@@ -28,7 +28,7 @@ def reset_default_device():
     torch.set_default_device(original_device)
 
 
-def test_topk_impl_equivalance():
+def test_topk_impl_equivalence():
 
     torch.set_default_device(DEVICE)
     generator = Generator(device=DEVICE).manual_seed(33)
diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index 73c0da45d4ab..0e7d305fef9e 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import unittest.mock as mock
 
 import pytest
 
@@ -17,24 +16,8 @@
     TPUModelRunner, _get_padded_num_reqs_with_upper_limit,
     _get_padded_token_len, _get_req_paddings, _get_token_paddings)
 
-# Mock torch_xla module since it may not be available in the test environments
-torch_xla_patcher = mock.patch.dict(
-    "sys.modules", {
-        "torch_xla": mock.MagicMock(),
-        "torch_xla.core.xla_model": mock.MagicMock(),
-        "torch_xla.runtime": mock.MagicMock(),
-    })
-torch_xla_patcher.start()
 
-# Mock the PallasAttentionBackend
-pallas_attention_backend_patcher = mock.patch(
-    "vllm.v1.worker.tpu_model_runner.PallasAttentionBackend", )
-pallas_attention_backend_patcher.start()
-
-
-@pytest.fixture
-def model_runner():
-    # Patchers have already been started at module level.
+def get_vllm_config():
     scheduler_config = SchedulerConfig(
         max_num_seqs=10,
         max_num_batched_tokens=512,
@@ -60,18 +43,19 @@ def model_runner():
         cache_config=cache_config,
         scheduler_config=scheduler_config,
     )
+    return vllm_config
+
+
+def get_model_runner(vllm_config):
     device = "xla:0"  # Mocking TPU device
-    with mock.patch("vllm.v1.worker.tpu_model_runner.torch"), \
-         mock.patch("vllm.v1.worker.tpu_model_runner.xm"), \
-         mock.patch("vllm.v1.worker.tpu_model_runner.xr"):
-        return TPUModelRunner(vllm_config, device)
+    return TPUModelRunner(vllm_config, device)
 
 
-@pytest.fixture(autouse=True, scope="session")
-def cleanup_patches():
-    yield
-    torch_xla_patcher.stop()
-    pallas_attention_backend_patcher.stop()
+@pytest.fixture
+def model_runner():
+    # Patchers have already been started at module level.
+    vllm_config = get_vllm_config()
+    return get_model_runner(vllm_config)
 
 
 def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
@@ -87,7 +71,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
                 mm_hashes=[],
                 mm_positions=[],
                 sampling_params=SamplingParams(),
-                block_ids=[[0]],  # block_ids should be list[list[int]]
+                block_ids=([0], ),  # block_ids should be tuple[list[int]]
                 num_computed_tokens=0,
                 lora_request=None,
             ))
@@ -132,10 +116,10 @@ def _is_req_state_block_table_match(model_runner, req_id: str) -> bool:
     # This is safe since we currently only use single KV cache groups
     block_table = multi_group_block_table[0]
 
-    # req_state.block_ids is now list[list[int]] for MultiGroupBlockTable
+    # req_state.block_ids is now tuple[list[int], ...] for MultiGroupBlockTable
     # Extract the first group's block IDs
     if isinstance(req_state.block_ids[0], list):
-        # New format: list[list[int]] - extract first group
+        # New format: tuple[list[int], ...] - extract first group
         req_block_ids = req_state.block_ids[0]
     else:
         # Legacy format: list[int] - use directly
@@ -226,7 +210,7 @@ def test_update_states_request_resumed(model_runner):
         req_id=req_id,
         resumed_from_preemption=False,
         new_token_ids=[],
-        new_block_ids=[[]],
+        new_block_ids=([], ),
         num_computed_tokens=0,
     )
 
@@ -370,12 +354,14 @@ def test_get_req_paddings():
     assert _get_req_paddings(8, 36) == [8, 16, 32, 36]
 
 
-@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
-def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
+def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order(
+        model_runner):
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
     error_msg = f"{layer_1} must come before the current layer"
-    with pytest.raises(ValueError, match=error_msg):
+    vllm_config = model_runner.vllm_config
+    with pytest.raises(ValueError, match=error_msg), \
+        set_current_vllm_config(vllm_config):
         fwd_context = {
             # initialization below will fail because target layer is invalid;
             # the target layer needs to come before layer 1
@@ -399,13 +385,14 @@ def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
         assert fwd_context is not None
 
 
-@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
-def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
+def test_init_kv_cache_with_kv_sharing_target_layer_not_exist(model_runner):
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
     invalid_layer = "model.layers.0.cross_attn.attn"
     error_msg = f"{invalid_layer} is not a valid Attention layer in the model"
-    with pytest.raises(ValueError, match=error_msg):
+    vllm_config = model_runner.vllm_config
+    with pytest.raises(ValueError, match=error_msg), \
+        set_current_vllm_config(vllm_config):
         fwd_context = {
             layer_0:
             Attention(
@@ -428,12 +415,13 @@ def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
         assert fwd_context is not None
 
 
-@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
-def test_init_kv_cache_with_kv_sharing_target_same_as_current():
+def test_init_kv_cache_with_kv_sharing_target_same_as_current(model_runner):
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
     error_msg = f"{layer_1} cannot be the same as the current layer"
-    with pytest.raises(ValueError, match=error_msg):
+    vllm_config = model_runner.vllm_config
+    with pytest.raises(ValueError, match=error_msg), \
+        set_current_vllm_config(vllm_config):
         fwd_context = {
             # initialization below will fail because target layer is invalid;
             # the target layer needs to come before layer 1
@@ -457,11 +445,10 @@ def test_init_kv_cache_with_kv_sharing_target_same_as_current():
         assert fwd_context is not None
 
 
-@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
-def test_init_kv_cache_without_kv_sharing(model_runner):
+def test_init_kv_cache_without_kv_sharing():
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
-    vllm_config = model_runner.vllm_config
+    vllm_config = get_vllm_config()
     with set_current_vllm_config(vllm_config):
         fwd_context = {
             layer_0:
@@ -482,33 +469,38 @@ def test_init_kv_cache_without_kv_sharing(model_runner):
         # suppress var not used error
         assert fwd_context is not None
     # Set high context length to test max context length estimation
-    vllm_config.model_config.max_model_len = 3_000_000
+    vllm_config.model_config.max_model_len = 1_000_000
     vllm_ctx = vllm_config.compilation_config.static_forward_context
+    model_runner = get_model_runner(vllm_config)
     kv_cache_spec = model_runner.get_kv_cache_spec()
     assert len(kv_cache_spec) == 2
     assert len(model_runner.shared_kv_cache_layers) == 0
 
     available_memory = 20 * GiB_bytes
-    # page size for layer 0's kv_cache_spec is 32KB
-    num_expected_blocks = 327680  # 20GB / 32KB / 2 (num layers)
+    # page size for each layer KV can be calculated as
+    # 2 (non-MLA) * 8 (num_heads) * 128 (head_dim)
+    # * 2 (bfloat16, kv_cache dtype) * 128 (block_size) = 512KB
+    num_expected_blocks = 20480  # 20GB / 512KB / 2 (num layers)
     kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
                                           available_memory)
     assert kv_cache_config.num_blocks == num_expected_blocks
-    assert len(kv_cache_config.tensors) == 2
-    assert kv_cache_config.tensors[layer_0].size == available_memory // 2
-    assert kv_cache_config.tensors[layer_1].size == available_memory // 2
+    assert len(kv_cache_config.kv_cache_tensors) == 2
+    assert kv_cache_config.kv_cache_tensors[0].size == available_memory // 2
+    assert kv_cache_config.kv_cache_tensors[1].size == available_memory // 2
 
     max_context_len =\
         estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
     # max context len with KV sharing should be 2x as large as without
-    assert max_context_len == 1310720
+    # max_context_len = available_memory / (page_size / block_size) / num_caches
+    # max_context_len = 5GB / (512KB / 128) / 2 = 655360
+    assert max_context_len == 655360
 
     # important: override tensor size to prevent large mem alloc during test
-    # this will only allocate 2 block worth of memory (2 * 32kb)
+    # this will only allocate 2 block worth of memory (2 * 512kb)
     kv_cache_config.num_blocks = 1
-    for layer in kv_cache_config.tensors:
-        kv_cache_config.tensors[layer].size =\
-            kv_cache_spec[layer].page_size_bytes
+    for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
+        kv_cache_tensor.size = (
+            kv_cache_spec[kv_cache_tensor.shared_by[0]].page_size_bytes)
 
     model_runner.initialize_kv_cache(kv_cache_config)
 
@@ -524,11 +516,10 @@ def test_init_kv_cache_without_kv_sharing(model_runner):
     assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1
 
 
-@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
-def test_init_kv_cache_with_kv_sharing_valid(model_runner):
+def test_init_kv_cache_with_kv_sharing_valid():
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
-    vllm_config = model_runner.vllm_config
+    vllm_config = get_vllm_config()
     with set_current_vllm_config(vllm_config):
         fwd_context = {
             layer_0:
@@ -552,33 +543,34 @@ def test_init_kv_cache_with_kv_sharing_valid(model_runner):
     # Set high context length to test max context length estimation
     vllm_config.model_config.max_model_len = 3_000_000
     vllm_ctx = vllm_config.compilation_config.static_forward_context
+    model_runner = get_model_runner(vllm_config)
     kv_cache_spec = model_runner.get_kv_cache_spec()
     assert len(kv_cache_spec) == 1
     assert layer_0 in kv_cache_spec
     assert model_runner.shared_kv_cache_layers[layer_1] == layer_0
 
     available_memory = 20 * GiB_bytes
-    # page size for layer 0's kv_cache_spec is 32KB
+    # page size for layer 0's kv_cache_spec is 512KB
     # with KV sharing, we can allocate (available_mem//page_size//1) blocks
     # which is twice as many as without KV sharing
-    num_expected_blocks = 655360  # 20GB / 32KB
+    num_expected_blocks = 2 * 20480  # 20GB / 512KB
     kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
                                           available_memory)
     assert kv_cache_config.num_blocks == num_expected_blocks
-    assert len(kv_cache_config.tensors) == 1
+    assert len(kv_cache_config.kv_cache_tensors) == 1
     # Each layer now has twice the available memory for KV cache
     # compared to no KV sharing
-    assert kv_cache_config.tensors[layer_0].size == available_memory
+    assert kv_cache_config.kv_cache_tensors[0].size == available_memory
 
     max_context_len =\
         estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
     # max context len with KV sharing should be 2x as large as without
-    assert max_context_len == 2 * 1310720
+    assert max_context_len == (2 * 655360)
 
     # important: override tensor size to prevent large mem alloc during test
-    # this will only allocate 1 block worth of memory (32kb)
+    # this will only allocate 1 block worth of memory (512kb)
     kv_cache_config.num_blocks = 1
-    kv_cache_config.tensors[layer_0].size =\
+    kv_cache_config.kv_cache_tensors[0].size =\
         kv_cache_spec[layer_0].page_size_bytes
 
     model_runner.initialize_kv_cache(kv_cache_config)
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 72547e86b0e9..de6ebe4f6716 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -203,7 +203,7 @@ def _construct_cached_request_state(req_id_suffix: int):
         sampling_params=_create_sampling_params(),
         mm_inputs=[],
         mm_positions=[],
-        block_ids=[[]],
+        block_ids=([], ),
         generator=None,
         num_computed_tokens=len(output_token_ids),
         output_token_ids=output_token_ids,
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index caacb1652e9a..994432dfd593 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -122,7 +122,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
                 mm_hashes=[],
                 mm_positions=[],
                 sampling_params=SamplingParams(),
-                block_ids=[[0]],
+                block_ids=([0], ),
                 num_computed_tokens=0,
                 lora_request=None,
             ))
@@ -250,7 +250,7 @@ def test_update_states_request_resumed(model_runner):
         req_id=req_id,
         resumed_from_preemption=False,
         new_token_ids=[],
-        new_block_ids=[[]],
+        new_block_ids=([], ),
         num_computed_tokens=0,
     )
 
diff --git a/tools/ep_kernels/README.md b/tools/ep_kernels/README.md
index 5c98e999da33..f1479146f053 100644
--- a/tools/ep_kernels/README.md
+++ b/tools/ep_kernels/README.md
@@ -1,11 +1,10 @@
 Large-scale cluster-level expert parallel, as described in the [DeepSeek-V3 Technical Report](http://arxiv.org/abs/2412.19437), is an efficient way to deploy sparse MoE models with many experts. However, such deployment requires many components beyond a normal Python package, including system package support and system driver support. It is impossible to bundle all these components into a Python package.
 
-Here we break down the requirements in 3 steps:
+Here we break down the requirements in 2 steps:
 1. Build and install the Python libraries (both [pplx-kernels](https://github.com/ppl-ai/pplx-kernels) and [DeepEP](https://github.com/deepseek-ai/DeepEP)), including necessary dependencies like NVSHMEM. This step does not require any privileged access. Any user can do this.
-2. Build and install the system libraries (GDR Copy). This step requires root access. You can do it inside a Docker container so that they can be shipped as a single image.
-3. Build and install the system drivers (GDR Copy, and necessary modifications to NVIDIA driver to enable IBGDA). This step requires root access, and must be done on the host machine.
+2. Configure NVIDIA driver to enable IBGDA. This step requires root access, and must be done on the host machine.
 
-2 and 3 are necessary for multi-node deployment.
+2 is necessary for multi-node deployment.
 
 All scripts accept a positional argument as workspace path for staging the build, defaulting to `$(pwd)/ep_kernels_workspace`.
 
@@ -21,7 +20,6 @@ bash install_python_libraries.sh
 
 ```bash
 bash install_python_libraries.sh
-sudo bash install_system_libraries.sh
-sudo bash install_system_drivers.sh
+sudo bash configure_system_drivers.sh
 sudo reboot # Reboot is required to load the new driver
 ```
diff --git a/tools/ep_kernels/configure_system_drivers.sh b/tools/ep_kernels/configure_system_drivers.sh
new file mode 100644
index 000000000000..cf15c1daccae
--- /dev/null
+++ b/tools/ep_kernels/configure_system_drivers.sh
@@ -0,0 +1,7 @@
+set -ex
+
+# turn on IBGDA
+echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1;"' | tee -a /etc/modprobe.d/nvidia.conf
+update-initramfs -u
+
+echo "Please reboot the system to apply the changes"
diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh
index e5632f4b5875..83643c084bf9 100644
--- a/tools/ep_kernels/install_python_libraries.sh
+++ b/tools/ep_kernels/install_python_libraries.sh
@@ -13,16 +13,6 @@ fi
 # install dependencies if not installed
 pip3 install cmake torch ninja
 
-# build gdrcopy, required by nvshmem
-pushd $WORKSPACE
-wget https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v2.4.4.tar.gz
-mkdir -p gdrcopy_src
-tar -xvf v2.4.4.tar.gz -C gdrcopy_src --strip-components=1
-pushd gdrcopy_src
-make -j$(nproc)
-make prefix=$WORKSPACE/gdrcopy_install install
-popd
-
 # build nvshmem
 pushd $WORKSPACE
 mkdir -p nvshmem_src
@@ -34,26 +24,30 @@ git init
 git apply -vvv nvshmem.patch
 
 # assume CUDA_HOME is set correctly
-export GDRCOPY_HOME=$WORKSPACE/gdrcopy_install
+if [ -z "$CUDA_HOME" ]; then
+    echo "CUDA_HOME is not set, please set it to your CUDA installation directory."
+    exit 1
+fi
+
+# disable all features except IBGDA
+export NVSHMEM_IBGDA_SUPPORT=1
+
 export NVSHMEM_SHMEM_SUPPORT=0
 export NVSHMEM_UCX_SUPPORT=0
 export NVSHMEM_USE_NCCL=0
-export NVSHMEM_IBGDA_SUPPORT=1
 export NVSHMEM_PMIX_SUPPORT=0
 export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
-export NVSHMEM_USE_GDRCOPY=1
-export NVSHMEM_IBRC_SUPPORT=1
-
-# remove MPI dependency
+export NVSHMEM_USE_GDRCOPY=0
+export NVSHMEM_IBRC_SUPPORT=0
 export NVSHMEM_BUILD_TESTS=0
 export NVSHMEM_BUILD_EXAMPLES=0
 export NVSHMEM_MPI_SUPPORT=0
+export NVSHMEM_BUILD_HYDRA_LAUNCHER=0
+export NVSHMEM_BUILD_TXZ_PACKAGE=0
+export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
 
-cmake -S . -B $WORKSPACE/nvshmem_build/ -DCMAKE_INSTALL_PREFIX=$WORKSPACE/nvshmem_install
-
-cd $WORKSPACE/nvshmem_build/
-make -j$(nproc)
-make install
+cmake -G Ninja -S . -B $WORKSPACE/nvshmem_build/ -DCMAKE_INSTALL_PREFIX=$WORKSPACE/nvshmem_install
+cmake --build $WORKSPACE/nvshmem_build/ --target install
 
 popd
 
diff --git a/tools/ep_kernels/install_system_drivers.sh b/tools/ep_kernels/install_system_drivers.sh
deleted file mode 100644
index 8b0669ef404f..000000000000
--- a/tools/ep_kernels/install_system_drivers.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-set -ex
-
-# prepare workspace directory
-WORKSPACE=$1
-if [ -z "$WORKSPACE" ]; then
-    export WORKSPACE=$(pwd)/ep_kernels_workspace
-fi
-
-if [ ! -d "$WORKSPACE" ]; then
-    mkdir -p $WORKSPACE
-fi
-
-# build and install gdrcopy driver
-pushd $WORKSPACE
-cd gdrcopy_src
-./insmod.sh
-# run gdrcopy_copybw to test the installation
-$WORKSPACE/gdrcopy_install/bin/gdrcopy_copybw
-
-# turn on IBGDA
-echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1;"' | tee -a /etc/modprobe.d/nvidia.conf
-update-initramfs -u
-
-echo "Please reboot the system to apply the changes"
diff --git a/tools/ep_kernels/install_system_libraries.sh b/tools/ep_kernels/install_system_libraries.sh
deleted file mode 100644
index c148d5443900..000000000000
--- a/tools/ep_kernels/install_system_libraries.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-set -ex
-
-# prepare workspace directory
-WORKSPACE=$1
-if [ -z "$WORKSPACE" ]; then
-    export WORKSPACE=$(pwd)/ep_kernels_workspace
-fi
-
-if [ ! -d "$WORKSPACE" ]; then
-    mkdir -p $WORKSPACE
-fi
-
-# build and install gdrcopy system packages
-pushd $WORKSPACE
-cd gdrcopy_src/packages
-apt install devscripts -y
-CUDA=${CUDA_HOME:-/usr/local/cuda} ./build-deb-packages.sh
-dpkg -i *.deb
diff --git a/vllm/config.py b/vllm/config.py
index d8318ab229c8..32ef83a1866d 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3931,7 +3931,7 @@ class CompilationConfig:
     constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
 
     # CudaGraph compilation
-    use_cudagraph: bool = envs.VLLM_USE_V1
+    use_cudagraph: bool = field(default_factory=lambda: envs.VLLM_USE_V1)
     """Whether to use cudagraph inside compilation.
     - False: cudagraph inside compilation is not used.
     - True: cudagraph inside compilation is used. It requires
@@ -4497,13 +4497,13 @@ def __post_init__(self):
             # warning message here and will log it later.
             if not (current_platform.is_cuda() or current_platform.is_rocm()):
                 # Hybrid KV cache manager is not supported on non-GPU platforms.
-                self.disable_hybrid_kv_cache_manager = True
+                self.scheduler_config.disable_hybrid_kv_cache_manager = True
             if self.kv_transfer_config is not None:
                 # Hybrid KV cache manager is not compatible with KV transfer.
-                self.disable_hybrid_kv_cache_manager = True
+                self.scheduler_config.disable_hybrid_kv_cache_manager = True
             if self.kv_events_config is not None:
                 # Hybrid KV cache manager is not compatible with KV events.
-                self.disable_hybrid_kv_cache_manager = True
+                self.scheduler_config.disable_hybrid_kv_cache_manager = True
 
     def update_sizes_for_sequence_parallelism(self,
                                               possible_sizes: list) -> list:
diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py
index cab2496bfba7..35f2fd0ba9e2 100644
--- a/vllm/distributed/device_communicators/all2all.py
+++ b/vllm/distributed/device_communicators/all2all.py
@@ -233,16 +233,11 @@ def _make_all2all_kwargs(
         # Defaults for internode and intranode are taken from DeepEP tests.
         num_nvl_bytes = 1024 * 1024 * 1024
         num_qps_per_rank = num_local_experts
-        num_rdma_bytes = None
-
-        if self.internode:
-            num_rdma_bytes = 1024 * 1024 * 1024
-        else:
-            num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(
-                num_max_dispatch_tokens_per_rank=max_num_tokens_per_dp_rank,
-                hidden=token_hidden_size,
-                num_ranks=num_ep_ranks,
-                num_experts=num_global_experts)
+        num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(
+            num_max_dispatch_tokens_per_rank=max_num_tokens_per_dp_rank,
+            hidden=token_hidden_size,
+            num_ranks=num_ep_ranks,
+            num_experts=num_global_experts)
 
         assert num_rdma_bytes is not None
         return dict(group=self.cpu_group,
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 4ce1b41e4f87..38d567acfd8a 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1440,9 +1440,10 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
             _raise_or_fallback(feature_name=name, recommend_to_remove=False)
             return False
 
-        # Non-[CUDA, TPU] may be supported on V1, but off by default for now.
+        # Non-[CUDA, TPU, x86 CPU] may be supported on V1,
+        # but off by default for now.
         v0_hardware = not any(
-            (current_platform.is_cuda(), current_platform.is_tpu(),
+            (current_platform.is_cuda_alike(), current_platform.is_tpu(),
              (current_platform.is_cpu()
               and current_platform.get_cpu_architecture() == CpuArchEnum.X86)))
         if v0_hardware and _warn_or_fallback(  # noqa: SIM103
diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py
index 7925d91f6064..3fb2f71b5e99 100644
--- a/vllm/engine/output_processor/stop_checker.py
+++ b/vllm/engine/output_processor/stop_checker.py
@@ -82,7 +82,7 @@ def maybe_stop_sequence(
             return
 
         # Check if the sequence has reached max_model_len.
-        if seq.get_len() > self._get_max_model_len(lora_req):
+        if seq.get_len() >= self._get_max_model_len(lora_req):
             seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
             return
 
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index 56f8754c266b..3d1e5dc14d2f 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -17,6 +17,7 @@
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, Response, StreamingResponse
 
+import vllm.envs as envs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.launcher import serve_http
@@ -29,7 +30,6 @@
 
 logger = init_logger("vllm.entrypoints.api_server")
 
-TIMEOUT_KEEP_ALIVE = 5  # seconds.
 app = FastAPI()
 engine = None
 
@@ -134,7 +134,7 @@ async def run_server(args: Namespace,
         host=args.host,
         port=args.port,
         log_level=args.log_level,
-        timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
+        timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE,
         ssl_keyfile=args.ssl_keyfile,
         ssl_certfile=args.ssl_certfile,
         ssl_ca_certs=args.ssl_ca_certs,
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 7c9ca41dd794..6e3cb18fc559 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -281,7 +281,7 @@ def generate(
         sampling_params: Optional[Union[SamplingParams,
                                         Sequence[SamplingParams]]] = None,
         *,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         guided_options_request: Optional[Union[LLMGuidedOptions,
@@ -297,7 +297,7 @@ def generate(
         sampling_params: Optional[Union[SamplingParams,
                                         list[SamplingParams]]] = None,
         prompt_token_ids: Optional[list[int]] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         guided_options_request: Optional[Union[LLMGuidedOptions,
@@ -313,7 +313,7 @@ def generate(
         sampling_params: Optional[Union[SamplingParams,
                                         list[SamplingParams]]] = None,
         prompt_token_ids: Optional[list[list[int]]] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         guided_options_request: Optional[Union[LLMGuidedOptions,
@@ -330,7 +330,7 @@ def generate(
                                         list[SamplingParams]]] = None,
         *,
         prompt_token_ids: list[int],
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         guided_options_request: Optional[Union[LLMGuidedOptions,
@@ -347,7 +347,7 @@ def generate(
                                         list[SamplingParams]]] = None,
         *,
         prompt_token_ids: list[list[int]],
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         guided_options_request: Optional[Union[LLMGuidedOptions,
@@ -362,7 +362,7 @@ def generate(
         prompts: None,
         sampling_params: None,
         prompt_token_ids: Union[list[int], list[list[int]]],
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         guided_options_request: Optional[Union[LLMGuidedOptions,
@@ -382,7 +382,7 @@ def generate(
         sampling_params: Optional[Union[SamplingParams,
                                         Sequence[SamplingParams]]] = None,
         prompt_token_ids: Optional[Union[list[int], list[list[int]]]] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         guided_options_request: Optional[Union[LLMGuidedOptions,
@@ -404,7 +404,10 @@ def generate(
                 When it is a single value, it is applied to every prompt.
                 When it is a list, the list must have the same length as the
                 prompts and it is paired one by one with the prompt.
-            use_tqdm: Whether to use tqdm to display the progress bar.
+            use_tqdm: If `True`, shows a tqdm progress bar.
+                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
+                it is used to create the progress bar.
+                If `False`, no progress bar is created.
             lora_request: LoRA request to use for generation, if any.
             prompt_adapter_request: Prompt Adapter request to use for
                 generation, if any.
@@ -678,7 +681,7 @@ def chat(
                         list[list[ChatCompletionMessageParam]]],
         sampling_params: Optional[Union[SamplingParams,
                                         list[SamplingParams]]] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[LoRARequest] = None,
         chat_template: Optional[str] = None,
         chat_template_content_format: ChatTemplateContentFormatOption = "auto",
@@ -709,7 +712,10 @@ def chat(
                 is a single value, it is applied to every prompt. When it
                 is a list, the list must have the same length as the
                 prompts and it is paired one by one with the prompt.
-            use_tqdm: Whether to use tqdm to display the progress bar.
+            use_tqdm: If `True`, shows a tqdm progress bar.
+                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
+                it is used to create the progress bar.
+                If `False`, no progress bar is created.
             lora_request: LoRA request to use for generation, if any.
             chat_template: The template to use for structuring the chat.
                 If not provided, the model's default chat template will be used.
@@ -823,7 +829,7 @@ def encode(
                                        Sequence[PoolingParams]]] = None,
         *,
         truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[PoolingRequestOutput]:
@@ -838,7 +844,7 @@ def encode(
                                        Sequence[PoolingParams]]] = None,
         prompt_token_ids: Optional[list[int]] = None,
         truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[PoolingRequestOutput]:
@@ -853,7 +859,7 @@ def encode(
                                        Sequence[PoolingParams]]] = None,
         prompt_token_ids: Optional[list[list[int]]] = None,
         truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[PoolingRequestOutput]:
@@ -869,7 +875,7 @@ def encode(
         *,
         prompt_token_ids: list[int],
         truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[PoolingRequestOutput]:
@@ -885,7 +891,7 @@ def encode(
         *,
         prompt_token_ids: list[list[int]],
         truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[PoolingRequestOutput]:
@@ -899,7 +905,7 @@ def encode(
         pooling_params: None,
         prompt_token_ids: Union[list[int], list[list[int]]],
         truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[PoolingRequestOutput]:
@@ -918,7 +924,7 @@ def encode(
                                        Sequence[PoolingParams]]] = None,
         prompt_token_ids: Optional[Union[list[int], list[list[int]]]] = None,
         truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[PoolingRequestOutput]:
@@ -935,7 +941,10 @@ def encode(
                 for more details about the format of each prompts.
             pooling_params: The pooling parameters for pooling. If None, we
                 use the default pooling parameters.
-            use_tqdm: Whether to use tqdm to display the progress bar.
+            use_tqdm: If `True`, shows a tqdm progress bar.
+                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
+                it is used to create the progress bar.
+                If `False`, no progress bar is created.
             lora_request: LoRA request to use for generation, if any.
             prompt_adapter_request: Prompt Adapter request to use for
                 generation, if any.
@@ -1005,7 +1014,7 @@ def embed(
         /,
         *,
         truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         pooling_params: Optional[Union[PoolingParams,
                                        Sequence[PoolingParams]]] = None,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
@@ -1024,7 +1033,10 @@ def embed(
                 for more details about the format of each prompts.
             pooling_params: The pooling parameters for pooling. If None, we
                 use the default pooling parameters.
-            use_tqdm: Whether to use tqdm to display the progress bar.
+            use_tqdm: If `True`, shows a tqdm progress bar.
+                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
+                it is used to create the progress bar.
+                If `False`, no progress bar is created.
             lora_request: LoRA request to use for generation, if any.
             prompt_adapter_request: Prompt Adapter request to use for
                 generation, if any.
@@ -1051,7 +1063,7 @@ def classify(
         prompts: Union[PromptType, Sequence[PromptType]],
         /,
         *,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[ClassificationRequestOutput]:
@@ -1066,7 +1078,10 @@ def classify(
             prompts: The prompts to the LLM. You may pass a sequence of prompts
                 for batch inference. See [PromptType][vllm.inputs.PromptType]
                 for more details about the format of each prompts.
-            use_tqdm: Whether to use tqdm to display the progress bar.
+            use_tqdm: If `True`, shows a tqdm progress bar.
+                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
+                it is used to create the progress bar.
+                If `False`, no progress bar is created.
             lora_request: LoRA request to use for generation, if any.
             prompt_adapter_request: Prompt Adapter request to use for
                 generation, if any.
@@ -1092,7 +1107,7 @@ def _embedding_score(
         text_1: list[Union[str, TextPrompt, TokensPrompt]],
         text_2: list[Union[str, TextPrompt, TokensPrompt]],
         truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[ScoringRequestOutput]:
@@ -1126,7 +1141,7 @@ def _cross_encoding_score(
         text_1: list[str],
         text_2: list[str],
         truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[ScoringRequestOutput]:
@@ -1178,7 +1193,7 @@ def score(
         /,
         *,
         truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: bool = True,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[ScoringRequestOutput]:
@@ -1198,7 +1213,10 @@ def score(
             text_2: The texts to pair with the query to form the input
                 to the LLM. See [PromptType][vllm.inputs.PromptType] for
                 more details about the format of each prompts.
-            use_tqdm: Whether to use tqdm to display the progress bar.
+            use_tqdm: If `True`, shows a tqdm progress bar.
+                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
+                it is used to create the progress bar.
+                If `False`, no progress bar is created.
             lora_request: LoRA request to use for generation, if any.
             prompt_adapter_request: Prompt Adapter request to use for
                 generation, if any.
@@ -1379,7 +1397,7 @@ def _validate_and_add_requests(
         params: Union[SamplingParams, Sequence[SamplingParams], PoolingParams,
                       Sequence[PoolingParams]],
         *,
-        use_tqdm: bool,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[Sequence[LoRARequest], LoRARequest]],
         prompt_adapter_request: Optional[PromptAdapterRequest],
         tokenization_kwargs: Optional[dict[str, Any]] = None,
@@ -1417,7 +1435,8 @@ def _validate_and_add_requests(
         # Add requests to the engine.
         it = prompts
         if use_tqdm:
-            it = tqdm(it, desc="Adding requests")
+            tqdm_func = use_tqdm if callable(use_tqdm) else tqdm
+            it = tqdm_func(it, desc="Adding requests")
 
         for i, prompt in enumerate(it):
             self._add_request(
@@ -1474,12 +1493,15 @@ def _add_guided_params(
         return params
 
     def _run_engine(
-            self, *, use_tqdm: bool
+        self,
+        *,
+        use_tqdm: Union[bool, Callable[..., tqdm]] = True
     ) -> list[Union[RequestOutput, PoolingRequestOutput]]:
         # Initialize tqdm.
         if use_tqdm:
             num_requests = self.llm_engine.get_num_unfinished_requests()
-            pbar = tqdm(
+            tqdm_func = use_tqdm if callable(use_tqdm) else tqdm
+            pbar = tqdm_func(
                 total=num_requests,
                 desc="Processed prompts",
                 dynamic_ncols=True,
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 2f8819bca60d..62f1c6a7c12b 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -103,8 +103,6 @@
 from vllm.v1.metrics.prometheus import get_prometheus_registry
 from vllm.version import __version__ as VLLM_VERSION
 
-TIMEOUT_KEEP_ALIVE = 5  # seconds
-
 prometheus_multiproc_dir: tempfile.TemporaryDirectory
 
 # Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765)
@@ -1360,7 +1358,7 @@ async def run_server_worker(listen_address,
             # NOTE: When the 'disable_uvicorn_access_log' value is True,
             # no access log will be output.
             access_log=not args.disable_uvicorn_access_log,
-            timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
+            timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE,
             ssl_keyfile=args.ssl_keyfile,
             ssl_certfile=args.ssl_certfile,
             ssl_ca_certs=args.ssl_ca_certs,
diff --git a/vllm/env_override.py b/vllm/env_override.py
index 2bede4963f96..ef425d433320 100644
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -13,7 +13,7 @@
 # that interact with vllm workers.
 # they are executed whenever `import vllm` is called.
 
-if 'NCCL_CUMEM_ENABLE' in os.environ:
+if os.environ.get('NCCL_CUMEM_ENABLE', '0') != '0':
     logger.warning(
         "NCCL_CUMEM_ENABLE is set to %s, skipping override. "
         "This may increase memory overhead with cudagraph+allreduce: "
diff --git a/vllm/envs.py b/vllm/envs.py
index 9d18a1389579..80c5f289bba9 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -44,6 +44,7 @@
     VLLM_PP_LAYER_PARTITION: Optional[str] = None
     VLLM_CPU_KVCACHE_SPACE: int = 0
     VLLM_CPU_OMP_THREADS_BIND: str = ""
+    VLLM_CPU_NUM_OF_RESERVED_CPU: int = 0
     VLLM_CPU_MOE_PREPACK: bool = True
     VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
     VLLM_XLA_CHECK_RECOMPILATION: bool = False
@@ -71,6 +72,7 @@
     VERBOSE: bool = False
     VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
     VLLM_RPC_TIMEOUT: int = 10000  # ms
+    VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5  # seconds
     VLLM_PLUGINS: Optional[list[str]] = None
     VLLM_LORA_RESOLVER_CACHE_DIR: Optional[str] = None
     VLLM_TORCH_PROFILER_DIR: Optional[str] = None
@@ -110,6 +112,7 @@
     VLLM_DP_SIZE: int = 1
     VLLM_DP_MASTER_IP: str = ""
     VLLM_DP_MASTER_PORT: int = 0
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False
     VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
     VLLM_V0_USE_OUTLINES_CACHE: bool = False
     VLLM_TPU_BUCKET_PADDING_GAP: int = 0
@@ -420,7 +423,12 @@ def get_vllm_port() -> Optional[int]:
     # (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31",
     # "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'.
     "VLLM_CPU_OMP_THREADS_BIND":
-    lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "all"),
+    lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "auto"),
+
+    # (CPU backend only) CPU cores not used by OMP threads .
+    # Those CPU cores will not be used by OMP threads of a rank.
+    "VLLM_CPU_NUM_OF_RESERVED_CPU":
+    lambda: int(os.getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0")),
 
     # (CPU backend only) whether to use prepack for MoE layer. This will be
     # passed to ipex.llm.modules.GatedMLPMOE. On unsupported CPUs, you might
@@ -556,6 +564,10 @@ def get_vllm_port() -> Optional[int]:
     "VLLM_RPC_TIMEOUT":
     lambda: int(os.getenv("VLLM_RPC_TIMEOUT", "10000")),
 
+    # Timeout in seconds for keeping HTTP connections alive in API server
+    "VLLM_HTTP_TIMEOUT_KEEP_ALIVE":
+    lambda: int(os.environ.get("VLLM_HTTP_TIMEOUT_KEEP_ALIVE", "5")),
+
     # a list of plugin names to load, separated by commas.
     # if this is not set, it means all plugins will be loaded
     # if this is set to an empty string, no plugins will be loaded
@@ -761,6 +773,10 @@ def get_vllm_port() -> Optional[int]:
     "VLLM_DP_MASTER_PORT":
     lambda: int(os.getenv("VLLM_DP_MASTER_PORT", "0")),
 
+    # Randomize inputs during dummy runs when using Data Parallel
+    "VLLM_RANDOMIZE_DP_DUMMY_INPUTS":
+    lambda: os.environ.get("VLLM_RANDOMIZE_DP_DUMMY_INPUTS", "0") == "1",
+
     # Whether to use S3 path for model loading in CI via RunAI Streamer
     "VLLM_CI_USE_S3":
     lambda: os.environ.get("VLLM_CI_USE_S3", "0") == "1",
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 000000000000..60ccde135159
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 000000000000..b9dc2d71f6dc
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 000000000000..3559f33f444b
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
index c00e849b4ebb..436c632be9c4 100644
--- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -80,11 +80,13 @@ def workspace_shapes(
         topk: int,
         num_experts: int,
     ) -> tuple[int, int, torch.dtype]:
+
         block_m = self.block_shape[0]
         M_sum = (M * topk) + num_experts * (block_m - 1)
         M_sum = round_up(M_sum, block_m)
         workspace1 = M_sum * max(N * 2, K)
-        workspace2 = M_sum * N
+        workspace2 = M_sum * max(N, K)
+
         return (workspace1, workspace2, a.dtype)
 
     def apply(
@@ -135,26 +137,31 @@ def apply(
 
         # Note: M_sum is different than the pre-permuted shape of a1q.
         M_sum = a1q.size(0)
-        workspace1 = _resize_cache(workspace13, (M_sum, N))
-        workspace2 = _resize_cache(workspace2, (M_sum, N // 2))
-        workspace3 = _resize_cache(workspace13, (M_sum, K))
+
+        mm1_out = _resize_cache(workspace13, (M_sum, N))
+        act_out = _resize_cache(workspace2, (M_sum, N // 2))
+        quant_out = _resize_cache(workspace13.view(dtype=torch.float8_e4m3fn),
+                                  (M_sum, N // 2))
+        mm2_out = _resize_cache(workspace2, (M_sum, K))
+        out = _resize_cache(workspace13, (inv_perm.size(0), K))
 
         dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
-            (a1q, a1q_scale), (w1, w1_scale), workspace1, expert_ids)
+            (a1q, a1q_scale), (w1, w1_scale), mm1_out, expert_ids)
 
-        self.activation(activation, workspace2, workspace1.view(-1, N))
+        self.activation(activation, act_out, mm1_out.view(-1, N))
 
         a2q_scale: Optional[torch.Tensor] = None
-        a2q, a2q_scale = per_token_group_quant_fp8(workspace2,
+        a2q, a2q_scale = per_token_group_quant_fp8(act_out,
                                                    self.block_shape[1],
-                                                   column_major_scales=True)
+                                                   column_major_scales=True,
+                                                   out_q=quant_out)
 
         dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
-            (a2q, a2q_scale), (w2, w2_scale), workspace3, expert_ids)
+            (a2q, a2q_scale), (w2, w2_scale), mm2_out, expert_ids)
 
-        workspace3 = workspace3[inv_perm, ...]
+        torch.index_select(mm2_out, 0, inv_perm, out=out)
 
-        return workspace3
+        return out
 
 
 def deep_gemm_moe_fp8(
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
index 48cf01638ade..8c21d8aa53a6 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
@@ -5,6 +5,7 @@
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm import _custom_ops as ops
 from vllm.model_executor.layers.fused_moe.utils import (
     moe_kernel_quantize_input)
 
@@ -193,20 +194,23 @@ def _apply_weights_and_reduce(self, num_tokens: int,
                                   apply_router_weight_on_input: bool,
                                   output_dtype: torch.dtype):
 
+        hidden_dim = fused_expert_output.size(-1)
         if fused_expert_output.ndim == 2:
-            hidden_dim = fused_expert_output.size(-1)
             fused_expert_output = fused_expert_output.view(
                 num_tokens, -1, hidden_dim)
 
         if not apply_router_weight_on_input:
             # The DeepEP combine kernels don't do the topk weight
             # multiplication. We multiply the weights locally.
-            fused_expert_output = fused_expert_output.to(torch.float32)
-            fused_expert_output = fused_expert_output * topk_weights.view(
-                fused_expert_output.size(0), -1, 1)
-            fused_expert_output = fused_expert_output.to(output_dtype)
+            m_x_topk = fused_expert_output.size(0)
+            fused_expert_output.mul_(topk_weights.view(m_x_topk, -1, 1))
 
-        return fused_expert_output.sum(dim=1).to(output_dtype)
+        out = torch.empty((num_tokens, hidden_dim),
+                          device=fused_expert_output.device,
+                          dtype=output_dtype)
+        ops.moe_sum(fused_expert_output, out)
+
+        return out
 
     def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor,
                  topk_weights: torch.Tensor, topk_ids: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
index 89481e5bd6b0..20ee0d9f780a 100644
--- a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
+++ b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
@@ -18,7 +18,7 @@ def _moe_permute(
     expert_map: Optional[torch.Tensor],
     block_m: int,
 ) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor,
-           Optional[torch.Tensor]]:
+           torch.Tensor]:
     """
     Determine the sorted_token_ids, expert_ids for the given problem size.
     Permute the hidden states and scales according to `sorted_token_ids`.
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 38935bc96785..53ba84ea8e75 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -71,9 +71,7 @@ def get_min_capability(cls) -> int:
 
     @staticmethod
     def get_config_filenames() -> list[str]:
-        return [
-            "adapter_config.json",
-        ]
+        return []
 
     @classmethod
     def from_config(cls, config: dict[str, Any]) -> "BitsAndBytesConfig":
diff --git a/vllm/model_executor/layers/quantization/deepgemm.py b/vllm/model_executor/layers/quantization/deepgemm.py
new file mode 100644
index 000000000000..1d40f4915a1b
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/deepgemm.py
@@ -0,0 +1,84 @@
+# SPDX-License-Identifier: Apache-2.0
+import importlib.util
+import logging
+
+import torch
+
+from vllm.platforms import current_platform
+from vllm.triton_utils import triton
+from vllm.utils import direct_register_custom_op
+
+has_deep_gemm = importlib.util.find_spec("deep_gemm") is not None
+if has_deep_gemm:
+    import deep_gemm
+
+logger = logging.getLogger(__name__)
+
+
+def prepare_block_fp8_matmul_inputs(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: list[int],
+    output_dtype: torch.dtype = torch.float16,
+) -> tuple[int, int, int, torch.Tensor]:
+    assert len(block_size) == 2
+    block_n, block_k = block_size[0], block_size[1]
+
+    assert A.shape[-1] == B.shape[-1]
+    assert A.shape[:-1] == As.shape[:-1]
+    assert A.is_contiguous()
+    assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
+
+    M = A.numel() // A.shape[-1]
+
+    assert B.ndim == 2
+    assert B.is_contiguous()
+    assert Bs.ndim == 2
+    N, K = B.shape
+    assert triton.cdiv(N, block_n) == Bs.shape[0]
+    assert triton.cdiv(K, block_k) == Bs.shape[1]
+
+    C_shape = A.shape[:-1] + (N, )
+    C = A.new_empty(C_shape, dtype=output_dtype)
+
+    return M, N, K, C
+
+
+def w8a8_block_fp8_matmul_deepgemm(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: list[int],
+    output_dtype: torch.dtype,
+) -> torch.Tensor:
+    M, N, K, C = prepare_block_fp8_matmul_inputs(A, B, As, Bs, block_size,
+                                                 output_dtype)
+    # Deepgemm only supports output tensor type as bfloat16
+    assert C.dtype == torch.bfloat16
+    deep_gemm.gemm_fp8_fp8_bf16_nt((A, As), (B, Bs), C)
+    return C
+
+
+def w8a8_block_fp8_matmul_deepgemm_fake(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: list[int],
+    output_dtype: torch.dtype,
+) -> torch.Tensor:
+    M, N, K, C = prepare_block_fp8_matmul_inputs(A, B, As, Bs, block_size,
+                                                 output_dtype)
+    return C
+
+
+direct_register_custom_op(
+    op_name="w8a8_block_fp8_matmul_deepgemm",
+    op_func=w8a8_block_fp8_matmul_deepgemm,
+    mutates_args=[],
+    fake_impl=w8a8_block_fp8_matmul_deepgemm_fake,
+    dispatch_key=current_platform.dispatch_key,
+)
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index c785e0d1674d..b3042bfaed3d 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -402,6 +402,7 @@ def apply(self,
 
         if self.block_quant:
             assert self.quant_config.weight_block_size is not None
+
             return torch.ops.vllm.apply_w8a8_block_fp8_linear(
                 input=x,
                 weight=layer.weight,
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 270979c8e932..3d67c09de58e 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -3,12 +3,14 @@
 
 # Adapted from https://github.com/sgl-project/sglang/pull/2575
 import functools
+import importlib.util
 import json
 import os
 from typing import Any, Callable, Optional, Union
 
 import torch
 
+import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
@@ -20,6 +22,7 @@
 from vllm.utils import direct_register_custom_op
 
 logger = init_logger(__name__)
+has_deep_gemm = importlib.util.find_spec("deep_gemm") is not None
 
 
 def is_fp8(x: Union[torch.dtype, torch.Tensor]) -> bool:
@@ -98,6 +101,19 @@ def dispatch_w8a8_blockscale_func(
     return w8a8_block_fp8_matmul
 
 
+def should_use_deepgemm(output_dtype: torch.dtype, weight: torch.Tensor):
+    """
+    Check if DeepGEMM should be used based on the output dtype and weight shape.
+    DeepGEMM is only supported for bfloat16 output dtype and weights with shape
+    divisible by 128.
+    """
+
+    return (current_platform.is_cuda()
+            and current_platform.is_device_capability(90) and has_deep_gemm
+            and envs.VLLM_USE_DEEP_GEMM and output_dtype == torch.bfloat16
+            and weight.shape[0] % 128 == 0 and weight.shape[1] % 128 == 0)
+
+
 # TODO fix ROCm->Triton custom path:
 #  https://github.com/vllm-project/vllm/issues/14397
 def apply_w8a8_block_fp8_linear(
@@ -114,6 +130,29 @@ def apply_w8a8_block_fp8_linear(
     # View input as 2D matrix for fp8 methods
     input_2d = input.view(-1, input.shape[-1])
     output_shape = [*input.shape[:-1], weight.shape[0]]
+    output_dtype = input.dtype
+
+    if should_use_deepgemm(output_dtype, weight):
+
+        input_2d = input.view(-1, input.shape[-1])
+        output_shape = [*input.shape[:-1], weight.shape[0]]
+
+        q_input, x_scale = per_token_group_quant_fp8(
+            input_2d,
+            block_size[1],
+            column_major_scales=True,
+        )
+
+        output = torch.ops.vllm.w8a8_block_fp8_matmul_deepgemm(
+            q_input,
+            weight,
+            x_scale,
+            weight_scale,
+            block_size,
+            output_dtype=output_dtype)
+        if bias is not None:
+            output += bias
+        return output.to(dtype=output_dtype).view(*output_shape)
 
     if current_platform.is_cuda():
         if current_platform.has_device_capability(100):
@@ -134,7 +173,6 @@ def ceil_div(x: int, y: int) -> int:
 
     w8a8_blockscale_func = dispatch_w8a8_blockscale_func(
         use_cutlass, use_aiter_and_is_supported)
-
     if use_cutlass:
         q_input, x_scale = per_token_group_quant_fp8(
             input_2d, block_size[1], column_major_scales=use_cutlass)
@@ -234,8 +272,13 @@ def _per_token_group_quant_fp8(
     row = g_id // groups_per_row
     row_g_id = g_id % groups_per_row
 
-    y_ptr += (row * y_row_stride) + (row_g_id * group_size)
-    y_q_ptr += g_id * group_size
+    # Ensure offset calculations use int64 to prevent overflow
+    y_ptr_offset = (row.to(tl.int64) * y_row_stride) + (row_g_id.to(tl.int64) *
+                                                        group_size)
+    y_ptr += y_ptr_offset
+
+    y_q_ptr_offset = g_id.to(tl.int64) * group_size
+    y_q_ptr += y_q_ptr_offset
     y_s_ptr += g_id
 
     cols = tl.arange(0, BLOCK)  # N <= BLOCK
@@ -282,15 +325,23 @@ def _per_token_group_quant_fp8_colmajor(
     row = g_id // groups_per_row
     row_g_id = g_id % groups_per_row
 
-    y_ptr += (row * y_row_stride) + (row_g_id * group_size)
-    y_q_ptr += g_id * group_size
+    # Ensure offset calculations use int64 to prevent overflow
+    y_ptr_offset = (row.to(tl.int64) * y_row_stride) + (row_g_id.to(tl.int64) *
+                                                        group_size)
+    y_ptr += y_ptr_offset
+
+    y_q_ptr_offset = g_id.to(tl.int64) * group_size
+    y_q_ptr += y_q_ptr_offset
 
     # Convert g_id the flattened block coordinate to 2D so we can index
     # into the output y_scales matrix
     blocks_per_row = y_num_columns // group_size
     scale_col = g_id % blocks_per_row
     scale_row = g_id // blocks_per_row
-    y_s_ptr += scale_col * y_s_col_stride + scale_row
+    # Ensure offset calculation uses int64 for y_s_ptr
+    y_s_ptr_offset = (scale_col.to(tl.int64) * y_s_col_stride) + scale_row.to(
+        tl.int64)
+    y_s_ptr += y_s_ptr_offset
 
     cols = tl.arange(0, BLOCK)  # group_size <= BLOCK
     mask = cols < group_size
@@ -311,6 +362,7 @@ def per_token_group_quant_fp8(
     eps: float = 1e-10,
     dtype: Optional[torch.dtype] = None,
     column_major_scales: bool = False,
+    out_q: Optional[torch.Tensor] = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """Function to perform per-token-group quantization on an input tensor `x`.
     It converts the tensor values into signed float8 values and returns the
@@ -321,6 +373,8 @@ def per_token_group_quant_fp8(
         eps: The minimum to avoid dividing zero.
         dtype: The dype of output tensor. Note that only `torch.float8_e4m3fn`
         is supported for now.
+        column_major_scales: Outputs scales in column major.
+        out_q: Optional output tensor. If not provided, function will create.
     Returns:
         tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the
         scaling factor for quantization.
@@ -335,7 +389,11 @@ def per_token_group_quant_fp8(
     fp8_min = finfo.min
     fp8_max = finfo.max
 
-    x_q = torch.empty_like(x, device=x.device, dtype=dtype)
+    assert out_q is None or out_q.shape == x.shape
+    x_q = out_q
+    if x_q is None:
+        x_q = torch.empty_like(x, device=x.device, dtype=dtype)
+
     M = x.numel() // group_size
     N = group_size
     if column_major_scales:
diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py
index a6e58a77d42c..db68f18726d3 100644
--- a/vllm/model_executor/layers/rejection_sampler.py
+++ b/vllm/model_executor/layers/rejection_sampler.py
@@ -283,14 +283,14 @@ def _get_accepted(
         batch_size, k, _ = draft_probs.shape
         batch_indices = torch.arange(batch_size,
                                      device=target_probs.device)[:, None]
-        probs_indicies = torch.arange(k, device=target_probs.device)
+        probs_indices = torch.arange(k, device=target_probs.device)
 
         # shape [batch_size, k]
-        selected_draft_probs = draft_probs[batch_indices, probs_indicies,
+        selected_draft_probs = draft_probs[batch_indices, probs_indices,
                                            draft_token_ids]
 
         # shape [batch_size, k]
-        selected_target_probs = target_probs[batch_indices, probs_indicies,
+        selected_target_probs = target_probs[batch_indices, probs_indices,
                                              draft_token_ids]
 
         uniform_rand = self._create_uniform_samples(seeded_seqs, batch_size,
diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py
index ebbb021cad64..068a4e355ff8 100644
--- a/vllm/model_executor/model_loader/bitsandbytes_loader.py
+++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py
@@ -392,7 +392,8 @@ def _unquantized_generator(self, hf_weights_files, use_safetensors,
     def _get_bnb_target_modules(self, model: nn.Module) -> None:
 
         for name, module in model.named_modules():
-            if isinstance(module, (LinearBase, )):
+            if (isinstance(module, LinearBase) and
+                    hasattr(module.quant_method, "quant_config")):
                 if modules_info := self.modules_mapping.get_sub_modules(name):
                     # Map vllm's names to transformers's names.
                     rep_name, sub_modules = modules_info
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index ee67cc64050e..817c6bb9a7f9 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -51,7 +51,8 @@
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
-from .utils import (extract_layer_index, is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, extract_layer_index,
+                    is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -286,6 +287,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
+        self.quant_config = quant_config
 
         self.config = config
         lora_vocab = (lora_config.lora_extra_vocab_size *
@@ -339,6 +341,62 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
+            for param_name, shard_name, shard_id in stacked_params_mapping:
+                if shard_name not in name:
+                    continue
+                name = name.replace(shard_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant):
     packed_modules_mapping = {
@@ -408,65 +466,6 @@ def compute_logits(
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-
-            # Skip loading rotary embeddings since vLLM has its own
-            if "rotary_emb.inv_freq" in name:
-                continue
-
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache quantization scales
-                param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
-                weight_loader(param, loaded_weight)
-                loaded_params.add(scale_name)
-                continue
-
-            for param_name, shard_name, shard_id in stacked_params_mapping:
-                if shard_name not in name:
-                    continue
-                name = name.replace(shard_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # lm_head is not used in vllm as it is tied with embed_token.
-                # To prevent errors, skip loading lm_head.weight.
-                if "lm_head.weight" in name:
-                    continue
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Remapping the name of FP8 kv-scale.
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(
+            self, skip_prefixes=["lm_head", "rotary_emb.inv_freq"])
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
index 393ce41a91a0..bad0f6b1ffb7 100644
--- a/vllm/model_executor/models/qwen3.py
+++ b/vllm/model_executor/models/qwen3.py
@@ -38,13 +38,15 @@
 from vllm.model_executor.layers.linear import (QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import IntermediateTensors
+from vllm.sequence import IntermediateTensors, PoolerOutput
 
-from .interfaces import SupportsLoRA, SupportsPP
+from .interfaces import SupportsCrossEncoding, SupportsLoRA, SupportsPP
 from .qwen2 import Qwen2MLP as Qwen3MLP
 from .qwen2 import Qwen2Model
 from .utils import AutoWeightsLoader, PPMissingLayer, maybe_prefix
@@ -319,3 +321,122 @@ def load_weights(self, weights: Iterable[tuple[str,
                            if self.config.tie_word_embeddings else None),
         )
         return loader.load_weights(weights)
+
+
+class Qwen3ForSequenceClassification(nn.Module, SupportsLoRA,
+                                     SupportsCrossEncoding):
+
+    def __init__(
+        self,
+        vllm_config: "VllmConfig",
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        pooler_config = vllm_config.model_config.pooler_config
+
+        self.vllm_config = vllm_config
+        self.config = config
+        self.quant_config = quant_config
+        self.prefix = prefix
+        self.model = Qwen3Model(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+        self.score = RowParallelLinear(config.hidden_size,
+                                       config.num_labels,
+                                       quant_config=quant_config,
+                                       input_is_parallel=False,
+                                       bias=False,
+                                       prefix=maybe_prefix(prefix, "score"))
+
+        self._pooler = Pooler.from_config_with_defaults(
+            pooler_config,
+            pooling_type=PoolingType.LAST,
+            normalize=False,
+            softmax=True)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return self.model(input_ids=input_ids,
+                          positions=positions,
+                          inputs_embeds=inputs_embeds,
+                          intermediate_tensors=intermediate_tensors)
+
+    def pooler(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Optional[PoolerOutput]:
+        hidden_states = self._pooler.extract_states(hidden_states,
+                                                    pooling_metadata)
+        logits, _ = self.score(hidden_states)
+        pooled_data = self._pooler.head(logits, pooling_metadata)
+        pooled_outputs = [
+            self._pooler.build_output(data.squeeze(-1)) for data in pooled_data
+        ]
+        return PoolerOutput(outputs=pooled_outputs)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        is_original_qwen3_reranker = getattr(self.config,
+                                             "is_original_qwen3_reranker",
+                                             False)
+
+        if not is_original_qwen3_reranker:
+            loader = AutoWeightsLoader(self)
+            return loader.load_weights(weights)
+
+        return self.load_weights_from_original_qwen3_reranker(weights)
+
+    def load_weights_from_original_qwen3_reranker(
+            self, weights: Iterable[tuple[str, torch.Tensor]]):
+        tokens = getattr(self.config, "classifier_from_token", None)
+        assert tokens is not None and len(tokens) == 2, \
+            ("Try loading the original Qwen3 Reranker?, see: "
+             "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/qwen3_reranker.py")
+
+        self.config.num_labels = 1
+        model_config = self.vllm_config.model_config
+
+        device = self.score.weight.device
+        self.score = RowParallelLinear(self.config.hidden_size,
+                                       self.config.num_labels,
+                                       quant_config=self.quant_config,
+                                       input_is_parallel=False,
+                                       bias=False,
+                                       prefix=maybe_prefix(
+                                           self.prefix, "score")).to(device)
+
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(self.config.vocab_size,
+                                          self.config.hidden_size,
+                                          quant_config=self.quant_config,
+                                          prefix=maybe_prefix(
+                                              self.prefix, "lm_head"))
+
+        loader = AutoWeightsLoader(self)
+        loaded_weights = loader.load_weights(weights)
+
+        from vllm.transformers_utils.tokenizer import get_tokenizer
+        tokenizer = get_tokenizer(
+            model_config.tokenizer,
+            revision=model_config.tokenizer_revision,
+            tokenizer_mode=model_config.tokenizer_mode,
+            trust_remote_code=model_config.trust_remote_code)
+
+        a = tokenizer.convert_tokens_to_ids(tokens[0])
+        b = tokenizer.convert_tokens_to_ids(tokens[1])
+        weight = self.lm_head.weight.data[b].to(
+            device) - self.lm_head.weight.data[a].to(device)
+        self.score.weight.data.copy_(weight)
+
+        del self.lm_head
+        loaded_weights.add("classifier.weight")
+        loaded_weights.discard("lm_head.weight")
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index e82e36638069..d28d2466bb6b 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -172,6 +172,7 @@
                                             "RobertaForSequenceClassification"),
     "ModernBertForSequenceClassification": ("modernbert",
                                             "ModernBertForSequenceClassification"),
+    "Qwen3ForSequenceClassification": ("qwen3", "Qwen3ForSequenceClassification"), # noqa: E501
 }
 
 _MULTIMODAL_MODELS = {
diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
index b7988359737a..db8b2e2b3959 100644
--- a/vllm/multimodal/hasher.py
+++ b/vllm/multimodal/hasher.py
@@ -3,7 +3,6 @@
 
 import pickle
 from collections.abc import Iterable, Mapping
-from typing import TYPE_CHECKING, Optional
 
 import numpy as np
 import torch
@@ -13,9 +12,6 @@
 from vllm.logger import init_logger
 from vllm.multimodal.image import convert_image_mode
 
-if TYPE_CHECKING:
-    from vllm.inputs import TokensPrompt
-
 logger = init_logger(__name__)
 
 MultiModalHashDict = Mapping[str, list[str]]
@@ -91,28 +87,3 @@ def hash_kwargs(cls, **kwargs: object) -> str:
                 hasher.update(v_bytes)
 
         return hasher.hexdigest()
-
-    @classmethod
-    def hash_prompt_mm_data(
-            cls, prompt: "TokensPrompt") -> Optional["MultiModalHashDict"]:
-        """Hash multimodal data in the user input prompt if they exist."""
-
-        if "multi_modal_data" not in prompt:
-            return None
-
-        mm_data = prompt["multi_modal_data"]
-        if not mm_data:
-            # mm_data can be None or an empty dict.
-            return None
-
-        mm_items = {
-            modality: items if isinstance(items, list) else [items]
-            for modality, items in mm_data.items()
-        }
-
-        mm_hashes = {
-            modality: [cls.hash_kwargs(**{modality: item}) for item in items]
-            for modality, items in mm_items.items()
-        }
-
-        return mm_hashes
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 0bf5b1cf1c6c..5cb720381d94 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -680,7 +680,8 @@ def modalities(self):
         return self._items_by_modality.keys()
 
     @staticmethod
-    def _try_stack(nested_tensors: NestedTensors) -> NestedTensors:
+    def _try_stack(nested_tensors: NestedTensors,
+                   pin_memory: bool = False) -> NestedTensors:
         """
         Stack the inner dimensions that have the same shape in
         a nested list of tensors.
@@ -697,7 +698,9 @@ def _try_stack(nested_tensors: NestedTensors) -> NestedTensors:
         if isinstance(nested_tensors, (int, float)):
             return torch.tensor(nested_tensors)
 
-        stacked = [MultiModalKwargs._try_stack(t) for t in nested_tensors]
+        stacked = [
+            MultiModalKwargs._try_stack(t, pin_memory) for t in nested_tensors
+        ]
         if not is_list_of(stacked, torch.Tensor, check="all"):
             # Only tensors (not lists) can be stacked.
             return stacked
@@ -713,10 +716,16 @@ def _try_stack(nested_tensors: NestedTensors) -> NestedTensors:
             # The tensors have incompatible shapes and can't be stacked.
             return tensors_
 
-        return torch.stack(tensors_)
+        outputs = torch.empty(len(tensors_),
+                              *tensors_[0].shape,
+                              dtype=tensors_[0].dtype,
+                              device=tensors_[0].device,
+                              pin_memory=pin_memory)
+        return torch.stack(tensors_, out=outputs)
 
     @staticmethod
-    def batch(inputs_list: list["MultiModalKwargs"]) -> BatchedTensorInputs:
+    def batch(inputs_list: list["MultiModalKwargs"],
+              pin_memory: bool = False) -> BatchedTensorInputs:
         """
         Batch multiple inputs together into a dictionary.
 
@@ -738,7 +747,7 @@ def batch(inputs_list: list["MultiModalKwargs"]) -> BatchedTensorInputs:
                 item_lists[k].append(v)
 
         return {
-            k: MultiModalKwargs._try_stack(item_list)
+            k: MultiModalKwargs._try_stack(item_list, pin_memory)
             for k, item_list in item_lists.items()
         }
 
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 71c964fbfbb5..27c591e3babd 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -208,6 +208,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         # Disable torch async compiling which won't work with daemonic processes
         os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
 
+        # Share the cpusets list among ranks by spawning process instead
+        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
         # Intel OpenMP setting
         ld_prealod_str = os.getenv("LD_PRELOAD", "")
         if "libiomp5.so" in ld_prealod_str:
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 48d1aacba185..7ab5146fd743 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -6,7 +6,7 @@
 
 import os
 from datetime import timedelta
-from functools import wraps
+from functools import cache, wraps
 from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
 
 import torch
@@ -389,6 +389,7 @@ def stateless_init_device_torch_dist_pg(
 class NvmlCudaPlatform(CudaPlatformBase):
 
     @classmethod
+    @cache
     @with_nvml_context
     def get_device_capability(cls,
                               device_id: int = 0
@@ -486,6 +487,7 @@ def log_warnings(cls):
 class NonNvmlCudaPlatform(CudaPlatformBase):
 
     @classmethod
+    @cache
     def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
         major, minor = torch.cuda.get_device_capability(device_id)
         return DeviceCapability(major=major, minor=minor)
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index d66e296fdbba..52a7a903cd8e 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -143,7 +143,9 @@ def lookup_files() -> list[str]:
                     modelscope_list_repo_files)
                 return modelscope_list_repo_files(repo_id,
                                                   revision=revision,
-                                                  token=token)
+                                                  token=os.getenv(
+                                                      "MODELSCOPE_API_TOKEN",
+                                                      None))
             return hf_list_repo_files(repo_id,
                                       revision=revision,
                                       repo_type=repo_type,
diff --git a/vllm/utils.py b/vllm/utils.py
index c19c0221cf83..342241d0dd8a 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1466,7 +1466,7 @@ def repl(match: re.Match) -> str:
         pattern = re.compile(r"(?<=--)[^\.]*")
 
         # Convert underscores to dashes and vice versa in argument names
-        processed_args = []
+        processed_args = list[str]()
         for arg in args:
             if arg.startswith('--'):
                 if '=' in arg:
@@ -1483,7 +1483,7 @@ def repl(match: re.Match) -> str:
             else:
                 processed_args.append(arg)
 
-        def create_nested_dict(keys: list[str], value: str):
+        def create_nested_dict(keys: list[str], value: str) -> dict[str, Any]:
             """Creates a nested dictionary from a list of keys and a value.
 
             For example, `keys = ["a", "b", "c"]` and `value = 1` will create:
@@ -1494,7 +1494,10 @@ def create_nested_dict(keys: list[str], value: str):
                 nested_dict = {key: nested_dict}
             return nested_dict
 
-        def recursive_dict_update(original: dict, update: dict):
+        def recursive_dict_update(
+            original: dict[str, Any],
+            update: dict[str, Any],
+        ):
             """Recursively updates a dictionary with another dictionary."""
             for k, v in update.items():
                 if isinstance(v, dict) and isinstance(original.get(k), dict):
@@ -1502,19 +1505,25 @@ def recursive_dict_update(original: dict, update: dict):
                 else:
                     original[k] = v
 
-        delete = set()
-        dict_args: dict[str, dict] = defaultdict(dict)
+        delete = set[int]()
+        dict_args = defaultdict[str, dict[str, Any]](dict)
         for i, processed_arg in enumerate(processed_args):
             if processed_arg.startswith("--") and "." in processed_arg:
                 if "=" in processed_arg:
-                    processed_arg, value = processed_arg.split("=", 1)
+                    processed_arg, value_str = processed_arg.split("=", 1)
                     if "." not in processed_arg:
                         # False positive, . was only in the value
                         continue
                 else:
-                    value = processed_args[i + 1]
+                    value_str = processed_args[i + 1]
                     delete.add(i + 1)
+
                 key, *keys = processed_arg.split(".")
+                try:
+                    value = json.loads(value_str)
+                except json.decoder.JSONDecodeError:
+                    value = value_str
+
                 # Merge all values with the same key into a single dict
                 arg_dict = create_nested_dict(keys, value)
                 recursive_dict_update(dict_args[key], arg_dict)
@@ -2269,6 +2278,8 @@ def kill_process_tree(pid: int):
 class MemorySnapshot:
     """Memory snapshot."""
     torch_peak: int = 0
+    free_memory: int = 0
+    total_memory: int = 0
     cuda_memory: int = 0
     torch_memory: int = 0
     non_torch_memory: int = 0
@@ -2288,8 +2299,8 @@ def measure(self):
         self.torch_peak = torch.cuda.memory_stats().get(
             "allocated_bytes.all.peak", 0)
 
-        self.cuda_memory = torch.cuda.mem_get_info(
-        )[1] - torch.cuda.mem_get_info()[0]
+        self.free_memory, self.total_memory = torch.cuda.mem_get_info()
+        self.cuda_memory = self.total_memory - self.free_memory
 
         # torch.cuda.memory_reserved() is how many bytes
         # PyTorch gets from cuda (by calling cudaMalloc, etc.)
@@ -2302,6 +2313,8 @@ def measure(self):
     def __sub__(self, other: MemorySnapshot) -> MemorySnapshot:
         return MemorySnapshot(
             torch_peak=self.torch_peak - other.torch_peak,
+            free_memory=self.free_memory - other.free_memory,
+            total_memory=self.total_memory - other.total_memory,
             cuda_memory=self.cuda_memory - other.cuda_memory,
             torch_memory=self.torch_memory - other.torch_memory,
             non_torch_memory=self.non_torch_memory - other.non_torch_memory,
@@ -2323,6 +2336,16 @@ class MemoryProfilingResult:
     after_profile: MemorySnapshot = field(default_factory=MemorySnapshot)
     profile_time: float = 0.0
 
+    def __repr__(self) -> str:
+        return (f"Memory profiling takes {self.profile_time:.2f} seconds. "
+                f"Total non KV cache memory: "
+                f"{(self.non_kv_cache_memory / GiB_bytes):.2f}GiB; "
+                f"torch peak memory increase: "
+                f"{(self.torch_peak_increase / GiB_bytes):.2f}GiB; "
+                f"non-torch forward increase memory: "
+                f"{(self.non_torch_increase / GiB_bytes):.2f}GiB; "
+                f"weights memory: {(self.weights_memory / GiB_bytes):.2f}GiB.")
+
 
 @contextlib.contextmanager
 def memory_profiling(
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index f1b61c152a9d..b15bb4b3152a 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -508,7 +508,12 @@ def __init__(
         logits_soft_cap: Optional[float] = None,
         attn_type: AttentionType = AttentionType.DECODER,
         kv_sharing_target_layer_name: Optional[int] = None,
+        use_irope: bool = False,
     ) -> None:
+        if use_irope:
+            logger.warning_once(
+                "Using irope in FlashInfer is not supported yet, it will fall"
+                " back to global attention for long context.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py
index 70aee058e296..c8ec571989c6 100644
--- a/vllm/v1/attention/backends/mla/cutlass_mla.py
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -40,12 +40,13 @@ def __init__(
             blocksparse_params: Optional[dict[str, Any]],
             logits_soft_cap: Optional[float],
             attn_type: str,
+            kv_sharing_target_layer_name: Optional[str],
             # MLA Specific Arguments
             **mla_args) -> None:
         super().__init__(num_heads, head_size, scale, num_kv_heads,
                          alibi_slopes, sliding_window, kv_cache_dtype,
                          blocksparse_params, logits_soft_cap, attn_type,
-                         **mla_args)
+                         kv_sharing_target_layer_name, **mla_args)
 
         unsupported_features = [
             alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index 3b2a4f936000..d21f94727cf6 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -89,8 +89,8 @@ def get_cached_block(
                 BlockHashWithGroupId(block_hash, group_id))
             if not cached_blocks_one_group:
                 return None
-            first_block_id = next(iter(cached_blocks_one_group))
-            cached_blocks.append(cached_blocks_one_group[first_block_id])
+            first_block = next(iter(cached_blocks_one_group.values()))
+            cached_blocks.append(first_block)
         return cached_blocks
 
     def cache_full_blocks(
@@ -260,7 +260,7 @@ def _maybe_evict_cached_block(self, block: KVCacheBlock) -> bool:
             return True
         return False
 
-    def touch(self, blocks: list[list[KVCacheBlock]]) -> None:
+    def touch(self, blocks: tuple[list[KVCacheBlock], ...]) -> None:
         """Touch a block increases its reference count by 1, and may remove
         the block from the free queue. This is used when a block is hit by
         another request with the same prefix.
@@ -299,7 +299,7 @@ def reset_prefix_cache(self) -> bool:
             bool: True if the prefix cache is successfully reset,
             False otherwise.
         """
-        num_used_blocks = (self.num_gpu_blocks - self.get_num_free_blocks())
+        num_used_blocks = self.num_gpu_blocks - self.get_num_free_blocks()
         if num_used_blocks != 1:  # The null block is always marked as used
             logger.warning(
                 "Failed to reset prefix cache because some "
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
index 231bad1df922..5620d9bee7a3 100644
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -5,8 +5,7 @@
 from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
 from vllm.v1.core.single_type_kv_cache_manager import (
-    FullAttentionManager, SingleTypeKVCacheManager,
-    get_manager_for_kv_cache_spec)
+    FullAttentionManager, get_manager_for_kv_cache_spec)
 from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheConfig
 from vllm.v1.request import Request
 
@@ -30,25 +29,21 @@ def __init__(
 
         self.block_pool = BlockPool(kv_cache_config.num_blocks, enable_caching,
                                     enable_kv_cache_events)
-        self.single_type_managers: list[SingleTypeKVCacheManager] = []
 
         # Needs special handling for find_longest_cache_hit if eagle is enabled
         self.use_eagle = use_eagle
-
-        for i in range(len(self.kv_cache_config.kv_cache_groups)):
-            kv_cache_spec = self.kv_cache_config.kv_cache_groups[
-                i].kv_cache_spec
-            self.single_type_managers.append(
-                get_manager_for_kv_cache_spec(
-                    kv_cache_spec=kv_cache_spec,
-                    block_pool=self.block_pool,
-                    kv_cache_group_id=i,
-                    caching_hash_fn=caching_hash_fn,
-                ))
+        self.single_type_managers = tuple(
+            get_manager_for_kv_cache_spec(
+                kv_cache_spec=kv_cache_group.kv_cache_spec,
+                block_pool=self.block_pool,
+                kv_cache_group_id=i,
+                caching_hash_fn=caching_hash_fn,
+            ) for i, kv_cache_group in enumerate(
+                self.kv_cache_config.kv_cache_groups))
 
     def get_num_blocks_to_allocate(
             self, request_id: str, num_tokens: int,
-            new_computed_blocks: list[list[KVCacheBlock]]) -> int:
+            new_computed_blocks: tuple[list[KVCacheBlock], ...]) -> int:
         """
         Get the number of blocks needed to be allocated for the request.
 
@@ -70,7 +65,7 @@ def get_num_blocks_to_allocate(
 
     def save_new_computed_blocks(
             self, request_id: str,
-            new_computed_blocks: list[list[KVCacheBlock]]) -> None:
+            new_computed_blocks: tuple[list[KVCacheBlock], ...]) -> None:
         """
         Add the new computed blocks to the request.
 
@@ -84,7 +79,7 @@ def save_new_computed_blocks(
                                              new_computed_blocks[i])
 
     def allocate_new_blocks(self, request_id: str,
-                            num_tokens: int) -> list[list[KVCacheBlock]]:
+                            num_tokens: int) -> tuple[list[KVCacheBlock], ...]:
         """
         Allocate new blocks for the request to give it at least `num_tokens` 
         token slots.
@@ -97,11 +92,9 @@ def allocate_new_blocks(self, request_id: str,
         Returns:
             The new allocated blocks.
         """
-        new_blocks = []
-        for manager in self.single_type_managers:
-            new_blocks.append(
-                manager.allocate_new_blocks(request_id, num_tokens))
-        return new_blocks
+        return tuple(
+            manager.allocate_new_blocks(request_id, num_tokens)
+            for manager in self.single_type_managers)
 
     def cache_blocks(self, request: Request, block_hashes: list[BlockHash],
                      num_computed_tokens: int) -> None:
@@ -159,19 +152,20 @@ def remove_skipped_blocks(self, request_id: str,
         for manager in self.single_type_managers:
             manager.remove_skipped_blocks(request_id, num_computed_tokens)
 
-    def get_blocks(self, request_id: str) -> list[list[KVCacheBlock]]:
+    def get_blocks(self, request_id: str) -> tuple[list[KVCacheBlock], ...]:
         """
         Get the blocks for the request.
         """
-        return [
+        return tuple(
             manager.req_to_blocks.get(request_id) or []
-            for manager in self.single_type_managers
-        ]
+            for manager in self.single_type_managers)
 
     @abstractmethod
     def find_longest_cache_hit(
-            self, block_hashes: list[BlockHash],
-            max_cache_hit_length: int) -> tuple[list[list[KVCacheBlock]], int]:
+        self,
+        block_hashes: list[BlockHash],
+        max_cache_hit_length: int,
+    ) -> tuple[tuple[list[KVCacheBlock], ...], int]:
         pass
 
 
@@ -195,8 +189,10 @@ def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int,
             "UnitaryKVCacheCoordinator assumes only one kv cache group")
 
     def find_longest_cache_hit(
-            self, block_hashes: list[BlockHash],
-            max_cache_hit_length: int) -> tuple[list[list[KVCacheBlock]], int]:
+        self,
+        block_hashes: list[BlockHash],
+        max_cache_hit_length: int,
+    ) -> tuple[tuple[list[KVCacheBlock], ...], int]:
         hit_blocks = self.single_type_managers[0].find_longest_cache_hit(
             block_hashes=block_hashes,
             max_length=max_cache_hit_length,
@@ -275,11 +271,24 @@ def verify_and_split_kv_cache_groups(self) -> None:
             "KVCacheCoordinator assumes the block_size of full attention "
             "layers is divisible by other layers now.")
 
+        if max(self.full_attention_group_ids) < min(self.other_group_ids):
+            self.full_attn_first = True
+        elif max(self.other_group_ids) < min(self.full_attention_group_ids):
+            self.full_attn_first = False
+        else:
+            raise ValueError(
+                "HybridKVCacheCoordinator assumes the full "
+                "attention group ids and other attention group ids "
+                "do not interleave, either full attention group ids "
+                "are before other attention group ids or vice versa."
+                "This is for simplifying merging hit_blocks_full_attn and "
+                "hit_blocks_other_attn to hit_blocks.")
+
     def find_longest_cache_hit(
         self,
         block_hashes: list[BlockHash],
         max_cache_hit_length: int,
-    ) -> tuple[list[list[KVCacheBlock]], int]:
+    ) -> tuple[tuple[list[KVCacheBlock], ...], int]:
         """
         Find the longest cache hit for the request.
 
@@ -318,27 +327,25 @@ def find_longest_cache_hit(
             ))
         hit_length = len(hit_blocks_other_attn[0]) * self.other_block_size
 
-        # NOTE: the prefix cache hit length must be a multiply of block_size as
+        # NOTE: the prefix cache hit length must be a multiple of block_size as
         # we don't support partial block cache hit yet. The cache hit length
-        # of other attention is ensured to be a multiply of the block size of
+        # of other attention is ensured to be a multiple of the block size of
         # full attention layers in current implementation, because hit_length is
-        # a multiply of other attention's block size, and other attention's
-        # block size is a multiply of full attention's block size (verified in
+        # a multiple of other attention's block size, and other attention's
+        # block size is a multiple of full attention's block size (verified in
         # `verify_and_split_kv_cache_groups`).
         assert hit_length % self.full_attention_block_size == 0
 
         # Truncate the full attention cache hit to the length of the
         # cache hit of the other attention.
-        for i in range(len(hit_blocks_full_attn)):
-            del hit_blocks_full_attn[i][hit_length //
-                                        self.full_attention_block_size:]
+        for group_hit_blocks in hit_blocks_full_attn:
+            del group_hit_blocks[hit_length // self.full_attention_block_size:]
 
         # Merge the hit blocks of full attention and other attention.
-        hit_blocks = hit_blocks_other_attn
-        for group_id, blocks in enumerate(hit_blocks_full_attn):
-            # NOTE: there is only one full attention group in most cases. So
-            # the time complexity of insert is fine.
-            hit_blocks.insert(group_id, blocks)
+        if self.full_attn_first:
+            hit_blocks = hit_blocks_full_attn + hit_blocks_other_attn
+        else:
+            hit_blocks = hit_blocks_other_attn + hit_blocks_full_attn
         return hit_blocks, hit_length
 
 
@@ -351,8 +358,6 @@ def get_kv_cache_coordinator(
                                          use_eagle, enable_caching,
                                          caching_hash_fn,
                                          enable_kv_cache_events)
-    else:
-        return HybridKVCacheCoordinator(kv_cache_config, max_model_len,
-                                        use_eagle, enable_caching,
-                                        caching_hash_fn,
-                                        enable_kv_cache_events)
+    return HybridKVCacheCoordinator(kv_cache_config, max_model_len, use_eagle,
+                                    enable_caching, caching_hash_fn,
+                                    enable_kv_cache_events)
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 35fb189fda34..2e09f4c0aacf 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -21,11 +21,11 @@
 @dataclass
 class KVCacheBlocks:
     """
-    The allocation result of KVCacheManager, work as the interface between 
-    Scheduler and KVCacheManager, to hide KVCacheManager's internal data 
+    The allocation result of KVCacheManager, work as the interface between
+    Scheduler and KVCacheManager, to hide KVCacheManager's internal data
     structure from the Scheduler.
     """
-    blocks: list[list[KVCacheBlock]]
+    blocks: tuple[list[KVCacheBlock], ...]
     """
     blocks[i][j] refers to the i-th kv_cache_group and the j-th block of tokens.
     We don't use block of tokens as the outer dimension because it assumes all
@@ -37,21 +37,19 @@ class KVCacheBlocks:
     def __add__(self, other: "KVCacheBlocks") -> "KVCacheBlocks":
         """Adds two KVCacheBlocks instances."""
         return KVCacheBlocks(
-            [blk1 + blk2 for blk1, blk2 in zip(self.blocks, other.blocks)])
+            tuple(blk1 + blk2
+                  for blk1, blk2 in zip(self.blocks, other.blocks)))
 
-    def get_block_ids(self) -> list[list[int]]:
+    def get_block_ids(self) -> tuple[list[int], ...]:
         """
         Converts the KVCacheBlocks instance to block_ids.
         
         Returns:
-            list[list[int]]: A two-level list where
-            * the outer list corresponds to KV cache groups
+            tuple[list[int], ...]: A tuple of lists where
+            * the outer tuple corresponds to KV cache groups
             * each inner list contains the block_ids of the blocks in that group
         """
-        block_ids = []
-        for group in self.blocks:
-            block_ids.append([blk.block_id for blk in group])
-        return block_ids
+        return tuple([blk.block_id for blk in group] for group in self.blocks)
 
     def get_unhashed_block_ids(self) -> list[int]:
         """Get block_ids of unhashed blocks from KVCacheBlocks instance."""
@@ -63,7 +61,7 @@ def get_unhashed_block_ids(self) -> list[int]:
 
     def new_empty(self) -> "KVCacheBlocks":
         """Creates a new KVCacheBlocks instance with no blocks."""
-        return KVCacheBlocks([[] for _ in range(len(self.blocks))])
+        return KVCacheBlocks(tuple([] for _ in range(len(self.blocks))))
 
 
 class KVCacheManager:
@@ -232,9 +230,8 @@ def allocate_slots(
         if new_computed_blocks is not None:
             new_computed_block_list = new_computed_blocks.blocks
         else:
-            new_computed_block_list = [
-                [] for _ in range(len(self.kv_cache_config.kv_cache_groups))
-            ]
+            new_computed_block_list = tuple(
+                [] for _ in range(len(self.kv_cache_config.kv_cache_groups)))
 
         # Free the blocks that are skipped during the attention computation
         # (e.g., tokens outside the sliding window).
@@ -267,7 +264,7 @@ def allocate_slots(
         if self.enable_caching:
             self.block_pool.touch(new_computed_block_list)
         else:
-            assert all(not blocks for blocks in new_computed_block_list), (
+            assert not any(new_computed_block_list), (
                 "Computed blocks should be empty when "
                 "prefix caching is disabled")
 
@@ -378,17 +375,18 @@ def take_events(self) -> list[KVCacheEvent]:
         """
         return self.block_pool.take_events()
 
-    def get_block_ids(self, request_id: str) -> list[list[int]]:
+    def get_block_ids(self, request_id: str) -> tuple[list[int], ...]:
         """Get the block ids of a request."""
         return KVCacheBlocks(
             self.coordinator.get_blocks(request_id)).get_block_ids()
 
-    def cache_blocks(self, request: Request, block_hashes: list[BlockHash],
-                     num_computed_tokens: int) -> None:
+    def cache_blocks(self, request: Request, num_computed_tokens: int) -> None:
         """Cache the blocks for the request."""
+        block_hashes = self.req_to_block_hashes[request.request_id]
         self.coordinator.cache_blocks(request, block_hashes,
                                       num_computed_tokens)
 
     def create_empty_block_list(self) -> KVCacheBlocks:
         """Creates a new KVCacheBlocks instance with no blocks."""
-        return KVCacheBlocks([[] for _ in range(self.num_kv_cache_groups)])
+        return KVCacheBlocks(tuple([]
+                                   for _ in range(self.num_kv_cache_groups)))
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
index b404c70eb1e4..9b0a439fe7dc 100644
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -27,7 +27,7 @@ class NewRequestData:
     mm_hashes: list[str]
     mm_positions: list[PlaceholderRange]
     sampling_params: SamplingParams
-    block_ids: list[list[int]]
+    block_ids: tuple[list[int], ...]
     num_computed_tokens: int
     lora_request: Optional[LoRARequest]
 
@@ -35,7 +35,7 @@ class NewRequestData:
     def from_request(
         cls,
         request: Request,
-        block_ids: list[list[int]],
+        block_ids: tuple[list[int], ...],
     ) -> NewRequestData:
         return cls(
             req_id=request.request_id,
@@ -86,7 +86,7 @@ class CachedRequestData:
     # request's block IDs instead of appending to the existing block IDs.
     resumed_from_preemption: bool
     new_token_ids: list[int]
-    new_block_ids: list[list[int]]
+    new_block_ids: tuple[list[int], ...]
     num_computed_tokens: int
 
     @classmethod
@@ -95,7 +95,7 @@ def from_request(
         request: Request,
         resumed_from_preemption: bool,
         new_token_ids: list[int],
-        new_block_ids: list[list[int]],
+        new_block_ids: tuple[list[int], ...],
     ) -> CachedRequestData:
         return cls(
             req_id=request.request_id,
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 6ffb17807cfc..dc17013a8805 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -183,7 +183,7 @@ def schedule(self) -> SchedulerOutput:
         # uses structured decoding.
         structured_output_request_ids: dict[str, int] = {}
 
-        req_to_new_block_ids: dict[str, list[list[int]]] = {}
+        req_to_new_block_ids: dict[str, tuple[list[int], ...]] = {}
         num_scheduled_tokens: dict[str, int] = {}
         token_budget = self.max_num_scheduled_tokens
         # Encoder-related.
@@ -513,7 +513,7 @@ def schedule(self) -> SchedulerOutput:
                 token_budget -= num_new_tokens
                 request.status = RequestStatus.RUNNING
                 request.num_computed_tokens = num_computed_tokens
-                # Count the number of prifix cached tokens.
+                # Count the number of prefix cached tokens.
                 if request.num_cached_tokens < 0:
                     request.num_cached_tokens = num_computed_tokens
                 # Encoder-related.
@@ -630,7 +630,7 @@ def _make_cached_request_data(
         request: Request,
         num_scheduled_tokens: int,
         num_scheduled_spec_tokens: int,
-        new_block_ids: list[list[int]],
+        new_block_ids: tuple[list[int], ...],
         resumed_from_preemption: bool,
     ) -> CachedRequestData:
         # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating
@@ -1057,11 +1057,7 @@ def _update_waiting_for_remote_kv(self, request: Request) -> bool:
         num_computed_tokens = min(num_computed_tokens, request.num_tokens)
         if num_computed_tokens == request.num_tokens:
             num_computed_tokens -= 1
-        self.kv_cache_manager.cache_blocks(
-            request,
-            self.kv_cache_manager.req_to_block_hashes[request.request_id],
-            num_computed_tokens,
-        )
+        self.kv_cache_manager.cache_blocks(request, num_computed_tokens)
 
         # Update the request state for scheduling.
         request.num_computed_tokens = num_computed_tokens
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
index 98d758f820ad..95222779c3af 100644
--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -197,7 +197,7 @@ def find_longest_cache_hit(
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
-    ) -> list[list[KVCacheBlock]]:
+    ) -> tuple[list[KVCacheBlock], ...]:
         """
         Get the longest cache hit prefix of the blocks that is not longer than 
         `max_length`. The prefix should be a common prefix hit for all the 
@@ -222,7 +222,7 @@ def find_longest_cache_hit(
             element is a list of cached blocks for the i-th kv cache group
             in `kv_cache_group_ids`.
             For example, sliding window manager should return a list like
-            [[NULL, NULL, KVCacheBlock(7), KVCacheBlock(8)]] for block size 4 
+            ([NULL, NULL, KVCacheBlock(7), KVCacheBlock(8)]) for block size 4
             and sliding window 8 and len(kv_cache_group_ids) = 1.
         """
 
@@ -254,27 +254,25 @@ def find_longest_cache_hit(
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
-    ) -> list[list[KVCacheBlock]]:
+    ) -> tuple[list[KVCacheBlock], ...]:
         assert isinstance(kv_cache_spec, FullAttentionSpec), (
             "FullAttentionManager can only be used for full attention groups")
-        computed_blocks: list[list[KVCacheBlock]] = [
-            [] for _ in range(len(kv_cache_group_ids))
-        ]
+        computed_blocks: tuple[list[KVCacheBlock], ...] = tuple(
+            [] for _ in range(len(kv_cache_group_ids)))
         max_num_blocks = max_length // kv_cache_spec.block_size
-        for i in range(max_num_blocks):
-            block_hash = block_hashes[i]
+        for i, block_hash in zip(range(max_num_blocks), block_hashes):
             # block_hashes is a chain of block hashes. If a block hash is not
             # in the cached_block_hash_to_id, the following block hashes are
             # not computed yet for sure.
             if cached_block := block_pool.get_cached_block(
                     block_hash, kv_cache_group_ids):
-                for j in range(len(kv_cache_group_ids)):
-                    computed_blocks[j].append(cached_block[j])
+                for computed, cached in zip(computed_blocks, cached_block):
+                    computed.append(cached)
             else:
                 break
-        if use_eagle and len(computed_blocks[0]) > 0:
-            for j in range(len(kv_cache_group_ids)):
-                computed_blocks[j].pop()
+        if use_eagle and computed_blocks[0]:
+            for computed in computed_blocks:
+                computed.pop()
         return computed_blocks
 
     def remove_skipped_blocks(self, request_id: str,
@@ -311,7 +309,7 @@ def find_longest_cache_hit(
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
-    ) -> list[list[KVCacheBlock]]:
+    ) -> tuple[list[KVCacheBlock], ...]:
         assert isinstance(kv_cache_spec, SlidingWindowSpec), (
             "SlidingWindowManager can only be used for sliding window groups")
 
@@ -332,23 +330,23 @@ def find_longest_cache_hit(
         # sliding_window_contiguous_blocks),
         # which is good for low cache hit rate scenarios.
         max_num_blocks = max_length // kv_cache_spec.block_size
-        computed_blocks = [[block_pool.null_block] * max_num_blocks
-                           for _ in range(len(kv_cache_group_ids))]
+        computed_blocks = tuple([block_pool.null_block] * max_num_blocks
+                                for _ in range(len(kv_cache_group_ids)))
         num_contiguous_blocks = 0
         match_found = False
         # Search from right to left and early stop when a match is found.
         for i in range(max_num_blocks - 1, -1, -1):
             if cached_block := block_pool.get_cached_block(
                     block_hashes[i], kv_cache_group_ids):
-                for j in range(len(kv_cache_group_ids)):
-                    computed_blocks[j][i] = cached_block[j]
+                for computed, cached in zip(computed_blocks, cached_block):
+                    computed[i] = cached
                 num_contiguous_blocks += 1
-                if (num_contiguous_blocks >= sliding_window_contiguous_blocks):
+                if num_contiguous_blocks >= sliding_window_contiguous_blocks:
                     # Trim the trailing blocks.
                     # E.g., [NULL, NULL, 8, 3, NULL, 9] -> [NULL, NULL, 8, 3]
                     # when sliding_window_contiguous_blocks=2.
-                    for j in range(len(kv_cache_group_ids)):
-                        del computed_blocks[j][i + num_contiguous_blocks:]
+                    for computed in computed_blocks:
+                        del computed[i + num_contiguous_blocks:]
                     match_found = True
                     break
             else:
@@ -356,11 +354,11 @@ def find_longest_cache_hit(
         if not match_found:
             # The first `num_contiguous_blocks` is a cache hit even if
             # `num_contiguous_blocks < sliding_window_contiguous_blocks`.
-            for j in range(len(kv_cache_group_ids)):
-                del computed_blocks[j][num_contiguous_blocks:]
-        if use_eagle and len(computed_blocks[0]) > 0:
-            for j in range(len(kv_cache_group_ids)):
-                computed_blocks[j].pop()
+            for computed in computed_blocks:
+                del computed[num_contiguous_blocks:]
+        if use_eagle and computed_blocks[0]:
+            for computed in computed_blocks:
+                computed.pop()
         return computed_blocks
 
     def remove_skipped_blocks(self, request_id: str,
diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
index 5cd5674fb522..8f4e8d64c615 100644
--- a/vllm/v1/worker/block_table.py
+++ b/vllm/v1/worker/block_table.py
@@ -112,11 +112,12 @@ def __init__(self, max_num_reqs: int, max_model_len: int,
             for block_size in block_sizes
         ]
 
-    def append_row(self, block_ids: list[list[int]], row_idx: int) -> None:
+    def append_row(self, block_ids: tuple[list[int], ...],
+                   row_idx: int) -> None:
         for i, block_table in enumerate(self.block_tables):
             block_table.append_row(block_ids[i], row_idx)
 
-    def add_row(self, block_ids: list[list[int]], row_idx: int) -> None:
+    def add_row(self, block_ids: tuple[list[int], ...], row_idx: int) -> None:
         for i, block_table in enumerate(self.block_tables):
             block_table.add_row(block_ids[i], row_idx)
 
diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py
index 0b710b7bc203..9a35e8812038 100644
--- a/vllm/v1/worker/cpu_worker.py
+++ b/vllm/v1/worker/cpu_worker.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 import os
+from importlib import util
 from typing import Optional
 
 import torch
@@ -38,10 +39,14 @@ def __init__(self,
     def init_device(self):
         # Setup OpenMP threads affinity.
         omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND
-        if omp_cpuids == "all":
-            self.local_omp_cpuid = "all"
+        self.local_omp_cpuid = "all"
+        if omp_cpuids == "auto":
+            self.local_omp_cpuid = self.get_cpus_id_binding_based_on_numa_nodes(
+            )
         else:
             self.local_omp_cpuid = omp_cpuids.split("|")[self.rank]
+
+        if self.local_omp_cpuid != "all":
             ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid)
             if ret:
                 logger.info(ret)
@@ -99,3 +104,49 @@ def execute_model(
 
         assert isinstance(output, ModelRunnerOutput)
         return output if self.is_driver_worker else None
+
+    def get_cpus_id_binding_based_on_numa_nodes(self) -> str:
+        """Return CPUs id binding based on NUMA nodes.
+        """
+        rank_to_cpus = self.local_omp_cpuid
+        # Setup OpenMP thread affinity based on NUMA nodes automatically
+        world_size = self.vllm_config.parallel_config.world_size
+        libnuma_found = util.find_spec("numa") is not None
+        psutil_found = util.find_spec("psutil") is not None
+        if libnuma_found and psutil_found:
+            import psutil
+            from numa import info
+            cpu_count = psutil.cpu_count(logical=False)
+            cpus_allow_list = psutil.Process().cpu_affinity()
+            numa_size = info.get_num_configured_nodes()
+            cpu_count_per_numa = cpu_count // numa_size
+            num_of_reserved_cpu = min(envs.VLLM_CPU_NUM_OF_RESERVED_CPU,
+                                      cpu_count_per_numa // 2)
+
+            # check allow node_to_cpus list
+            node_to_cpus = []
+            for i in range(numa_size):
+                node_intersect = set(
+                    info.node_to_cpus(i)).intersection(cpus_allow_list)
+                if bool(node_intersect):
+                    node_to_cpus.append(list(node_intersect))
+
+            if world_size > len(node_to_cpus):
+                logger.error(
+                    "Auto thread-binding failed due to "
+                    "world size: %d is larger than "
+                    "allowed NUMA nodes number: %d."
+                    "Please try to bind threads manually.", world_size,
+                    len(node_to_cpus))
+            else:
+                end = cpu_count_per_numa - num_of_reserved_cpu
+                rank_to_cpus_list = node_to_cpus[self.rank][:end]
+                rank_to_cpus = ','.join(str(x) for x in rank_to_cpus_list)
+                logger.info("auto thread-binding list: %s", rank_to_cpus)
+        else:
+            logger.warning(
+                "Auto thread-binding is not supported due to "
+                "the lack of package numa and psutil,"
+                "fallback to no thread-binding. To get better performance,"
+                "please try to manually bind threads.")
+        return rank_to_cpus
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 34737029f6bf..ebb770a7ddb2 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -30,7 +30,7 @@ class CachedRequestState:
     sampling_params: SamplingParams
     generator: Optional[torch.Generator]
 
-    block_ids: list[list[int]]
+    block_ids: tuple[list[int], ...]
     num_computed_tokens: int
     output_token_ids: list[int]
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index e3535ef143ad..b1bc727e1e8e 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -5,6 +5,7 @@
 import gc
 import time
 import weakref
+from contextlib import contextmanager
 from typing import TYPE_CHECKING, Any, Optional, Union
 
 import numpy as np
@@ -12,6 +13,7 @@
 import torch.distributed
 import torch.nn as nn
 
+import vllm.envs as envs
 from vllm.attention import AttentionType, get_attn_backend
 from vllm.attention.backends.abstract import (AttentionBackend,
                                               AttentionMetadataBuilder)
@@ -655,7 +657,10 @@ def _prepare_inputs(
 
         # Fill unused with -1. Needed for reshape_and_cache
         self.seq_lens[num_reqs:].fill_(0)
-        self.query_start_loc[num_reqs + 1:].fill_(-1)
+        # Note: pad query_start_loc to be non-decreasing, as kernels
+        # like FlashAttention requires that
+        self.query_start_loc[num_reqs + 1:].fill_(
+            self.query_start_loc_cpu[num_reqs].item())
 
         query_start_loc = self.query_start_loc[:num_reqs + 1]
         seq_lens = self.seq_lens[:num_reqs]
@@ -957,7 +962,8 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
 
         encoder_outputs = []
         for grouped_mm_inputs in grouped_mm_inputs_list:
-            batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs)
+            batched_mm_inputs = MultiModalKwargs.batch(
+                grouped_mm_inputs, pin_memory=self.pin_memory)
             batched_mm_inputs = MultiModalKwargs.as_kwargs(
                 batched_mm_inputs,
                 device=self.device,
@@ -1724,6 +1730,35 @@ def _get_prompt_logprobs_dict(
 
         return prompt_logprobs_dict
 
+    @contextmanager
+    def maybe_randomize_inputs(self, input_ids: torch.Tensor):
+        """
+        Randomize input_ids if VLLM_RANDOMIZE_DP_DUMMY_INPUTS is set.
+        This is to help balance expert-selection
+         - during profile_run
+         - during DP rank dummy run 
+        """
+        dp_size = self.vllm_config.parallel_config.data_parallel_size
+        randomize_inputs = envs.VLLM_RANDOMIZE_DP_DUMMY_INPUTS and dp_size > 1
+        if not randomize_inputs:
+            yield
+        else:
+            import functools
+
+            @functools.cache
+            def rand_input_ids() -> torch.Tensor:
+                return torch.randint_like(
+                    self.input_ids,
+                    low=0,
+                    high=self.model_config.get_vocab_size(),
+                    dtype=input_ids.dtype)
+
+            logger.debug("Randomizing dummy data for DP Rank")
+            input_ids.copy_(rand_input_ids()[:input_ids.size(0)],
+                            non_blocking=True)
+            yield
+            input_ids.fill_(0)
+
     @torch.inference_mode()
     def _dummy_run(
         self,
@@ -1804,7 +1839,7 @@ def _dummy_run(
                 intermediate_tensors = self.sync_and_slice_intermediate_tensors(
                     num_tokens, None, False)
 
-            with set_forward_context(
+            with self.maybe_randomize_inputs(input_ids), set_forward_context(
                     attn_metadata,
                     self.vllm_config,
                     num_tokens=num_tokens,
@@ -1955,7 +1990,8 @@ def profile_run(self) -> None:
             ).multi_modal_data
 
             batched_dummy_mm_inputs = MultiModalKwargs.batch(
-                [dummy_mm_kwargs] * max_num_mm_items)
+                [dummy_mm_kwargs] * max_num_mm_items,
+                pin_memory=self.pin_memory)
             batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs(
                 batched_dummy_mm_inputs,
                 device=self.device,
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 1dfccc9b31bd..b7d244f27045 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -22,7 +22,7 @@
 from vllm.model_executor import set_random_seed
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.utils import GiB_bytes
+from vllm.utils import GiB_bytes, MemorySnapshot, memory_profiling
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.utils import report_usage_stats
@@ -130,20 +130,22 @@ def init_device(self):
             _check_if_gpu_supports_dtype(self.model_config.dtype)
             gc.collect()
             torch.cuda.empty_cache()
-            self.init_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
-            requested_memory = (total_gpu_memory *
-                                self.cache_config.gpu_memory_utilization)
-            if self.init_gpu_memory < requested_memory:
+
+            # take current memory snapshot
+            self.init_snapshot = MemorySnapshot()
+            self.requested_memory = (self.init_snapshot.total_memory *
+                                     self.cache_config.gpu_memory_utilization)
+            if self.init_snapshot.free_memory < self.requested_memory:
                 GiB = lambda b: round(b / GiB_bytes, 2)
                 raise ValueError(
-                    f"Free memory on device ({GiB(self.init_gpu_memory)}/"
-                    f"{GiB(total_gpu_memory)} GiB) on startup is less than "
-                    f"desired GPU memory utilization "
+                    f"Free memory on device "
+                    f"({GiB(self.init_snapshot.free_memory)}/"
+                    f"{GiB(self.init_snapshot.total_memory)} GiB) on startup "
+                    f"is less than desired GPU memory utilization "
                     f"({self.cache_config.gpu_memory_utilization}, "
-                    f"{GiB(requested_memory)} GiB). Decrease GPU memory "
+                    f"{GiB(self.requested_memory)} GiB). Decrease GPU memory "
                     f"utilization or reduce GPU memory used by other processes."
                 )
-
         else:
             raise RuntimeError(
                 f"Not support device type: {self.device_config.device}")
@@ -192,57 +194,39 @@ def determine_available_memory(self) -> int:
         """
         torch.cuda.empty_cache()
         torch.cuda.reset_peak_memory_stats()
+        GiB = lambda b: b / GiB_bytes
 
-        _, total_gpu_memory = torch.cuda.mem_get_info()
         # Execute a forward pass with dummy inputs to profile the memory usage
         # of the model.
-        self.model_runner.profile_run()
+        with memory_profiling(
+                self.init_snapshot,
+                weights_memory=int(
+                    self.model_runner.model_memory_usage)) as profile_result:
+            self.model_runner.profile_run()
 
-        free_gpu_memory, _ = torch.cuda.mem_get_info()
+        free_gpu_memory = profile_result.after_profile.free_memory
         # NOTE(woosuk): Here we assume that the other processes using the same
         # GPU did not change their memory usage during the profiling.
-        assert self.init_gpu_memory > free_gpu_memory, (
+        assert self.init_snapshot.free_memory > free_gpu_memory, (
             "Error in memory profiling. "
-            f"Initial free memory {self.init_gpu_memory/GiB_bytes} GiB, "
-            f"current free memory {free_gpu_memory/GiB_bytes} GiB. "
-            f"This happens when the GPU memory was not properly cleaned up "
-            f"before initializing the vLLM instance.")
-
-        # Get the peak memory allocation recorded by torch
-        peak_torch_memory = torch.cuda.memory_stats(
-        )["allocated_bytes.all.peak"]
-
-        # Check for any memory left around that may have been allocated on the
-        # gpu outside of `torch`. NCCL operations, for example, can use a few
-        # GB during a forward pass.
-        torch.cuda.empty_cache()
-        torch_allocated_bytes = torch.cuda.memory_stats(
-        )["allocated_bytes.all.current"]
-
-        # Reset after emptying torch cache
-        free_gpu_memory = torch.cuda.mem_get_info()[0]
+            f"Initial free memory {GiB(self.init_snapshot.free_memory)} GiB, "
+            f"current free memory {GiB(free_gpu_memory)} GiB. "
+            "This happens when other processes sharing the same container "
+            "release GPU memory while vLLM is profiling during initialization. "
+            "To fix this, ensure consistent GPU memory allocation or "
+            "isolate vLLM in its own container.")
+        available_kv_cache_memory = self.requested_memory \
+            - profile_result.non_kv_cache_memory
 
-        # Total forward allocation (current) is equal to the diff in free memory
-        fwd_alloc_bytes = self.init_gpu_memory - free_gpu_memory
-        # We assume current non-torch allocation is equal to peak
-        non_torch_alloc_bytes = max(0, fwd_alloc_bytes - torch_allocated_bytes)
-        # Total forward allocation (peak) is peak torch + non-torch
-        peak_memory = peak_torch_memory + non_torch_alloc_bytes
-
-        available_kv_cache_memory = (
-            total_gpu_memory * self.cache_config.gpu_memory_utilization -
-            peak_memory)
-
-        GiB = lambda b: b / GiB_bytes
         logger.debug(
             "Initial free memory: %.2f GiB, free memory: %.2f GiB, "
-            "total GPU memory: %.2f GiB", GiB(self.init_gpu_memory),
-            GiB(free_gpu_memory), GiB(total_gpu_memory))
-        logger.debug(
-            "Peak torch memory: %.2f GiB, non-torch forward-pass memory: "
-            "%.2f GiB, available KVCache memory: %.2f GiB",
-            GiB(peak_torch_memory), GiB(non_torch_alloc_bytes),
-            GiB(available_kv_cache_memory))
+            "requested GPU memory: %.2f GiB",
+            GiB(self.init_snapshot.free_memory), GiB(free_gpu_memory),
+            GiB(self.requested_memory))
+        logger.debug(profile_result)
+        logger.info("Available KV cache memory: %.2f GiB",
+                    GiB(available_kv_cache_memory))
+        gc.collect()
 
         return int(available_kv_cache_memory)
 
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index 16a9f0959b5c..5da481baeeea 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -101,7 +101,10 @@ def init_device(self):
         # fix this. It will be removed after the bug in XLA compiler is fixed.
         os.environ["LIBTPU_INIT_ARGS"] = (
             os.environ.get("LIBTPU_INIT_ARGS", "") +
-            " --xla_tpu_force_1d_allreduce_at_chunk_count=1")
+            " --xla_tpu_force_1d_allreduce_at_chunk_count=1"
+            " --xla_jf_conv_input_fusion=False")
+        # --xla_jf_conv_input_fusion=False is used to improve the perf of
+        # quantized matmul.
         torch.set_grad_enabled(False)
         torch.set_default_dtype(self.model_config.dtype)
 
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index b04a9a1eb08d..9e834befd68a 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A CPU worker class."""
 import os
+from importlib import util
 from typing import Dict, List, Optional, Set, Tuple, Type
 
 import torch
@@ -156,8 +157,10 @@ def __init__(
 
         # Setup OpenMP threads affinity.
         omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND
-        if omp_cpuids == "all":
-            self.local_omp_cpuid = "all"
+        self.local_omp_cpuid = "all"
+        if omp_cpuids == "auto":
+            self.local_omp_cpuid = self.get_cpus_id_binding_based_on_numa_nodes(
+            )
         else:
             self.local_omp_cpuid = omp_cpuids.split("|")[rank]
 
@@ -399,3 +402,49 @@ def get_cache_block_size_bytes(self) -> int:
         return CPUCacheEngine.get_cache_block_size(
             self.cache_config.block_size, self.cache_config.cache_dtype,
             self.model_config, self.parallel_config)
+
+    def get_cpus_id_binding_based_on_numa_nodes(self) -> str:
+        """Return CPUs id binding based on NUMA nodes.
+        """
+        rank_to_cpus = self.local_omp_cpuid
+        # Setup OpenMP thread affinity based on NUMA nodes automatically
+        world_size = self.vllm_config.parallel_config.world_size
+        libnuma_found = util.find_spec("numa") is not None
+        psutil_found = util.find_spec("psutil") is not None
+        if libnuma_found and psutil_found:
+            import psutil
+            from numa import info
+            cpu_count = psutil.cpu_count(logical=False)
+            cpus_allow_list = psutil.Process().cpu_affinity()
+            numa_size = info.get_num_configured_nodes()
+            cpu_count_per_numa = cpu_count // numa_size
+            num_of_reserved_cpu = min(envs.VLLM_CPU_NUM_OF_RESERVED_CPU,
+                                      cpu_count_per_numa // 2)
+
+            # check allow node_to_cpus list
+            node_to_cpus = []
+            for i in range(numa_size):
+                node_intersect = set(
+                    info.node_to_cpus(i)).intersection(cpus_allow_list)
+                if bool(node_intersect):
+                    node_to_cpus.append(list(node_intersect))
+
+            if world_size > len(node_to_cpus):
+                logger.error(
+                    "Auto thread-binding failed due to "
+                    "world size: %d is larger than "
+                    "allowed NUMA nodes number: %d."
+                    "Please try to bind threads manually.", world_size,
+                    len(node_to_cpus))
+            else:
+                end = cpu_count_per_numa - num_of_reserved_cpu
+                rank_to_cpus_list = node_to_cpus[self.rank][:end]
+                rank_to_cpus = ','.join(str(x) for x in rank_to_cpus_list)
+                logger.info("auto thread-binding list: %s", rank_to_cpus)
+        else:
+            logger.warning(
+                "Auto thread-binding is not supported due to "
+                "the lack of package numa and psutil,"
+                "fallback to no thread-binding. To get better performance,"
+                "please try to manually bind threads.")
+        return rank_to_cpus

From 065432a1c216e9e0a94592b7500eca265c45c0bc Mon Sep 17 00:00:00 2001
From: amit <amit.man@gmail.com>
Date: Mon, 16 Jun 2025 14:24:32 +0300
Subject: [PATCH 13/33] fix bug of using min instead of max for lowest-priority
 request

Signed-off-by: amit <amit.man@gmail.com>
---
 vllm/v1/core/sched/scheduler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 08ad6703fa48..78e1b49f8315 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -264,7 +264,7 @@ def schedule(self) -> SchedulerOutput:
                         can_schedule = False
                         break
                     if self.policy == "priority":
-                        preempted_req = min(
+                        preempted_req = max(
                             self.running,
                             key=lambda r: (r.priority, r.arrival_time),
                         )

From 7876e4152617b12928fec218785912f6f33b187f Mon Sep 17 00:00:00 2001
From: amit <amit.man@gmail.com>
Date: Mon, 16 Jun 2025 14:33:09 +0300
Subject: [PATCH 14/33] remove unnecessary check for self.running

Signed-off-by: amit <amit.man@gmail.com>
---
 vllm/v1/core/sched/scheduler.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 78e1b49f8315..7b9dc266df70 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -259,10 +259,6 @@ def schedule(self) -> SchedulerOutput:
                 if new_blocks is None:
                     # The request cannot be scheduled.
                     # Preempt the lowest-priority request.
-                    if not self.running:
-                        # No request to preempt.
-                        can_schedule = False
-                        break
                     if self.policy == "priority":
                         preempted_req = max(
                             self.running,

From 0252d41dc20abdb1bacc7f3c1ff23abedeedad79 Mon Sep 17 00:00:00 2001
From: amit <amit.man@gmail.com>
Date: Mon, 16 Jun 2025 16:55:33 +0300
Subject: [PATCH 15/33] fix extend_left_requests and use peek_request for
 checking conditions for processing

Signed-off-by: amit <amit.man@gmail.com>
---
 vllm/v1/core/sched/scheduler.py     | 20 ++++++-------------
 vllm/v1/core/sched/waiting_queue.py | 30 ++++++++++++++++++++---------
 2 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 7b9dc266df70..942ac6d505b8 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -345,7 +345,7 @@ def schedule(self) -> SchedulerOutput:
                 if len(self.running) == self.max_num_running_reqs:
                     break
 
-                request = self.waiting.pop_request()
+                request = self.waiting.peek_request()
 
                 # KVTransfer: skip request if still waiting for remote kvs.
                 if request.status == RequestStatus.WAITING_FOR_REMOTE_KVS:
@@ -358,7 +358,7 @@ def schedule(self) -> SchedulerOutput:
                             request.request_id,
                         )
                         skipped_waiting_requests.push_request(
-                            request, request.priority, request.arrival_time)
+                            self.waiting.pop_request())
                         continue
 
                 # Skip request if the structured output request is still waiting
@@ -369,7 +369,7 @@ def schedule(self) -> SchedulerOutput:
                         request.status = RequestStatus.WAITING
                     else:
                         skipped_waiting_requests.push_request(
-                            request, request.priority, request.arrival_time)
+                            self.waiting.pop_request())
                         continue
 
                 # Check that adding the request still respects the max_loras
@@ -379,7 +379,7 @@ def schedule(self) -> SchedulerOutput:
                      request.lora_request.lora_int_id not in scheduled_loras)):
                     # Scheduling would exceed max_loras, skip.
                     skipped_waiting_requests.push_request(
-                        request, request.priority, request.arrival_time)
+                        self.waiting.pop_request())
                     continue
 
                 num_external_computed_tokens = 0
@@ -454,14 +454,6 @@ def schedule(self) -> SchedulerOutput:
                 )
                 if new_blocks is None:
                     # The request cannot be scheduled.
-                    # For FCFS, push back to the front; for priority, add to
-                    # skipped queue
-                    if self.policy == "priority":
-                        skipped_waiting_requests.push_request(
-                            request, request.priority, request.arrival_time)
-                    else:
-                        self.waiting.push_request(request, request.priority,
-                                                  request.arrival_time)
                     break
 
                 # KVTransfer: the connector uses this info to determine
@@ -477,11 +469,11 @@ def schedule(self) -> SchedulerOutput:
 
                 # Request was already popped from self.waiting
                 # unless it was re-added above due to new_blocks being None.
+                request = self.waiting.pop_request()
                 if load_kv_async:
                     # If loading async, allocate memory and put request
                     # into the WAITING_FOR_REMOTE_KV state.
-                    skipped_waiting_requests.push_request(
-                        request, request.priority, request.arrival_time)
+                    skipped_waiting_requests.push_request(request)
                     request.status = RequestStatus.WAITING_FOR_REMOTE_KVS
                     continue
 
diff --git a/vllm/v1/core/sched/waiting_queue.py b/vllm/v1/core/sched/waiting_queue.py
index f18011691101..af3ee6f6555b 100644
--- a/vllm/v1/core/sched/waiting_queue.py
+++ b/vllm/v1/core/sched/waiting_queue.py
@@ -24,6 +24,11 @@ def pop_request(self) -> Request:
         """Pop a request from the queue according to the policy."""
         pass
 
+    @abstractmethod
+    def peek_request(self) -> Request:
+        """Peek at the request at the front of the queue without removing it."""
+        pass
+
     @abstractmethod
     def push_request(self,
                      request: Request,
@@ -72,6 +77,12 @@ def pop_request(self) -> Request:
         """Pop a request from the queue according to FCFS policy."""
         return self.popleft()
 
+    def peek_request(self) -> Request:
+        """Peek at the next request in the queue without removing it."""
+        if not self:
+            raise IndexError("peek from an empty queue")
+        return self[0]
+
     def push_request(self,
                      request: Request,
                      priority: int = 0,
@@ -118,6 +129,13 @@ def pop_request(self) -> Request:
         _, _, request = heapq.heappop(self._heap)
         return request
 
+    def peek_request(self) -> Request:
+        """Peek at the next request in the queue without removing it."""
+        if not self._heap:
+            raise IndexError("peek from empty heap")
+        _, _, request = self._heap[0]
+        return request
+
     def push_request(self,
                      request: Request,
                      priority: int = 0,
@@ -127,15 +145,9 @@ def push_request(self,
 
     def extend_left_requests(self, requests: WaitingQueue) -> None:
         """Extend left with requests from another WaitingQueue."""
-        if isinstance(requests, PriorityWaitingQueue):
-            for item in requests._heap:
-                heapq.heappush(self._heap, item)
-        else:
-            # Handle FCFS queue case
-            for request in requests:
-                heapq.heappush(
-                    self._heap,
-                    (request.priority, request.arrival_time, request))
+        for request in requests:
+            # Set priority to -1 so these requests stay at the front.
+            heapq.heappush(self._heap, (-1, request.arrival_time, request))
 
     def remove_request(self, request: Request) -> None:
         """Remove a specific request from the queue."""

From 7fc11f0959632e49944665a017c330c18f851227 Mon Sep 17 00:00:00 2001
From: amit <amit@digitalowl.com>
Date: Mon, 16 Jun 2025 17:08:49 +0300
Subject: [PATCH 16/33] Update vllm/v1/core/sched/scheduler.py

Co-authored-by: Roger Wang <Rogerw0108@gmail.com>
---
 vllm/v1/core/sched/scheduler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 942ac6d505b8..5202e3404813 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -335,7 +335,7 @@ def schedule(self) -> SchedulerOutput:
                 if req.lora_request and req.lora_request.lora_int_id > 0)
             assert len(scheduled_loras) <= self.lora_config.max_loras
 
-        # Use a temporary queue to collect requests that need to be skipped
+        # Use a temporary WaitingQueue to collect requests that need to be skipped
         # and put back at the head of the waiting queue later
         skipped_waiting_requests = create_waiting_queue(self.policy)
 

From 7523db3cbc31cf8d684db362caf1e4c70a4b2229 Mon Sep 17 00:00:00 2001
From: amit <amit.man@gmail.com>
Date: Mon, 16 Jun 2025 17:15:54 +0300
Subject: [PATCH 17/33] rename WaitingQueue to RequestQueue

Signed-off-by: amit <amit.man@gmail.com>
---
 .../{waiting_queue.py => request_queue.py}    | 26 +++++++++----------
 vllm/v1/core/sched/scheduler.py               |  6 ++---
 2 files changed, 16 insertions(+), 16 deletions(-)
 rename vllm/v1/core/sched/{waiting_queue.py => request_queue.py} (88%)

diff --git a/vllm/v1/core/sched/waiting_queue.py b/vllm/v1/core/sched/request_queue.py
similarity index 88%
rename from vllm/v1/core/sched/waiting_queue.py
rename to vllm/v1/core/sched/request_queue.py
index af3ee6f6555b..4789c3715f5f 100644
--- a/vllm/v1/core/sched/waiting_queue.py
+++ b/vllm/v1/core/sched/request_queue.py
@@ -11,7 +11,7 @@
 from vllm.v1.request import Request
 
 
-class WaitingQueue(ABC):
+class RequestQueue(ABC):
     """Abstract base class for request queues."""
 
     @abstractmethod
@@ -38,8 +38,8 @@ def push_request(self,
         pass
 
     @abstractmethod
-    def extend_left_requests(self, requests: WaitingQueue) -> None:
-        """Extend left with requests from another WaitingQueue."""
+    def extend_left_requests(self, requests: RequestQueue) -> None:
+        """Extend left with requests from another RequestQueue."""
         pass
 
     @abstractmethod
@@ -63,7 +63,7 @@ def __iter__(self) -> Iterator[Request]:
         pass
 
 
-class FCFSWaitingQueue(deque[Request], WaitingQueue):
+class FCFSRequestQueue(deque[Request], RequestQueue):
     """A first-come-first-served queue that supports deque operations."""
 
     def __init__(self) -> None:
@@ -90,8 +90,8 @@ def push_request(self,
         """Push a request back to the queue (used for skipped requests)."""
         self.appendleft(request)
 
-    def extend_left_requests(self, requests: WaitingQueue) -> None:
-        """Extend left with requests from another WaitingQueue."""
+    def extend_left_requests(self, requests: RequestQueue) -> None:
+        """Extend left with requests from another RequestQueue."""
         self.extendleft(reversed(list(requests)))
 
     def remove_request(self, request: Request) -> None:
@@ -111,7 +111,7 @@ def __iter__(self) -> Iterator[Request]:
         return super().__iter__()
 
 
-class PriorityWaitingQueue(WaitingQueue):
+class PriorityRequestQueue(RequestQueue):
     """A priority queue that supports heap operations."""
 
     def __init__(self) -> None:
@@ -143,8 +143,8 @@ def push_request(self,
         """Push a request back to the queue (used for skipped requests)."""
         heapq.heappush(self._heap, (priority, arrival_time, request))
 
-    def extend_left_requests(self, requests: WaitingQueue) -> None:
-        """Extend left with requests from another WaitingQueue."""
+    def extend_left_requests(self, requests: RequestQueue) -> None:
+        """Extend left with requests from another RequestQueue."""
         for request in requests:
             # Set priority to -1 so these requests stay at the front.
             heapq.heappush(self._heap, (-1, request.arrival_time, request))
@@ -170,9 +170,9 @@ def __iter__(self) -> Iterator[Request]:
             yield request
 
 
-def create_waiting_queue(policy: str, ) -> WaitingQueue:
-    """Create waiting queue based on scheduling policy."""
+def create_request_queue(policy: str, ) -> RequestQueue:
+    """Create request queue based on scheduling policy."""
     if policy == "priority":
-        return PriorityWaitingQueue()
+        return PriorityRequestQueue()
     else:
-        return FCFSWaitingQueue()
+        return FCFSRequestQueue() 
\ No newline at end of file
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 942ac6d505b8..3543f63413df 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -22,8 +22,8 @@
 from vllm.v1.core.sched.interface import SchedulerInterface
 from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
                                        SchedulerOutput)
+from vllm.v1.core.sched.request_queue import create_request_queue
 from vllm.v1.core.sched.utils import check_stop
-from vllm.v1.core.sched.waiting_queue import create_waiting_queue
 from vllm.v1.engine import (EngineCoreEventType, EngineCoreOutput,
                             EngineCoreOutputs)
 from vllm.v1.kv_cache_interface import KVCacheConfig
@@ -98,7 +98,7 @@ def __init__(
         # Scheduling policy
         self.policy = self.scheduler_config.policy
         # Priority queues for requests.
-        self.waiting = create_waiting_queue(self.policy)
+        self.waiting = create_request_queue(self.policy)
         self.running: list[Request] = []
 
         # The request IDs that are finished in between the previous and the
@@ -337,7 +337,7 @@ def schedule(self) -> SchedulerOutput:
 
         # Use a temporary queue to collect requests that need to be skipped
         # and put back at the head of the waiting queue later
-        skipped_waiting_requests = create_waiting_queue(self.policy)
+        skipped_waiting_requests = create_request_queue(self.policy)
 
         # Next, schedule the WAITING requests.
         if not preempted_reqs:

From ac25e10b6009d5aa98874b90847a4da9638b40d9 Mon Sep 17 00:00:00 2001
From: amit <amit.man@gmail.com>
Date: Mon, 16 Jun 2025 17:22:00 +0300
Subject: [PATCH 18/33] use request.priority, request.arrival_time for heap

Signed-off-by: amit <amit.man@gmail.com>
---
 vllm/v1/core/sched/request_queue.py | 20 ++++++--------------
 vllm/v1/core/sched/scheduler.py     |  4 ++--
 2 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/vllm/v1/core/sched/request_queue.py b/vllm/v1/core/sched/request_queue.py
index 4789c3715f5f..9b9449cfbbe1 100644
--- a/vllm/v1/core/sched/request_queue.py
+++ b/vllm/v1/core/sched/request_queue.py
@@ -30,10 +30,7 @@ def peek_request(self) -> Request:
         pass
 
     @abstractmethod
-    def push_request(self,
-                     request: Request,
-                     priority: int = 0,
-                     arrival_time: float = 0.0) -> None:
+    def push_request(self, request: Request) -> None:
         """Push a request back to the queue (used for skipped requests)."""
         pass
 
@@ -83,10 +80,7 @@ def peek_request(self) -> Request:
             raise IndexError("peek from an empty queue")
         return self[0]
 
-    def push_request(self,
-                     request: Request,
-                     priority: int = 0,
-                     arrival_time: float = 0.0) -> None:
+    def push_request(self, request: Request) -> None:
         """Push a request back to the queue (used for skipped requests)."""
         self.appendleft(request)
 
@@ -136,12 +130,10 @@ def peek_request(self) -> Request:
         _, _, request = self._heap[0]
         return request
 
-    def push_request(self,
-                     request: Request,
-                     priority: int = 0,
-                     arrival_time: float = 0.0) -> None:
+    def push_request(self, request: Request) -> None:
         """Push a request back to the queue (used for skipped requests)."""
-        heapq.heappush(self._heap, (priority, arrival_time, request))
+        heapq.heappush(self._heap,
+                       (request.priority, request.arrival_time, request))
 
     def extend_left_requests(self, requests: RequestQueue) -> None:
         """Extend left with requests from another RequestQueue."""
@@ -175,4 +167,4 @@ def create_request_queue(policy: str, ) -> RequestQueue:
     if policy == "priority":
         return PriorityRequestQueue()
     else:
-        return FCFSRequestQueue() 
\ No newline at end of file
+        return FCFSRequestQueue()
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 0ffc6e1cd8e8..7cd84c9891c3 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -335,8 +335,8 @@ def schedule(self) -> SchedulerOutput:
                 if req.lora_request and req.lora_request.lora_int_id > 0)
             assert len(scheduled_loras) <= self.lora_config.max_loras
 
-        # Use a temporary WaitingQueue to collect requests that need to be skipped
-        # and put back at the head of the waiting queue later
+        # Use a temporary WaitingQueue to collect requests that need to be
+        # skipped and put back at the head of the waiting queue later
         skipped_waiting_requests = create_request_queue(self.policy)
 
         # Next, schedule the WAITING requests.

From 2482db97d34236adf8601e2f18da183d5534fc46 Mon Sep 17 00:00:00 2001
From: amit <amit.man@gmail.com>
Date: Mon, 16 Jun 2025 17:31:26 +0300
Subject: [PATCH 19/33] fix lint error

Signed-off-by: amit <amit.man@gmail.com>
---
 vllm/v1/core/sched/scheduler.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 7cd84c9891c3..0b8d5bf083a9 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -275,11 +275,7 @@ def schedule(self) -> SchedulerOutput:
                         preempted_req.record_event(
                             EngineCoreEventType.PREEMPTED, scheduled_timestamp)
 
-                    self.waiting.push_request(
-                        preempted_req,
-                        preempted_req.priority,
-                        preempted_req.arrival_time,
-                    )
+                    self.waiting.push_request(preempted_req)
                     preempted_reqs.append(preempted_req)
                     if preempted_req == request:
                         # No more request to preempt.

From 627e11c6b008f0c2a31d695bc574549fd1f1ba7b Mon Sep 17 00:00:00 2001
From: amit <amit.man@gmail.com>
Date: Mon, 16 Jun 2025 17:32:02 +0300
Subject: [PATCH 20/33] add more description to PriorityRequestQueue class

Signed-off-by: amit <amit.man@gmail.com>
---
 vllm/v1/core/sched/request_queue.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/core/sched/request_queue.py b/vllm/v1/core/sched/request_queue.py
index 9b9449cfbbe1..da34d2b78dea 100644
--- a/vllm/v1/core/sched/request_queue.py
+++ b/vllm/v1/core/sched/request_queue.py
@@ -106,7 +106,13 @@ def __iter__(self) -> Iterator[Request]:
 
 
 class PriorityRequestQueue(RequestQueue):
-    """A priority queue that supports heap operations."""
+    """
+    A priority queue that supports heap operations.
+
+    Requests with a smaller value of `priority` are processed first.
+    If multiple requests have the same priority, the one with the earlier
+    `arrival_time` is processed first.
+    """
 
     def __init__(self) -> None:
         self._heap: list[tuple[int, float, Request]] = []

From 62620a0b8dab712289be5b50689037d17b0178bd Mon Sep 17 00:00:00 2001
From: amit <amit.man@gmail.com>
Date: Thu, 19 Jun 2025 10:30:59 +0300
Subject: [PATCH 21/33] rename extend_left_requests => prepend_requests;
 push_request => prepend_request; ad -1 priority for prepend_request

Signed-off-by: amit <amit.man@gmail.com>
---
 vllm/v1/core/sched/request_queue.py | 34 ++++++++++++++++-------------
 vllm/v1/core/sched/scheduler.py     | 12 +++++-----
 2 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/vllm/v1/core/sched/request_queue.py b/vllm/v1/core/sched/request_queue.py
index da34d2b78dea..a29ddc9584eb 100644
--- a/vllm/v1/core/sched/request_queue.py
+++ b/vllm/v1/core/sched/request_queue.py
@@ -30,13 +30,14 @@ def peek_request(self) -> Request:
         pass
 
     @abstractmethod
-    def push_request(self, request: Request) -> None:
-        """Push a request back to the queue (used for skipped requests)."""
+    def prepend_request(self, request: Request) -> None:
+        """Prepend a request to the front of the queue."""
         pass
 
     @abstractmethod
-    def extend_left_requests(self, requests: RequestQueue) -> None:
-        """Extend left with requests from another RequestQueue."""
+    def prepend_requests(self, requests: RequestQueue) -> None:
+        """Prepend all requests from another queue to the front of this
+        queue."""
         pass
 
     @abstractmethod
@@ -80,12 +81,13 @@ def peek_request(self) -> Request:
             raise IndexError("peek from an empty queue")
         return self[0]
 
-    def push_request(self, request: Request) -> None:
-        """Push a request back to the queue (used for skipped requests)."""
+    def prepend_request(self, request: Request) -> None:
+        """Prepend a request to the front of the queue."""
         self.appendleft(request)
 
-    def extend_left_requests(self, requests: RequestQueue) -> None:
-        """Extend left with requests from another RequestQueue."""
+    def prepend_requests(self, requests: RequestQueue) -> None:
+        """Prepend all requests from another queue to the front of this
+        queue."""
         self.extendleft(reversed(list(requests)))
 
     def remove_request(self, request: Request) -> None:
@@ -136,15 +138,17 @@ def peek_request(self) -> Request:
         _, _, request = self._heap[0]
         return request
 
-    def push_request(self, request: Request) -> None:
-        """Push a request back to the queue (used for skipped requests)."""
-        heapq.heappush(self._heap,
-                       (request.priority, request.arrival_time, request))
+    def prepend_request(self, request: Request) -> None:
+        """Prepend a request to the front of the queue."""
+        # Set priority to -1 so these requests stay at the front after heapify.
+        heapq.heappush(self._heap, (-1, request.arrival_time, request))
 
-    def extend_left_requests(self, requests: RequestQueue) -> None:
-        """Extend left with requests from another RequestQueue."""
+    def prepend_requests(self, requests: RequestQueue) -> None:
+        """Prepend all requests from another queue to the front of this
+        queue."""
         for request in requests:
-            # Set priority to -1 so these requests stay at the front.
+            # Set priority to -1 so these requests stay at the front after
+            # heapify.
             heapq.heappush(self._heap, (-1, request.arrival_time, request))
 
     def remove_request(self, request: Request) -> None:
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index c6f2385b8599..7f57763fbaf4 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -275,7 +275,7 @@ def schedule(self) -> SchedulerOutput:
                         preempted_req.record_event(
                             EngineCoreEventType.PREEMPTED, scheduled_timestamp)
 
-                    self.waiting.push_request(preempted_req)
+                    self.waiting.prepend_request(preempted_req)
                     preempted_reqs.append(preempted_req)
                     if preempted_req == request:
                         # No more request to preempt.
@@ -353,7 +353,7 @@ def schedule(self) -> SchedulerOutput:
                             "%s is still in WAITING_FOR_REMOTE_KVS state.",
                             request.request_id,
                         )
-                        skipped_waiting_requests.push_request(
+                        skipped_waiting_requests.prepend_request(
                             self.waiting.pop_request())
                         continue
 
@@ -364,7 +364,7 @@ def schedule(self) -> SchedulerOutput:
                     if structured_output_req and structured_output_req.grammar:
                         request.status = RequestStatus.WAITING
                     else:
-                        skipped_waiting_requests.push_request(
+                        skipped_waiting_requests.prepend_request(
                             self.waiting.pop_request())
                         continue
 
@@ -374,7 +374,7 @@ def schedule(self) -> SchedulerOutput:
                     (len(scheduled_loras) == self.lora_config.max_loras and
                      request.lora_request.lora_int_id not in scheduled_loras)):
                     # Scheduling would exceed max_loras, skip.
-                    skipped_waiting_requests.push_request(
+                    skipped_waiting_requests.prepend_request(
                         self.waiting.pop_request())
                     continue
 
@@ -478,7 +478,7 @@ def schedule(self) -> SchedulerOutput:
                 if load_kv_async:
                     # If loading async, allocate memory and put request
                     # into the WAITING_FOR_REMOTE_KV state.
-                    skipped_waiting_requests.push_request(request)
+                    skipped_waiting_requests.prepend_request(request)
                     request.status = RequestStatus.WAITING_FOR_REMOTE_KVS
                     continue
 
@@ -520,7 +520,7 @@ def schedule(self) -> SchedulerOutput:
 
         # Put back any skipped requests at the head of the waiting queue
         if skipped_waiting_requests:
-            self.waiting.extend_left_requests(skipped_waiting_requests)
+            self.waiting.prepend_requests(skipped_waiting_requests)
 
         # Check if the scheduling constraints are satisfied.
         total_num_scheduled_tokens = sum(num_scheduled_tokens.values())

From 6037313ab9c4714424e1e1c740f8180af31a1f69 Mon Sep 17 00:00:00 2001
From: amit <amit.man@gmail.com>
Date: Thu, 19 Jun 2025 10:45:10 +0300
Subject: [PATCH 22/33] fix proirity queue unti test by adding
 pooling_params=None

Signed-off-by: amit <amit.man@gmail.com>
---
 tests/v1/core/test_scheduler.py | 4 ++++
 vllm/v1/core/sched/scheduler.py | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 0dab1663270e..8994816a3017 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1391,6 +1391,7 @@ def create_requests_with_priority(
             request_id=f"{i}",
             prompt_token_ids=[i] * num_tokens,
             sampling_params=sampling_params,
+            pooling_params=None,
             multi_modal_inputs=mm_inputs,
             multi_modal_placeholders=mm_position,
             multi_modal_hashes=None,
@@ -1526,6 +1527,7 @@ def test_priority_scheduling_preemption():
         spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
+        pooler_output=[],
     )
     scheduler.update_from_output(output, model_output)
 
@@ -1598,6 +1600,7 @@ def test_priority_scheduling_no_preemption_when_space_available():
         spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
+        pooler_output=[],
     )
     scheduler.update_from_output(output, model_output)
 
@@ -1839,6 +1842,7 @@ def test_priority_scheduling_heap_property():
                 spec_token_ids=None,
                 logprobs=None,
                 prompt_logprobs_dict={},
+                pooler_output=[],
             )
             scheduler.update_from_output(output, model_output)
 
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 7f57763fbaf4..55828eee2b27 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -426,8 +426,8 @@ def schedule(self) -> SchedulerOutput:
                     # pooling requests to be chunked
                     if not self.scheduler_config.chunked_prefill_enabled and \
                         num_new_tokens > token_budget:
-                        self.waiting.popleft()
-                        skipped_waiting_requests.appendleft(request)
+                        self.waiting.pop_request()
+                        skipped_waiting_requests.prepend_request(request)
                         continue
 
                     num_new_tokens = min(num_new_tokens, token_budget)

From 05cc6b8662f8c4d5ac7ee2779e8dbd6e1bbdc931 Mon Sep 17 00:00:00 2001
From: amit <amit.man@gmail.com>
Date: Thu, 19 Jun 2025 11:06:47 +0300
Subject: [PATCH 23/33] add __reversed__ to PriorityRequestQueue and remove
 list conversion

Signed-off-by: amit <amit.man@gmail.com>
---
 vllm/v1/core/sched/request_queue.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/core/sched/request_queue.py b/vllm/v1/core/sched/request_queue.py
index a29ddc9584eb..5d8f144fe395 100644
--- a/vllm/v1/core/sched/request_queue.py
+++ b/vllm/v1/core/sched/request_queue.py
@@ -88,7 +88,7 @@ def prepend_request(self, request: Request) -> None:
     def prepend_requests(self, requests: RequestQueue) -> None:
         """Prepend all requests from another queue to the front of this
         queue."""
-        self.extendleft(reversed(list(requests)))
+        self.extendleft(reversed(requests))
 
     def remove_request(self, request: Request) -> None:
         """Remove a specific request from the queue."""
@@ -171,6 +171,10 @@ def __iter__(self) -> Iterator[Request]:
             _, _, request = heapq.heappop(heap_copy)
             yield request
 
+    def __reversed__(self) -> Iterator[Request]:
+        """Iterate over the queue in reverse priority order."""
+        return reversed(list(self))
+
 
 def create_request_queue(policy: str, ) -> RequestQueue:
     """Create request queue based on scheduling policy."""

From 86fe5bb24b97a216dcfa5163c05dab79b1219663 Mon Sep 17 00:00:00 2001
From: amit <amit.man@gmail.com>
Date: Thu, 19 Jun 2025 11:07:35 +0300
Subject: [PATCH 24/33] minor: remove trailing comma

Signed-off-by: amit <amit.man@gmail.com>
---
 vllm/v1/core/sched/request_queue.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/core/sched/request_queue.py b/vllm/v1/core/sched/request_queue.py
index 5d8f144fe395..1cfce9c3ec6b 100644
--- a/vllm/v1/core/sched/request_queue.py
+++ b/vllm/v1/core/sched/request_queue.py
@@ -176,7 +176,7 @@ def __reversed__(self) -> Iterator[Request]:
         return reversed(list(self))
 
 
-def create_request_queue(policy: str, ) -> RequestQueue:
+def create_request_queue(policy: str) -> RequestQueue:
     """Create request queue based on scheduling policy."""
     if policy == "priority":
         return PriorityRequestQueue()

From 67da5facd91c414f8f7b375e157398590b45d879 Mon Sep 17 00:00:00 2001
From: amit <amit.man@gmail.com>
Date: Thu, 19 Jun 2025 11:46:37 +0300
Subject: [PATCH 25/33] add remove_requests() to request queue interface and
 use it in finish_requests for better performance

Signed-off-by: amit <amit.man@gmail.com>
---
 vllm/v1/core/sched/request_queue.py | 34 ++++++++++++++++++++++++++++-
 vllm/v1/core/sched/scheduler.py     | 19 ++++++++++++++--
 2 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/core/sched/request_queue.py b/vllm/v1/core/sched/request_queue.py
index 1cfce9c3ec6b..48ef8453ff71 100644
--- a/vllm/v1/core/sched/request_queue.py
+++ b/vllm/v1/core/sched/request_queue.py
@@ -6,7 +6,7 @@
 import heapq
 from abc import ABC, abstractmethod
 from collections import deque
-from collections.abc import Iterator
+from collections.abc import Iterable, Iterator
 
 from vllm.v1.request import Request
 
@@ -45,6 +45,11 @@ def remove_request(self, request: Request) -> None:
         """Remove a specific request from the queue."""
         pass
 
+    @abstractmethod
+    def remove_requests(self, requests: Iterable[Request]) -> None:
+        """Remove multiple specific requests from the queue."""
+        pass
+
     @abstractmethod
     def __bool__(self) -> bool:
         """Check if queue has any requests."""
@@ -60,6 +65,11 @@ def __iter__(self) -> Iterator[Request]:
         """Iterate over the queue according to the policy."""
         pass
 
+    @abstractmethod
+    def __reversed__(self) -> Iterator[Request]:
+        """Iterate over the queue in reverse order."""
+        pass
+
 
 class FCFSRequestQueue(deque[Request], RequestQueue):
     """A first-come-first-served queue that supports deque operations."""
@@ -94,6 +104,17 @@ def remove_request(self, request: Request) -> None:
         """Remove a specific request from the queue."""
         self.remove(request)
 
+    def remove_requests(self, requests: Iterable[Request]) -> None:
+        """Remove multiple specific requests from the queue."""
+        requests_to_remove = set(requests)
+        filtered_requests = [
+            req for req in self if req not in requests_to_remove
+        ]
+        # deque does not support in-place filtering, so we need to clear
+        # and extend
+        self.clear()
+        self.extend(filtered_requests)
+
     def __bool__(self) -> bool:
         """Check if queue has any requests."""
         return len(self) > 0
@@ -106,6 +127,10 @@ def __iter__(self) -> Iterator[Request]:
         """Iterate over the queue according to FCFS policy."""
         return super().__iter__()
 
+    def __reversed__(self) -> Iterator[Request]:
+        """Iterate over the queue in reverse order."""
+        return super().__reversed__()
+
 
 class PriorityRequestQueue(RequestQueue):
     """
@@ -156,6 +181,13 @@ def remove_request(self, request: Request) -> None:
         self._heap = [(p, t, r) for p, t, r in self._heap if r != request]
         heapq.heapify(self._heap)
 
+    def remove_requests(self, requests: Iterable[Request]) -> None:
+        """Remove multiple specific requests from the queue."""
+        requests_to_remove = set(requests)
+        self._heap = [(p, t, r) for p, t, r in self._heap
+                      if r not in requests_to_remove]
+        heapq.heapify(self._heap)
+
     def __bool__(self) -> bool:
         """Check if queue has any requests."""
         return bool(self._heap)
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 55828eee2b27..411a1b031629 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -938,16 +938,31 @@ def finish_requests(
         else:
             request_ids = set(request_ids)
 
+        running_requests_to_remove = []
+        waiting_requests_to_remove = []
+        valid_requests = []
+
+        # First pass: collect requests to remove from queues
         for req_id in request_ids:
             request = self.requests.get(req_id)
             if request is None:
                 # Invalid request ID.
                 continue
 
+            valid_requests.append(request)
             if request.status == RequestStatus.RUNNING:
-                self.running.remove(request)
+                running_requests_to_remove.append(request)
             else:
-                self.waiting.remove_request(request)
+                waiting_requests_to_remove.append(request)
+
+        # Remove all requests from queues at once for better efficiency
+        for request in running_requests_to_remove:
+            self.running.remove(request)
+        if waiting_requests_to_remove:
+            self.waiting.remove_requests(waiting_requests_to_remove)
+
+        # Second pass: set status and free requests
+        for request in valid_requests:
             request.status = finished_status
             self._free_request(request)
 

From 14990534fc074997c5b7e9bf4704dc7f29be879c Mon Sep 17 00:00:00 2001
From: amit <amit.man@gmail.com>
Date: Thu, 19 Jun 2025 13:32:31 +0300
Subject: [PATCH 26/33] use  in prepend_request(s) to support requests with
 priorities of negative value

Signed-off-by: amit <amit.man@gmail.com>
---
 vllm/v1/core/sched/request_queue.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/core/sched/request_queue.py b/vllm/v1/core/sched/request_queue.py
index 48ef8453ff71..e5d0a3db49c5 100644
--- a/vllm/v1/core/sched/request_queue.py
+++ b/vllm/v1/core/sched/request_queue.py
@@ -4,12 +4,18 @@
 from __future__ import annotations
 
 import heapq
+import sys
 from abc import ABC, abstractmethod
 from collections import deque
 from collections.abc import Iterable, Iterator
 
 from vllm.v1.request import Request
 
+# Python doesn't provide sys.minsize, so we calculate the minimum integer value.
+# In two's complement representation: min = -2^(n-1), max = 2^(n-1) - 1
+# Therefore, the true minimum is -sys.maxsize - 1, not just -sys.maxsize
+_MIN_PRIORITY = -sys.maxsize - 1
+
 
 class RequestQueue(ABC):
     """Abstract base class for request queues."""
@@ -165,16 +171,19 @@ def peek_request(self) -> Request:
 
     def prepend_request(self, request: Request) -> None:
         """Prepend a request to the front of the queue."""
-        # Set priority to -1 so these requests stay at the front after heapify.
-        heapq.heappush(self._heap, (-1, request.arrival_time, request))
+        # Set priority to max negative integer so these requests stay at the
+        # front after heapify.
+        heapq.heappush(self._heap,
+                       (_MIN_PRIORITY, request.arrival_time, request))
 
     def prepend_requests(self, requests: RequestQueue) -> None:
         """Prepend all requests from another queue to the front of this
         queue."""
         for request in requests:
-            # Set priority to -1 so these requests stay at the front after
-            # heapify.
-            heapq.heappush(self._heap, (-1, request.arrival_time, request))
+            # Set priority to max negative integer so these requests stay at
+            # the front after heapify.
+            heapq.heappush(self._heap,
+                           (_MIN_PRIORITY, request.arrival_time, request))
 
     def remove_request(self, request: Request) -> None:
         """Remove a specific request from the queue."""

From c84af1e67276200dee820bcbc6abbf875c6e566b Mon Sep 17 00:00:00 2001
From: amit <amit.man@gmail.com>
Date: Thu, 19 Jun 2025 13:58:52 +0300
Subject: [PATCH 27/33] minor change in comment

Signed-off-by: amit <amit.man@gmail.com>
---
 vllm/v1/core/sched/scheduler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 411a1b031629..ef55537c6be6 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -331,7 +331,7 @@ def schedule(self) -> SchedulerOutput:
                 if req.lora_request and req.lora_request.lora_int_id > 0)
             assert len(scheduled_loras) <= self.lora_config.max_loras
 
-        # Use a temporary WaitingQueue to collect requests that need to be
+        # Use a temporary RequestQueue to collect requests that need to be
         # skipped and put back at the head of the waiting queue later
         skipped_waiting_requests = create_request_queue(self.policy)
 

From e1f7e78a5f0fd6a14846ae81a1cf33329a3459bc Mon Sep 17 00:00:00 2001
From: amit <amit.man@gmail.com>
Date: Thu, 19 Jun 2025 14:03:27 +0300
Subject: [PATCH 28/33] restore client_index=request.client_index

Signed-off-by: amit <amit.man@gmail.com>
---
 vllm/v1/request.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index bae3a817295c..7da90b627370 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -112,6 +112,7 @@ def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
 
         return cls(
             request_id=request.request_id,
+            client_index=request.client_index,
             prompt_token_ids=request.prompt_token_ids,
             multi_modal_inputs=request.mm_inputs,
             multi_modal_hashes=request.mm_hashes,

From 9aa2aca511e4d02bd819b9d310d10373a71377c3 Mon Sep 17 00:00:00 2001
From: amit <amit.man@gmail.com>
Date: Thu, 19 Jun 2025 20:31:26 +0300
Subject: [PATCH 29/33] do not alter prioirty in PriorityRequestQueue
 prepend_request(s) function

Signed-off-by: amit <amit.man@gmail.com>
---
 vllm/v1/core/sched/request_queue.py | 27 ++++++++++-----------------
 1 file changed, 10 insertions(+), 17 deletions(-)

diff --git a/vllm/v1/core/sched/request_queue.py b/vllm/v1/core/sched/request_queue.py
index e5d0a3db49c5..319fcd58cf4e 100644
--- a/vllm/v1/core/sched/request_queue.py
+++ b/vllm/v1/core/sched/request_queue.py
@@ -4,18 +4,12 @@
 from __future__ import annotations
 
 import heapq
-import sys
 from abc import ABC, abstractmethod
 from collections import deque
 from collections.abc import Iterable, Iterator
 
 from vllm.v1.request import Request
 
-# Python doesn't provide sys.minsize, so we calculate the minimum integer value.
-# In two's complement representation: min = -2^(n-1), max = 2^(n-1) - 1
-# Therefore, the true minimum is -sys.maxsize - 1, not just -sys.maxsize
-_MIN_PRIORITY = -sys.maxsize - 1
-
 
 class RequestQueue(ABC):
     """Abstract base class for request queues."""
@@ -170,20 +164,19 @@ def peek_request(self) -> Request:
         return request
 
     def prepend_request(self, request: Request) -> None:
-        """Prepend a request to the front of the queue."""
-        # Set priority to max negative integer so these requests stay at the
-        # front after heapify.
-        heapq.heappush(self._heap,
-                       (_MIN_PRIORITY, request.arrival_time, request))
+        """Add a request to the queue according to priority policy.
+        
+        Note: In a priority queue, there is no concept of prepending to the 
+        front. Requests are ordered by (priority, arrival_time)."""
+        self.add_request(request)
 
     def prepend_requests(self, requests: RequestQueue) -> None:
-        """Prepend all requests from another queue to the front of this
-        queue."""
+        """Add all requests from another queue according to priority policy.
+        
+        Note: In a priority queue, there is no concept of prepending to the 
+        front. Requests are ordered by (priority, arrival_time)."""
         for request in requests:
-            # Set priority to max negative integer so these requests stay at
-            # the front after heapify.
-            heapq.heappush(self._heap,
-                           (_MIN_PRIORITY, request.arrival_time, request))
+            self.add_request(request)
 
     def remove_request(self, request: Request) -> None:
         """Remove a specific request from the queue."""

From f6fe73139a31223570147b6043cf2a442d75bada Mon Sep 17 00:00:00 2001
From: amit <amit.man@gmail.com>
Date: Fri, 20 Jun 2025 12:00:33 +0300
Subject: [PATCH 30/33] minor style changes

Signed-off-by: amit <amit.man@gmail.com>
---
 vllm/v1/core/sched/request_queue.py |  3 ---
 vllm/v1/core/sched/scheduler.py     | 40 +++++++++++------------------
 2 files changed, 15 insertions(+), 28 deletions(-)

diff --git a/vllm/v1/core/sched/request_queue.py b/vllm/v1/core/sched/request_queue.py
index 319fcd58cf4e..f3796521f2db 100644
--- a/vllm/v1/core/sched/request_queue.py
+++ b/vllm/v1/core/sched/request_queue.py
@@ -74,9 +74,6 @@ def __reversed__(self) -> Iterator[Request]:
 class FCFSRequestQueue(deque[Request], RequestQueue):
     """A first-come-first-served queue that supports deque operations."""
 
-    def __init__(self) -> None:
-        super().__init__()
-
     def add_request(self, request: Request) -> None:
         """Add a request to the queue according to FCFS policy."""
         self.append(request)
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index ef55537c6be6..84eab45a2a8e 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -218,16 +218,10 @@ def schedule(self) -> SchedulerOutput:
             encoder_inputs_to_schedule = None
             new_encoder_budget = encoder_budget
             if request.has_encoder_inputs:
-                (
-                    encoder_inputs_to_schedule,
-                    num_new_tokens,
-                    new_encoder_budget,
-                ) = self._try_schedule_encoder_inputs(
-                    request,
-                    request.num_computed_tokens,
-                    num_new_tokens,
-                    encoder_budget,
-                )
+                (encoder_inputs_to_schedule, num_new_tokens,
+                 new_encoder_budget) = self._try_schedule_encoder_inputs(
+                     request, request.num_computed_tokens, num_new_tokens,
+                     encoder_budget)
 
             if num_new_tokens == 0:
                 # The request cannot be scheduled because one of the following
@@ -245,17 +239,14 @@ def schedule(self) -> SchedulerOutput:
 
             num_draft_tokens = max(
                 num_new_tokens + request.num_computed_tokens -
-                request.num_tokens,
-                0,
-            )
+                request.num_tokens, 0)
 
             while True:
                 new_blocks = self.kv_cache_manager.allocate_slots(
                     request,
                     num_new_tokens,
                     num_draft_tokens=num_draft_tokens,
-                    num_lookahead_tokens=self.num_lookahead_tokens,
-                )
+                    num_lookahead_tokens=self.num_lookahead_tokens)
                 if new_blocks is None:
                     # The request cannot be scheduled.
                     # Preempt the lowest-priority request.
@@ -351,10 +342,9 @@ def schedule(self) -> SchedulerOutput:
                     else:
                         logger.debug(
                             "%s is still in WAITING_FOR_REMOTE_KVS state.",
-                            request.request_id,
-                        )
-                        skipped_waiting_requests.prepend_request(
-                            self.waiting.pop_request())
+                            request.request_id)
+                        self.waiting.pop_request()
+                        skipped_waiting_requests.prepend_request(request)
                         continue
 
                 # Skip request if the structured output request is still waiting
@@ -364,8 +354,8 @@ def schedule(self) -> SchedulerOutput:
                     if structured_output_req and structured_output_req.grammar:
                         request.status = RequestStatus.WAITING
                     else:
-                        skipped_waiting_requests.prepend_request(
-                            self.waiting.pop_request())
+                        self.waiting.pop_request()
+                        skipped_waiting_requests.prepend_request(request)
                         continue
 
                 # Check that adding the request still respects the max_loras
@@ -374,8 +364,8 @@ def schedule(self) -> SchedulerOutput:
                     (len(scheduled_loras) == self.lora_config.max_loras and
                      request.lora_request.lora_int_id not in scheduled_loras)):
                     # Scheduling would exceed max_loras, skip.
-                    skipped_waiting_requests.prepend_request(
-                        self.waiting.pop_request())
+                    self.waiting.pop_request()
+                    skipped_waiting_requests.prepend_request(request)
                     continue
 
                 num_external_computed_tokens = 0
@@ -530,8 +520,8 @@ def schedule(self) -> SchedulerOutput:
         # Since some requests in the RUNNING queue may not be scheduled in
         # this step, the total number of scheduled requests can be smaller than
         # len(self.running).
-        assert len(scheduled_new_reqs) + len(scheduled_resumed_reqs) + len(
-            scheduled_running_reqs) <= len(self.running)
+        assert (len(scheduled_new_reqs) + len(scheduled_resumed_reqs) +
+                len(scheduled_running_reqs) <= len(self.running))
 
         # Get the longest common prefix among all requests in the running queue.
         # This can be potentially used for cascade attention.

From dd63a037a0202fad7c56bf674298bbced1d39600 Mon Sep 17 00:00:00 2001
From: amit <amit.man@gmail.com>
Date: Fri, 20 Jun 2025 12:26:00 +0300
Subject: [PATCH 31/33] minor style changes

Signed-off-by: amit <amit.man@gmail.com>
---
 vllm/v1/core/sched/scheduler.py | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 84eab45a2a8e..7b5e2a302bc8 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -374,8 +374,9 @@ def schedule(self) -> SchedulerOutput:
                 # Get already-cached tokens.
                 if request.num_computed_tokens == 0:
                     # Get locally-cached tokens.
-                    new_computed_blocks, num_new_local_computed_tokens = (
-                        self.kv_cache_manager.get_computed_blocks(request))
+                    new_computed_blocks, num_new_local_computed_tokens = \
+                        self.kv_cache_manager.get_computed_blocks(
+                            request)
 
                     # Get externally-cached tokens if using a KVConnector.
                     if self.connector is not None:
@@ -425,16 +426,11 @@ def schedule(self) -> SchedulerOutput:
 
                     # Schedule encoder inputs.
                     if request.has_encoder_inputs:
-                        (
-                            encoder_inputs_to_schedule,
-                            num_new_tokens,
-                            new_encoder_budget,
-                        ) = self._try_schedule_encoder_inputs(
-                            request,
-                            num_computed_tokens,
-                            num_new_tokens,
-                            encoder_budget,
-                        )
+                        (encoder_inputs_to_schedule, num_new_tokens,
+                         new_encoder_budget
+                         ) = self._try_schedule_encoder_inputs(
+                             request, num_computed_tokens, num_new_tokens,
+                             encoder_budget)
                         if num_new_tokens == 0:
                             # The request cannot be scheduled.
                             break
@@ -1017,8 +1013,7 @@ def make_spec_decoding_stats(
             spec_decoding_stats = SpecDecodingStats.new(self.num_spec_tokens)
         spec_decoding_stats.observe_draft(
             num_draft_tokens=num_draft_tokens,
-            num_accepted_tokens=num_accepted_tokens,
-        )
+            num_accepted_tokens=num_accepted_tokens)
         return spec_decoding_stats
 
     def shutdown(self) -> None:
@@ -1061,6 +1056,7 @@ def _update_waiting_for_remote_kv(self, request: Request) -> bool:
         assert self.connector is not None
         if request.request_id not in self.finished_recving_kv_req_ids:
             return False
+
         # Now that the blocks are ready, actually cache them.
         (block_ids, ) = self.kv_cache_manager.get_block_ids(request.request_id)
         num_computed_tokens = len(block_ids) * self.block_size
@@ -1093,6 +1089,6 @@ def _update_from_kv_xfer_finished(self,
         for req_id in (model_runner_output.finished_recving or ()):
             logger.debug("Finished recving KV transfer for request %s", req_id)
             self.finished_recving_kv_req_ids.add(req_id)
-        for req_id in model_runner_output.finished_sending or ():
+        for req_id in (model_runner_output.finished_sending or ()):
             logger.debug("Finished sending KV transfer for request %s", req_id)
             self._free_blocks(self.requests[req_id])

From 9b10818e4d052ad50317a4cfc8dac3e773511f42 Mon Sep 17 00:00:00 2001
From: amit <amit.man@gmail.com>
Date: Fri, 20 Jun 2025 12:30:41 +0300
Subject: [PATCH 32/33] use enum for policy parameter in create_request_queue

Signed-off-by: amit <amit.man@gmail.com>
---
 vllm/v1/core/sched/request_queue.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/core/sched/request_queue.py b/vllm/v1/core/sched/request_queue.py
index f3796521f2db..fc2bc30b9a5f 100644
--- a/vllm/v1/core/sched/request_queue.py
+++ b/vllm/v1/core/sched/request_queue.py
@@ -7,10 +7,17 @@
 from abc import ABC, abstractmethod
 from collections import deque
 from collections.abc import Iterable, Iterator
+from enum import Enum
 
 from vllm.v1.request import Request
 
 
+class SchedulingPolicy(Enum):
+    """Enum for scheduling policies."""
+    FCFS = "fcfs"
+    PRIORITY = "priority"
+
+
 class RequestQueue(ABC):
     """Abstract base class for request queues."""
 
@@ -207,9 +214,11 @@ def __reversed__(self) -> Iterator[Request]:
         return reversed(list(self))
 
 
-def create_request_queue(policy: str) -> RequestQueue:
+def create_request_queue(policy: SchedulingPolicy) -> RequestQueue:
     """Create request queue based on scheduling policy."""
-    if policy == "priority":
+    if policy == SchedulingPolicy.PRIORITY:
         return PriorityRequestQueue()
-    else:
+    elif policy == SchedulingPolicy.FCFS:
         return FCFSRequestQueue()
+    else:
+        raise ValueError(f"Unknown scheduling policy: {policy}")

From 38cdbd57f3f3a33f47b0b9b0a189f3e115b51542 Mon Sep 17 00:00:00 2001
From: amit <amit.man@gmail.com>
Date: Fri, 20 Jun 2025 12:43:46 +0300
Subject: [PATCH 33/33] use enum for policy parameter in create_request_queue
 pt. 2

Signed-off-by: amit <amit.man@gmail.com>
---
 vllm/v1/core/sched/scheduler.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 7b5e2a302bc8..1b0c46b397be 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -22,7 +22,8 @@
 from vllm.v1.core.sched.interface import SchedulerInterface
 from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
                                        SchedulerOutput)
-from vllm.v1.core.sched.request_queue import create_request_queue
+from vllm.v1.core.sched.request_queue import (SchedulingPolicy,
+                                              create_request_queue)
 from vllm.v1.core.sched.utils import check_stop
 from vllm.v1.engine import (EngineCoreEventType, EngineCoreOutput,
                             EngineCoreOutputs)
@@ -96,7 +97,13 @@ def __init__(
         # req_id -> Request
         self.requests: dict[str, Request] = {}
         # Scheduling policy
-        self.policy = self.scheduler_config.policy
+        if self.scheduler_config.policy == "priority":
+            self.policy = SchedulingPolicy.PRIORITY
+        elif self.scheduler_config.policy == "fcfs":
+            self.policy = SchedulingPolicy.FCFS
+        else:
+            raise ValueError(
+                f"Unknown scheduling policy: {self.scheduler_config.policy}")
         # Priority queues for requests.
         self.waiting = create_request_queue(self.policy)
         self.running: list[Request] = []
@@ -250,7 +257,7 @@ def schedule(self) -> SchedulerOutput:
                 if new_blocks is None:
                     # The request cannot be scheduled.
                     # Preempt the lowest-priority request.
-                    if self.policy == "priority":
+                    if self.policy == SchedulingPolicy.PRIORITY:
                         preempted_req = max(
                             self.running,
                             key=lambda r: (r.priority, r.arrival_time),