From caabd388bc93837ac1a28439bef8324e3bb0986b Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 10 Feb 2025 15:48:35 -0800
Subject: [PATCH 1/3] [V1][Minor] Move scheduler outputd to a separate file

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/core/scheduler.py          | 89 +-----------------------------
 vllm/v1/core/scheduler_output.py   | 87 +++++++++++++++++++++++++++++
 vllm/v1/worker/gpu_model_runner.py |  2 +-
 3 files changed, 91 insertions(+), 87 deletions(-)
 create mode 100644 vllm/v1/core/scheduler_output.py

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 1aa34ee38602..1c54914d182b 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -1,26 +1,20 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from collections import deque
-from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Deque, Dict, Iterable, List, Optional, Set,
-                    Tuple, Union)
+from typing import Deque, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 from vllm.config import CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig
 from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.sampling_params import SamplingParams
 from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
                                                 compute_encoder_budget)
 from vllm.v1.core.kv_cache_manager import KVCacheManager
+from vllm.v1.core.scheduler_output import (CachedRequestData, NewRequestData,
+                                           SchedulerOutput)
 from vllm.v1.engine import EngineCoreOutput, EngineCoreOutputs
 from vllm.v1.metrics.stats import SchedulerStats
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 
-if TYPE_CHECKING:
-    from vllm.multimodal import MultiModalKwargs
-    from vllm.multimodal.base import PlaceholderRange
-
 logger = init_logger(__name__)
 
 
@@ -600,80 +594,3 @@ def make_stats(self) -> SchedulerStats:
             num_waiting_reqs=len(self.waiting),
             gpu_cache_usage=self.kv_cache_manager.usage,
         )
-
-
-@dataclass
-class NewRequestData:
-
-    req_id: str
-    prompt_token_ids: List[int]
-    prompt: Optional[str]
-    mm_inputs: List["MultiModalKwargs"]
-    mm_hashes: List[str]
-    mm_positions: List["PlaceholderRange"]
-    sampling_params: SamplingParams
-    block_ids: List[int]
-    num_computed_tokens: int
-    lora_request: Optional[LoRARequest]
-
-    @classmethod
-    def from_request(
-        cls,
-        request: Request,
-        block_ids: List[int],
-        num_computed_tokens: int,
-    ) -> "NewRequestData":
-        return cls(
-            req_id=request.request_id,
-            prompt_token_ids=request.prompt_token_ids,
-            prompt=request.prompt,
-            mm_inputs=request.mm_inputs,
-            mm_hashes=request.mm_hashes,
-            mm_positions=request.mm_positions,
-            sampling_params=request.sampling_params,
-            block_ids=block_ids,
-            num_computed_tokens=num_computed_tokens,
-            lora_request=request.lora_request,
-        )
-
-
-@dataclass
-class CachedRequestData:
-
-    req_id: str
-    # If resumed_from_preemption is False, new_block_ids will be appended to
-    # the request's block IDs. If True, new_block_ids will be used as the
-    # request's block IDs instead of appending to the existing block IDs.
-    resumed_from_preemption: bool
-    new_block_ids: List[int]
-    num_computed_tokens: int
-
-    @classmethod
-    def from_request(
-        cls,
-        request: Request,
-        resumed_from_preemption: bool,
-        new_block_ids: List[int],
-        num_computed_tokens: int,
-    ) -> "CachedRequestData":
-        return cls(
-            req_id=request.request_id,
-            resumed_from_preemption=resumed_from_preemption,
-            new_block_ids=new_block_ids,
-            num_computed_tokens=num_computed_tokens,
-        )
-
-
-@dataclass
-class SchedulerOutput:
-
-    scheduled_new_reqs: List[NewRequestData]
-    scheduled_cached_reqs: List[CachedRequestData]
-
-    num_scheduled_tokens: Dict[str, int]
-    total_num_scheduled_tokens: int
-    scheduled_encoder_inputs: Dict[str, List[int]]
-    num_common_prefix_blocks: int
-
-    finished_req_ids: Set[str]
-    free_encoder_input_ids: List[Tuple[str, int]]
diff --git a/vllm/v1/core/scheduler_output.py b/vllm/v1/core/scheduler_output.py
new file mode 100644
index 000000000000..6bc3c036a5f3
--- /dev/null
+++ b/vllm/v1/core/scheduler_output.py
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: Apache-2.0
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple
+
+if TYPE_CHECKING:
+    from vllm.lora.request import LoRARequest
+    from vllm.multimodal import MultiModalKwargs
+    from vllm.multimodal.base import PlaceholderRange
+    from vllm.sampling_params import SamplingParams
+    from vllm.v1.request import Request
+
+
+@dataclass
+class NewRequestData:
+
+    req_id: str
+    prompt_token_ids: List[int]
+    prompt: Optional[str]
+    mm_inputs: List["MultiModalKwargs"]
+    mm_hashes: List[str]
+    mm_positions: List["PlaceholderRange"]
+    sampling_params: "SamplingParams"
+    block_ids: List[int]
+    num_computed_tokens: int
+    lora_request: Optional["LoRARequest"]
+
+    @classmethod
+    def from_request(
+        cls,
+        request: "Request",
+        block_ids: List[int],
+        num_computed_tokens: int,
+    ) -> "NewRequestData":
+        return cls(
+            req_id=request.request_id,
+            prompt_token_ids=request.prompt_token_ids,
+            prompt=request.prompt,
+            mm_inputs=request.mm_inputs,
+            mm_hashes=request.mm_hashes,
+            mm_positions=request.mm_positions,
+            sampling_params=request.sampling_params,
+            block_ids=block_ids,
+            num_computed_tokens=num_computed_tokens,
+            lora_request=request.lora_request,
+        )
+
+
+@dataclass
+class CachedRequestData:
+
+    req_id: str
+    # If resumed_from_preemption is False, new_block_ids will be appended to
+    # the request's block IDs. If True, new_block_ids will be used as the
+    # request's block IDs instead of appending to the existing block IDs.
+    resumed_from_preemption: bool
+    new_block_ids: List[int]
+    num_computed_tokens: int
+
+    @classmethod
+    def from_request(
+        cls,
+        request: "Request",
+        resumed_from_preemption: bool,
+        new_block_ids: List[int],
+        num_computed_tokens: int,
+    ) -> "CachedRequestData":
+        return cls(
+            req_id=request.request_id,
+            resumed_from_preemption=resumed_from_preemption,
+            new_block_ids=new_block_ids,
+            num_computed_tokens=num_computed_tokens,
+        )
+
+
+@dataclass
+class SchedulerOutput:
+
+    scheduled_new_reqs: List[NewRequestData]
+    scheduled_cached_reqs: List[CachedRequestData]
+
+    num_scheduled_tokens: Dict[str, int]
+    total_num_scheduled_tokens: int
+    scheduled_encoder_inputs: Dict[str, List[int]]
+    num_common_prefix_blocks: int
+
+    finished_req_ids: Set[str]
+    free_encoder_input_ids: List[Tuple[str, int]]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index fdbca70bda71..9b1eab613bf7 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -36,7 +36,7 @@
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
 if TYPE_CHECKING:
-    from vllm.v1.core.scheduler import SchedulerOutput
+    from vllm.v1.core.scheduler_output import SchedulerOutput
 
 logger = init_logger(__name__)
 

From b0c3b0c9b28fa6f641de7ac95514b83b4041717b Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 10 Feb 2025 15:57:57 -0800
Subject: [PATCH 2/3] comments

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/core/scheduler_output.py | 21 +++++++++++++++++++++
 vllm/v1/worker/gpu_worker.py     |  3 +--
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/core/scheduler_output.py b/vllm/v1/core/scheduler_output.py
index 6bc3c036a5f3..cdcdc03733cd 100644
--- a/vllm/v1/core/scheduler_output.py
+++ b/vllm/v1/core/scheduler_output.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple
 
@@ -75,13 +76,33 @@ def from_request(
 @dataclass
 class SchedulerOutput:
 
+    # List of the requests that are scheduled for the first time.
+    # We cache the request's data in each worker process, so that we don't
+    # need to re-send it every scheduling step.
     scheduled_new_reqs: List[NewRequestData]
+    # List of the requests that have been scheduled before.
+    # Since the request's data is already cached in the worker processes,
+    # we only send the diff to minimize the communication cost.
     scheduled_cached_reqs: List[CachedRequestData]
 
+    # req_id -> num_scheduled_tokens
+    # Number of tokens scheduled for each request.
     num_scheduled_tokens: Dict[str, int]
+    # Total number of tokens scheduled for all requests.
+    # Equal to sum(num_scheduled_tokens.values())
     total_num_scheduled_tokens: int
+    # req_id -> indices of encoder inputs scheduled for each request
+    # E.g., [0, 1] means that 0-th and 1-th images are scheduled to run on
+    # the vision encoder at the current step.
     scheduled_encoder_inputs: Dict[str, List[int]]
+    # Number of common prefix blocks for all requests.
+    # This can be used for cascade attention.
     num_common_prefix_blocks: int
 
+    # Request IDs that are finished in between the previous and the current
+    # steps. This is used to notify the workers about the finished requests
+    # so that they can free the cached states for those requests.
     finished_req_ids: Set[str]
+    # List of (req_id, encoder_input_index) tuples.
+    # Used to free the encoder cache.
     free_encoder_input_ids: List[Tuple[str, int]]
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 0adb69073397..ad53f90b8665 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -18,7 +18,6 @@
 from vllm.model_executor import set_random_seed
 from vllm.platforms import current_platform
 from vllm.utils import GiB_bytes
-from vllm.v1.core.scheduler import SchedulerOutput
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
@@ -26,7 +25,7 @@
 logger = init_logger(__name__)
 
 if TYPE_CHECKING:
-    from vllm.v1.core.scheduler import SchedulerOutput
+    from vllm.v1.core.scheduler_output import SchedulerOutput
 
 
 class Worker:

From 666f120b12220b5425eff6a85abd0c58cea4ecac Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 10 Feb 2025 16:12:43 -0800
Subject: [PATCH 3/3] minor

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/core/scheduler_output.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/core/scheduler_output.py b/vllm/v1/core/scheduler_output.py
index cdcdc03733cd..990b3dd0ed78 100644
--- a/vllm/v1/core/scheduler_output.py
+++ b/vllm/v1/core/scheduler_output.py
@@ -91,9 +91,9 @@ class SchedulerOutput:
     # Total number of tokens scheduled for all requests.
     # Equal to sum(num_scheduled_tokens.values())
     total_num_scheduled_tokens: int
-    # req_id -> indices of encoder inputs scheduled for each request
-    # E.g., [0, 1] means that 0-th and 1-th images are scheduled to run on
-    # the vision encoder at the current step.
+    # req_id -> encoder input indices that need processing.
+    # E.g., if a request has [0, 1], it could mean the vision encoder needs
+    # to process that the request's 0-th and 1-th images in the current step.
     scheduled_encoder_inputs: Dict[str, List[int]]
     # Number of common prefix blocks for all requests.
     # This can be used for cascade attention.