[Bugfix] Fix num_tokens_to_schedule & Add unit test

herotai214 · herotai214 · commit 2665ff626b1c · 2025-11-05T18:17:02.000+08:00
Signed-off-by: herotai214 &lt;herotai214@gmail.com&gt;
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
@@ -22,6 +22,7 @@
 )
 from vllm.sampling_params import SamplingParams, StructuredOutputsParams
 from vllm.utils.hashing import sha256
+from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
 from vllm.v1.core.kv_cache_utils import get_request_block_hasher, init_none_hash
 from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler
@@ -1518,6 +1519,7 @@ def create_requests_with_priority(
     starting_idx: int = 0,
     same_prompt: bool = False,
     block_size: int = 16,
+    req_ids: list[int] | None = None,
 ):
     """Create requests with specified priorities and arrival times."""
     assert len(priorities) == num_requests
@@ -1553,6 +1555,11 @@ def create_requests_with_priority(
         # Verify mm items with identical identifier are having mm_position.length
         seen_hashes: dict[str, int] = {}
 
+    if req_ids:
+        assert len(req_ids) == num_requests
+    else:
+        req_ids = [f"{i + starting_idx}" for i in range(num_requests)]
+
     for i in range(num_requests):
         mm_features = []
 
@@ -1589,7 +1596,7 @@ def create_requests_with_priority(
             else [i + starting_idx] * num_tokens
         )
         request = Request(
-            request_id=f"{i + starting_idx}",
+            request_id=req_ids[i],
             prompt_token_ids=prompt_token_ids,
             sampling_params=sampling_params,
             pooling_params=None,
@@ -2273,6 +2280,7 @@ def _validate_chunked_prefill_settings_for_encoder_decoder(
 
 def _assert_right_encoder_cache_allocated(
     scheduler: Scheduler,
+    hashes_to_check: list[str] | None = None,
     requests: list[Request] | None = None,
     expected_total_allocated: int | None = None,
 ):
@@ -2291,6 +2299,13 @@ def _assert_right_encoder_cache_allocated(
     # Verify each request with MM data is in cache
     cached_hashes = set(encoder_cache_manager.cached.keys())
 
+    if hashes_to_check:
+        missed_hashes = set(hashes_to_check) - cached_hashes
+        assert not missed_hashes, (
+            f"Miss hashes: {missed_hashes} "
+            f"Existing encoder cache: {encoder_cache_manager.cached}"
+        )        
+        
     for req in requests if requests is not None else []:
         if req.mm_features:
             mm_hashes = [f.identifier for f in req.mm_features]
@@ -2572,7 +2587,7 @@ def test_ec_connector_with_partial_cache_hit_multi_round(use_kv_connector):
     # Encoder cache should contain all mm items from request
     _assert_right_encoder_cache_allocated(scheduler, requests=[request1])
 
-    # Should call update_state_after_alloc for external load
+    # Should have called update_state_after_alloc for external load
     scheduler.ec_connector.update_state_after_alloc.assert_called()
     scheduler.ec_connector.update_state_after_alloc.reset_mock()
 
@@ -2716,7 +2731,7 @@ def test_ec_connector_schedule_multiple_requests(cache_exist, use_kv_connector):
 
     ## Encoder-cache-specific checks:
     # mm_hashes of requests exist in cache after scheduling for all scenario
-    _assert_right_encoder_cache_allocated(scheduler, requests)
+    _assert_right_encoder_cache_allocated(scheduler, requests=requests)
 
     # Should only call update_state_after_alloc when loaded externally
     if cache_exist == "connector_only":
@@ -2814,7 +2829,7 @@ def test_ec_connector_unable_to_allocate(use_kv_connector):
     assert len(scheduler.running) == 1
     assert len(scheduler.waiting) == 1
 
-    # Should call update_state_after_alloc for external load
+    # Should have called update_state_after_alloc for external load
     scheduler.ec_connector.update_state_after_alloc.assert_called_with(
         scheduler.running[0], 0
     )
@@ -3051,7 +3066,7 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
 
     ## Encoder-cache-specific checks:
     # mm_hash of request_low exists in cache after scheduling for all scenario
-    _assert_right_encoder_cache_allocated(scheduler, [request_low])
+    _assert_right_encoder_cache_allocated(scheduler, requests=[request_low])
 
     # Should only call update_state_after_alloc when loaded externally
     if cache_exist == "connector_only":
@@ -3080,6 +3095,158 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
         _assert_right_encoder_inputs(output, expected_total_reqs=0)
 
 
+@pytest.mark.parametrize("use_kv_connector", [False, True])
+def test_ec_connector_allocate_encoder_tokens_with_external_load(use_kv_connector):
+    """
+    Scenario:
+      - Encoder cache size: 32
+      - Request A: 1 feature (12 tokens) → NOT cached remotely.
+      - Request B: 3 features (3 x 10 tokens) → ALL cached remotely.
+
+    Steps:
+      1. Schedule Request A (locally uses 12 tokens).
+      2. Schedule Request B (remote cache — no local tokens used) - only schedule 1st and 2nd
+      3. Free A's cache, then schedule B again (continuation) - schedule 3rd image
+    """
+    scheduler = create_scheduler(
+        model="llava-hf/llava-1.5-7b-hf",
+        max_num_batched_tokens=1024,
+        enable_prefix_caching=True,
+        use_kv_connector=use_kv_connector,
+        block_size=16,
+        num_blocks=11,  # Can hold 160 tokens (first block is null)
+        use_ec_connector=True,
+        ec_role="ec_consumer",
+        disable_hybrid_kv_cache_manager=use_kv_connector,
+    )
+
+    # Limit the 
+    scheduler.encoder_cache_manager = EncoderCacheManager(cache_size=32)
+
+    # Create MM request1
+    NUM_TOKENS_1 = 50  # NOTE: includes mm tokens
+    NUM_ENCODER_TOKENS_1 = 12
+    mm_hashes_list_1 = [["hash1_1"]]
+    mm_positions_1 = [[PlaceholderRange(offset=0, length=NUM_ENCODER_TOKENS_1)]]
+
+    request1 = create_requests(
+        num_requests=1,
+        num_tokens=NUM_TOKENS_1,
+        mm_hashes_list=mm_hashes_list_1,
+        mm_positions=mm_positions_1,
+        max_tokens=1,  # For simplicity
+        req_ids=["req1"],
+    )[0]
+
+    # Create MM request1 with 3 MM items
+    NUM_TOKENS_2 = 40
+    NUM_ENCODER_TOKENS_2 = 10
+    mm_hashes_list_2 = [["hash2_1", "hash2_2", "hash2_3"]]
+    mm_positions_2 = [
+        [
+            PlaceholderRange(offset=0, length=NUM_ENCODER_TOKENS_2),
+            PlaceholderRange(offset=12, length=NUM_ENCODER_TOKENS_2),
+            PlaceholderRange(offset=24, length=NUM_ENCODER_TOKENS_2),
+        ]
+    ]
+
+    request2 = create_requests(
+        num_requests=1,
+        num_tokens=NUM_TOKENS_2,
+        mm_hashes_list=mm_hashes_list_2,
+        mm_positions=mm_positions_2,
+        max_tokens=10,
+        req_ids=["req2"],
+    )[0]
+
+    # Mock cache hit: MM of request1 NOT cached remotely, request2 cached remotely
+    scheduler.ec_connector.has_caches = Mock(
+        side_effect=lambda req: [True, True, True] if req == request2 else [False]
+    )
+    scheduler.ec_connector.update_state_after_alloc = Mock(
+        wraps=scheduler.ec_connector.update_state_after_alloc
+    )
+
+    scheduler.add_request(request1)
+    scheduler.add_request(request2)
+    output = scheduler.schedule()
+
+    # Now, since encoder cache manager can only store 32 tokens
+    # It should allocated mm item hash1_1, hash2_1 and hash2_2
+    scheduled_tokens = output.num_scheduled_tokens[request1.request_id]
+    assert scheduled_tokens == NUM_TOKENS_1
+    assert scheduler.get_num_unfinished_requests() == 2
+
+    # Encoder cache should contain mm item from request1
+    _assert_right_encoder_cache_allocated(
+        scheduler, hashes_to_check=['hash1_1', 'hash2_1', 'hash2_2']
+    )
+
+    # request2's 2nd mm item is the last call of update_state_after_alloc
+    scheduler.ec_connector.update_state_after_alloc.assert_called_with(request2, 1)
+    scheduler.ec_connector.update_state_after_alloc.reset_mock()
+
+    # ECConnector should carry metadata of hash2_1 and hash2_2 ONLY
+    _assert_right_ec_connector_metadata(
+        output, mm_features_list=[request2.mm_features[0], request2.mm_features[1]]
+    )
+
+    # Should schedule ONLY 1 encoder input
+    _assert_right_encoder_inputs(
+        output,
+        requests=[request1],
+        expected_encoder_inputs=[[0]],  # index 0 of the mm item of request1
+        expected_total_reqs=1,
+    )
+
+    # Simulate model execution 1 step
+    model_output = ModelRunnerOutput(
+        req_ids=[request1.request_id, request2.request_id],
+        req_id_to_index={request1.request_id: 0, request2.request_id: 1},
+        sampled_token_ids=[[100], [121]],
+        # spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    scheduler.update_from_output(output, model_output)
+
+    # request1 is finished after outputing 1 token
+    # Finish request
+    scheduler.finish_requests(request1.request_id, RequestStatus.FINISHED_LENGTH_CAPPED)
+    assert scheduler.get_num_unfinished_requests() == 1
+
+    # Schedule again
+    # Now request1's encoder cache should be freed -> hash2_3 can be scheduled and allocated
+    output = scheduler.schedule()
+
+    # Check
+    # Should schedule all tokens
+    scheduled_tokens = output.num_scheduled_tokens[request2.request_id]
+    print(f"Hero: scheduled_tokens for req2: {scheduled_tokens}")
+    print(f"hero: num_scheduled_tokens 2: {output.num_scheduled_tokens}")
+
+    # Encoder cache should contain all mm items from request2
+    _assert_right_encoder_cache_allocated(scheduler, requests=[request2])
+
+    # request2's 3rd mm item is the ONLY call of update_state_after_alloc
+    scheduler.ec_connector.update_state_after_alloc.assert_called_with(request2, 2)
+    scheduler.ec_connector.update_state_after_alloc.assert_called_once()
+
+    scheduler.ec_connector.update_state_after_alloc.reset_mock()
+
+    # ECConnector should carry metadata for hash2_3 ONLY
+    _assert_right_ec_connector_metadata(
+        output, mm_features_list=[request2.mm_features[2]]
+    )
+
+    # Should schedule no encoder input
+    _assert_right_encoder_inputs(
+        output,
+        expected_total_reqs=0,
+    )
+
+
 # ==============================================================================
 # EPD (Encoder-Prefill-Decode) Encoder-cache-specific tests end
 # ==============================================================================
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
@@ -163,6 +163,7 @@ def create_requests(
     prompt_logprobs: int | None = None,
     same_prompt: bool = False,
     block_size: int = 16,
+    req_ids: list[int] | None = None,
 ) -> list[Request]:
     global _none_hash_initialized
     if not _none_hash_initialized:
@@ -191,6 +192,11 @@ def create_requests(
         # Verify mm items with identical identifier are having mm_position.length
         seen_hashes: dict[str, int] = {}
 
+    if req_ids:
+        assert len(req_ids) == num_requests
+    else:
+        req_ids = [f"{i}" for i in range(num_requests)]
+
     for i in range(num_requests):
         mm_features = []
 
@@ -223,7 +229,7 @@ def create_requests(
 
         prompt_token_ids = [0] * num_tokens if same_prompt else [i] * num_tokens
         request = Request(
-            request_id=f"{i}",
+            request_id=req_ids[i],
             prompt_token_ids=prompt_token_ids,
             sampling_params=sampling_params,
             pooling_params=None,
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
@@ -898,6 +898,7 @@ def _try_schedule_encoder_inputs(
             if self.ec_connector is not None and remote_cache_has_item[i]:
                 mm_hashes_to_schedule.add(request.mm_features[i].identifier)
                 external_load_encoder_input.append(i)
+                num_tokens_to_schedule += num_encoder_tokens
                 continue
 
             num_tokens_to_schedule += num_encoder_tokens