|
33 | 33 | from vllm.sampling_params import SamplingParams |
34 | 34 | from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend |
35 | 35 | from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput |
| 36 | +from vllm.v1.request import RequestStatus |
36 | 37 |
|
37 | 38 | from .utils import create_request, create_scheduler, create_vllm_config |
38 | 39 |
|
@@ -1023,3 +1024,68 @@ def test_shutdown_cleans_up_resources(dist_init): |
1023 | 1024 | assert mock_dereg.call_count == 2 |
1024 | 1025 | mock_dereg.assert_any_call("desc1") |
1025 | 1026 | mock_dereg.assert_any_call("desc2") |
| 1027 | + |
| 1028 | + |
| 1029 | +@patch( |
| 1030 | + "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper", |
| 1031 | + FakeNixlWrapper) |
| 1032 | +def test_aborted_request_removed_from_worker_in_batch(dist_init): |
| 1033 | + """ |
| 1034 | + Create and schedule a request so that P adds it to in-batch tracking via |
| 1035 | + the real scheduler, then simulate an abort (request not in next scheduler |
| 1036 | + iteration) and verify the worker no longer tracks it as in-batch. |
| 1037 | + """ |
| 1038 | + vllm_config = create_vllm_config() |
| 1039 | + |
| 1040 | + scheduler = create_scheduler(vllm_config) |
| 1041 | + # KVConnector Worker in P |
| 1042 | + connector = NixlConnector(vllm_config, KVConnectorRole.WORKER) |
| 1043 | + connector.connector_worker = FakeNixlConnectorWorker(vllm_config, |
| 1044 | + connector.engine_id, |
| 1045 | + hand_shake_latency=0) |
| 1046 | + |
| 1047 | + # Create a request that triggers do_remote_decode so that |
| 1048 | + # the scheduler adds it to reqs_in_batch |
| 1049 | + req = create_request(request_id=1, do_remote_decode=True, max_tokens=1) |
| 1050 | + scheduler.add_request(req) |
| 1051 | + |
| 1052 | + # First scheduling pass - examinate build_connector_meta output |
| 1053 | + sched_out = scheduler.schedule() |
| 1054 | + kv_meta = sched_out.kv_connector_metadata |
| 1055 | + assert kv_meta is not None |
| 1056 | + assert isinstance(kv_meta, NixlConnectorMetadata) |
| 1057 | + assert req.request_id in kv_meta.reqs_in_batch |
| 1058 | + |
| 1059 | + #### Model Runner start #### |
| 1060 | + # Bind scheduler-produced metadata and start worker processing. |
| 1061 | + connector.bind_connector_metadata(kv_meta) |
| 1062 | + |
| 1063 | + dummy_ctx = ForwardContext( |
| 1064 | + no_compile_layers={}, |
| 1065 | + attn_metadata={}, |
| 1066 | + virtual_engine=0, |
| 1067 | + ) |
| 1068 | + connector.start_load_kv(dummy_ctx) |
| 1069 | + |
| 1070 | + # Ensure it was tracked by the worker |
| 1071 | + assert req.request_id in connector.connector_worker._reqs_to_process |
| 1072 | + |
| 1073 | + #### Model Runner end #### |
| 1074 | + |
| 1075 | + # Abort request - request_finished call in connector scheduler |
| 1076 | + scheduler.finish_requests(req.request_id, RequestStatus.FINISHED_ABORTED) |
| 1077 | + # Second scheduling pass - build metadata with aborted request |
| 1078 | + sched_out2 = scheduler.schedule() |
| 1079 | + kv_meta2 = sched_out2.kv_connector_metadata |
| 1080 | + assert kv_meta2 is not None |
| 1081 | + assert isinstance(kv_meta2, NixlConnectorMetadata) |
| 1082 | + assert req.request_id not in kv_meta2.reqs_in_batch |
| 1083 | + |
| 1084 | + # Bind empty/abort metadata and run worker step |
| 1085 | + #### Model Runner start #### |
| 1086 | + connector.bind_connector_metadata(kv_meta2) |
| 1087 | + connector.start_load_kv(dummy_ctx) |
| 1088 | + |
| 1089 | + # After abort, the worker should not keep tracking it as "in-batch" |
| 1090 | + assert req.request_id not in connector.connector_worker._reqs_to_process |
| 1091 | + #### Model Runner end #### |
0 commit comments