@@ -76,6 +76,9 @@ def __init__(
7676 # KV Connector pushes/pull of remote KVs for P/D and offloading.
7777 self .connector = None
7878 if self .vllm_config .kv_transfer_config is not None :
79+ assert len (self .kv_cache_config .kv_cache_groups ) == 1 , (
80+ "Multiple KV cache groups are not currently supported "
81+ "with KV connectors" )
7982 self .connector = KVConnectorFactory .create_connector_v1 (
8083 config = self .vllm_config , role = KVConnectorRole .SCHEDULER )
8184
@@ -985,16 +988,8 @@ def _connector_finished(
985988 """
986989 if self .connector is None :
987990 return False , None
988- assert len (self .kv_cache_config .kv_cache_groups
989- ) == 1 , "KV connector only supports one KV cache group now"
990- if (request .status == RequestStatus .FINISHED_ABORTED and \
991- request .request_id not in
992- self .kv_cache_manager .single_type_manager .req_to_blocks ):
993- # Ensure empty blocks ids are passed to respect connector interface
994- block_ids = KVCacheBlocks .create_empty ().get_block_ids ()[0 ]
995- else :
996- block_ids = self .kv_cache_manager .get_block_ids (
997- request .request_id )[0 ]
991+
992+ (block_ids , ) = self .kv_cache_manager .get_block_ids (request .request_id )
998993 return self .connector .request_finished (request , block_ids )
999994
1000995 def _update_waiting_for_remote_kv (self , request : Request ) -> bool :
@@ -1009,12 +1004,12 @@ def _update_waiting_for_remote_kv(self, request: Request) -> bool:
10091004 and the request state will be moved back to WAITING from
10101005 WAITING_FOR_REMOTE_KV.
10111006 """
1007+ assert self .connector is not None
10121008 if request .request_id not in self .finished_recving_kv_req_ids :
10131009 return False
1014- assert len (self .kv_cache_config .kv_cache_groups
1015- ) == 1 , "KV connector only supports one KV cache group now"
1010+
10161011 # Now that the blocks are ready, actually cache them.
1017- block_ids = self .kv_cache_manager .get_block_ids (request .request_id )[ 0 ]
1012+ ( block_ids , ) = self .kv_cache_manager .get_block_ids (request .request_id )
10181013 num_computed_tokens = len (block_ids ) * self .block_size
10191014 # Handle the case where num request tokens less then one block.
10201015 num_computed_tokens = min (num_computed_tokens , request .num_tokens )
0 commit comments