9696from vllm_ascend .worker .mtp_proposer_v1 import MtpProposer
9797from vllm_ascend .worker .npu_input_batch import CachedRequestState , InputBatch
9898
99- if not vllm_version_is ("0.10.1.1" ):
99+ if not ( vllm_version_is ("0.10.1.1" ) or vllm_version_is ( "0.10.1" ) ):
100100 from vllm .v1 .outputs import DraftTokenIds
101101else :
102102 DraftTokenIds = None
@@ -384,7 +384,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
384384 # Remove finished requests from the cached states.
385385 for req_id in scheduler_output .finished_req_ids :
386386 self .requests .pop (req_id , None )
387- if vllm_version_is ("0.10.1.1" ):
387+ if vllm_version_is ("0.10.1.1" ) or vllm_version_is ( "0.10.1" ) :
388388 self .encoder_cache .pop (req_id , None )
389389 # Remove the finished requests from the persistent batch.
390390 # NOTE(woosuk): There could be an edge case where finished_req_ids and
@@ -394,7 +394,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
394394 # and handling the second as a new request.
395395 for req_id in scheduler_output .finished_req_ids :
396396 self .input_batch .remove_request (req_id )
397- if vllm_version_is ("0.10.1.1" ):
397+ if vllm_version_is ("0.10.1.1" ) or vllm_version_is ( "0.10.1" ) :
398398 # Free the cached encoder outputs.
399399 for req_id , input_id in scheduler_output .free_encoder_input_ids :
400400 encoder_outputs = self .encoder_cache .get (req_id )
@@ -455,9 +455,10 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
455455 lora_request = new_req_data .lora_request ,
456456 ** ({
457457 "mm_hashes" : new_req_data .mm_hashes
458- } if not vllm_version_is ("0.10.1.1" ) else {
459- "mm_hashes" : None
460- }),
458+ } if not (vllm_version_is ("0.10.1.1" )
459+ or vllm_version_is ("0.10.1" )) else {
460+ "mm_hashes" : None
461+ }),
461462 )
462463
463464 # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
@@ -893,13 +894,13 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
893894
894895 # Batch the multi-modal inputs.
895896 mm_kwargs = list [MultiModalKwargsItem ]()
896- if vllm_version_is ("0.10.1.1" ):
897+ if vllm_version_is ("0.10.1.1" ) or vllm_version_is ( "0.10.1" ) :
897898 req_ids_pos = list [tuple [str , int , PlaceholderRange ]]()
898899 else :
899900 mm_hashes_pos = list [tuple [str , PlaceholderRange ]]()
900901 for req_id , encoder_input_ids in scheduled_encoder_inputs .items ():
901902 req_state = self .requests [req_id ]
902- if vllm_version_is ("0.10.1.1" ):
903+ if vllm_version_is ("0.10.1.1" ) or vllm_version_is ( "0.10.1" ) :
903904 for mm_input_id in encoder_input_ids :
904905 mm_kwargs .append (req_state .mm_kwargs [mm_input_id ])
905906 req_ids_pos .append ((req_id , mm_input_id ,
@@ -942,7 +943,7 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
942943
943944 for output in curr_group_outputs :
944945 encoder_outputs .append (output )
945- if vllm_version_is ("0.10.1.1" ):
946+ if vllm_version_is ("0.10.1.1" ) or vllm_version_is ( "0.10.1" ) :
946947 # Cache the encoder outputs.
947948 for (req_id , input_id , pos_info ), output in zip (
948949 req_ids_pos ,
@@ -974,7 +975,7 @@ def _gather_mm_embeddings(
974975 req_state = self .requests [req_id ]
975976 num_computed_tokens = req_state .num_computed_tokens
976977 mm_positions = req_state .mm_positions
977- if not vllm_version_is ("0.10.1.1" ):
978+ if not ( vllm_version_is ("0.10.1.1" ) or vllm_version_is ( "0.10.1" ) ):
978979 mm_hashes = req_state .mm_hashes
979980 for i , pos_info in enumerate (mm_positions ):
980981 start_pos = pos_info .offset
@@ -993,7 +994,7 @@ def _gather_mm_embeddings(
993994 continue
994995
995996 start_idx = max (num_computed_tokens - start_pos , 0 )
996- if vllm_version_is ("0.10.1.1" ):
997+ if vllm_version_is ("0.10.1.1" ) or vllm_version_is ( "0.10.1" ) :
997998 end_idx = min (
998999 num_computed_tokens - start_pos + num_scheduled_tokens ,
9991000 num_encoder_tokens )
@@ -1719,7 +1720,8 @@ def execute_model(
17191720 logits = None
17201721 else :
17211722 if self .input_batch .pooling_params :
1722- if vllm_version_is ("0.10.1.1" ):
1723+ if vllm_version_is ("0.10.1.1" ) or vllm_version_is (
1724+ "0.10.1" ):
17231725 return self ._pool_v010 (
17241726 hidden_states ,
17251727 scheduler_output .total_num_scheduled_tokens ,
@@ -1867,7 +1869,7 @@ def execute_model(
18671869
18681870 extra_args = ({"kv_connector_output" : kv_connector_output })
18691871
1870- if vllm_version_is ("0.10.1.1" ):
1872+ if vllm_version_is ("0.10.1.1" ) or vllm_version_is ( "0.10.1" ) :
18711873 model_runner_output = ModelRunnerOutput (
18721874 req_ids = self .input_batch .req_ids ,
18731875 req_id_to_index = self .input_batch .req_id_to_index ,
@@ -2191,7 +2193,7 @@ def _dummy_pooler_run_task(
21912193 dummy_pooling_params = PoolingParams (task = task )
21922194 to_update = model .pooler .get_pooling_updates (task )
21932195 to_update .apply (dummy_pooling_params )
2194- if vllm_version_is ("0.10.1.1" ):
2196+ if vllm_version_is ("0.10.1.1" ) or vllm_version_is ( "0.10.1" ) :
21952197 dummy_prompt_lens = torch .tensor (
21962198 [h .shape [0 ] for h in hidden_states_list ],
21972199 device = self .device ,
0 commit comments