File tree Expand file tree Collapse file tree 3 files changed +22
-3
lines changed Expand file tree Collapse file tree 3 files changed +22
-3
lines changed Original file line number Diff line number Diff line change @@ -34,7 +34,7 @@ def __post_init__(self) -> None:
3434 raise NotImplementedError (f"currently AscendScheduler only supports fcfs policy, got { self .policy } " )
3535 if self .is_multimodal_model :
3636 raise NotImplementedError (f"currently AscendScheduler only supports LLM modles." )
37- if self .num_scheduler_steps > t1 :
37+ if self .num_scheduler_steps > 1 :
3838 raise NotImplementedError (f"currently AscendScheduler doesn't support multi-step." )
3939 if self .send_delta_data :
4040 raise NotImplementedError (f"currently AscendScheduler doesn't support send_delta_data." )
Original file line number Diff line number Diff line change 11from collections import deque
2- from dataclasses import dataclass
3- from typing import List
42
53from vllm .logger import init_logger
4+ from vllm .utils import cdiv
65from vllm .v1 .core .scheduler import Scheduler
76from vllm .v1 .core .scheduler_output import (NewRequestData ,
87 SchedulerOutput )
@@ -88,6 +87,11 @@ def skip_cur_request():
8887 skip_cur_request ()
8988 continue
9089
90+ if not self ._check_watermark_for_prefill (num_new_tokens ):
91+ # Scheduling would exceed watermark, skip.
92+ skip_cur_request ()
93+ continue
94+
9195 assert num_new_tokens > 0
9296 new_blocks = self .kv_cache_manager .allocate_slots (
9397 request , num_new_tokens , computed_blocks )
@@ -261,6 +265,14 @@ def skip_cur_request():
261265
262266 self .finished_req_ids = set ()
263267 return scheduler_output
268+
269+
270+ def _check_watermark_for_prefill (self , num_new_tokens , watermark = 0.01 ):
271+ watermark_blocks = self .cache_config .num_gpu_blocks * watermark
272+ num_required_blocks = cdiv (num_new_tokens , self .block_size )
273+ if (self .kv_cache_manager .free_block_queue .num_free_blocks - num_required_blocks ) < watermark_blocks :
274+ return False
275+ return True
264276
265277
266278 def _get_prompt_limit (self , request : Request ) -> int :
Original file line number Diff line number Diff line change 4141
4242from vllm_ascend .device_allocator .camem import CaMemAllocator
4343from vllm_ascend .worker .model_runner_v1 import NPUModelRunner
44+ from vllm_ascend .utils import try_register_lib
4445
4546logger = init_logger (__name__ )
4647
@@ -74,6 +75,12 @@ def __init__(self,
7475 self .prompt_adapter_config = vllm_config .prompt_adapter_config
7576 self .observability_config = vllm_config .observability_config
7677
78+ # Try to import mindie_turbo to accelerate vLLM inference.
79+ try_register_lib (
80+ "mindie_turbo" ,
81+ "MindIE Turbo is installed. vLLM inference will be accelerated with MindIE Turbo."
82+ )
83+
7784 if self .cache_config .cache_dtype == "auto" :
7885 self .cache_dtype = self .model_config .dtype
7986 else :
You can’t perform that action at this time.
0 commit comments