Skip to content

Commit 8baf750

Browse files
author
hw_whx
committed
feat: add watermark schema
Signed-off-by: hw_whx <wanghexiang7@huawei.com>
1 parent 72e71f6 commit 8baf750

File tree

3 files changed

+22
-3
lines changed

3 files changed

+22
-3
lines changed

vllm_ascend/core/schedule_config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def __post_init__(self) -> None:
3434
raise NotImplementedError(f"currently AscendScheduler only supports fcfs policy, got {self.policy}")
3535
if self.is_multimodal_model:
3636
raise NotImplementedError(f"currently AscendScheduler only supports LLM modles.")
37-
if self.num_scheduler_steps >t1:
37+
if self.num_scheduler_steps > 1:
3838
raise NotImplementedError(f"currently AscendScheduler doesn't support multi-step.")
3939
if self.send_delta_data:
4040
raise NotImplementedError(f"currently AscendScheduler doesn't support send_delta_data.")

vllm_ascend/core/scheduler.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
from collections import deque
2-
from dataclasses import dataclass
3-
from typing import List
42

53
from vllm.logger import init_logger
4+
from vllm.utils import cdiv
65
from vllm.v1.core.scheduler import Scheduler
76
from vllm.v1.core.scheduler_output import (NewRequestData,
87
SchedulerOutput)
@@ -88,6 +87,11 @@ def skip_cur_request():
8887
skip_cur_request()
8988
continue
9089

90+
if not self._check_watermark_for_prefill(num_new_tokens):
91+
# Scheduling would exceed watermark, skip.
92+
skip_cur_request()
93+
continue
94+
9195
assert num_new_tokens > 0
9296
new_blocks = self.kv_cache_manager.allocate_slots(
9397
request, num_new_tokens, computed_blocks)
@@ -261,6 +265,14 @@ def skip_cur_request():
261265

262266
self.finished_req_ids = set()
263267
return scheduler_output
268+
269+
270+
def _check_watermark_for_prefill(self, num_new_tokens, watermark = 0.01):
271+
watermark_blocks = self.cache_config.num_gpu_blocks * watermark
272+
num_required_blocks = cdiv(num_new_tokens, self.block_size)
273+
if (self.kv_cache_manager.free_block_queue.num_free_blocks - num_required_blocks) < watermark_blocks:
274+
return False
275+
return True
264276

265277

266278
def _get_prompt_limit(self, request: Request) -> int:

vllm_ascend/worker/worker_v1.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141

4242
from vllm_ascend.device_allocator.camem import CaMemAllocator
4343
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
44+
from vllm_ascend.utils import try_register_lib
4445

4546
logger = init_logger(__name__)
4647

@@ -74,6 +75,12 @@ def __init__(self,
7475
self.prompt_adapter_config = vllm_config.prompt_adapter_config
7576
self.observability_config = vllm_config.observability_config
7677

78+
# Try to import mindie_turbo to accelerate vLLM inference.
79+
try_register_lib(
80+
"mindie_turbo",
81+
"MindIE Turbo is installed. vLLM inference will be accelerated with MindIE Turbo."
82+
)
83+
7784
if self.cache_config.cache_dtype == "auto":
7885
self.cache_dtype = self.model_config.dtype
7986
else:

0 commit comments

Comments
 (0)