vllm-project
diff --git a/‎tests/test_scheduler.py‎
Lines changed: 31 additions & 20 deletions b/‎tests/test_scheduler.py‎
Lines changed: 31 additions & 20 deletions
diff --git a/‎vllm_ascend/attention/attention.py‎
Lines changed: 6 additions & 7 deletions b/‎vllm_ascend/attention/attention.py‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎vllm_ascend/attention/attention_v1.py‎
Lines changed: 9 additions & 12 deletions b/‎vllm_ascend/attention/attention_v1.py‎
Lines changed: 9 additions & 12 deletions
diff --git a/‎vllm_ascend/core/schedule_config.py‎
Lines changed: 37 additions & 12 deletions b/‎vllm_ascend/core/schedule_config.py‎
Lines changed: 37 additions & 12 deletions
@@ -1,14 +1,32 @@
-# SPDX-License-Identifier: Apache-2.0
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm-project/vllm/blob/main/tests/models/utils.py
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 from typing import List, Optional
 
 from vllm.config import CacheConfig, ModelConfig, SchedulerConfig
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
-from vllm_ascend.core.scheduler import AscendScheduler
 from vllm.v1.core.scheduler import SchedulerOutput
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 
+from vllm_ascend.core.scheduler import AscendScheduler
+
 EOS_TOKEN_ID = 50256
 
 
@@ -39,11 +57,11 @@ def create_scheduler(
     )
     cache_config.num_gpu_blocks = 10000
     return AscendScheduler(scheduler_config,
-                     model_config,
-                     cache_config,
-                     speculative_config=None,
-                     lora_config=None,
-                     log_stats=True)
+                           model_config,
+                           cache_config,
+                           speculative_config=None,
+                           lora_config=None,
+                           log_stats=True)
 
 
 def create_requests(
@@ -136,7 +154,6 @@ def test_schedule():
         assert scheduler.running[i] == request
 
 
-
 def test_stop_via_update_from_output():
     """Test stopping behavior through update_from_output"""
     scheduler = create_scheduler()
@@ -167,10 +184,8 @@ def test_stop_via_update_from_output():
 
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
-        req_id_to_index={
-            req.request_id: i
-            for i, req in enumerate(requests)
-        },
+        req_id_to_index={req.request_id: i
+                         for i, req in enumerate(requests)},
         sampled_token_ids=[[EOS_TOKEN_ID],
                            [10,
                             11]],  # First request hits EOS, second continues
@@ -217,10 +232,8 @@ def test_stop_via_update_from_output():
 
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
-        req_id_to_index={
-            req.request_id: i
-            for i, req in enumerate(requests)
-        },
+        req_id_to_index={req.request_id: i
+                         for i, req in enumerate(requests)},
         sampled_token_ids=[[10, 42, 12],
                            [13, 14]],  # First request hits stop token
         spec_token_ids=None,
@@ -265,10 +278,8 @@ def test_stop_via_update_from_output():
 
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
-        req_id_to_index={
-            req.request_id: i
-            for i, req in enumerate(requests)
-        },
+        req_id_to_index={req.request_id: i
+                         for i, req in enumerate(requests)},
         sampled_token_ids=[[10, 11, 12],
                            [13]],  # First request exceeds max_tokens
         spec_token_ids=None,
 
@@ -99,7 +99,7 @@ def get_decode_attn_mask(
         self.update_attn_cache(max_s, dtype, device)
         return (self.attn_mask_cache.index_select(
             0, input_lengths)[:, :max_s].view(-1, 1, max_s).contiguous())
-    
+
     def get_splitfuse_attn_mask(
         self,
         seq_lens,
@@ -115,16 +115,15 @@ def get_splitfuse_attn_mask(
             # is not the same. Fix this in the future when kernel is ready.
             if self.attn_mask_cache[0][1] > 0:
                 attn_mask = self.get_attn_mask(  # type: ignore
-                        max_seq_len, dtype, device)
+                    max_seq_len, dtype, device)
                 attn_mask *= -10000
             else:
                 attn_mask = self.attn_mask_cache
-            return torch.index_select(attn_mask,
-                                        dim=0,
-                                        index=position)[:, :max_seq_len]
+            return torch.index_select(attn_mask, dim=0,
+                                      index=position)[:, :max_seq_len]
         total_q_len = sum(query_lens)
         attn_mask = torch.zeros((total_q_len, max_seq_len),
-                                dtype=self.vllm_config.model_config.dtype,
+                                dtype=dtype,
                                 device="cpu")
 
         current_row = 0
@@ -142,7 +141,7 @@ def get_splitfuse_attn_mask(
                 right_tensor.tril() == self.splitfuse_mask_value, 0)
             current_row += q_len
 
-        return attn_mask.to(self.device, non_blocking=True)
+        return attn_mask.to(device, non_blocking=True)
 
 
 class AscendAttentionBackend(AttentionBackend):
 
@@ -19,8 +19,6 @@
 from enum import Enum
 from typing import Any, Dict, List, Optional, Tuple, Type
 
-import numpy as np
-
 import torch
 import torch_npu
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
@@ -220,16 +218,15 @@ def forward(
             assert attn_metadata is not None
             assert attn_metadata.attn_mask is not None
             mask = attn_metadata.attn_mask
-            torch_npu._npu_flash_attention(
-                query=query,
-                key=key,
-                value=value,
-                mask=mask,
-                seq_len=attn_metadata.seq_lens,
-                scale_value=self.scale,
-                num_heads=self.num_heads,
-                num_kv_heads=self.num_kv_heads,
-                out=output)
+            torch_npu._npu_flash_attention(query=query,
+                                           key=key,
+                                           value=value,
+                                           mask=mask,
+                                           seq_len=attn_metadata.seq_lens,
+                                           scale_value=self.scale,
+                                           num_heads=self.num_heads,
+                                           num_kv_heads=self.num_kv_heads,
+                                           out=output)
         elif attn_metadata.attn_state == AscendAttentionState.DecodeOnly:
             block_tables = attn_metadata.block_tables
             torch_npu._npu_paged_attention(
 
@@ -1,5 +1,22 @@
-from dataclasses import dataclass, asdict
-from typing import Union, Type
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from dataclasses import asdict, dataclass
+from typing import Type, Union
 
 from vllm.config import SchedulerConfig
 
@@ -9,34 +26,42 @@ class AscendSchedulerConfig(SchedulerConfig):
     enable_chunked_prefill: bool = False
     policy: str = "fcfs"
     num_scheduler_steps: int = 1
-    scheduler_cls: Union[str, Type[object]] = "vllm_ascend.core.scheduler.AscendScheduler"
-
+    scheduler_cls: Union[
+        str, Type[object]] = "vllm_ascend.core.scheduler.AscendScheduler"
 
     @classmethod
-    def initialize_from_config(cls, vllm_scheduler_config: SchedulerConfig, ascend_scheduler_config: dict):
+    def initialize_from_config(cls, vllm_scheduler_config: SchedulerConfig,
+                               ascend_scheduler_config: dict):
         scheduler_config = asdict(vllm_scheduler_config)
         # Override default values into original SchedulerConfig
         scheduler_config["enable_chunked_prefill"] = False
         scheduler_config["policy"] = "fcfs"
         scheduler_config["num_scheduler_steps"] = 1
-        scheduler_config["scheduler_cls"] = "vllm_ascend.core.scheduler.AscendScheduler"
+        scheduler_config[
+            "scheduler_cls"] = "vllm_ascend.core.scheduler.AscendScheduler"
         # Override params in original SchedulerConfig with params in additional_config.ascend_scheduler_config
         for k, v in ascend_scheduler_config.items():
             scheduler_config[k] = v
         # The "chunked_prefill_enabled" param of vllm's SchedulerConfig can't be initialized.
         scheduler_config.pop("chunked_prefill_enabled")
         return cls(**scheduler_config)
 
-
     def __post_init__(self) -> None:
         self.chunked_prefill_enabled = self.enable_chunked_prefill
         if self.policy != "fcfs":
-            raise NotImplementedError(f"currently AscendScheduler only supports fcfs policy, got {self.policy}")
+            raise NotImplementedError(
+                f"currently AscendScheduler only supports fcfs policy, got {self.policy}"
+            )
         if self.is_multimodal_model:
-            raise NotImplementedError(f"currently AscendScheduler only supports LLM modles.")
+            raise NotImplementedError(
+                "currently AscendScheduler only supports LLM modles.")
         if self.num_scheduler_steps > 1:
-            raise NotImplementedError(f"currently AscendScheduler doesn't support multi-step.")
+            raise NotImplementedError(
+                "currently AscendScheduler doesn't support multi-step.")
         if self.send_delta_data:
-            raise NotImplementedError(f"currently AscendScheduler doesn't support send_delta_data.")
+            raise NotImplementedError(
+                "currently AscendScheduler doesn't support send_delta_data.")
         if self.delay_factor > 0:
-            raise NotImplementedError(f"currently AscendScheduler doesn't support scheduler_delay_factor.")
+            raise NotImplementedError(
+                "currently AscendScheduler doesn't support scheduler_delay_factor."
+            )