Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,8 @@
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
locale_dirs = ['locale/']
gettext_compact = False
locale_dirs = ['locale/']
gettext_compact = False
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
Expand Down
4 changes: 2 additions & 2 deletions docs/source/tutorials/large_scale_ep.md
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,7 @@ for process in processes:

:::::

Note that the prefiller nodes and the decoder nodes may have differenet configurations. In this example, each prefiller node deployed as master node independently, but all decoder nodes take the first node as the master node. So it leads to differents in 'dp_size_local' and 'dp_rank_start'
Note that the prefiller nodes and the decoder nodes may have different configurations. In this example, each prefiller node deployed as master node independently, but all decoder nodes take the first node as the master node. So it leads to difference in 'dp_size_local' and 'dp_rank_start'

## Example proxy for Distributed DP Server

Expand Down Expand Up @@ -395,7 +395,7 @@ python load_balance_proxy_server_example.py \

You can get the proxy program in the repository's examples, [load\_balance\_proxy\_server\_example.py](https://github.com/vllm-project/vllm-ascend/blob/v0.9.1-dev/examples/disaggregate_prefill_v1/load_balance_proxy_server_example.py)

## Benckmark
## Benchmark

We recommend use aisbench tool to assess performance. [aisbench](https://gitee.com/aisbench/benchmark) Execute the following commands to install aisbench

Expand Down
2 changes: 2 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@

from tests.model_utils import (PROMPT_TEMPLATES, TokensTextLogprobs,
TokensTextLogprobsPromptLogprobs)
from vllm_ascend.ascend_config import clear_ascend_config
# TODO: remove this part after the patch merged into vllm, if
# we not explicitly patch here, some of them might be effectiveless
# in pytest scenario
Expand Down Expand Up @@ -348,6 +349,7 @@ def __enter__(self):

def __exit__(self, exc_type, exc_value, traceback):
del self.model
clear_ascend_config()
cleanup_dist_env_and_memory()


Expand Down
4 changes: 4 additions & 0 deletions tests/multicard/test_model_qwen3_w4a8.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
from modelscope import snapshot_download # type: ignore
from vllm import LLM, SamplingParams

from vllm_ascend.ascend_config import clear_ascend_config

MODELS = ["vllm-ascend/Qwen3-8B-W4A8"]
PROMPTS = [
"Hello, my name is",
Expand All @@ -38,6 +40,7 @@
@pytest.mark.parametrize("max_tokens", [16])
def test_qwen3_model_with_w4a8_linear_method(model: str,
max_tokens: int) -> None:
clear_ascend_config()
messages = [[{"role": "user", "content": prompt}] for prompt in PROMPTS]
sampling_params = SamplingParams(
max_tokens=max_tokens,
Expand All @@ -63,3 +66,4 @@ def test_qwen3_model_with_w4a8_linear_method(model: str,
for vllm_output, golden_output in zip(vllm_outputs, golden_outputs):
assert vllm_output.outputs[0].text == golden_output
print(f"Generated text: {vllm_output.outputs[0].text!r}")
clear_ascend_config()
5 changes: 3 additions & 2 deletions tests/singlecard/core/test_ascend_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@


def create_scheduler(
model: str = "Qwen/Qwen2.5-0.5B-Instruct",
model: str = "deepseek-ai/DeepSeek-V2-Lite",
max_num_seqs: int = 16,
max_num_batched_tokens: int = 8192,
enable_prefix_caching: Optional[bool] = None,
Expand Down Expand Up @@ -60,6 +60,7 @@ def create_scheduler(
)
model_config = ModelConfig(
model=model,
enforce_eager=True,
task="auto",
tokenizer=model,
tokenizer_mode="auto",
Expand Down Expand Up @@ -227,7 +228,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):

"""
scheduler = create_scheduler(
model="facebook/opt-125m",
model="deepseek-ai/DeepSeek-V2-Lite",
max_num_batched_tokens=1024,
long_prefill_token_threshold=400,
enable_prefix_caching=enable_prefix_caching,
Expand Down
7 changes: 5 additions & 2 deletions tests/singlecard/test_ascend_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,9 @@ def test_run_without_ascend_config():
assert not ascend_config.torchair_graph_config.use_cached_graph
assert ascend_config.torchair_graph_config.graph_batch_sizes == []
assert not ascend_config.torchair_graph_config.graph_batch_sizes_init
assert not ascend_config.ascend_scheduler_config.enabled
# Non-MLA LLMs forcibly disable the chunked prefill feature
# and use AscendScheduler
assert ascend_config.ascend_scheduler_config.enabled


@_clean_up_ascend_config
Expand Down Expand Up @@ -81,7 +83,8 @@ def test_run_with_ascend_config():
assert not ascend_config.torchair_graph_config.enable_multistream_moe
assert not ascend_config.torchair_graph_config.enable_view_optimize
assert ascend_config.ascend_scheduler_config.enabled
assert ascend_config.ascend_scheduler_config.enable_chunked_prefill
# Non-MLA LLMs forcibly disable the chunked prefill feature
assert not ascend_config.ascend_scheduler_config.enable_chunked_prefill


@_clean_up_ascend_config
Expand Down
28 changes: 28 additions & 0 deletions vllm_ascend/platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,34 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
model_config = vllm_config.model_config
parallel_config = vllm_config.parallel_config
cache_config = vllm_config.cache_config
decoding_config = vllm_config.decoding_config
scheduler_config = vllm_config.scheduler_config
ascend_scheduler_config = ascend_config.ascend_scheduler_config

if model_config is not None and not model_config.use_mla:
logger.info(
"Non-MLA LLMs forcibly disable the chunked prefill feature,"
"as the performance of operators supporting this feature "
"functionality is currently suboptimal.")
if not envs.VLLM_USE_V1:
scheduler_config.enable_chunked_prefill = False
scheduler_config.chunked_prefill_enabled = False
if envs.VLLM_USE_V1 and \
not model_config.is_multimodal_model and \
decoding_config.backend == "auto" and \
not scheduler_config.delay_factor > 0 and \
not scheduler_config.send_delta_data and \
scheduler_config.policy == "fcfs" and \
scheduler_config.num_scheduler_steps == 1:
scheduler_config.enable_chunked_prefill = False
scheduler_config.chunked_prefill_enabled = False
ascend_scheduler_config.enabled = True
if hasattr(ascend_scheduler_config, "enable_chunked_prefill"):
ascend_scheduler_config.enable_chunked_prefill = False
if (scheduler_config.max_num_batched_tokens <
scheduler_config.max_model_len
and not scheduler_config.chunked_prefill_enabled):
scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len

if parallel_config:
if parallel_config.enable_expert_parallel:
Expand Down