Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions .github/workflows/vllm_ascend_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,13 @@ jobs:
pytest -sv tests/singlecard/test_scheduler.py
# guided decoding doesn't work, fix it later
# pytest -sv tests/singlecard/test_guided_decoding.py.py
pytest -sv tests/singlecard/ --ignore=tests/singlecard/test_offline_inference.py --ignore=tests/singlecard/test_scheduler.py --ignore=tests/singlecard/test_guided_decoding.py
# test_ascend_config.py should be ran separately because it will regenerate the global config many times.
pytest -sv tests/singlecard/test_ascend_config.py
pytest -sv tests/singlecard/ \
--ignore=tests/singlecard/test_offline_inference.py \
--ignore=tests/singlecard/test_scheduler.py \
--ignore=tests/singlecard/test_guided_decoding.py \
--ignore=tests/singlecard/test_ascend_config.py
else
pytest -sv tests/multicard/test_ilama_lora_tp2.py
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py
Expand All @@ -128,11 +134,14 @@ jobs:
# guided decoding doesn't work, fix it later
# pytest -sv tests/singlecard/test_guided_decoding.py.py
pytest -sv tests/singlecard/test_camem.py
# test_ascend_config.py should be ran separately because it will regenerate the global config many times.
pytest -sv tests/singlecard/test_ascend_config.py
pytest -sv tests/singlecard/ \
--ignore=tests/singlecard/test_offline_inference.py \
--ignore=tests/singlecard/test_scheduler.py \
--ignore=tests/singlecard/test_guided_decoding.py \
--ignore=tests/singlecard/test_camem.py
--ignore=tests/singlecard/test_camem.py \
--ignore=tests/singlecard/test_ascend_config.py
else
pytest -sv tests/multicard/test_ilama_lora_tp2.py
# Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py will raise error.
Expand Down
1 change: 1 addition & 0 deletions docs/source/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ faqs
user_guide/suppoted_features
user_guide/supported_models
user_guide/env_vars
user_guide/additional_config
user_guide/release_notes
:::

Expand Down
70 changes: 70 additions & 0 deletions docs/source/user_guide/additional_config.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Additional Configuration

addintional configuration is a mechanism provided by vLLM to allow plugins to control inner behavior by their own. vLLM Ascend uses this mechanism to make the project more flexible.

## How to use

With either online mode or offline mode, users can use additional configuration. Take Qwen3 as an example:

**Online mode**:

```bash
vllm serve Qwen/Qwen3-8B --additional-config='{"config_key":"config_value"}'
```

**Offline mode**:

```python
from vllm import LLM

LLM(model="Qwen/Qwen3-8B", additional_config={"config_key":"config_value"})
```

### Configuration options

The following table lists the additional configuration options available in vLLM Ascend:

| Name | Type | Default | Description |
| ---- | ---- | ------- | ----------- |
| `torchair_graph_config` | dict | `{}` | The config options for torchair graph mode |
| `ascend_scheduler_config` | dict | `{}` | The config options for ascend scheduler |
| `expert_tensor_parallel_size` | str | `1` | Expert tensor parallel size the model to use. |

The details of each config option are as follows:

**torchair_graph_config**

| Name | Type | Default | Description |
| ---- | ---- | ------- | ----------- |
| `enabled` | bool | `False` | Whether to enable torchair graph mode |
| `use_cached_graph` | bool | `False` | Whether to use cached graph |
| `graph_batch_sizes` | list[int] | `[]` | The batch size for torchair graph cache |
| `graph_batch_sizes_init` | bool | `False` | Init graph batch size dynamically if `graph_batch_sizes` is empty |

**ascend_scheduler_config**

| Name | Type | Default | Description |
| ---- | ---- | ------- | ----------- |
| `enabled` | bool | `False` | Whether to enable ascend scheduler for V1 engine|

ascend_scheduler_config also support the options from [vllm scheduler config](https://docs.vllm.ai/en/stable/api/vllm/config.html#vllm.config.SchedulerConfig). For example, you can add `chunked_prefill_enabled: true` to ascend_scheduler_config as well.

### Example

A full example of additional configuration is as follows:

```
{
"torchair_graph_config": {
"enabled": true,
"use_cached_graph": true,
"graph_batch_sizes": [1, 2, 4, 8],
"graph_batch_sizes_init": true
},
"ascend_scheduler_config": {
"enabled": true,
"chunked_prefill_enabled": true,
},
"expert_tensor_parallel_size": 1
}
```
4 changes: 3 additions & 1 deletion examples/dp_offline/data_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,9 @@ def main():
max_num_seqs=num_seqs,
additional_config={
'expert_tensor_parallel_size': etp_size,
'enable_graph_mode': False,
'torchair_graph_config': {
'enabled': False,
},
})

outputs = llm.generate(prompts, sampling_params)
Expand Down
10 changes: 5 additions & 5 deletions tests/long_term/spec_decode/e2e/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,17 +167,17 @@ def run_equality_correctness_test(

# TODO current torchair graph mode needs clean torchair cache.
# if do not clean, it will raise error
additional_config = common_llm_kwargs.get("additional_config")
enable_graph_mode = additional_config.get(
"enable_graph_mode") if additional_config else False
torchair_graph_enabled = common_llm_kwargs.get(
"additional_config", {}).get("torchair_graph_config",
{}).get("enabled", False)

with vllm_runner(**org_args) as vllm_model:
if enable_graph_mode:
if torchair_graph_enabled:
_clean_torchair_cache()
org_outputs = vllm_model.generate_w_logprobs(prompts, sampling_params)

with vllm_runner(**sd_args) as vllm_model:
if enable_graph_mode:
if torchair_graph_enabled:
_clean_torchair_cache()
if ensure_all_accepted or expected_acceptance_rate is not None:
# Force log interval to be 0 to catch all metrics.
Expand Down
8 changes: 6 additions & 2 deletions tests/long_term/spec_decode/e2e/test_mtp_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,9 @@ def test_mtp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
"common_llm_kwargs",
[{
"additional_config": {
'enable_graph_mode': True,
'torchair_graph_config': {
"enabled": True,
},
},

# Print spec metrics.
Expand Down Expand Up @@ -262,7 +264,9 @@ def test_mtp_e2e_greedy_correctness_torchair_graph(
"common_llm_kwargs",
[{
"additional_config": {
'enable_graph_mode': True,
'torchair_graph_config': {
"enabled": True,
},
},

# Print spec metrics.
Expand Down
5 changes: 0 additions & 5 deletions tests/multicard/test_dynamic_npugraph_batchsize.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@
import torch
from vllm import LLM, SamplingParams

from vllm_ascend.utils import vllm_version_is

MODELS = [
"Qwen/Qwen2.5-0.5B-Instruct",
]
Expand All @@ -32,9 +30,6 @@
]


@pytest.mark.skipif(
(vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1")),
reason="aclgraph not supported in v0.8.5 and v0.8.5.post1")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
@pytest.mark.parametrize("max_tokens", [64])
Expand Down
8 changes: 2 additions & 6 deletions tests/multicard/test_offline_inference_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,7 @@

def test_models_distributed_QwQ():
example_prompts = [
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
"Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
"Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
"Hello, my name is",
]
dtype = "half"
max_tokens = 5
Expand All @@ -48,9 +46,7 @@ def test_models_distributed_QwQ():

def test_models_distributed_DeepSeek():
example_prompts = [
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
"Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
"Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
"Hello, my name is",
]
dtype = "half"
max_tokens = 5
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,12 @@

from tests.conftest import VllmRunner
from tests.model_utils import check_outputs_equal
from vllm_ascend.utils import vllm_version_is

MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"]


@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
reason="aclgraph only support on v1")
@pytest.mark.skipif(
(vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1")),
reason="aclgraph not supported in v0.8.5 and v0.8.5.post1")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32])
def test_models(
Expand Down Expand Up @@ -88,9 +84,6 @@ def test_models(

@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
reason="aclgraph only support on v1")
@pytest.mark.skipif(
(vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1")),
reason="aclgraph not supported in v0.8.5 and v0.8.5.post1")
def test_deepseek_raises_error(monkeypatch: pytest.MonkeyPatch) -> None:
with monkeypatch.context() as m:
m.setenv("VLLM_USE_MODELSCOPE", "True")
Expand Down
118 changes: 118 additions & 0 deletions tests/singlecard/test_ascend_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# This file is a part of the vllm-ascend project.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pytest

from tests.conftest import VllmRunner
from vllm_ascend.ascend_config import clear_ascend_config, get_ascend_config


def _clean_up_ascend_config(func):

def wrapper(*args, **kwargs):
clear_ascend_config()
func(*args, **kwargs)
clear_ascend_config()

return wrapper


@_clean_up_ascend_config
def test_run_without_ascend_config():
with VllmRunner("facebook/opt-125m"):
ascend_config = get_ascend_config()

assert not ascend_config.torchair_graph_config.enabled
assert not ascend_config.torchair_graph_config.use_cached_graph
assert ascend_config.torchair_graph_config.graph_batch_sizes == []
assert not ascend_config.torchair_graph_config.graph_batch_sizes_init
assert not ascend_config.ascend_scheduler_config.enabled
assert ascend_config.expert_tensor_parallel_size == 1


@_clean_up_ascend_config
def test_run_with_ascend_config():
input_additional_config = {
"torchair_graph_config": {
# torchair graph only works with deepseek. The e2e test should be added
# in multicard test with deepseek models.
"enabled": False,
"use_cached_graph": True,
"graph_batch_sizes": [1, 2, 4, 8],
"graph_batch_sizes_init": False,
},
"ascend_scheduler_config": {
"enabled": True,
"enable_chunked_prefill": True,
},
"expert_tensor_parallel_size": 1
}
with VllmRunner("facebook/opt-125m",
additional_config=input_additional_config):
ascend_config = get_ascend_config()

assert not ascend_config.torchair_graph_config.enabled
assert ascend_config.torchair_graph_config.use_cached_graph
assert ascend_config.torchair_graph_config.graph_batch_sizes == [
1, 2, 4, 8
]
assert not ascend_config.torchair_graph_config.graph_batch_sizes_init
assert ascend_config.ascend_scheduler_config.enabled
assert ascend_config.ascend_scheduler_config.enable_chunked_prefill
assert ascend_config.expert_tensor_parallel_size == 1


@_clean_up_ascend_config
def test_ascend_config_init_error():
# ascend_config should be initialized first
with pytest.raises(RuntimeError):
_ = get_ascend_config()


@_clean_up_ascend_config
def test_ascend_config_load_error():
# graph_batch_sizes should be list.
with pytest.raises(TypeError):
input_additional_config_fake_1 = {
"torchair_graph_config": {
"graph_batch_sizes": "fake_size",
},
}
with VllmRunner("facebook/opt-125m",
additional_config=input_additional_config_fake_1):
pass

# graph_batch_sizes_init should not be True when graph_batch_sizes is not empty.
with pytest.raises(ValueError):
input_additional_config_fake_2 = {
"torchair_graph_config": {
"graph_batch_sizes": [1, 2, 4, 8],
"graph_batch_sizes_init": True,
},
}
with VllmRunner("facebook/opt-125m",
additional_config=input_additional_config_fake_2):
pass

# torchair graph only works with deepseek.
with pytest.raises(NotImplementedError):
input_additional_config_fake_2 = {
"torchair_graph_config": {
"enabled": True,
},
}
with VllmRunner("facebook/opt-125m",
additional_config=input_additional_config_fake_2):
pass
Loading