vllm-project
diff --git a/‎.github/workflows/vllm_ascend_test.yaml‎
Lines changed: 13 additions & 3 deletions b/‎.github/workflows/vllm_ascend_test.yaml‎
Lines changed: 13 additions & 3 deletions
diff --git a/‎docs/source/community/contributors.md‎
Lines changed: 8 additions & 0 deletions b/‎docs/source/community/contributors.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎docs/source/community/governance.md‎
Lines changed: 17 additions & 0 deletions b/‎docs/source/community/governance.md‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎docs/source/conf.py‎
Lines changed: 4 additions & 4 deletions b/‎docs/source/conf.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/source/developer_guide/versioning_policy.md‎
Lines changed: 2 additions & 0 deletions b/‎docs/source/developer_guide/versioning_policy.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/source/faqs.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/faqs.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/index.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/source/index.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/user_guide/additional_config.md‎
Lines changed: 10 additions & 6 deletions b/‎docs/source/user_guide/additional_config.md‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎docs/source/user_guide/graph_mode.md‎
Lines changed: 82 additions & 0 deletions b/‎docs/source/user_guide/graph_mode.md‎
Lines changed: 82 additions & 0 deletions
diff --git a/‎docs/source/user_guide/release_notes.md‎
Lines changed: 38 additions & 0 deletions b/‎docs/source/user_guide/release_notes.md‎
Lines changed: 38 additions & 0 deletions
@@ -114,14 +114,20 @@ jobs:
             # pytest -sv tests/singlecard/test_guided_decoding.py.py
             # test_ascend_config.py should be ran separately because it will regenerate the global config many times.
             pytest -sv tests/singlecard/test_ascend_config.py
+            pytest -sv tests/singlecard/test_camem.py
             pytest -sv tests/singlecard/ \
             --ignore=tests/singlecard/test_offline_inference.py \
             --ignore=tests/singlecard/test_scheduler.py \
             --ignore=tests/singlecard/test_guided_decoding.py \
-            --ignore=tests/singlecard/test_ascend_config.py
+            --ignore=tests/singlecard/test_ascend_config.py \
+            --ignore=tests/singlecard/test_camem.py
           else
             pytest -sv tests/multicard/test_ilama_lora_tp2.py
-            VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py
+            # To avoid oom, we need to run the test in a single process.
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py
           fi
 
       - name: Run vllm-project/vllm-ascend test on V0 engine
@@ -136,16 +142,20 @@ jobs:
             pytest -sv tests/singlecard/test_camem.py
             # test_ascend_config.py should be ran separately because it will regenerate the global config many times.
             pytest -sv tests/singlecard/test_ascend_config.py
+            pytest -sv tests/singlecard/test_prompt_embedding.py
             pytest -sv tests/singlecard/ \
               --ignore=tests/singlecard/test_offline_inference.py \
               --ignore=tests/singlecard/test_scheduler.py \
               --ignore=tests/singlecard/test_guided_decoding.py \
               --ignore=tests/singlecard/test_camem.py \
-              --ignore=tests/singlecard/test_ascend_config.py
+              --ignore=tests/singlecard/test_ascend_config.py \
+              --ignore=tests/singlecard/test_prompt_embedding.py
           else
             pytest -sv tests/multicard/test_ilama_lora_tp2.py
             # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py will raise error.
+            # To avoid oom, we need to run the test in a single process.
             VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
             VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk
             VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py
           fi
@@ -1,3 +1,11 @@
+# Maintainers
+
+| Name | Github ID | Date |
+|:-----------:|:-----:|:-----:|
+| Xiyuan Wang| [@wangxiyuan](https://github.com/wangxiyuan) | 2025/01 |
+| Yikun Jiang| [@Yikun](https://github.com/Yikun) | 2025/02 |
+| Yi Gan| [@ganyi1996ppo](https://github.com/ganyi1996ppo) | 2025/02 |
+
 # Contributors
 
 vLLM Ascend every release would not have been possible without the following contributors:
 
@@ -29,3 +29,20 @@ vLLM Ascend is an open-source project under the vLLM community, where the author
     Requires approval from existing Maintainers. The vLLM community has the final decision-making authority.
 
     Maintainer will be empowered [vllm-project/vllm-ascend](https://github.com/vllm-project/vllm-ascend) Github repo write permissions (`Can read, clone, and push to this repository. Can also manage issues and pull requests`).
+
+## Nominating and Removing Maintainers
+
+### The Principles
+
+- Membership in vLLM Ascend is given to individuals on merit basis after they demonstrated strong expertise of the vLLM / vLLM Ascend through contributions, reviews and discussions.
+
+- For membership in the maintainer group the individual has to demonstrate strong and continued alignment with the overall vLLM / vLLM Ascend principles.
+
+- Light criteria of moving module maintenance to ‘emeritus’ status if they don’t actively participate over long periods of time.
+
+- The membership is for an individual, not a company.
+
+### Nomination and Removal
+
+- Nomination: Anyone can nominate someone to become a maintainer (include self-nominate). All existing maintainers are responsible for evaluating the nomination. The nominator should provide nominee's info around the strength of the candidate to be a maintainer, include but not limited to review quality, quality contribution, community involvement.
+- Removal: Anyone can nominate a person to be removed from maintainer position (include self-nominate). All existing maintainers are responsible for evaluating the nomination. The nominator should provide nominee's info, include but not limited to lack of activity, conflict with the overall direction and other information that makes them unfit to be a maintainer.
@@ -64,15 +64,15 @@
     # the branch of vllm, used in vllm clone
     # - main branch: 'main'
     # - vX.Y.Z branch: 'vX.Y.Z'
-    'vllm_version': 'v0.8.5.post1',
+    'vllm_version': 'v0.9.0',
     # the branch of vllm-ascend, used in vllm-ascend clone and image tag
     # - main branch: 'main'
     # - vX.Y.Z branch: latest vllm-ascend release tag
-    'vllm_ascend_version': 'v0.8.5rc1',
+    'vllm_ascend_version': 'v0.9.0rc1',
     # the newest release version of vllm-ascend and matched vLLM, used in pip install.
     # This value should be updated when cut down release.
-    'pip_vllm_ascend_version': "0.8.5rc1",
-    'pip_vllm_version': "0.8.5.post1",
+    'pip_vllm_ascend_version': "0.9.0rc1",
+    'pip_vllm_version': "0.9.0",
     # CANN image tag
     'cann_image_tag': "8.1.rc1-910b-ubuntu22.04-py3.10",
 }
 
@@ -22,6 +22,7 @@ Following is the Release Compatibility Matrix for vLLM Ascend Plugin:
 
 | vLLM Ascend | vLLM         | Python           | Stable CANN | PyTorch/torch_npu  | MindIE Turbo |
 |-------------|--------------|------------------|-------------|--------------------|--------------|
+| v0.9.0rc1   | v0.9.0       | >= 3.9, < 3.12   | 8.1.RC1     | 2.5.1 / 2.5.1      |              |
 | v0.8.5rc1   | v0.8.5.post1 | >= 3.9, < 3.12   | 8.1.RC1     | 2.5.1 / 2.5.1      |              |
 | v0.8.4rc2   | v0.8.4       | >= 3.9, < 3.12   | 8.0.0       | 2.5.1 / 2.5.1      |              |
 | v0.7.3.post1| v0.7.3       | >= 3.9, < 3.12   | 8.1.RC1     | 2.5.1 / 2.5.1      |   2.0rc1     |
@@ -33,6 +34,7 @@ Following is the Release Compatibility Matrix for vLLM Ascend Plugin:
 
 | Date       | Event                                     |
 |------------|-------------------------------------------|
+| 2025.06.09 | Release candidates, v0.9.0rc1             |
 | 2025.05.29 | v0.7.x post release, v0.7.3.post1         |
 | 2025.05.08 | v0.7.x Final release, v0.7.3              |
 | 2025.05.06 | Release candidates, v0.8.5rc1             |
 
@@ -3,7 +3,7 @@
 ## Version Specific FAQs
 
 - [[v0.7.3.post1] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/1007)
-- [[v0.8.5rc1] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/754)
+- [[v0.9.0rc1] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/1115)
 
 ## General FAQs
 
 
@@ -47,6 +47,7 @@ user_guide/suppoted_features
 user_guide/supported_models
 user_guide/env_vars
 user_guide/additional_config
+user_guide/graph_mode.md
 user_guide/release_notes
 :::
 
 
@@ -24,11 +24,13 @@ LLM(model="Qwen/Qwen3-8B", additional_config={"config_key":"config_value"})
 
 The following table lists the additional configuration options available in vLLM Ascend:
 
-| Name | Type | Default | Description |
-| ---- | ---- | ------- | ----------- |
-| `torchair_graph_config` | dict | `{}` | The config options for torchair graph mode |
-| `ascend_scheduler_config` | dict | `{}` | The config options for ascend scheduler  |
-| `expert_tensor_parallel_size` | str | `1` | Expert tensor parallel size the model to use. |
+| Name                          | Type | Default | Description                                                                                   |
+|-------------------------------| ---- |------|-----------------------------------------------------------------------------------------------|
+| `torchair_graph_config`       | dict | `{}` | The config options for torchair graph mode                                                    |
+| `ascend_scheduler_config`     | dict | `{}` | The config options for ascend scheduler                                                       |
+| `expert_tensor_parallel_size` | str | `0`  | Expert tensor parallel size the model to use.                                                 |
+| `refresh`                     | bool | `false` | Whether to refresh global ascend config content. This value is usually used by rlhf case.     |
+| `expert_map_path`             | str | None | When using expert load balancing for the MOE model, an expert map path needs to be passed in. |
 
 The details of each config option are as follows:
 
@@ -37,6 +39,7 @@ The details of each config option are as follows:
 | Name | Type | Default | Description |
 | ---- | ---- | ------- | ----------- |
 | `enabled` | bool | `False` | Whether to enable torchair graph mode |
+| `enable_view_optimize` | bool | `True` | Whether to enable torchair view optimization |
 | `use_cached_graph` | bool | `False` | Whether to use cached graph |
 | `graph_batch_sizes` | list[int] | `[]` | The batch size for torchair graph cache |
 | `graph_batch_sizes_init` | bool | `False` | Init graph batch size dynamically if `graph_batch_sizes` is empty |
@@ -69,6 +72,7 @@ A full example of additional configuration is as follows:
         "enabled": true,
         "chunked_prefill_enabled": true,
     },
-    "expert_tensor_parallel_size": 1
+    "expert_tensor_parallel_size": 1,
+    "refresh": false,
 }
 ```
@@ -0,0 +1,82 @@
+# Graph Mode Guide
+
+
+This feature is currently experimental. In future versions, there may be behavioral changes around configuration, coverage, performance improvement.
+
+This guide provides instructions for using Ascend Graph Mode with vLLM Ascend. Please note that graph mode is only available on V1 Engine. And only Qwen, DeepSeek series models are well tested in 0.9.0rc1. We'll make it stable and generalize in the next release.
+
+## Getting Started
+
+From v0.9.0rc1 with V1 Engine, vLLM Ascend will run models in graph mode by default to keep the same behavior with vLLM. If you hit any issues, please feel free to open an issue on GitHub and fallback to eager mode temporarily by set `enforce_eager=True` when initializing the model.
+
+There are two kinds for graph mode supported by vLLM Ascend:
+- **ACLGraph**: This is the default graph mode supported by vLLM Ascend. In v0.9.0rc1, only Qwen series models are well tested.
+- **TorchAirGraph**: This is the GE graph mode. In v0.9.0rc1, only DeepSeek series models are supported.
+
+## Using ACLGraph
+ACLGraph is enabled by default. Take Qwen series models as an example, just set to use V1 Engine is enough.
+
+offline example:
+
+```python
+import os
+
+from vllm import LLM
+
+os.environ["VLLM_USE_V1"] = 1
+
+model = LLM(model="Qwen/Qwen2-7B-Instruct")
+outputs = model.generate("Hello, how are you?")
+```
+
+online example:
+
+```shell
+vllm serve Qwen/Qwen2-7B-Instruct
+```
+
+## Using TorchAirGraph
+
+If you want to run DeepSeek series models with graph mode, you should use [TorchAirGraph](https://www.hiascend.com/document/detail/zh/Pytorch/700/modthirdparty/torchairuseguide/torchair_0002.html). In this case, additional config is required.
+
+offline example:
+
+```python
+import os
+from vllm import LLM
+
+os.environ["VLLM_USE_V1"] = 1
+
+model = LLM(model="deepseek-ai/DeepSeek-R1-0528", additional_config={"torchair_graph_config": {"enable": True}})
+outputs = model.generate("Hello, how are you?")
+```
+
+online example:
+
+```shell
+vllm serve Qwen/Qwen2-7B-Instruct --additional-config='{"torchair_graph_config": {"enable": True}}'
+```
+
+You can find more detail about additional config [here](./additional_config.md)
+
+## Fallback to Eager Mode
+
+If both `ACLGraph` and `TorchAirGraph` fail to run, you should fallback to eager mode.
+
+offline example:
+
+```python
+import os
+from vllm import LLM
+
+os.environ["VLLM_USE_V1"] = 1
+
+model = LLM(model="someother_model_weight", enforce_eager=True)
+outputs = model.generate("Hello, how are you?")
+```
+
+online example:
+
+```shell
+vllm serve Qwen/Qwen2-7B-Instruct --enforce-eager
+```
@@ -1,5 +1,43 @@
 # Release note
 
+## v0.9.0rc1 - 2025.06.09
+
+This is the 1st release candidate of v0.9.0 for vllm-ascend. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/) to start the journey. From this release, V1 Engine is recommended to use. The code of V0 Engine is frozen and will not be maintained any more. Please set environment `VLLM_USE_V1=1` to enable V1 Engine.
+
+### Highlights
+
+- DeepSeek works with graph mode now. Follow the [official doc](https://vllm-ascend.readthedocs.io/en/latest/user_guide/graph_mode.html) to take a try. [#789](https://github.com/vllm-project/vllm-ascend/pull/789)
+- Qwen series models works with graph mode now. It works by default with V1 Engine. Please note that in this release, only Qwen series models are well tested with graph mode. We'll make it stable and generalize in the next release. If you hit any issues, please feel free to open an issue on GitHub and fallback to eager mode temporarily by set `enforce_eager=True` when initializing the model.
+
+### Core
+
+- The performance of multi-step scheduler has been improved. Thanks for the contribution from China Merchants Bank. [#814](https://github.com/vllm-project/vllm-ascend/pull/814)
+- LoRA、Multi-LoRA And Dynamic Serving is supported for V1 Engine now. Thanks for the contribution from China Merchants Bank. [#893](https://github.com/vllm-project/vllm-ascend/pull/893)
+- prefix cache and chunked prefill feature works now [#782](https://github.com/vllm-project/vllm-ascend/pull/782) [#844](https://github.com/vllm-project/vllm-ascend/pull/844)
+- Spec decode and MTP features work with V1 Engine now. [#874](https://github.com/vllm-project/vllm-ascend/pull/874) [#890](https://github.com/vllm-project/vllm-ascend/pull/890)
+- DP feature works with DeepSeek now. [#1012](https://github.com/vllm-project/vllm-ascend/pull/1012)
+- Input embedding feature works with V0 Engine now. [#916](https://github.com/vllm-project/vllm-ascend/pull/916)
+- Sleep mode feature works with V1 Engine now. [#1084](https://github.com/vllm-project/vllm-ascend/pull/1084)
+
+### Model
+
+- Qwen2.5 VL works with V1 Engine now. [#736](https://github.com/vllm-project/vllm-ascend/pull/736)
+- LLama4 works now. [#740](https://github.com/vllm-project/vllm-ascend/pull/740)
+- A new kind of DeepSeek model called dual-batch overlap(DBO) is added. Please set `VLLM_ASCEND_ENABLE_DBO=1` to use it. [#941](https://github.com/vllm-project/vllm-ascend/pull/941)
+
+### Other
+
+- online serve with ascend quantization works now. [#877](https://github.com/vllm-project/vllm-ascend/pull/877)
+- A batch of bugs for graph mode and moe model have been fixed. [#773](https://github.com/vllm-project/vllm-ascend/pull/773) [#771](https://github.com/vllm-project/vllm-ascend/pull/771) [#774](https://github.com/vllm-project/vllm-ascend/pull/774) [#816](https://github.com/vllm-project/vllm-ascend/pull/816) [#817](https://github.com/vllm-project/vllm-ascend/pull/817) [#819](https://github.com/vllm-project/vllm-ascend/pull/819) [#912](https://github.com/vllm-project/vllm-ascend/pull/912) [#897](https://github.com/vllm-project/vllm-ascend/pull/897) [#961](https://github.com/vllm-project/vllm-ascend/pull/961) [#958](https://github.com/vllm-project/vllm-ascend/pull/958) [#913](https://github.com/vllm-project/vllm-ascend/pull/913) [#905](https://github.com/vllm-project/vllm-ascend/pull/905)
+- A batch of performance improvement PRs have been merged. [#784](https://github.com/vllm-project/vllm-ascend/pull/784) [#803](https://github.com/vllm-project/vllm-ascend/pull/803) [#966](https://github.com/vllm-project/vllm-ascend/pull/966) [#839](https://github.com/vllm-project/vllm-ascend/pull/839) [#970](https://github.com/vllm-project/vllm-ascend/pull/970) [#947](https://github.com/vllm-project/vllm-ascend/pull/947) [#987](https://github.com/vllm-project/vllm-ascend/pull/987) [#1085](https://github.com/vllm-project/vllm-ascend/pull/1085)
+- From this release, binary wheel package will be released as well. [#775](https://github.com/vllm-project/vllm-ascend/pull/775)
+- The contributor doc site is [added](https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html)
+
+### Known Issue
+
+- In some case, vLLM process may be crashed with aclgraph enabled. We're working this issue and it'll be fixed in the next release.
+- Multi node data-parallel doesn't work with this release. This is a known issue in vllm and has been fixed on main branch. [#18981](https://github.com/vllm-project/vllm/pull/18981)
+
 ## v0.7.3.post1 - 2025.05.29
 
 This is the first post release of 0.7.3. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/v0.7.3-dev) to start the journey. It includes the following changes: