diff --git a/docs/source/user_guide/additional_config.md b/docs/source/user_guide/additional_config.md index d2d4234d77..df39789429 100644 --- a/docs/source/user_guide/additional_config.md +++ b/docs/source/user_guide/additional_config.md @@ -28,7 +28,8 @@ The following table lists the additional configuration options available in vLLM | ---- | ---- | ------- | ----------- | | `torchair_graph_config` | dict | `{}` | The config options for torchair graph mode | | `ascend_scheduler_config` | dict | `{}` | The config options for ascend scheduler | -| `expert_tensor_parallel_size` | str | `1` | Expert tensor parallel size the model to use. | +| `expert_tensor_parallel_size` | str | `0` | Expert tensor parallel size the model to use. | +| `refresh` | bool | `false` | Whether to refresh global ascend config content. This value is usually used by rlhf case. | The details of each config option are as follows: @@ -40,6 +41,7 @@ The details of each config option are as follows: | `use_cached_graph` | bool | `False` | Whether to use cached graph | | `graph_batch_sizes` | list[int] | `[]` | The batch size for torchair graph cache | | `graph_batch_sizes_init` | bool | `False` | Init graph batch size dynamically if `graph_batch_sizes` is empty | +| `enable_multistream_shared_expert`| bool | `False` | Whether to enable multistream shared expert | **ascend_scheduler_config** @@ -59,12 +61,14 @@ A full example of additional configuration is as follows: "enabled": true, "use_cached_graph": true, "graph_batch_sizes": [1, 2, 4, 8], - "graph_batch_sizes_init": true + "graph_batch_sizes_init": false, + "enable_multistream_shared_expert": false }, "ascend_scheduler_config": { "enabled": true, "chunked_prefill_enabled": true, }, - "expert_tensor_parallel_size": 1 + "expert_tensor_parallel_size": 1, + "refresh": false, } ``` diff --git a/tests/singlecard/test_ascend_config.py b/tests/singlecard/test_ascend_config.py index 4433538cd1..484fe5f702 100644 --- a/tests/singlecard/test_ascend_config.py +++ b/tests/singlecard/test_ascend_config.py @@ -13,10 +13,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os + import pytest from tests.conftest import VllmRunner -from vllm_ascend.ascend_config import clear_ascend_config, get_ascend_config +from vllm_ascend.ascend_config import (clear_ascend_config, get_ascend_config, + init_ascend_config) def _clean_up_ascend_config(func): @@ -39,12 +42,15 @@ def test_run_without_ascend_config(): assert ascend_config.torchair_graph_config.graph_batch_sizes == [] assert not ascend_config.torchair_graph_config.graph_batch_sizes_init assert not ascend_config.ascend_scheduler_config.enabled - assert ascend_config.expert_tensor_parallel_size == 1 + assert ascend_config.expert_tensor_parallel_size == 0 @_clean_up_ascend_config def test_run_with_ascend_config(): - input_additional_config = { + if os.getenv("VLLM_USE_V1") == "0": + pytest.skip("graph only works on v1") + + input_additional_config_1 = { "torchair_graph_config": { # torchair graph only works with deepseek. The e2e test should be added # in multicard test with deepseek models. @@ -52,6 +58,7 @@ def test_run_with_ascend_config(): "use_cached_graph": True, "graph_batch_sizes": [1, 2, 4, 8], "graph_batch_sizes_init": False, + "enable_multistream_shared_expert": True, }, "ascend_scheduler_config": { "enabled": True, @@ -59,8 +66,11 @@ def test_run_with_ascend_config(): }, "expert_tensor_parallel_size": 1 } + + # check passed with eager mode with VllmRunner("facebook/opt-125m", - additional_config=input_additional_config): + enforce_eager=True, + additional_config=input_additional_config_1): ascend_config = get_ascend_config() assert not ascend_config.torchair_graph_config.enabled @@ -69,6 +79,7 @@ def test_run_with_ascend_config(): 1, 2, 4, 8 ] assert not ascend_config.torchair_graph_config.graph_batch_sizes_init + assert ascend_config.torchair_graph_config.enable_multistream_shared_expert assert ascend_config.ascend_scheduler_config.enabled assert ascend_config.ascend_scheduler_config.enable_chunked_prefill assert ascend_config.expert_tensor_parallel_size == 1 @@ -83,6 +94,8 @@ def test_ascend_config_init_error(): @_clean_up_ascend_config def test_ascend_config_load_error(): + if os.getenv("VLLM_USE_V1") == "0": + pytest.skip("graph only works on v1") # graph_batch_sizes should be list. with pytest.raises(TypeError): input_additional_config_fake_1 = { @@ -117,3 +130,60 @@ def test_ascend_config_load_error(): enforce_eager=False, additional_config=input_additional_config_fake_2): pass + + # torchair graph should not be enabled with eager mode + with pytest.raises(RuntimeError): + input_additional_config_fake_3 = { + "torchair_graph_config": { + "enabled": True, + }, + } + with VllmRunner("facebook/opt-125m", + enforce_eager=True, + additional_config=input_additional_config_fake_3): + pass + + +@_clean_up_ascend_config +def test_check_ascend_config_v0(): + if os.getenv("VLLM_USE_V1") == "1": + pytest.skip("graph only works on v1, this is the test for v0") + with pytest.raises(NotImplementedError): + input_additional_config_fake_1 = { + "torchair_graph_config": { + "enabled": True, + }, + } + with VllmRunner("facebook/opt-125m", + additional_config=input_additional_config_fake_1): + pass + + +@_clean_up_ascend_config +def test_ascend_config_refresh(): + from vllm.config import get_current_vllm_config + vllm_config = get_current_vllm_config() + # set additional_config with none + init_ascend_config(vllm_config) + + input_additional_config = { + "torchair_graph_config": { + "enabled": False, + "use_cached_graph": True, + "graph_batch_sizes": [1, 2, 4, 8], + "graph_batch_sizes_init": False, + }, + "refresh": True, + } + + # refresh ascend config + with VllmRunner("facebook/opt-125m", + additional_config=input_additional_config): + ascend_config = get_ascend_config() + + assert not ascend_config.torchair_graph_config.enabled + assert ascend_config.torchair_graph_config.use_cached_graph + assert ascend_config.torchair_graph_config.graph_batch_sizes == [ + 1, 2, 4, 8 + ] + assert not ascend_config.torchair_graph_config.graph_batch_sizes_init diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index 2e7d744408..41ebbde9fd 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -37,7 +37,7 @@ def __init__(self, vllm_config): ascend_scheduler_config) self.expert_tensor_parallel_size = int( - additional_config.get("expert_tensor_parallel_size", 1)) + additional_config.get("expert_tensor_parallel_size", 0)) class TorchairGraphConfig: @@ -82,8 +82,11 @@ def __init__(self, ascend_scheduler_config: dict): def init_ascend_config(vllm_config): + additional_config = vllm_config.additional_config if vllm_config.additional_config is not None else {} + refresh = additional_config.get("refresh", + False) if additional_config else False global _ASCEND_CONFIG - if _ASCEND_CONFIG is not None: + if _ASCEND_CONFIG is not None and not refresh: return _ASCEND_CONFIG _ASCEND_CONFIG = AscendConfig(vllm_config) return _ASCEND_CONFIG @@ -106,35 +109,52 @@ def get_ascend_config(): def check_ascend_config(vllm_config, enforce_eager): ascend_config = get_ascend_config() - # Both for V0 and V1 Engine, torchair_graph cannot be enabled with eager mode. - if ascend_config.torchair_graph_config.enabled and enforce_eager: - raise RuntimeError( - "Can't enable graph mode and eager mode at the same time. Please set `enforce_eager=False` if you attempt to enable NPU graph mode." - ) - - # torchair_graph only work with deepseek model and mla enabled. - if ascend_config.torchair_graph_config.enabled: - if envs.VLLM_MLA_DISABLE: - logger.warning( - "Torchair graph mode is still experimental and not supported for V1 without mla currently, " - "it has been disabled automatically.") - ascend_config.ascend_scheduler_config.enabled = False - if vllm_config.model_config: - model_type = vllm_config.model_config.hf_config.model_type - if "deepseek" not in model_type: - raise NotImplementedError( - "Torchair graph mode only works with deepseek model.") - - # for V1 Engine, aclgraph doesn't work with deepseek model and only qwen model is well tested. - if envs.VLLM_USE_V1 and vllm_config.model_config is not None and not enforce_eager: - model_type = vllm_config.model_config.hf_config.model_type - if "deepseek" in model_type: + # for v0 engine + if not envs.VLLM_USE_V1: + if ascend_config.torchair_graph_config.enabled: + raise NotImplementedError( + "Torchair graph mode is only supported for V1 Engine.") + if ascend_config.ascend_scheduler_config.enabled: raise NotImplementedError( - "ACL Graph does not support deepseek. Please " - "try torchair graph mode to serve deepseek models on vllm-ascend." - " Or set `enforce_eager=True` to use eager mode.") - if "qwen" not in model_type: - logger.warning( - "ACL Graph is currently experimental. Please " - "raise an issue on https://github.com/vllm-project/vllm-ascend/issues" - " if you encourage any Error") + "Ascend scheduler is only supported for V1 Engine.") + # for v1 engine + else: + # for eager mode + if enforce_eager: + # torchair_graph cannot be enabled with eager mode. + if ascend_config.torchair_graph_config.enabled: + raise RuntimeError( + "Can't enable graph mode and eager mode at the same time. Please set `enforce_eager=False` if you attempt to enable NPU graph mode." + ) + # for graph mode + else: + # torchair_graph case + if ascend_config.torchair_graph_config.enabled: + # torchair_graph is not supported for V1 without mla currently. + if envs.VLLM_MLA_DISABLE: + logger.warning( + "Torchair graph mode is still experimental and not supported for V1 without mla currently, " + "it has been disabled automatically.") + ascend_config.torchair_graph_config.enabled = False + # torchair_graph is supported for deepseek model only currently. + if vllm_config.model_config: + model_type = vllm_config.model_config.hf_config.model_type + if "deepseek" not in model_type: + raise NotImplementedError( + "Torchair graph mode only works with deepseek model." + ) + # aclgraph case + else: + # aclgraph doesn't work with deepseek model and only qwen model is well tested. + if vllm_config.model_config: + model_type = vllm_config.model_config.hf_config.model_type + if "deepseek" in model_type: + raise NotImplementedError( + "ACL Graph does not support deepseek. Please " + "try torchair graph mode to serve deepseek models on vllm-ascend." + " Or set `enforce_eager=True` to use eager mode.") + if "qwen" not in model_type: + logger.warning( + "ACL Graph is currently experimental. Please " + "raise an issue on https://github.com/vllm-project/vllm-ascend/issues" + " if you encourage any Error") diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 647fefbe0e..96d4a9bbb6 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -133,7 +133,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: # NOTE: When enable_expert_parallel is True, we follow vLLM convention: # ep_size = world_size, which means expert_tensor_parallel_size must be 1 - if ascend_config.expert_tensor_parallel_size > 1 and not parallel_config.enable_expert_parallel: + if ascend_config.expert_tensor_parallel_size > 0 and not parallel_config.enable_expert_parallel: parallel_config.expert_tensor_parallel_size = ascend_config.expert_tensor_parallel_size # Calculate expert parallel size based on world size diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 07ea679312..269767ffcd 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -323,7 +323,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): ascend_config = get_ascend_config() self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled and self.vllm_config.model_config.use_mla - self.torchair_graph_use_cached_npu_graph = ascend_config.torchair_graph_config.use_cached_graph + self.use_cached_npu_graph = ascend_config.torchair_graph_config.use_cached_graph self.torchair_graph_batch_sizes = ascend_config.torchair_graph_config.graph_batch_sizes if ascend_config.torchair_graph_config.graph_batch_sizes_init: