-
Notifications
You must be signed in to change notification settings - Fork 545
[Fix] Fix update_aclgraph_sizes when running MoE models #913
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -119,6 +119,26 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: | |||||||||||||||
| from vllm.config import CompilationLevel # noqa: E402 | ||||||||||||||||
| compilation_config = vllm_config.compilation_config | ||||||||||||||||
| model_config = vllm_config.model_config | ||||||||||||||||
| additional_config = vllm_config.additional_config | ||||||||||||||||
| parallel_config = vllm_config.parallel_config | ||||||||||||||||
| cache_config = vllm_config.cache_config | ||||||||||||||||
|
|
||||||||||||||||
| if parallel_config: | ||||||||||||||||
| # Default value for expert tensor parallel size | ||||||||||||||||
| parallel_config.expert_tensor_parallel_size = parallel_config.tensor_parallel_size | ||||||||||||||||
|
|
||||||||||||||||
| # NOTE: When enable_expert_parallel is True, we follow vLLM convention: | ||||||||||||||||
| # ep_size = world_size, which means expert_tensor_parallel_size must be 1 | ||||||||||||||||
| if (additional_config | ||||||||||||||||
| and "expert_tensor_parallel_size" in additional_config | ||||||||||||||||
| and not parallel_config.enable_expert_parallel): | ||||||||||||||||
| parallel_config.expert_tensor_parallel_size = int( | ||||||||||||||||
| additional_config["expert_tensor_parallel_size"]) | ||||||||||||||||
|
|
||||||||||||||||
| # Calculate expert parallel size based on world size | ||||||||||||||||
| parallel_config.expert_parallel_size = ( | ||||||||||||||||
| parallel_config.world_size // | ||||||||||||||||
| parallel_config.expert_tensor_parallel_size) | ||||||||||||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. QQ: Does this mean etp size not equal to 1 only when disabling ep and enabling etp? then why we set ep size here if ep is disabled?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Like I mentioned above, this is more like an enhancement.
Comment on lines
+139
to
+141
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, I'm still confused. Could I understand like this: when disabling ep and enabling etp, a expected etp size is set, and an unexpected ep size is set? or should ep size be set only when ep is enabled?
Suggested change
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can discuss this in detail later, you are right, perhaps we need to give careful consideration to the configuration logic here to avoid any unwanted confusion. |
||||||||||||||||
|
|
||||||||||||||||
| if model_config is None: | ||||||||||||||||
| logger.warning("Model config is missing. This may indicate " | ||||||||||||||||
|
|
@@ -127,9 +147,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: | |||||||||||||||
| else: | ||||||||||||||||
| enforce_eager = getattr(model_config, "enforce_eager", False) | ||||||||||||||||
|
|
||||||||||||||||
| if vllm_config.additional_config is not None: | ||||||||||||||||
| enable_graph_mode = vllm_config.additional_config.get( | ||||||||||||||||
| "enable_graph_mode", False) | ||||||||||||||||
| if additional_config is not None: | ||||||||||||||||
| enable_graph_mode = additional_config.get("enable_graph_mode", | ||||||||||||||||
| False) | ||||||||||||||||
| if enable_graph_mode: | ||||||||||||||||
| if enforce_eager: | ||||||||||||||||
| raise RuntimeError( | ||||||||||||||||
|
|
@@ -139,7 +159,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: | |||||||||||||||
| logger.warning( | ||||||||||||||||
| "NPU graph mode is still experimental and not supported for V1 without mla currently, " | ||||||||||||||||
| "it has been disabled automatically.") | ||||||||||||||||
| vllm_config.additional_config["enable_graph_mode"] = False | ||||||||||||||||
| additional_config["enable_graph_mode"] = False | ||||||||||||||||
| if model_config: | ||||||||||||||||
| model_type = model_config.hf_config.model_type | ||||||||||||||||
| if "deepseek" not in model_type: | ||||||||||||||||
|
|
@@ -178,7 +198,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: | |||||||||||||||
| ["vllm.unified_ascend_attention_with_output"]) | ||||||||||||||||
| update_aclgraph_sizes(vllm_config) | ||||||||||||||||
|
|
||||||||||||||||
| parallel_config = vllm_config.parallel_config | ||||||||||||||||
| if parallel_config and parallel_config.worker_cls == "auto": | ||||||||||||||||
| if envs.VLLM_USE_V1: | ||||||||||||||||
| parallel_config.worker_cls = "vllm_ascend.worker.worker_v1.NPUWorker" | ||||||||||||||||
|
|
@@ -190,7 +209,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: | |||||||||||||||
| else: | ||||||||||||||||
| parallel_config.worker_cls = "vllm_ascend.worker.worker.NPUWorker" | ||||||||||||||||
|
|
||||||||||||||||
| cache_config = vllm_config.cache_config | ||||||||||||||||
| if cache_config: | ||||||||||||||||
| if cache_config.block_size is None: | ||||||||||||||||
| cache_config.block_size = 128 | ||||||||||||||||
|
|
@@ -202,11 +220,10 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: | |||||||||||||||
|
|
||||||||||||||||
| if envs.VLLM_USE_V1: | ||||||||||||||||
| # Activate custom ops for v1. | ||||||||||||||||
| vllm_config.compilation_config.custom_ops = ["all"] | ||||||||||||||||
| compilation_config.custom_ops = ["all"] | ||||||||||||||||
| # If ascend_scheduler_config exists in additional_config, | ||||||||||||||||
| # extents original scheduler_config to use AscendScheduler. | ||||||||||||||||
|
|
||||||||||||||||
| additional_config = vllm_config.additional_config | ||||||||||||||||
| if additional_config and additional_config.get( | ||||||||||||||||
| "ascend_scheduler_config", None) is not None: | ||||||||||||||||
| additional_scheduler_config = additional_config.get( | ||||||||||||||||
|
|
||||||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Seems there is no
expert_tensor_parallel_sizeinParallelConfig? same forparallel_config.expert_parallel_sizeThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
vLLM does not support tensor parallelism for experts’ weights, so I added this attribute. This change primarily addresses scenarios where the number of devices significantly exceeds the number of experts.