-
Notifications
You must be signed in to change notification settings - Fork 544
[EPLB]: Correct local expert number calculation with redundant experts && add e2e test #1223
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Closed
Changes from all commits
Commits
Show all changes
6 commits
Select commit
Hold shift + click to select a range
16bbd53
[EPLB]: Correct local expert number calculation with redundant experts
ZhengWG 30852a6
[CI]: add e2e test case for eplb
ZhengWG 0d2d7f2
reformat
ZhengWG d1032fb
change test model
ZhengWG cb0917f
[CI]: adjust device num
ZhengWG d74bd9f
[CI]: enable ci run
ZhengWG File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,166 @@ | ||
| import json | ||
| import os | ||
| import random | ||
| import signal | ||
| import subprocess | ||
| import time | ||
|
|
||
| import psutil | ||
| import pytest | ||
| import requests | ||
|
|
||
|
|
||
| def kill_process_and_children(pid): | ||
| try: | ||
| parent = psutil.Process(pid) | ||
| children = parent.children(recursive=True) | ||
| for child in children: | ||
| print(f"Killing child process {child.pid}") | ||
| child.kill() | ||
| print(f"Killing parent process {pid}") | ||
| parent.kill() | ||
| except psutil.NoSuchProcess: | ||
| pass | ||
|
|
||
|
|
||
| def kill_all_vllm_related(): | ||
| current_pid = os.getpid() | ||
|
|
||
| for proc in psutil.process_iter(['pid', 'cmdline']): | ||
| try: | ||
| if proc.pid == current_pid: | ||
| continue | ||
| cmd = ' '.join(proc.info['cmdline']) | ||
| if "vllm" in cmd or "proxy" in cmd or "engine_worker" in cmd: | ||
| kill_process_and_children(proc.pid) | ||
| except Exception: | ||
| continue | ||
|
|
||
|
|
||
| def build_expert_map(expert_map_path, | ||
| num_redundant_expert=0, | ||
| num_layer=2, | ||
| num_device=4, | ||
| num_original_expert=256, | ||
| random_seed=42): | ||
| expert_num_list = list(range(num_original_expert)) | ||
| random.seed(random_seed) | ||
| if num_redundant_expert > 0: | ||
| expert_num_list = expert_num_list + random.choices( | ||
| expert_num_list, k=num_redundant_expert) | ||
| local_num_expert = len(expert_num_list) // num_device | ||
|
|
||
| expert_map = { | ||
| "moe_layer_count": num_layer, | ||
| "device_count": num_device, | ||
| "layer_list": [] | ||
| } | ||
| for layer_id in range(num_layer): | ||
| random.shuffle(expert_num_list) | ||
| current_expert_distribution = [ | ||
| expert_num_list[i * local_num_expert:(i + 1) * local_num_expert] | ||
| for i in range(num_device) | ||
| ] | ||
| layer_info = { | ||
| "layer_id": layer_id, | ||
| "device_count": num_device, | ||
| "device_list": [] | ||
| } | ||
| for device_id in range(num_device): | ||
| layer_info["device_list"].append({ | ||
| "device_id": | ||
| device_id, | ||
| "device_expert": | ||
| current_expert_distribution[device_id] | ||
| }) | ||
| expert_map["layer_list"].append(layer_info) | ||
| with open(expert_map_path, "w") as f: | ||
| json.dump(expert_map, f) | ||
|
|
||
|
|
||
| def is_port_in_use(port): | ||
| """Check if a port is currently in use.""" | ||
| import socket | ||
| with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: | ||
| return sock.connect_ex(("127.0.0.1", port)) == 0 | ||
|
|
||
|
|
||
| def ensure_port_available(port, timeout=30): | ||
| """Wait for a port to become available.""" | ||
| start = time.time() | ||
| while time.time() - start < timeout: | ||
| if not is_port_in_use(port): | ||
| return True | ||
| print(f"Port {port} is still in use, waiting...") | ||
| time.sleep(2) | ||
| return False | ||
|
|
||
|
|
||
| def wait_for_port(port, timeout=30): | ||
| import socket | ||
| start = time.time() | ||
| while time.time() - start < timeout: | ||
| with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: | ||
| if sock.connect_ex(("127.0.0.1", port)) == 0: | ||
| return True | ||
| time.sleep(1) | ||
| raise TimeoutError(f"Port {port} not ready after {timeout}s") | ||
|
|
||
|
|
||
| SCRIPT_PATH = os.path.abspath("./tests/e2e/run_eplb.sh") | ||
| PROXY_PORT = 10102 | ||
| EXPERT_MAP_PATH = "./tests/e2e/eplb/expert_map.json" | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("num_redundant_expert", [0, 4]) | ||
| def test_eplb_with_redundant_expert(num_redundant_expert): | ||
| # Ensure port is available before starting the test | ||
| if is_port_in_use(PROXY_PORT): | ||
| print( | ||
| f"Port {PROXY_PORT} is still in use from previous test, waiting for it to become available..." | ||
| ) | ||
| if not ensure_port_available(PROXY_PORT, timeout=300): | ||
| pytest.skip( | ||
| f"Port {PROXY_PORT} is still in use after waiting 60 seconds") | ||
|
|
||
| print("Launching bash script to run eplb setup...") | ||
| build_expert_map(EXPERT_MAP_PATH, | ||
| num_redundant_expert=num_redundant_expert) | ||
| proc = subprocess.Popen(["bash", SCRIPT_PATH, str(num_redundant_expert)]) | ||
| try: | ||
| print("Waiting for proxy port to be available...") | ||
| wait_for_port(PROXY_PORT, timeout=600) | ||
|
|
||
| # request | ||
| payload = { | ||
| "model": "Deepseek", | ||
| "prompt": "The future of AI is", | ||
| "max_tokens": 64, | ||
| "temperature": 0, | ||
| } | ||
| response = requests.post( | ||
| f"http://localhost:{PROXY_PORT}/v1/completions", | ||
| headers={"Content-Type": "application/json"}, | ||
| json=payload, | ||
| timeout=10) | ||
| assert response.status_code == 200, f"HTTP failed: {response.status_code}" | ||
| result = response.json() | ||
| print("Response:", result) | ||
| assert "text" in result["choices"][0] | ||
| assert len(result["choices"][0]["text"].strip()) > 0 | ||
|
|
||
| finally: | ||
| # clean up subprocesses | ||
| print("Cleaning up subprocess...") | ||
| proc.send_signal(signal.SIGINT) | ||
| try: | ||
| proc.wait(timeout=10) | ||
| except subprocess.TimeoutExpired: | ||
| proc.kill() | ||
| if os.path.exists(EXPERT_MAP_PATH): | ||
| os.remove(EXPERT_MAP_PATH) | ||
| kill_all_vllm_related() | ||
|
|
||
| # Wait for port to be fully released | ||
| print("Waiting for port to be fully released...") | ||
| time.sleep(3) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,50 @@ | ||
| #!/bin/bash | ||
|
|
||
| set -eo errexit | ||
|
|
||
| . $(dirname "$0")/common.sh | ||
|
|
||
| export VLLM_ENABLE_MC2=1 | ||
| export VLLM_USE_V1=1 | ||
| export TASK_QUEUE_ENABLE=1 | ||
| # FIXME: unset HCCL_OP_EXPANSION_MODE to avoid the torch_air bug | ||
| unset HCCL_OP_EXPANSION_MODE | ||
|
|
||
| MODEL_NAME="vllm-ascend/DeepSeek-V3-Pruning" | ||
| TP_SIZE=2 | ||
| DP_SIZE=2 | ||
| REGISTER_PORT=10102 | ||
| ASCEND_VISIBLE_DEVICES=0,1,2,3 | ||
| NUM_REDUNDANT_EXPERT=$1 | ||
|
|
||
|
|
||
| function run_eplb_instance() { | ||
| local model_name=$1 | ||
| local tp_size=$2 | ||
| local dp_size=$3 | ||
| local register_port=$4 | ||
| local num_redundant_expert=$5 | ||
|
|
||
| _info "====> Test model: $model_name" | ||
| _info "====> TP size: $tp_size" | ||
| _info "====> DP size: $dp_size" | ||
| _info "====> Register port: $register_port" | ||
| _info "====> Expert map path: ./tests/e2e/eplb/expert_map.json" | ||
| _info "====> Num redundant expert: $num_redundant_expert" | ||
|
|
||
| ASCEND_RT_VISIBLE_DEVICES=$ASCEND_VISIBLE_DEVICES vllm serve $model_name \ | ||
| --host 0.0.0.0 \ | ||
| --port $register_port \ | ||
| --tensor-parallel-size $tp_size \ | ||
| --data-parallel-size $dp_size \ | ||
| --enable-expert-parallel \ | ||
| --served-model-name Deepseek \ | ||
| --max-model-len 8192 \ | ||
| --max-num-seqs 24 \ | ||
| --trust-remote-code \ | ||
| --additional-config '{"torchair_graph_config": {"enabled": true, "graph_batch_sizes": [24]}, "ascend_scheduler_config": {"enabled": true}, "expert_map_path": "./tests/e2e/eplb/expert_map.json"}' | ||
| } | ||
|
|
||
|
|
||
| _info "====> Start staic_eplb test" | ||
| run_eplb_instance $MODEL_NAME $TP_SIZE $DP_SIZE $REGISTER_PORT $NUM_REDUNDANT_EXPERT |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not just use the
self.local_num_expertsvalue whenlog2phyis None? It's already set bydetermine_expert_map.vllm-ascend/vllm_ascend/ops/fused_moe.py
Lines 1065 to 1067 in fe0da59
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, it should return the same value. The current implementation intentionally preserves the original logic.