1+ import json
12import os
2- import time
3- import subprocess
3+ import random
44import signal
5+ import subprocess
6+ import time
7+
58import psutil
6- import requests
79import pytest
8- import random
9- import json
10+ import requests
1011
1112
1213def kill_process_and_children (pid ):
@@ -36,13 +37,17 @@ def kill_all_vllm_related():
3637 continue
3738
3839
39- def build_expert_map (expert_map_path , num_redundant_expert = 0 ,
40- num_layer = 58 , num_device = 16 , num_original_expert = 256 ,
40+ def build_expert_map (expert_map_path ,
41+ num_redundant_expert = 0 ,
42+ num_layer = 58 ,
43+ num_device = 16 ,
44+ num_original_expert = 256 ,
4145 random_seed = 42 ):
4246 expert_num_list = list (range (num_original_expert ))
4347 random .seed (random_seed )
4448 if num_redundant_expert > 0 :
45- expert_num_list = expert_num_list + random .choices (expert_num_list , k = num_redundant_expert )
49+ expert_num_list = expert_num_list + random .choices (
50+ expert_num_list , k = num_redundant_expert )
4651 local_num_expert = len (expert_num_list ) // num_device
4752
4853 expert_map = {
@@ -52,16 +57,21 @@ def build_expert_map(expert_map_path, num_redundant_expert=0,
5257 }
5358 for layer_id in range (num_layer ):
5459 random .shuffle (expert_num_list )
55- current_expert_distribution = [expert_num_list [i * local_num_expert :(i + 1 )* local_num_expert ] for i in range (num_device )]
60+ current_expert_distribution = [
61+ expert_num_list [i * local_num_expert :(i + 1 ) * local_num_expert ]
62+ for i in range (num_device )
63+ ]
5664 layer_info = {
5765 "layer_id" : layer_id ,
5866 "device_count" : num_device ,
5967 "device_list" : []
6068 }
6169 for device_id in range (num_device ):
6270 layer_info ["device_list" ].append ({
63- "device_id" : device_id ,
64- "device_expert" : current_expert_distribution [device_id ]
71+ "device_id" :
72+ device_id ,
73+ "device_expert" :
74+ current_expert_distribution [device_id ]
6575 })
6676 expert_map ["layer_list" ].append (layer_info )
6777 with open (expert_map_path , "w" ) as f :
@@ -77,7 +87,6 @@ def is_port_in_use(port):
7787
7888def ensure_port_available (port , timeout = 30 ):
7989 """Wait for a port to become available."""
80- import socket
8190 start = time .time ()
8291 while time .time () - start < timeout :
8392 if not is_port_in_use (port ):
@@ -107,12 +116,16 @@ def wait_for_port(port, timeout=30):
107116def test_eplb_with_redundant_expert (num_redundant_expert ):
108117 # Ensure port is available before starting the test
109118 if is_port_in_use (PROXY_PORT ):
110- print (f"Port { PROXY_PORT } is still in use from previous test, waiting for it to become available..." )
119+ print (
120+ f"Port { PROXY_PORT } is still in use from previous test, waiting for it to become available..."
121+ )
111122 if not ensure_port_available (PROXY_PORT , timeout = 300 ):
112- pytest .skip (f"Port { PROXY_PORT } is still in use after waiting 60 seconds" )
113-
123+ pytest .skip (
124+ f"Port { PROXY_PORT } is still in use after waiting 60 seconds" )
125+
114126 print ("Launching bash script to run eplb setup..." )
115- build_expert_map (EXPERT_MAP_PATH , num_redundant_expert = num_redundant_expert )
127+ build_expert_map (EXPERT_MAP_PATH ,
128+ num_redundant_expert = num_redundant_expert )
116129 proc = subprocess .Popen (["bash" , SCRIPT_PATH , str (num_redundant_expert )])
117130 try :
118131 print ("Waiting for proxy port to be available..." )
@@ -135,7 +148,7 @@ def test_eplb_with_redundant_expert(num_redundant_expert):
135148 print ("Response:" , result )
136149 assert "text" in result ["choices" ][0 ]
137150 assert len (result ["choices" ][0 ]["text" ].strip ()) > 0
138-
151+
139152 finally :
140153 # clean up subprocesses
141154 print ("Cleaning up subprocess..." )
@@ -147,7 +160,7 @@ def test_eplb_with_redundant_expert(num_redundant_expert):
147160 if os .path .exists (EXPERT_MAP_PATH ):
148161 os .remove (EXPERT_MAP_PATH )
149162 kill_all_vllm_related ()
150-
163+
151164 # Wait for port to be fully released
152165 print ("Waiting for port to be fully released..." )
153- time .sleep (3 )
166+ time .sleep (3 )
0 commit comments