1111
1212from tests .utils import RemoteOpenAIServer
1313from tests .v1 .test_utils import check_request_balancing
14- from vllm .platforms import Platform
14+ from vllm .platforms import current_platform
1515
1616MODEL_NAME = "ibm-research/PowerMoE-3b"
1717
@@ -96,10 +96,12 @@ def start_server(r: int, sargs: list[str]):
9696 sargs ,
9797 auto_port = False ,
9898 env_dict = {
99- "CUDA_VISIBLE_DEVICES" :
99+ current_platform . device_control_env_var :
100100 "," .join (
101- str (Platform .device_id_to_physical_device_id (
102- i )) for i in range (r , r + gpus_per_node ))
101+ str (
102+ current_platform .
103+ device_id_to_physical_device_id (i ))
104+ for i in range (r , r + gpus_per_node ))
103105 })
104106 server .__enter__ ()
105107 if r == 0 :
@@ -219,9 +221,11 @@ def start_engines_server():
219221 engines_server_args ,
220222 auto_port = False ,
221223 env_dict = {
222- "CUDA_VISIBLE_DEVICES" :
224+ current_platform . device_control_env_var :
223225 "," .join (
224- str (Platform .device_id_to_physical_device_id (i ))
226+ str (
227+ current_platform .
228+ device_id_to_physical_device_id (i ))
225229 for i in range (self .dp_size * self .tp_size ))
226230 })
227231 server .__enter__ ()
@@ -330,7 +334,7 @@ async def make_request():
330334 completion = await client .completions .create (
331335 model = model_name ,
332336 prompt = "Hello, my name is" ,
333- max_tokens = 10 ,
337+ max_tokens = 5 ,
334338 temperature = 1.0 )
335339
336340 assert completion .id is not None
@@ -361,8 +365,11 @@ async def make_request():
361365 await asyncio .sleep (0.5 )
362366
363367 # Send multiple requests - internal LB should distribute across DP ranks
364- num_requests = 50
365- all_tasks = [make_request () for _ in range (num_requests )]
368+ num_requests = 200
369+ all_tasks = []
370+ for _ in range (num_requests ):
371+ all_tasks .append (asyncio .create_task (make_request ()))
372+ await asyncio .sleep (0.01 )
366373
367374 results = await asyncio .gather (* all_tasks )
368375 assert len (results ) == num_requests
@@ -371,7 +378,10 @@ async def make_request():
371378 await asyncio .sleep (0.5 )
372379
373380 # Second burst of requests
374- all_tasks = [make_request () for _ in range (num_requests )]
381+ all_tasks = []
382+ for _ in range (num_requests ):
383+ all_tasks .append (asyncio .create_task (make_request ()))
384+ await asyncio .sleep (0.01 )
375385
376386 results = await asyncio .gather (* all_tasks )
377387 assert len (results ) == num_requests
@@ -449,8 +459,11 @@ async def make_streaming_request():
449459
450460 # Send multiple streaming requests - internal LB should distribute across
451461 # DP ranks
452- num_requests = 50
453- all_tasks = [make_streaming_request () for _ in range (num_requests )]
462+ num_requests = 200
463+ all_tasks = []
464+ for _ in range (num_requests ):
465+ all_tasks .append (asyncio .create_task (make_streaming_request ()))
466+ await asyncio .sleep (0.01 )
454467
455468 results = await asyncio .gather (* all_tasks )
456469 assert len (results ) == num_requests
@@ -459,7 +472,10 @@ async def make_streaming_request():
459472 await asyncio .sleep (0.5 )
460473
461474 # Second burst of streaming requests
462- all_tasks = [make_streaming_request () for _ in range (num_requests )]
475+ all_tasks = []
476+ for _ in range (num_requests ):
477+ all_tasks .append (asyncio .create_task (make_streaming_request ()))
478+ await asyncio .sleep (0.01 )
463479
464480 results = await asyncio .gather (* all_tasks )
465481 assert len (results ) == num_requests
@@ -492,7 +508,7 @@ async def make_request():
492508 completion = await api_only_client .completions .create (
493509 model = model_name ,
494510 prompt = "Hello, my name is" ,
495- max_tokens = 10 ,
511+ max_tokens = 5 ,
496512 temperature = 1.0 )
497513
498514 assert completion .id is not None
@@ -522,8 +538,11 @@ async def make_request():
522538
523539 # Send multiple requests - should be distributed across engines on
524540 # headless server
525- num_requests = 50
526- all_tasks = [make_request () for _ in range (num_requests )]
541+ num_requests = 200
542+ all_tasks = []
543+ for _ in range (num_requests ):
544+ all_tasks .append (asyncio .create_task (make_request ()))
545+ await asyncio .sleep (0.01 )
527546
528547 results = await asyncio .gather (* all_tasks )
529548 assert len (results ) == num_requests
@@ -532,7 +551,10 @@ async def make_request():
532551 await asyncio .sleep (0.5 )
533552
534553 # Second burst of requests
535- all_tasks = [make_request () for _ in range (num_requests )]
554+ all_tasks = []
555+ for _ in range (num_requests ):
556+ all_tasks .append (asyncio .create_task (make_request ()))
557+ await asyncio .sleep (0.01 )
536558
537559 results = await asyncio .gather (* all_tasks )
538560 assert len (results ) == num_requests
@@ -610,8 +632,11 @@ async def make_streaming_request():
610632 await asyncio .sleep (0.5 )
611633
612634 # Send multiple streaming requests - should be distributed across engines
613- num_requests = 50
614- all_tasks = [make_streaming_request () for _ in range (num_requests )]
635+ num_requests = 200
636+ all_tasks = []
637+ for _ in range (num_requests ):
638+ all_tasks .append (asyncio .create_task (make_streaming_request ()))
639+ await asyncio .sleep (0.01 )
615640
616641 results = await asyncio .gather (* all_tasks )
617642 assert len (results ) == num_requests
@@ -620,7 +645,10 @@ async def make_streaming_request():
620645 await asyncio .sleep (0.5 )
621646
622647 # Second burst of streaming requests
623- all_tasks = [make_streaming_request () for _ in range (num_requests )]
648+ all_tasks = []
649+ for _ in range (num_requests ):
650+ all_tasks .append (asyncio .create_task (make_streaming_request ()))
651+ await asyncio .sleep (0.01 )
624652
625653 results = await asyncio .gather (* all_tasks )
626654 assert len (results ) == num_requests
0 commit comments