@@ -76,18 +76,22 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle():
7676 seed.
7777 - Keep max_tokens and max_model_len bounded for speed and memory use.
7878 """
79- random .seed (12345 )
79+ seed = int (os .getenv ("VLLM_TEST_SEED" , "12345" ))
80+ random .seed (seed )
8081
8182 # Allow overrides from environment (useful for CI tuning)
8283 # "facebook/opt-125m" is too small, doesn't reliably test determinism
8384 model = os .getenv ("VLLM_TEST_MODEL" , "Qwen/Qwen3-1.7B" )
85+ #model = os.getenv("VLLM_TEST_MODEL", "ibm-research/PowerMoE-3b")
8486 num_trials = int (os .getenv ("VLLM_NEEDLE_TRIALS" , "5" ))
85- batch_size = int (os .getenv ("VLLM_NEEDLE_BATCH_SIZE" , "64" ))
86- assert batch_size >= 2 , "Batch size should be >= 2 to mix needle."
87+ max_batch_size = int (os .getenv ("VLLM_NEEDLE_BATCH_SIZE" , "128" ))
88+ min_random_prompt = int (os .getenv ("VLLM_MIN_PROMPT" , "1024" ))
89+ max_random_prompt = int (os .getenv ("VLLM_MAX_PROMPT" , "2048" ))
90+ assert max_batch_size >= 2 , "Batch size should be >= 2 to mix needle."
8791
8892 # Keep GPU memory usage low to avoid startup allocation failures.
89- gpu_mem_util = float (os .getenv ("VLLM_GPU_MEMORY_UTILIZATION" , "0.3 " ))
90- max_model_len = int (os .getenv ("VLLM_MAX_MODEL_LEN" , "4096 " ))
93+ gpu_mem_util = float (os .getenv ("VLLM_GPU_MEMORY_UTILIZATION" , "0.4 " ))
94+ max_model_len = int (os .getenv ("VLLM_MAX_MODEL_LEN" , "5120 " ))
9195 swap_space_gb = int (os .getenv ("VLLM_SWAP_SPACE_GB" , "4" ))
9296
9397 # Sampling parameters: longer outputs with a more random-sounding
@@ -111,7 +115,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle():
111115 # Engine with bs=1 behavior
112116 llm_bs1 = LLM_with_max_seqs (
113117 model = model ,
114- max_num_seqs = 1 ,
118+ max_num_seqs = 128 ,
115119 gpu_memory_utilization = gpu_mem_util ,
116120 max_model_len = max_model_len ,
117121 swap_space = swap_space_gb ,
@@ -126,7 +130,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle():
126130 # Engine with larger batch limit (e.g., 64)
127131 llm_bsN = LLM_with_max_seqs (
128132 model = model ,
129- max_num_seqs = batch_size ,
133+ max_num_seqs = 128 ,
130134 gpu_memory_utilization = gpu_mem_util ,
131135 max_model_len = max_model_len ,
132136 swap_space = swap_space_gb ,
@@ -135,15 +139,17 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle():
135139 mismatches = 0
136140
137141 for trial in range (num_trials ):
138- # Create a batch of size `batch_size ` and insert the needle at
142+ # Create a batch of size `max_batch_size ` and insert the needle at
139143 # a random index
140144 prompts : list [str ] = []
145+ batch_size = random .randint (max_batch_size // 2 , max_batch_size )
141146 needle_pos = random .randint (0 , batch_size - 1 )
142147 for i in range (batch_size ):
143148 if i == needle_pos :
144149 prompts .append (needle_prompt )
145150 else :
146- prompts .append (_random_prompt ())
151+ prompts .append (
152+ _random_prompt (min_random_prompt , max_random_prompt ))
147153
148154 # Generate with the larger-batch engine
149155 outputs = llm_bsN .generate (prompts , sampling )
@@ -154,17 +160,19 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle():
154160 text = needle_output .outputs [0 ].text
155161
156162 if text != baseline_text :
163+ print (
164+ f"{ text } \n \n == Not the same as ==\n \n { baseline_text } \n \n " )
157165 mismatches += 1
158166
159167 passes = num_trials - mismatches
160168 # Dump how many passed vs failed
161169 print (f"[determinism] total={ num_trials } , passed={ passes } , "
162- f"failed={ mismatches } , batch_size= { batch_size } " )
170+ f"failed={ mismatches } , max_batch_size= { max_batch_size } " )
163171
164172 if mismatches > 0 :
165173 pytest .fail (
166174 f"Nondeterministic outputs detected: { mismatches } failed out "
167- f"of { num_trials } trials (batch_size= { batch_size } )." )
175+ f"of { num_trials } trials (max_batch_size= { max_batch_size } )." )
168176
169177 finally :
170178 # Ensure engines are shutdown to free GPU/VRAM across test sessions
@@ -196,9 +204,10 @@ def _extract_step_logprobs(request_output):
196204 not torch .cuda .is_available (),
197205 reason = "Requires CUDA to match production inference path." ,
198206)
199- def test_logprobs_bitwise_batch_invariance_bs1_vs_bs2 ():
207+ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN ():
200208
201- #model_name = os.getenv("VLLM_TEST_MODEL", "facebook/opt-125m")
209+ seed = int (os .getenv ("VLLM_TEST_SEED" , "12345" ))
210+ random .seed (seed )
202211 model_name = os .getenv ("VLLM_TEST_MODEL" , "Qwen/Qwen3-1.7B" )
203212 tp_size = int (os .getenv ("VLLM_TEST_TP_SIZE" , "1" ))
204213
@@ -212,10 +221,15 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bs2():
212221 prompts = [
213222 "The capital of France is" ,
214223 "The capital of Germany is" ,
224+ _random_prompt (10 , 10 ),
225+ _random_prompt (10 , 10 ),
226+ _random_prompt (10 , 10 ),
227+ _random_prompt (10 , 10 ),
228+ _random_prompt (10 , 10 ),
215229 ]
216230
217231 sp = SamplingParams (
218- temperature = 0.0 ,
232+ temperature = 0.6 ,
219233 top_p = 1.0 ,
220234 max_tokens = 8 ,
221235 # Seed shouldn't matter at temperature=0, but keeping it stable anyway.
@@ -234,25 +248,25 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bs2():
234248 "enable logprobs return to run this test." )
235249 bs1_logprobs_per_prompt .append (step_logprobs )
236250
237- # BS=2 : run prompts in a batch and collect logprobs per step for each
251+ # BS=N : run prompts in a batch and collect logprobs per step for each
238252 # prompt.
239253 outs_batched = llm .generate (prompts , sp , use_tqdm = False )
240254 assert len (outs_batched ) == len (prompts )
241- bs2_logprobs_per_prompt = []
255+ bsN_logprobs_per_prompt = []
242256 for o in outs_batched :
243257 step_logprobs = _extract_step_logprobs (o )
244258 if step_logprobs is None :
245259 pytest .skip ("Logits are not available on RequestOutput; "
246260 "enable logprobs return to run this test." )
247- bs2_logprobs_per_prompt .append (step_logprobs )
261+ bsN_logprobs_per_prompt .append (step_logprobs )
248262
249- # Compare step-by-step logprobs for each prompt between BS=1 and BS=2 runs.
250- for i , (logprobs_bs1 , logprobs_bs2 ) in enumerate (
251- zip (bs1_logprobs_per_prompt , bs2_logprobs_per_prompt )):
252- assert len (logprobs_bs1 ) == len (logprobs_bs2 ), (
263+ # Compare step-by-step logprobs for each prompt between BS=1 and BS=N runs.
264+ for i , (logprobs_bs1 , logprobs_bsN ) in enumerate (
265+ zip (bs1_logprobs_per_prompt , bsN_logprobs_per_prompt )):
266+ assert len (logprobs_bs1 ) == len (logprobs_bsN ), (
253267 f"Different number of generation steps for prompt index { i } : "
254- f"{ len (logprobs_bs1 )} (BS=1) vs { len (logprobs_bs2 )} (BS=2 )" )
255- for t , (a , b ) in enumerate (zip (logprobs_bs1 , logprobs_bs2 )):
268+ f"{ len (logprobs_bs1 )} (BS=1) vs { len (logprobs_bsN )} (BS=N )" )
269+ for t , (a , b ) in enumerate (zip (logprobs_bs1 , logprobs_bsN )):
256270 assert a .shape == b .shape , (
257271 f"Logits shape mismatch at prompt { i } , step { t } : "
258272 f"{ a .shape } vs { b .shape } " )
0 commit comments