We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 01b6113 commit bd7599dCopy full SHA for bd7599d
vllm/v1/worker/tpu_model_runner.py
@@ -862,7 +862,9 @@ def capture_model(self) -> None:
862
out = self.model.sample_from_hidden(dummy_hidden,
863
sampling_meta)
864
out = out.cpu()
865
- if num_reqs_to_sample >= self.max_num_reqs:
+ # Requests can't be more than tokens. But do compile for the
866
+ # next bigger value in case num_tokens uses bucketed padding.
867
+ if num_reqs_to_sample >= min(num_tokens, self.max_num_reqs):
868
break
869
# Make sure to compile the `max_num_reqs` upper-limit case
870
num_reqs_to_sample = _get_padded_num_reqs_with_upper_limit(
0 commit comments