Skip to content

Commit 54a1d4b

Browse files
committed
[Frontend] Add a random prefix to client-provided request IDs
Since vllm-project#9550 and vllm-project#10968 we support client's supplying a custom request ID. The motivation for this is that it can be very helpful when you need to correlate vLLM logs with logs of a related service. Since the request ID is used ubiquitously across vLLM as a unique key, it obviously is problematic if we ever have multiple in-flight requests using the same client-provided request ID. We saw this happening recently when `vllm serve bench` started including a request ID and the request IDs from multiple concurrent instances caused collisions. See vllm-project#27723 We try to guard against request ID collisions currently in the frontend in OutputProcessor: ``` def add_request(...): if request_id in self.request_states: raise ValueError(f"Request id {request_id} already running.") ``` however, this is not always effective: 1) We can have abort race conditions where a request is no longer tracked by the frontend, but still not completed in the engine. See vllm-project#15326 for an attempt to fix this. 2) With P/D, a request will continue to be tracked by the prefill engine long after the prefill request has been completed in the frontend, while we wait for the decode side to fetch the KV blocks Let's instead ensure we use a unique request ID internaly, even when a client provides a custom request ID. We can do this simply by prepending a short random prefix given that we already add a prefix to the client-provided ID. A full 32 character random UUID would be overkill as a prefix, so how many random characters would be sufficient? 8 characters gives us 32 bits of entropy, or 16^8 possible prefixes. Using the collision probability approximation from https://preshing.com/20110504/hash-collision-probabilities: N = 16^8 and k is the number of generated prefixes, then the probability of collision is (k^2)/(2N), so If a client somehow caused vLLM to hold 10k requests that reuse the same client-provided ID, then there would be a 1.16% chance of collision: ``` >>> (k**2)/(2*N) 0.011641532182693481 ``` That seems (super good enough)[https://hownot2.com/products/hownot2-super-good-enough-t-shirt]. Signed-off-by: Mark McLoughlin <markmc@redhat.com>
1 parent d200a33 commit 54a1d4b

File tree

9 files changed

+28
-21
lines changed

9 files changed

+28
-21
lines changed

vllm/entrypoints/openai/serving_chat.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -256,8 +256,8 @@ async def create_chat_completion(
256256
logger.exception("Error in preprocessing prompt inputs")
257257
return self.create_error_response(f"{e} {e.__cause__}")
258258

259-
request_id = (
260-
f"chatcmpl-{self._base_request_id(raw_request, request.request_id)}"
259+
request_id = self._internal_request_id(
260+
raw_request, "chatcmpl", request.request_id
261261
)
262262

263263
request_metadata = RequestResponseMetadata(request_id=request_id)

vllm/entrypoints/openai/serving_classification.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ async def create_classify(
145145
raw_request: Request,
146146
) -> ClassificationResponse | ErrorResponse:
147147
model_name = self.models.model_name()
148-
request_id = f"{self.request_id_prefix}-{self._base_request_id(raw_request)}"
148+
request_id = self._internal_request_id(raw_request, self.request_id_prefix)
149149

150150
ctx = ClassificationServeContext(
151151
request=request,

vllm/entrypoints/openai/serving_completion.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ async def create_completion(
107107
"prompt_logprobs is not compatible with prompt embeds."
108108
)
109109

110-
request_id = f"cmpl-{self._base_request_id(raw_request, request.request_id)}"
110+
request_id = self._internal_request_id(raw_request, "cmpl", request.request_id)
111111
created_time = int(time.time())
112112

113113
request_metadata = RequestResponseMetadata(request_id=request_id)

vllm/entrypoints/openai/serving_embedding.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -648,9 +648,8 @@ async def create_embedding(
648648
for the API specification. This API mimics the OpenAI Embedding API.
649649
"""
650650
model_name = self.models.model_name()
651-
request_id = (
652-
f"{self.request_id_prefix}-"
653-
f"{self._base_request_id(raw_request, request.request_id)}"
651+
request_id = self._internal_request_id(
652+
raw_request, self.request_id_prefix, request.request_id
654653
)
655654

656655
ctx = EmbeddingServeContext(

vllm/entrypoints/openai/serving_engine.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1280,15 +1280,23 @@ async def _get_trace_headers(
12801280
return None
12811281

12821282
@staticmethod
1283-
def _base_request_id(
1284-
raw_request: Request | None, default: str | None = None
1285-
) -> str | None:
1286-
"""Pulls the request id to use from a header, if provided"""
1287-
default = default or random_uuid()
1288-
if raw_request is None:
1289-
return default
1283+
def _internal_request_id(
1284+
raw_request: Request | None, prefix: str, client_request_id: str | None = None
1285+
) -> str:
1286+
"""Construct an internal request ID.
12901287
1291-
return raw_request.headers.get("X-Request-Id", default)
1288+
If the client provides a request ID - either via an explicit field
1289+
in the request (e.g. CompletionRequest.request_id) or via an
1290+
X-Request-Id header, we include this in the internal request ID
1291+
along with a per-endpoint prefix and 8 random characters to ensure
1292+
uniqueness.
1293+
"""
1294+
if raw_request is not None and "X-Request-Id" in raw_request.headers:
1295+
client_request_id = raw_request.headers["X-Request-Id"]
1296+
if client_request_id:
1297+
return f"{prefix}-{random_uuid()[:8]}-{client_request_id}"
1298+
else:
1299+
return f"{prefix}-{random_uuid()}"
12921300

12931301
@staticmethod
12941302
def _get_data_parallel_rank(raw_request: Request | None) -> int | None:

vllm/entrypoints/openai/serving_pooling.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ async def create_pooling(
8585

8686
model_name = self.models.model_name()
8787

88-
request_id = f"pool-{self._base_request_id(raw_request)}"
88+
request_id = self._internal_request_id(raw_request, "pool")
8989
created_time = int(time.time())
9090

9191
is_io_processor_request = isinstance(request, IOProcessorRequest)

vllm/entrypoints/openai/serving_score.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -350,7 +350,7 @@ async def create_score(
350350
if error_check_ret is not None:
351351
return error_check_ret
352352

353-
request_id = f"score-{self._base_request_id(raw_request)}"
353+
request_id = self._internal_request_id(raw_request, "score")
354354
created_time = int(time.time())
355355

356356
try:
@@ -392,7 +392,7 @@ async def do_rerank(
392392
if error_check_ret is not None:
393393
return error_check_ret
394394

395-
request_id = f"rerank-{self._base_request_id(raw_request)}"
395+
request_id = self._internal_request_id(raw_request, "rerank")
396396
documents = request.documents
397397
top_n = (
398398
request.top_n

vllm/entrypoints/openai/serving_tokenization.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ async def create_tokenize(
5959
if error_check_ret is not None:
6060
return error_check_ret
6161

62-
request_id = f"tokn-{self._base_request_id(raw_request)}"
62+
request_id = self._internal_request_id(raw_request, "tokn")
6363

6464
try:
6565
lora_request = self._maybe_get_adapters(request)
@@ -134,7 +134,7 @@ async def create_detokenize(
134134
if error_check_ret is not None:
135135
return error_check_ret
136136

137-
request_id = f"tokn-{self._base_request_id(raw_request)}"
137+
request_id = self._internal_request_id(raw_request, "tokn")
138138

139139
lora_request = self._maybe_get_adapters(request)
140140

vllm/entrypoints/openai/speech_to_text.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ async def _create_speech_to_text(
161161
"Currently only support response_format `text` or `json`"
162162
)
163163

164-
request_id = f"{self.task_type}-{self._base_request_id(raw_request)}"
164+
request_id = self._internal_request_id(raw_request, self.task_type)
165165

166166
request_metadata = RequestResponseMetadata(request_id=request_id)
167167
if raw_request:

0 commit comments

Comments
 (0)