[Frontend] Add a random prefix to client-provided request IDs

markmc · markmc · commit 54a1d4bc9a45 · 2025-11-03T10:06:53.000-05:00
Since vllm-project#9550 and vllm-project#10968 we support client's supplying a custom request ID. The motivation for this is that it can be very helpful when you need to correlate vLLM logs with logs of a related service. Since the request ID is used ubiquitously across vLLM as a unique key, it obviously is problematic if we ever have multiple in-flight requests using the same client-provided request ID. We saw this happening recently when `vllm serve bench` started including a request ID and the request IDs from multiple concurrent instances caused collisions. See vllm-project#27723 We try to guard against request ID collisions currently in the frontend in OutputProcessor: ``` def add_request(...): if request_id in self.request_states: raise ValueError(f"Request id {request_id} already running.") ``` however, this is not always effective: 1) We can have abort race conditions where a request is no longer tracked by the frontend, but still not completed in the engine. See vllm-project#15326 for an attempt to fix this. 2) With P/D, a request will continue to be tracked by the prefill engine long after the prefill request has been completed in the frontend, while we wait for the decode side to fetch the KV blocks Let's instead ensure we use a unique request ID internaly, even when a client provides a custom request ID. We can do this simply by prepending a short random prefix given that we already add a prefix to the client-provided ID. A full 32 character random UUID would be overkill as a prefix, so how many random characters would be sufficient? 8 characters gives us 32 bits of entropy, or 16^8 possible prefixes. Using the collision probability approximation from https://preshing.com/20110504/hash-collision-probabilities: N = 16^8 and k is the number of generated prefixes, then the probability of collision is (k^2)/(2N), so If a client somehow caused vLLM to hold 10k requests that reuse the same client-provided ID, then there would be a 1.16% chance of collision: ``` >>> (k**2)/(2*N) 0.011641532182693481 ``` That seems (super good enough)[https://hownot2.com/products/hownot2-super-good-enough-t-shirt]. Signed-off-by: Mark McLoughlin <markmc@redhat.com>
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
@@ -256,8 +256,8 @@ async def create_chat_completion(
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(f"{e} {e.__cause__}")
 
-        request_id = (
-            f"chatcmpl-{self._base_request_id(raw_request, request.request_id)}"
+        request_id = self._internal_request_id(
+            raw_request, "chatcmpl", request.request_id
         )
 
         request_metadata = RequestResponseMetadata(request_id=request_id)
diff --git a/vllm/entrypoints/openai/serving_classification.py b/vllm/entrypoints/openai/serving_classification.py
@@ -145,7 +145,7 @@ async def create_classify(
         raw_request: Request,
     ) -> ClassificationResponse | ErrorResponse:
         model_name = self.models.model_name()
-        request_id = f"{self.request_id_prefix}-{self._base_request_id(raw_request)}"
+        request_id = self._internal_request_id(raw_request, self.request_id_prefix)
 
         ctx = ClassificationServeContext(
             request=request,
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
@@ -107,7 +107,7 @@ async def create_completion(
                 "prompt_logprobs is not compatible with prompt embeds."
             )
 
-        request_id = f"cmpl-{self._base_request_id(raw_request, request.request_id)}"
+        request_id = self._internal_request_id(raw_request, "cmpl", request.request_id)
         created_time = int(time.time())
 
         request_metadata = RequestResponseMetadata(request_id=request_id)
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
@@ -648,9 +648,8 @@ async def create_embedding(
         for the API specification. This API mimics the OpenAI Embedding API.
         """
         model_name = self.models.model_name()
-        request_id = (
-            f"{self.request_id_prefix}-"
-            f"{self._base_request_id(raw_request, request.request_id)}"
+        request_id = self._internal_request_id(
+            raw_request, self.request_id_prefix, request.request_id
         )
 
         ctx = EmbeddingServeContext(
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
@@ -1280,15 +1280,23 @@ async def _get_trace_headers(
         return None
 
     @staticmethod
-    def _base_request_id(
-        raw_request: Request | None, default: str | None = None
-    ) -> str | None:
-        """Pulls the request id to use from a header, if provided"""
-        default = default or random_uuid()
-        if raw_request is None:
-            return default
+    def _internal_request_id(
+        raw_request: Request | None, prefix: str, client_request_id: str | None = None
+    ) -> str:
+        """Construct an internal request ID.
 
-        return raw_request.headers.get("X-Request-Id", default)
+        If the client provides a request ID - either via an explicit field
+        in the request (e.g. CompletionRequest.request_id) or via an
+        X-Request-Id header, we include this in the internal request ID
+        along with a per-endpoint prefix and 8 random characters to ensure
+        uniqueness.
+        """
+        if raw_request is not None and "X-Request-Id" in raw_request.headers:
+            client_request_id = raw_request.headers["X-Request-Id"]
+        if client_request_id:
+            return f"{prefix}-{random_uuid()[:8]}-{client_request_id}"
+        else:
+            return f"{prefix}-{random_uuid()}"
 
     @staticmethod
     def _get_data_parallel_rank(raw_request: Request | None) -> int | None:
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
@@ -85,7 +85,7 @@ async def create_pooling(
 
         model_name = self.models.model_name()
 
-        request_id = f"pool-{self._base_request_id(raw_request)}"
+        request_id = self._internal_request_id(raw_request, "pool")
         created_time = int(time.time())
 
         is_io_processor_request = isinstance(request, IOProcessorRequest)
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
@@ -350,7 +350,7 @@ async def create_score(
         if error_check_ret is not None:
             return error_check_ret
 
-        request_id = f"score-{self._base_request_id(raw_request)}"
+        request_id = self._internal_request_id(raw_request, "score")
         created_time = int(time.time())
 
         try:
@@ -392,7 +392,7 @@ async def do_rerank(
         if error_check_ret is not None:
             return error_check_ret
 
-        request_id = f"rerank-{self._base_request_id(raw_request)}"
+        request_id = self._internal_request_id(raw_request, "rerank")
         documents = request.documents
         top_n = (
             request.top_n
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
@@ -59,7 +59,7 @@ async def create_tokenize(
         if error_check_ret is not None:
             return error_check_ret
 
-        request_id = f"tokn-{self._base_request_id(raw_request)}"
+        request_id = self._internal_request_id(raw_request, "tokn")
 
         try:
             lora_request = self._maybe_get_adapters(request)
@@ -134,7 +134,7 @@ async def create_detokenize(
         if error_check_ret is not None:
             return error_check_ret
 
-        request_id = f"tokn-{self._base_request_id(raw_request)}"
+        request_id = self._internal_request_id(raw_request, "tokn")
 
         lora_request = self._maybe_get_adapters(request)
 
diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
@@ -161,7 +161,7 @@ async def _create_speech_to_text(
                 "Currently only support response_format `text` or `json`"
             )
 
-        request_id = f"{self.task_type}-{self._base_request_id(raw_request)}"
+        request_id = self._internal_request_id(raw_request, self.task_type)
 
         request_metadata = RequestResponseMetadata(request_id=request_id)
         if raw_request:

Original file line number	Diff line number	Diff line change
`@@ -256,8 +256,8 @@ async def create_chat_completion(`
`256`	`256`	`logger.exception("Error in preprocessing prompt inputs")`
`257`	`257`	`return self.create_error_response(f"{e} {e.__cause__}")`
`258`	`258`
`259`		`- request_id = (`
`260`		`- f"chatcmpl-{self._base_request_id(raw_request, request.request_id)}"`
	`259`	`+ request_id = self._internal_request_id(`
	`260`	`+ raw_request, "chatcmpl", request.request_id`
`261`	`261`	`)`
`262`	`262`
`263`	`263`	`request_metadata = RequestResponseMetadata(request_id=request_id)`
Original file line number	Diff line number	Diff line change
`@@ -107,7 +107,7 @@ async def create_completion(`
`107`	`107`	`"prompt_logprobs is not compatible with prompt embeds."`
`108`	`108`	`)`
`109`	`109`
`110`		`- request_id = f"cmpl-{self._base_request_id(raw_request, request.request_id)}"`
	`110`	`+ request_id = self._internal_request_id(raw_request, "cmpl", request.request_id)`
`111`	`111`	`created_time = int(time.time())`
`112`	`112`
`113`	`113`	`request_metadata = RequestResponseMetadata(request_id=request_id)`
Original file line number	Diff line number	Diff line change
`@@ -161,7 +161,7 @@ async def _create_speech_to_text(`
`161`	`161`	"Currently only support response_format `text` or `json`"
`162`	`162`	`)`
`163`	`163`
`164`		`- request_id = f"{self.task_type}-{self._base_request_id(raw_request)}"`
	`164`	`+ request_id = self._internal_request_id(raw_request, self.task_type)`
`165`	`165`
`166`	`166`	`request_metadata = RequestResponseMetadata(request_id=request_id)`
`167`	`167`	`if raw_request:`