diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index da6c774f439a..f8e213b9ca48 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -60,8 +60,8 @@ def _run_incremental_decode(tokenizer, skip_special_tokens=skip_special_tokens, spaces_between_special_tokens=spaces_between_special_tokens, ) - request = EngineCoreRequest("", "", prompt_token_ids, None, None, None, - params, None, 0.0, None) + request = EngineCoreRequest("", prompt_token_ids, None, None, None, params, + None, 0.0, None) if fast is None: detokenizer = IncrementalDetokenizer.from_new_request( diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index e2f8fd1999c4..8e8cb27d42cf 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -37,7 +37,6 @@ def make_request(request_id, return Request( request_id=request_id, - prompt=None, prompt_token_ids=prompt_token_ids, multi_modal_inputs=multi_modal_inputs, multi_modal_hashes=mm_hashes, diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index 1b238d47c03a..42b769e125c0 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -29,7 +29,6 @@ def make_request(request_id, return Request( request_id=request_id, - prompt=None, prompt_token_ids=prompt_token_ids, multi_modal_inputs=multi_modal_inputs, multi_modal_hashes=mm_hashes, diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 591284ec8541..ee4e95856f23 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -132,7 +132,6 @@ def create_requests(num_requests: int, mm_inputs = None request = Request( request_id=f"{i}", - prompt=None, prompt_token_ids=[i] * num_tokens, sampling_params=sampling_params, multi_modal_inputs=mm_inputs, diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 047ff774cadc..30fa9e371ad1 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -31,8 +31,7 @@ def make_request() -> EngineCoreRequest: return EngineCoreRequest( - request_id=uuid.uuid4(), - prompt=PROMPT, + request_id=str(uuid.uuid4()), prompt_token_ids=PROMPT_TOKENS, mm_inputs=None, mm_hashes=None, diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index 8ebdaf63b484..8cc36fa163f7 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -35,7 +35,6 @@ def make_request(params: SamplingParams) -> EngineCoreRequest: return EngineCoreRequest( request_id=str(uuid.uuid4()), - prompt=PROMPT, prompt_token_ids=PROMPT_TOKENS, mm_inputs=None, mm_hashes=None, diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py index f8d96caf1a27..d2bb7d88fef2 100644 --- a/tests/v1/engine/test_output_processor.py +++ b/tests/v1/engine/test_output_processor.py @@ -50,7 +50,6 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind, # Make N requests. requests = [ EngineCoreRequest(request_id=f"request-{idx}", - prompt=prompt, prompt_token_ids=prompt_tokens, arrival_time=0, mm_inputs=None, @@ -64,14 +63,13 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind, output_kind=request_output_kind, stop=[], include_stop_str_in_output=False, - )) for idx, (prompt, prompt_tokens) in enumerate( - zip(dummy_test_vectors.prompt_strings, - dummy_test_vectors.prompt_tokens)) + )) + for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens) ] # Add requests to the detokenizer. - for request in requests: - output_processor.add_request(request) + for request, prompt in zip(requests, dummy_test_vectors.prompt_strings): + output_processor.add_request(request, prompt) gen_strings = {} gen_tokens = {} @@ -398,7 +396,6 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind, ] requests = [ EngineCoreRequest(request_id=request_id_list[idx], - prompt=prompt, prompt_token_ids=prompt_tokens, arrival_time=0, mm_inputs=None, @@ -414,14 +411,13 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind, include_stop_str_in_output=False, logprobs=num_sample_logprobs, prompt_logprobs=num_prompt_logprobs, - )) for idx, (prompt, prompt_tokens) in enumerate( - zip(dummy_test_vectors.prompt_strings, - dummy_test_vectors.prompt_tokens)) + )) + for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens) ] # Add requests to the detokenizer. - for request in requests: - output_processor.add_request(request) + for request, prompt in zip(requests, dummy_test_vectors.prompt_strings): + output_processor.add_request(request, prompt) gen_tokens = {} gen_logprobs = {} @@ -562,7 +558,6 @@ def test_stop_token(include_stop_str_in_output: bool, request_id = "request-0" request = EngineCoreRequest( request_id=request_id, - prompt=prompt_string, prompt_token_ids=prompt_tokens, arrival_time=0, mm_inputs=None, @@ -583,7 +578,7 @@ def test_stop_token(include_stop_str_in_output: bool, )) # Add request to the detokenizer. - output_processor.add_request(request) + output_processor.add_request(request, prompt_string) # Loop over engine core steps; run output processor gen_string = "" @@ -659,7 +654,6 @@ def test_stop_string(include_stop_str_in_output: bool, requests = [ EngineCoreRequest( request_id=request_id_list[idx], - prompt=prompt, prompt_token_ids=prompt_tokens, arrival_time=0, mm_inputs=None, @@ -675,14 +669,13 @@ def test_stop_string(include_stop_str_in_output: bool, include_stop_str_in_output=include_stop_str_in_output, logprobs=num_sample_logprobs, prompt_logprobs=None, - )) for idx, (prompt, prompt_tokens) in enumerate( - zip(dummy_test_vectors.prompt_strings, - dummy_test_vectors.prompt_tokens)) + )) + for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens) ] # Add requests to the detokenizer. - for request in requests: - output_processor.add_request(request) + for request, prompt in zip(requests, dummy_test_vectors.prompt_strings): + output_processor.add_request(request, prompt) gen_strings = {} gen_tokens = {} @@ -774,7 +767,6 @@ def test_iteration_stats(dummy_test_vectors): requests = [ EngineCoreRequest( request_id=f"request-{idx}", - prompt=prompt, prompt_token_ids=prompt_tokens, arrival_time=0, mm_inputs=None, @@ -783,15 +775,13 @@ def test_iteration_stats(dummy_test_vectors): eos_token_id=None, lora_request=None, sampling_params=SamplingParams(), - ) for idx, (prompt, prompt_tokens) in enumerate( - zip(dummy_test_vectors.prompt_strings, - dummy_test_vectors.prompt_tokens)) + ) for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens) ] # Add all requests except one to the OutputProcessor. num_active = len(dummy_test_vectors.generation_tokens) - 1 for request in requests[:num_active]: - output_processor.add_request(request) + output_processor.add_request(request, None) inactive_request = requests[num_active] # First iteration has 2 prefills. @@ -817,7 +807,7 @@ def test_iteration_stats(dummy_test_vectors): assert iteration_stats.num_generation_tokens == num_active # Add a new request - prefill and 2 decodes in this step. - output_processor.add_request(inactive_request) + output_processor.add_request(inactive_request, None) num_active += 1 outputs = engine_core.get_outputs()[:num_active] iteration_stats = IterationStats() diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py index 5db606093569..319b38b4ca09 100644 --- a/tests/v1/tpu/worker/test_tpu_model_runner.py +++ b/tests/v1/tpu/worker/test_tpu_model_runner.py @@ -77,7 +77,6 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput: NewRequestData( req_id=req_id, prompt_token_ids=[1, 2, 3], - prompt="test", mm_inputs=[], mm_hashes=[], mm_positions=[], diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py index 2486c26c6071..915ec2914a82 100644 --- a/tests/v1/worker/test_gpu_input_batch.py +++ b/tests/v1/worker/test_gpu_input_batch.py @@ -195,7 +195,6 @@ def _construct_cached_request_state(req_id_suffix: int): return CachedRequestState( req_id=f"req_id_{req_id_suffix}", prompt_token_ids=prompt_token_ids, - prompt=None, sampling_params=_create_sampling_params(), mm_inputs=[], mm_positions=[], diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index dd95a7f53064..68e34cfacc58 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -50,7 +50,6 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput: NewRequestData( req_id=req_id, prompt_token_ids=[1, 2, 3], - prompt="test", mm_inputs=[], mm_hashes=[], mm_positions=[], diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index 1d3f1f41f8fb..928fb231a1f2 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -22,7 +22,6 @@ class NewRequestData: req_id: str prompt_token_ids: list[int] - prompt: Optional[str] mm_inputs: list[MultiModalKwargs] mm_hashes: list[str] mm_positions: list[PlaceholderRange] @@ -40,7 +39,6 @@ def from_request( return cls( req_id=request.request_id, prompt_token_ids=request.prompt_token_ids, - prompt=request.prompt, mm_inputs=request.mm_inputs, mm_hashes=request.mm_hashes, mm_positions=request.mm_positions, diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 5f5675b955fa..0474669610cd 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -49,9 +49,6 @@ class EngineCoreRequest( # due to circular imports and typing we have in data.py request_id: str - # NOTE(ywang96): original text prompt is needed when a request is added to - # Detokenizer, but set to None when it is added to EngineCoreClient. - prompt: Optional[str] prompt_token_ids: list[int] mm_inputs: Optional[Sequence[Optional[MultiModalKwargs]]] mm_hashes: Optional[list[str]] diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index c33535b3d360..944efcbc123a 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -203,14 +203,12 @@ async def add_request( queue = RequestOutputCollector(output_kind=params.output_kind) # Convert Input --> Request. - request = self.processor.process_inputs(request_id, prompt, params, - arrival_time, lora_request, - trace_headers, - prompt_adapter_request, - priority) + prompt_str, request = self.processor.process_inputs( + request_id, prompt, params, arrival_time, lora_request, + trace_headers, prompt_adapter_request, priority) if params.n == 1: - await self._add_request(request, None, 0, queue) + await self._add_request(request, prompt_str, None, 0, queue) return queue # Fan out child requests (for n>1). @@ -220,15 +218,18 @@ async def add_request( child_request = request if idx == params.n - 1 else copy(request) child_request.request_id = request_id child_request.sampling_params = params - await self._add_request(child_request, parent_request, idx, queue) + await self._add_request(child_request, prompt_str, parent_request, + idx, queue) return queue async def _add_request(self, request: EngineCoreRequest, + prompt: Optional[str], parent_req: Optional[ParentRequest], index: int, queue: RequestOutputCollector): # Add the request to OutputProcessor (this process). - self.output_processor.add_request(request, parent_req, index, queue) + self.output_processor.add_request(request, prompt, parent_req, index, + queue) # Add the EngineCoreRequest to EngineCore (separate process). await self.engine_core.add_request_async(request) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 21ad446172a3..2a619bd08eec 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -559,9 +559,6 @@ def call_utility(self, method: str, *args) -> Any: return future.result() def add_request(self, request: EngineCoreRequest) -> None: - # NOTE: text prompt is not needed in the core engine as it has been - # tokenized. - request.prompt = None self._send_input(EngineCoreRequestType.ADD, request) def abort_requests(self, request_ids: list[str]) -> None: @@ -729,9 +726,6 @@ async def _call_utility_async(self, method: str, *args, return await future async def add_request_async(self, request: EngineCoreRequest) -> None: - # NOTE: text prompt is not needed in the core engine as it has been - # tokenized. - request.prompt = None await self._send_input(EngineCoreRequestType.ADD, request) self._ensure_output_queue_task() @@ -824,9 +818,6 @@ async def call_utility_async(self, method: str, *args) -> Any: ]))[0] async def add_request_async(self, request: EngineCoreRequest) -> None: - # NOTE: text prompt is not needed in the core engine as it has been - # tokenized. - request.prompt = None request.current_wave = self.current_wave chosen_engine = self.get_core_engine_for_request() diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index a07595a552af..ee293d520fad 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -180,17 +180,15 @@ def add_request( priority: int = 0, ) -> None: # Process raw inputs into the request. - request = self.processor.process_inputs(request_id, prompt, params, - arrival_time, lora_request, - trace_headers, - prompt_adapter_request, - priority) + prompt_str, request = self.processor.process_inputs( + request_id, prompt, params, arrival_time, lora_request, + trace_headers, prompt_adapter_request, priority) n = params.n if isinstance(params, SamplingParams) else 1 if n == 1: # Make a new RequestState and queue. - self.output_processor.add_request(request, None, 0) + self.output_processor.add_request(request, prompt_str, None, 0) # Add the request to EngineCore. self.engine_core.add_request(request) return @@ -204,7 +202,8 @@ def add_request( child_request.sampling_params = params # Make a new RequestState and queue. - self.output_processor.add_request(child_request, parent_req, idx) + self.output_processor.add_request(child_request, prompt_str, + parent_req, idx) # Add the request to EngineCore. self.engine_core.add_request(child_request) diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 9c42d34b8a9a..f76c44cb8bca 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -109,6 +109,7 @@ def from_new_request( cls, tokenizer: AnyTokenizer, request: EngineCoreRequest, + prompt: Optional[str], parent_req: Optional[ParentRequest], request_index: int, queue: Optional[RequestOutputCollector], @@ -123,7 +124,7 @@ def from_new_request( lora_name=(request.lora_request.name if request.lora_request is not None else None), output_kind=request.sampling_params.output_kind, - prompt=request.prompt, + prompt=prompt, prompt_token_ids=request.prompt_token_ids, logprobs_processor=LogprobsProcessor.from_new_request( tokenizer=tokenizer, @@ -267,6 +268,7 @@ def abort_requests( def add_request( self, request: EngineCoreRequest, + prompt: Optional[str], parent_req: Optional[ParentRequest] = None, request_index: int = 0, queue: Optional[RequestOutputCollector] = None, @@ -278,6 +280,7 @@ def add_request( req_state = RequestState.from_new_request( tokenizer=self.tokenizer.get_lora_tokenizer(request.lora_request), request=request, + prompt=prompt, parent_req=parent_req, request_index=request_index, queue=queue, diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index af09a67635a2..fa334302e781 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -202,7 +202,7 @@ def process_inputs( trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, - ) -> EngineCoreRequest: + ) -> tuple[Optional[str], EngineCoreRequest]: # TODO(woosuk): Support pooling models. # TODO(woosuk): Support encoder-decoder models. @@ -306,9 +306,8 @@ def process_inputs( else: sorted_mm_inputs = orig_sorted_mm_inputs - return EngineCoreRequest( + return decoder_inputs.get("prompt"), EngineCoreRequest( request_id=request_id, - prompt=decoder_inputs.get("prompt"), prompt_token_ids=decoder_inputs["prompt_token_ids"], mm_inputs=sorted_mm_inputs, mm_hashes=sorted_mm_hashes, diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 6be72431dde5..3b9b666f936a 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -20,7 +20,6 @@ class Request: def __init__( self, request_id: str, - prompt: Optional[str], prompt_token_ids: list[int], multi_modal_inputs: Optional[list[MultiModalKwargs]], multi_modal_hashes: Optional[list[str]], @@ -46,7 +45,6 @@ def __init__( assert sampling_params.max_tokens is not None self.max_tokens = sampling_params.max_tokens - self.prompt = prompt self.prompt_token_ids = prompt_token_ids self.num_prompt_tokens = len(self.prompt_token_ids) self._output_token_ids: list[int] = [] @@ -81,7 +79,6 @@ def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request": return cls( request_id=request.request_id, - prompt=request.prompt, prompt_token_ids=request.prompt_token_ids, multi_modal_inputs=request.mm_inputs, multi_modal_hashes=request.mm_hashes, diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index a64cb97e0123..c00424dfea73 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -24,7 +24,6 @@ class CachedRequestState: req_id: str prompt_token_ids: list[int] - prompt: Optional[str] mm_inputs: list[MultiModalKwargs] mm_positions: list[PlaceholderRange] sampling_params: SamplingParams diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 7910481762ef..f1fa4f4e1967 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -348,7 +348,6 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: self.requests[req_id] = CachedRequestState( req_id=req_id, prompt_token_ids=new_req_data.prompt_token_ids, - prompt=new_req_data.prompt, mm_inputs=new_req_data.mm_inputs, mm_positions=new_req_data.mm_positions, sampling_params=sampling_params, diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index e9cb0dbe8b5e..98b0ddcccb5d 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -356,7 +356,6 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool: self.requests[req_id] = CachedRequestState( req_id=req_id, prompt_token_ids=new_req_data.prompt_token_ids, - prompt=new_req_data.prompt, mm_inputs=new_req_data.mm_inputs, mm_positions=new_req_data.mm_positions, sampling_params=sampling_params,