Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions tests/tokenization/test_detokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@ def _run_incremental_decode(tokenizer,
skip_special_tokens=skip_special_tokens,
spaces_between_special_tokens=spaces_between_special_tokens,
)
request = EngineCoreRequest("", "", prompt_token_ids, None, None, None,
params, None, 0.0, None)
request = EngineCoreRequest("", prompt_token_ids, None, None, None, params,
None, 0.0, None)

if fast is None:
detokenizer = IncrementalDetokenizer.from_new_request(
Expand Down
1 change: 0 additions & 1 deletion tests/v1/core/test_kv_cache_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ def make_request(request_id,

return Request(
request_id=request_id,
prompt=None,
prompt_token_ids=prompt_token_ids,
multi_modal_inputs=multi_modal_inputs,
multi_modal_hashes=mm_hashes,
Expand Down
1 change: 0 additions & 1 deletion tests/v1/core/test_prefix_caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ def make_request(request_id,

return Request(
request_id=request_id,
prompt=None,
prompt_token_ids=prompt_token_ids,
multi_modal_inputs=multi_modal_inputs,
multi_modal_hashes=mm_hashes,
Expand Down
1 change: 0 additions & 1 deletion tests/v1/core/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,6 @@ def create_requests(num_requests: int,
mm_inputs = None
request = Request(
request_id=f"{i}",
prompt=None,
prompt_token_ids=[i] * num_tokens,
sampling_params=sampling_params,
multi_modal_inputs=mm_inputs,
Expand Down
3 changes: 1 addition & 2 deletions tests/v1/engine/test_engine_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,7 @@

def make_request() -> EngineCoreRequest:
return EngineCoreRequest(
request_id=uuid.uuid4(),
prompt=PROMPT,
request_id=str(uuid.uuid4()),
prompt_token_ids=PROMPT_TOKENS,
mm_inputs=None,
mm_hashes=None,
Expand Down
1 change: 0 additions & 1 deletion tests/v1/engine/test_engine_core_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
def make_request(params: SamplingParams) -> EngineCoreRequest:
return EngineCoreRequest(
request_id=str(uuid.uuid4()),
prompt=PROMPT,
prompt_token_ids=PROMPT_TOKENS,
mm_inputs=None,
mm_hashes=None,
Expand Down
42 changes: 16 additions & 26 deletions tests/v1/engine/test_output_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind,
# Make N requests.
requests = [
EngineCoreRequest(request_id=f"request-{idx}",
prompt=prompt,
prompt_token_ids=prompt_tokens,
arrival_time=0,
mm_inputs=None,
Expand All @@ -64,14 +63,13 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind,
output_kind=request_output_kind,
stop=[],
include_stop_str_in_output=False,
)) for idx, (prompt, prompt_tokens) in enumerate(
zip(dummy_test_vectors.prompt_strings,
dummy_test_vectors.prompt_tokens))
))
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
]

# Add requests to the detokenizer.
for request in requests:
output_processor.add_request(request)
for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
output_processor.add_request(request, prompt)

gen_strings = {}
gen_tokens = {}
Expand Down Expand Up @@ -398,7 +396,6 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind,
]
requests = [
EngineCoreRequest(request_id=request_id_list[idx],
prompt=prompt,
prompt_token_ids=prompt_tokens,
arrival_time=0,
mm_inputs=None,
Expand All @@ -414,14 +411,13 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind,
include_stop_str_in_output=False,
logprobs=num_sample_logprobs,
prompt_logprobs=num_prompt_logprobs,
)) for idx, (prompt, prompt_tokens) in enumerate(
zip(dummy_test_vectors.prompt_strings,
dummy_test_vectors.prompt_tokens))
))
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
]

# Add requests to the detokenizer.
for request in requests:
output_processor.add_request(request)
for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
output_processor.add_request(request, prompt)

gen_tokens = {}
gen_logprobs = {}
Expand Down Expand Up @@ -562,7 +558,6 @@ def test_stop_token(include_stop_str_in_output: bool,
request_id = "request-0"
request = EngineCoreRequest(
request_id=request_id,
prompt=prompt_string,
prompt_token_ids=prompt_tokens,
arrival_time=0,
mm_inputs=None,
Expand All @@ -583,7 +578,7 @@ def test_stop_token(include_stop_str_in_output: bool,
))

# Add request to the detokenizer.
output_processor.add_request(request)
output_processor.add_request(request, prompt_string)

# Loop over engine core steps; run output processor
gen_string = ""
Expand Down Expand Up @@ -659,7 +654,6 @@ def test_stop_string(include_stop_str_in_output: bool,
requests = [
EngineCoreRequest(
request_id=request_id_list[idx],
prompt=prompt,
prompt_token_ids=prompt_tokens,
arrival_time=0,
mm_inputs=None,
Expand All @@ -675,14 +669,13 @@ def test_stop_string(include_stop_str_in_output: bool,
include_stop_str_in_output=include_stop_str_in_output,
logprobs=num_sample_logprobs,
prompt_logprobs=None,
)) for idx, (prompt, prompt_tokens) in enumerate(
zip(dummy_test_vectors.prompt_strings,
dummy_test_vectors.prompt_tokens))
))
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
]

# Add requests to the detokenizer.
for request in requests:
output_processor.add_request(request)
for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
output_processor.add_request(request, prompt)

gen_strings = {}
gen_tokens = {}
Expand Down Expand Up @@ -774,7 +767,6 @@ def test_iteration_stats(dummy_test_vectors):
requests = [
EngineCoreRequest(
request_id=f"request-{idx}",
prompt=prompt,
prompt_token_ids=prompt_tokens,
arrival_time=0,
mm_inputs=None,
Expand All @@ -783,15 +775,13 @@ def test_iteration_stats(dummy_test_vectors):
eos_token_id=None,
lora_request=None,
sampling_params=SamplingParams(),
) for idx, (prompt, prompt_tokens) in enumerate(
zip(dummy_test_vectors.prompt_strings,
dummy_test_vectors.prompt_tokens))
) for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
]

# Add all requests except one to the OutputProcessor.
num_active = len(dummy_test_vectors.generation_tokens) - 1
for request in requests[:num_active]:
output_processor.add_request(request)
output_processor.add_request(request, None)
inactive_request = requests[num_active]

# First iteration has 2 prefills.
Expand All @@ -817,7 +807,7 @@ def test_iteration_stats(dummy_test_vectors):
assert iteration_stats.num_generation_tokens == num_active

# Add a new request - prefill and 2 decodes in this step.
output_processor.add_request(inactive_request)
output_processor.add_request(inactive_request, None)
num_active += 1
outputs = engine_core.get_outputs()[:num_active]
iteration_stats = IterationStats()
Expand Down
1 change: 0 additions & 1 deletion tests/v1/tpu/worker/test_tpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
NewRequestData(
req_id=req_id,
prompt_token_ids=[1, 2, 3],
prompt="test",
mm_inputs=[],
mm_hashes=[],
mm_positions=[],
Expand Down
1 change: 0 additions & 1 deletion tests/v1/worker/test_gpu_input_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,6 @@ def _construct_cached_request_state(req_id_suffix: int):
return CachedRequestState(
req_id=f"req_id_{req_id_suffix}",
prompt_token_ids=prompt_token_ids,
prompt=None,
sampling_params=_create_sampling_params(),
mm_inputs=[],
mm_positions=[],
Expand Down
1 change: 0 additions & 1 deletion tests/v1/worker/test_gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
NewRequestData(
req_id=req_id,
prompt_token_ids=[1, 2, 3],
prompt="test",
mm_inputs=[],
mm_hashes=[],
mm_positions=[],
Expand Down
2 changes: 0 additions & 2 deletions vllm/v1/core/sched/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ class NewRequestData:

req_id: str
prompt_token_ids: list[int]
prompt: Optional[str]
mm_inputs: list[MultiModalKwargs]
mm_hashes: list[str]
mm_positions: list[PlaceholderRange]
Expand All @@ -40,7 +39,6 @@ def from_request(
return cls(
req_id=request.request_id,
prompt_token_ids=request.prompt_token_ids,
prompt=request.prompt,
mm_inputs=request.mm_inputs,
mm_hashes=request.mm_hashes,
mm_positions=request.mm_positions,
Expand Down
3 changes: 0 additions & 3 deletions vllm/v1/engine/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,6 @@ class EngineCoreRequest(
# due to circular imports and typing we have in data.py

request_id: str
# NOTE(ywang96): original text prompt is needed when a request is added to
# Detokenizer, but set to None when it is added to EngineCoreClient.
prompt: Optional[str]
prompt_token_ids: list[int]
mm_inputs: Optional[Sequence[Optional[MultiModalKwargs]]]
mm_hashes: Optional[list[str]]
Expand Down
17 changes: 9 additions & 8 deletions vllm/v1/engine/async_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,14 +203,12 @@ async def add_request(
queue = RequestOutputCollector(output_kind=params.output_kind)

# Convert Input --> Request.
request = self.processor.process_inputs(request_id, prompt, params,
arrival_time, lora_request,
trace_headers,
prompt_adapter_request,
priority)
prompt_str, request = self.processor.process_inputs(
request_id, prompt, params, arrival_time, lora_request,
trace_headers, prompt_adapter_request, priority)

if params.n == 1:
await self._add_request(request, None, 0, queue)
await self._add_request(request, prompt_str, None, 0, queue)
return queue

# Fan out child requests (for n>1).
Expand All @@ -220,15 +218,18 @@ async def add_request(
child_request = request if idx == params.n - 1 else copy(request)
child_request.request_id = request_id
child_request.sampling_params = params
await self._add_request(child_request, parent_request, idx, queue)
await self._add_request(child_request, prompt_str, parent_request,
idx, queue)
return queue

async def _add_request(self, request: EngineCoreRequest,
prompt: Optional[str],
parent_req: Optional[ParentRequest], index: int,
queue: RequestOutputCollector):

# Add the request to OutputProcessor (this process).
self.output_processor.add_request(request, parent_req, index, queue)
self.output_processor.add_request(request, prompt, parent_req, index,
queue)

# Add the EngineCoreRequest to EngineCore (separate process).
await self.engine_core.add_request_async(request)
Expand Down
9 changes: 0 additions & 9 deletions vllm/v1/engine/core_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,9 +559,6 @@ def call_utility(self, method: str, *args) -> Any:
return future.result()

def add_request(self, request: EngineCoreRequest) -> None:
# NOTE: text prompt is not needed in the core engine as it has been
# tokenized.
request.prompt = None
self._send_input(EngineCoreRequestType.ADD, request)

def abort_requests(self, request_ids: list[str]) -> None:
Expand Down Expand Up @@ -729,9 +726,6 @@ async def _call_utility_async(self, method: str, *args,
return await future

async def add_request_async(self, request: EngineCoreRequest) -> None:
# NOTE: text prompt is not needed in the core engine as it has been
# tokenized.
request.prompt = None
await self._send_input(EngineCoreRequestType.ADD, request)
self._ensure_output_queue_task()

Expand Down Expand Up @@ -824,9 +818,6 @@ async def call_utility_async(self, method: str, *args) -> Any:
]))[0]

async def add_request_async(self, request: EngineCoreRequest) -> None:
# NOTE: text prompt is not needed in the core engine as it has been
# tokenized.
request.prompt = None
request.current_wave = self.current_wave

chosen_engine = self.get_core_engine_for_request()
Expand Down
13 changes: 6 additions & 7 deletions vllm/v1/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,17 +180,15 @@ def add_request(
priority: int = 0,
) -> None:
# Process raw inputs into the request.
request = self.processor.process_inputs(request_id, prompt, params,
arrival_time, lora_request,
trace_headers,
prompt_adapter_request,
priority)
prompt_str, request = self.processor.process_inputs(
request_id, prompt, params, arrival_time, lora_request,
trace_headers, prompt_adapter_request, priority)

n = params.n if isinstance(params, SamplingParams) else 1

if n == 1:
# Make a new RequestState and queue.
self.output_processor.add_request(request, None, 0)
self.output_processor.add_request(request, prompt_str, None, 0)
# Add the request to EngineCore.
self.engine_core.add_request(request)
return
Expand All @@ -204,7 +202,8 @@ def add_request(
child_request.sampling_params = params

# Make a new RequestState and queue.
self.output_processor.add_request(child_request, parent_req, idx)
self.output_processor.add_request(child_request, prompt_str,
parent_req, idx)
# Add the request to EngineCore.
self.engine_core.add_request(child_request)

Expand Down
5 changes: 4 additions & 1 deletion vllm/v1/engine/output_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ def from_new_request(
cls,
tokenizer: AnyTokenizer,
request: EngineCoreRequest,
prompt: Optional[str],
parent_req: Optional[ParentRequest],
request_index: int,
queue: Optional[RequestOutputCollector],
Expand All @@ -123,7 +124,7 @@ def from_new_request(
lora_name=(request.lora_request.name
if request.lora_request is not None else None),
output_kind=request.sampling_params.output_kind,
prompt=request.prompt,
prompt=prompt,
prompt_token_ids=request.prompt_token_ids,
logprobs_processor=LogprobsProcessor.from_new_request(
tokenizer=tokenizer,
Expand Down Expand Up @@ -267,6 +268,7 @@ def abort_requests(
def add_request(
self,
request: EngineCoreRequest,
prompt: Optional[str],
parent_req: Optional[ParentRequest] = None,
request_index: int = 0,
queue: Optional[RequestOutputCollector] = None,
Expand All @@ -278,6 +280,7 @@ def add_request(
req_state = RequestState.from_new_request(
tokenizer=self.tokenizer.get_lora_tokenizer(request.lora_request),
request=request,
prompt=prompt,
parent_req=parent_req,
request_index=request_index,
queue=queue,
Expand Down
5 changes: 2 additions & 3 deletions vllm/v1/engine/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ def process_inputs(
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
) -> EngineCoreRequest:
) -> tuple[Optional[str], EngineCoreRequest]:

# TODO(woosuk): Support pooling models.
# TODO(woosuk): Support encoder-decoder models.
Expand Down Expand Up @@ -306,9 +306,8 @@ def process_inputs(
else:
sorted_mm_inputs = orig_sorted_mm_inputs

return EngineCoreRequest(
return decoder_inputs.get("prompt"), EngineCoreRequest(
request_id=request_id,
prompt=decoder_inputs.get("prompt"),
prompt_token_ids=decoder_inputs["prompt_token_ids"],
mm_inputs=sorted_mm_inputs,
mm_hashes=sorted_mm_hashes,
Expand Down
Loading