Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion vllm/v1/core/sched/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -842,7 +842,6 @@ def update_from_output(
spec_token_ids[req_index])
else:
request.spec_token_ids = spec_token_ids[req_index]

# Get prompt logprobs for this request.
prompt_logprobs_tensors = prompt_logprobs_dict.get(req_id)
if new_token_ids or pooler_output is not None \
Expand All @@ -869,6 +868,10 @@ def update_from_output(

if not stopped:
new_running.append(request)

if model_runner_output.finished_dumping is not None:
request.succeed_dumped_blocks.extend(model_runner_output.finished_dumping.get(req_id, []))

self.running = new_running

# KV Connector: update state for finished KV Transfers.
Expand Down
1 change: 1 addition & 0 deletions vllm/v1/outputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ class ModelRunnerOutput:
# [req_ids]
finished_sending: Optional[set[str]] = None
finished_recving: Optional[set[str]] = None
finished_dumping: Optional[dict[str, list[str]]] = None

# req_id -> num_nans_in_logits
num_nans_in_logits: Optional[dict[str, int]] = None
Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def __init__(
# State
# The number of tokens with prefix cache hits.
self.num_cached_tokens = -1

self.succeed_dumped_blocks: list[str] = []
# The number of NaNs in logits. A value greater than 0
# indicates that the output is corrupted
self.num_nans_in_logits = 0
Expand Down
7 changes: 4 additions & 3 deletions vllm/v1/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -1378,7 +1378,7 @@
inputs_embeds=inputs_embeds,
)

self.maybe_wait_for_kv_save()
finished_dumping = self.maybe_wait_for_kv_save()
finished_sending, finished_recving = (
self.get_finished_kv_transfers(scheduler_output))

Expand Down Expand Up @@ -1563,6 +1563,7 @@
finished_sending=finished_sending,
finished_recving=finished_recving,
num_nans_in_logits=num_nans_in_logits,
finished_dumping=finished_dumping
)

def propose_draft_token_ids(
Expand Down Expand Up @@ -1718,10 +1719,10 @@
# Do this here to save a collective_rpc.
kv_connector.start_load_kv(get_forward_context())

@staticmethod

Check failure on line 1722 in vllm/v1/worker/gpu_model_runner.py

View workflow job for this annotation

GitHub Actions / pre-commit

Missing return statement [return]

Check failure on line 1722 in vllm/v1/worker/gpu_model_runner.py

View workflow job for this annotation

GitHub Actions / pre-commit

Missing return statement [return]

Check failure on line 1722 in vllm/v1/worker/gpu_model_runner.py

View workflow job for this annotation

GitHub Actions / pre-commit

Missing return statement [return]

Check failure on line 1722 in vllm/v1/worker/gpu_model_runner.py

View workflow job for this annotation

GitHub Actions / pre-commit

Missing return statement [return]

Check failure on line 1722 in vllm/v1/worker/gpu_model_runner.py

View workflow job for this annotation

GitHub Actions / pre-commit

Missing return statement [return]

Check failure on line 1722 in vllm/v1/worker/gpu_model_runner.py

View workflow job for this annotation

GitHub Actions / pre-commit

Missing return statement [return]

Check failure on line 1722 in vllm/v1/worker/gpu_model_runner.py

View workflow job for this annotation

GitHub Actions / pre-commit

Missing return statement [return]

Check failure on line 1722 in vllm/v1/worker/gpu_model_runner.py

View workflow job for this annotation

GitHub Actions / pre-commit

Missing return statement [return]

Check failure on line 1722 in vllm/v1/worker/gpu_model_runner.py

View workflow job for this annotation

GitHub Actions / pre-commit

Missing return statement [return]

Check failure on line 1722 in vllm/v1/worker/gpu_model_runner.py

View workflow job for this annotation

GitHub Actions / pre-commit

Missing return statement [return]
def maybe_wait_for_kv_save() -> None:
def maybe_wait_for_kv_save() -> Optional[dict[str, list[str]]]:
if has_kv_transfer_group():
get_kv_transfer_group().wait_for_save()
return get_kv_transfer_group().wait_for_save()

@staticmethod
def get_finished_kv_transfers(
Expand Down
Loading