diff --git a/vllm/config.py b/vllm/config.py index fbd61a332af61..7e0b75eceae5b 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -347,10 +347,10 @@ def verify_async_output_proc(self, parallel_config, speculative_config, self.use_async_output_proc = False return - if device_config.device_type != "cuda": + if device_config.device_type not in ("cuda", "tpu"): logger.warning( - "Async output processing is only supported for CUDA." - " Disabling it for other platforms.") + "Async output processing is only supported for CUDA or TPU. " + "Disabling it for other platforms.") self.use_async_output_proc = False return diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index ebb4b89cb4727..a0498315516b8 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -1,6 +1,7 @@ import time from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union +from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, + Type, Union) from unittest.mock import patch import numpy as np @@ -51,6 +52,7 @@ class ModelInputForTPU(ModelRunnerInputBase): best_of: List[int] seq_groups: List[List[int]] virtual_engine: int = 0 + async_callback: Optional[Callable] = None def as_broadcastable_tensor_dict( self) -> Dict[str, Union[int, torch.Tensor]]: @@ -562,6 +564,8 @@ def _execute_model(*args): model_input.attn_metadata, model_input.input_lens[i:i + 1], model_input.t[i:i + 1], model_input.p[i:i + 1], model_input.num_samples, kv_caches) + if i == 0 and model_input.async_callback is not None: + model_input.async_callback() # Retrieve the outputs to CPU. next_token_ids += output_token_ids.cpu().tolist() start_idx = end_idx @@ -572,6 +576,8 @@ def _execute_model(*args): model_input.attn_metadata, model_input.input_lens, model_input.t, model_input.p, model_input.num_samples, kv_caches) + if model_input.async_callback is not None: + model_input.async_callback() # Retrieve the outputs to CPU. next_token_ids = output_token_ids.cpu().tolist()