vllm-project · xendo · Oct 21, 2024
diff --git a/vllm/config.py b/vllm/config.py
@@ -416,10 +416,10 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
 
         # Reminder: Please update docs/source/serving/compatibility_matrix.rst
         # If the feature combo become valid
-        if device_config.device_type not in ("cuda", "tpu", "xpu"):
+        if device_config.device_type not in ("cuda", "tpu", "xpu", "neuron"):
             logger.warning(
-                "Async output processing is only supported for CUDA, TPU, XPU. "
-                "Disabling it for other platforms.")
+                "Async output processing is only supported for CUDA, TPU, XPU,"
+                " Neuron. Disabling it for other platforms.")
             self.use_async_output_proc = False
             return
 

diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
@@ -1,7 +1,8 @@
 import os
 from dataclasses import dataclass
 from importlib.util import find_spec
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple,
+                    Union)
 
 import torch
 from torch import nn
@@ -35,6 +36,7 @@ class ModelInputForNeuron(ModelRunnerInputBase):
     input_block_ids: Optional[torch.Tensor] = None
     sampling_metadata: Optional["SamplingMetadata"] = None
     multi_modal_kwargs: Optional[BatchedTensorInputs] = None
+    async_callback: Optional[Callable] = None
 
     def as_broadcastable_tensor_dict(
             self) -> Dict[str, Union[int, torch.Tensor]]:
@@ -334,6 +336,9 @@ def execute_model(
         else:
             logits = hidden_states
 
+        if model_input.async_callback is not None:
+            model_input.async_callback()
+
         # Sample the next token.
         output = self.model.sample(
             logits=logits,