vllm-project · zhouyuan · Apr 10, 2024 · jikunshang · Apr 11, 2024
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
@@ -333,6 +333,13 @@ def from_engine_args(
         if engine_config.device_config.device_type == "neuron":
             raise NotImplementedError("Neuron is not supported for "
                                       "async engine yet.")
+        if engine_config.device_config.device_type == "cpu":
+            if (engine_config.parallel_config.worker_use_ray
+                    or engine_args.engine_use_ray):
+                logger.warning("CPU backend does not support ray yet")
+            else:
+                from vllm.executor.cpu_executor import CPUExecutorAsync
+                executor_class = CPUExecutorAsync
         elif (engine_config.parallel_config.worker_use_ray
               or engine_args.engine_use_ray):
             initialize_ray_cluster(engine_config.parallel_config)

diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
@@ -5,11 +5,12 @@
 
 from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig)
-from vllm.executor.executor_base import ExecutorBase
+from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
-from vllm.utils import get_distributed_init_method, get_ip, get_open_port
+from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+                        make_async)
 
 logger = init_logger(__name__)
 
@@ -104,6 +105,28 @@ def check_health(self) -> None:
         return
 
 
+class CPUExecutorAsync(CPUExecutor, ExecutorAsyncBase):
+
+    async def execute_model_async(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Dict[int, int],
+        blocks_to_swap_out: Dict[int, int],
+        blocks_to_copy: Dict[int, List[int]],
+    ) -> SamplerOutput:
+        output = await make_async(self.driver_worker.execute_model)(
+            seq_group_metadata_list=seq_group_metadata_list,
+            blocks_to_swap_in=blocks_to_swap_in,
+            blocks_to_swap_out=blocks_to_swap_out,
+            blocks_to_copy=blocks_to_copy)
+        return output
+
+    async def check_health_async(self) -> None:
+        # CPUExecutor will always be healthy as long as
+        # it's running.
+        return
+
+
 def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
     if config.dtype == torch.float16:
         logger.warning("float16 is not supported on CPU, casting to bfloat16.")

@@ -94,6 +94,9 @@ def init_device(self) -> None:
             _check_if_gpu_supports_dtype(self.model_config.dtype)
             torch.cuda.empty_cache()
             self.init_gpu_memory = torch.cuda.mem_get_info()[0]
+        elif self.device_config.device == "cpu":
+            self.rank = 0
+            self.device = torch.device("cpu")
         else:
             raise RuntimeError(
                 f"Not support device type: {self.device_config.device}")