Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[HARDWARE] enable async_engine for CPU backend #3992

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions vllm/engine/async_llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,13 @@ def from_engine_args(
if engine_config.device_config.device_type == "neuron":
raise NotImplementedError("Neuron is not supported for "
"async engine yet.")
if engine_config.device_config.device_type == "cpu":
if (engine_config.parallel_config.worker_use_ray
or engine_args.engine_use_ray):
logger.warning("CPU backend does not support ray yet")
else:
from vllm.executor.cpu_executor import CPUExecutorAsync
executor_class = CPUExecutorAsync
elif (engine_config.parallel_config.worker_use_ray
or engine_args.engine_use_ray):
initialize_ray_cluster(engine_config.parallel_config)
Expand Down
27 changes: 25 additions & 2 deletions vllm/executor/cpu_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@

from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
ParallelConfig, SchedulerConfig)
from vllm.executor.executor_base import ExecutorBase
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
from vllm.utils import get_distributed_init_method, get_ip, get_open_port
from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
make_async)

logger = init_logger(__name__)

Expand Down Expand Up @@ -104,6 +105,28 @@ def check_health(self) -> None:
return


class CPUExecutorAsync(CPUExecutor, ExecutorAsyncBase):

async def execute_model_async(
self,
seq_group_metadata_list: List[SequenceGroupMetadata],
blocks_to_swap_in: Dict[int, int],
blocks_to_swap_out: Dict[int, int],
blocks_to_copy: Dict[int, List[int]],
) -> SamplerOutput:
output = await make_async(self.driver_worker.execute_model)(
seq_group_metadata_list=seq_group_metadata_list,
blocks_to_swap_in=blocks_to_swap_in,
blocks_to_swap_out=blocks_to_swap_out,
blocks_to_copy=blocks_to_copy)
return output

async def check_health_async(self) -> None:
# CPUExecutor will always be healthy as long as
# it's running.
return


def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
if config.dtype == torch.float16:
logger.warning("float16 is not supported on CPU, casting to bfloat16.")
Expand Down
3 changes: 3 additions & 0 deletions vllm/worker/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,9 @@ def init_device(self) -> None:
_check_if_gpu_supports_dtype(self.model_config.dtype)
torch.cuda.empty_cache()
self.init_gpu_memory = torch.cuda.mem_get_info()[0]
elif self.device_config.device == "cpu":
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel this part should be added in cpu_worker.

self.rank = 0
self.device = torch.device("cpu")
else:
raise RuntimeError(
f"Not support device type: {self.device_config.device}")
Expand Down
Loading