33import dataclasses
44import os
55import time
6- from abc import ABC , abstractmethod
6+ from abc import abstractmethod
77from typing import Any , Dict , List , Optional , Set , Tuple , Type , Union
88
99import cloudpickle
1919from vllm .sequence import ExecuteModelRequest , IntermediateTensors
2020from vllm .utils import (enable_trace_function_call_for_thread ,
2121 resolve_obj_by_qualname , run_method ,
22- update_environment_variables )
22+ update_environment_variables ,
23+ warn_for_unimplemented_methods )
2324from vllm .worker .model_runner_base import (BroadcastableModelInput ,
2425 ModelRunnerBase ,
2526 ModelRunnerInputBase )
2627
2728logger = init_logger (__name__ )
2829
2930
30- class WorkerBase (ABC ):
31+ @warn_for_unimplemented_methods
32+ class WorkerBase :
3133 """Worker interface that allows vLLM to cleanly separate implementations for
3234 different hardware. Also abstracts control plane communication, e.g., to
3335 communicate request metadata to other workers.
@@ -53,35 +55,31 @@ def __init__(
5355 from vllm .platforms import current_platform
5456 self .current_platform = current_platform
5557
56- @abstractmethod
5758 def init_device (self ) -> None :
5859 """Initialize device state, such as loading the model or other on-device
5960 memory allocations.
6061 """
6162 raise NotImplementedError
6263
63- @abstractmethod
64- def determine_num_available_blocks (self ) -> Tuple [int , int ]:
65- """Determine the number of available blocks for the GPU KV cache and
66- swappable CPU KV cache.
67-
68- The implementation may run profiling or other heuristics to determine
69- the size of caches.
70-
71- Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
72- are blocks that are "active" on the device and can be appended to.
73- num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
74- appended to.
75- """
76- raise NotImplementedError
77-
78- @abstractmethod
7964 def initialize_cache (self , num_gpu_blocks : int ,
8065 num_cpu_blocks : int ) -> None :
8166 """Initialize the KV cache with the given size in blocks.
8267 """
8368 raise NotImplementedError
8469
70+ def get_model (self ) -> nn .Module :
71+ raise NotImplementedError
72+
73+ def load_model (self ) -> None :
74+ """Load model onto target device."""
75+ raise NotImplementedError
76+
77+ def execute_model (
78+ self ,
79+ execute_model_req : Optional [ExecuteModelRequest ] = None
80+ ) -> Optional [List [SamplerOutput ]]:
81+ raise NotImplementedError
82+
8583 def start_worker_execution_loop (self ) -> None :
8684 """Execute model loop in parallel worker.
8785
@@ -94,40 +92,43 @@ def start_worker_execution_loop(self) -> None:
9492 if output is None :
9593 return None
9694
97- @ abstractmethod
98- def get_model ( self ) -> nn . Module :
99- raise NotImplementedError
95+ def determine_num_available_blocks ( self ) -> Tuple [ int , int ]:
96+ """Determine the number of available blocks for the GPU KV cache and
97+ swappable CPU KV cache.
10098
101- @abstractmethod
102- def execute_model (
103- self ,
104- execute_model_req : Optional [ExecuteModelRequest ] = None
105- ) -> Optional [List [SamplerOutput ]]:
99+ The implementation may run profiling or other heuristics to determine
100+ the size of caches.
101+
102+ Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
103+ are blocks that are "active" on the device and can be appended to.
104+ num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
105+ appended to.
106+ """
106107 raise NotImplementedError
107108
108- @abstractmethod
109109 def get_cache_block_size_bytes (self ) -> int :
110110 """Return the size of a single cache block, in bytes. Used in
111111 speculative decoding.
112112 """
113113 raise NotImplementedError
114114
115- @abstractmethod
116115 def add_lora (self , lora_request : LoRARequest ) -> bool :
117116 raise NotImplementedError
118117
119- @abstractmethod
120118 def remove_lora (self , lora_id : int ) -> bool :
121119 raise NotImplementedError
122120
123- @abstractmethod
124121 def pin_lora (self , lora_id : int ) -> bool :
125122 raise NotImplementedError
126123
127- @abstractmethod
128124 def list_loras (self ) -> Set [int ]:
129125 raise NotImplementedError
130126
127+ @property
128+ def vocab_size (self ) -> int :
129+ """Get vocabulary size from model configuration."""
130+ return self .model_config .get_vocab_size ()
131+
131132
132133class DelegateWorkerBase (WorkerBase ):
133134 """
@@ -156,6 +157,10 @@ def initialize_cache(self, num_gpu_blocks: int,
156157 num_cpu_blocks : int ) -> None :
157158 self .worker .initialize_cache (num_gpu_blocks , num_cpu_blocks )
158159
160+ def load_model (self ) -> None :
161+ """Load model onto target device."""
162+ self .worker .load_model ()
163+
159164 def get_model (self ) -> nn .Module :
160165 return self .worker .get_model ()
161166
0 commit comments