33import  dataclasses 
44import  os 
55import  time 
6- from  abc  import  ABC ,  abstractmethod 
6+ from  abc  import  abstractmethod 
77from  typing  import  Any , Dict , List , Optional , Set , Tuple , Type , Union 
88
99import  cloudpickle 
1919from  vllm .sequence  import  ExecuteModelRequest , IntermediateTensors 
2020from  vllm .utils  import  (enable_trace_function_call_for_thread ,
2121                        resolve_obj_by_qualname , run_method ,
22-                         update_environment_variables )
22+                         update_environment_variables ,
23+                         warn_for_unimplemented_methods )
2324from  vllm .worker .model_runner_base  import  (BroadcastableModelInput ,
2425                                           ModelRunnerBase ,
2526                                           ModelRunnerInputBase )
2627
2728logger  =  init_logger (__name__ )
2829
2930
30- class  WorkerBase (ABC ):
31+ @warn_for_unimplemented_methods  
32+ class  WorkerBase :
3133    """Worker interface that allows vLLM to cleanly separate implementations for 
3234    different hardware. Also abstracts control plane communication, e.g., to 
3335    communicate request metadata to other workers. 
@@ -53,35 +55,31 @@ def __init__(
5355        from  vllm .platforms  import  current_platform 
5456        self .current_platform  =  current_platform 
5557
56-     @abstractmethod  
5758    def  init_device (self ) ->  None :
5859        """Initialize device state, such as loading the model or other on-device 
5960        memory allocations. 
6061        """ 
6162        raise  NotImplementedError 
6263
63-     @abstractmethod  
64-     def  determine_num_available_blocks (self ) ->  Tuple [int , int ]:
65-         """Determine the number of available blocks for the GPU KV cache and 
66-         swappable CPU KV cache. 
67- 
68-         The implementation may run profiling or other heuristics to determine 
69-         the size of caches. 
70- 
71-         Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks 
72-         are blocks that are "active" on the device and can be appended to. 
73-         num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be 
74-         appended to. 
75-         """ 
76-         raise  NotImplementedError 
77- 
78-     @abstractmethod  
7964    def  initialize_cache (self , num_gpu_blocks : int ,
8065                         num_cpu_blocks : int ) ->  None :
8166        """Initialize the KV cache with the given size in blocks. 
8267        """ 
8368        raise  NotImplementedError 
8469
70+     def  get_model (self ) ->  nn .Module :
71+         raise  NotImplementedError 
72+ 
73+     def  load_model (self ) ->  None :
74+         """Load model onto target device.""" 
75+         raise  NotImplementedError 
76+ 
77+     def  execute_model (
78+         self ,
79+         execute_model_req : Optional [ExecuteModelRequest ] =  None 
80+     ) ->  Optional [List [SamplerOutput ]]:
81+         raise  NotImplementedError 
82+ 
8583    def  start_worker_execution_loop (self ) ->  None :
8684        """Execute model loop in parallel worker. 
8785
@@ -94,40 +92,43 @@ def start_worker_execution_loop(self) -> None:
9492                if  output  is  None :
9593                    return  None 
9694
97-     @ abstractmethod 
98-     def   get_model ( self )  ->   nn . Module : 
99-         raise   NotImplementedError 
95+     def   determine_num_available_blocks ( self )  ->   Tuple [ int ,  int ]: 
96+          """Determine the number of available blocks for the GPU KV cache and 
97+         swappable CPU KV cache.  
10098
101-     @abstractmethod  
102-     def  execute_model (
103-         self ,
104-         execute_model_req : Optional [ExecuteModelRequest ] =  None 
105-     ) ->  Optional [List [SamplerOutput ]]:
99+         The implementation may run profiling or other heuristics to determine 
100+         the size of caches. 
101+ 
102+         Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks 
103+         are blocks that are "active" on the device and can be appended to. 
104+         num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be 
105+         appended to. 
106+         """ 
106107        raise  NotImplementedError 
107108
108-     @abstractmethod  
109109    def  get_cache_block_size_bytes (self ) ->  int :
110110        """Return the size of a single cache block, in bytes. Used in 
111111        speculative decoding. 
112112        """ 
113113        raise  NotImplementedError 
114114
115-     @abstractmethod  
116115    def  add_lora (self , lora_request : LoRARequest ) ->  bool :
117116        raise  NotImplementedError 
118117
119-     @abstractmethod  
120118    def  remove_lora (self , lora_id : int ) ->  bool :
121119        raise  NotImplementedError 
122120
123-     @abstractmethod  
124121    def  pin_lora (self , lora_id : int ) ->  bool :
125122        raise  NotImplementedError 
126123
127-     @abstractmethod  
128124    def  list_loras (self ) ->  Set [int ]:
129125        raise  NotImplementedError 
130126
127+     @property  
128+     def  vocab_size (self ) ->  int :
129+         """Get vocabulary size from model configuration.""" 
130+         return  self .model_config .get_vocab_size ()
131+ 
131132
132133class  DelegateWorkerBase (WorkerBase ):
133134    """ 
@@ -156,6 +157,10 @@ def initialize_cache(self, num_gpu_blocks: int,
156157                         num_cpu_blocks : int ) ->  None :
157158        self .worker .initialize_cache (num_gpu_blocks , num_cpu_blocks )
158159
160+     def  load_model (self ) ->  None :
161+         """Load model onto target device.""" 
162+         self .worker .load_model ()
163+ 
159164    def  get_model (self ) ->  nn .Module :
160165        return  self .worker .get_model ()
161166
0 commit comments