@@ -303,10 +303,13 @@ def __init__(
303303 self .query_start_loc = self ._make_buffer (self .max_num_reqs + 1 ,
304304 dtype = torch .int32 )
305305 self .seq_lens = self ._make_buffer (self .max_num_reqs , dtype = torch .int32 )
306- self .inputs_embeds = torch .zeros (
307- (self .max_num_tokens , self .hidden_size ),
308- dtype = self .dtype ,
309- device = self .device )
306+ # Because inputs_embeds may be bfloat16 and we don't need a numpy
307+ # version of this tensor, avoid a RuntimeError by not creating a
308+ # numpy buffer.
309+ self .inputs_embeds = self ._make_buffer (self .max_num_tokens ,
310+ self .hidden_size ,
311+ dtype = self .dtype ,
312+ numpy = False )
310313
311314 # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
312315 if self .uses_mrope :
@@ -374,11 +377,18 @@ def __init__(
374377 device = "cpu" ,
375378 pin_memory = self .pin_memory )
376379
377- def _make_buffer (self , * args , dtype : torch .dtype ) -> CpuGpuBuffer :
378- return CpuGpuBuffer (* args ,
380+ def _make_buffer (self ,
381+ * size : Union [int , torch .SymInt ],
382+ dtype : torch .dtype ,
383+ numpy : bool = True ) -> CpuGpuBuffer :
384+ # Bfloat16 torch tensors cannot be directly cast to a numpy array, so
385+ # if a bfloat16 buffer is needed without a corresponding numpy array,
386+ # don't bother instantiating the numpy array.
387+ return CpuGpuBuffer (* size ,
379388 dtype = dtype ,
380389 device = self .device ,
381- pin_memory = self .pin_memory )
390+ pin_memory = self .pin_memory ,
391+ with_numpy = numpy )
382392
383393 def _init_model_kwargs (self , num_tokens : int ):
384394 model_kwargs = dict [str , Any ]()
@@ -1645,11 +1655,11 @@ def execute_model(
16451655 )
16461656
16471657 # TODO(woosuk): Avoid the copy. Optimize.
1648- self .inputs_embeds [:num_scheduled_tokens ].copy_ (
1658+ self .inputs_embeds . gpu [:num_scheduled_tokens ].copy_ (
16491659 inputs_embeds_scheduled )
16501660
16511661 input_ids = None
1652- inputs_embeds = self .inputs_embeds [:num_input_tokens ]
1662+ inputs_embeds = self .inputs_embeds . gpu [:num_input_tokens ]
16531663 model_kwargs = {
16541664 ** self ._init_model_kwargs (num_scheduled_tokens ),
16551665 ** self ._extract_mm_kwargs (scheduler_output ),
@@ -2484,7 +2494,7 @@ def _dummy_run(
24842494 num_scheduled_tokens , remove_lora ):
24852495 if self .supports_mm_inputs :
24862496 input_ids = None
2487- inputs_embeds = self .inputs_embeds [:num_tokens ]
2497+ inputs_embeds = self .inputs_embeds . gpu [:num_tokens ]
24882498 model_kwargs = {
24892499 ** self ._init_model_kwargs (num_tokens ),
24902500 ** self ._dummy_mm_kwargs (num_reqs ),
0 commit comments