@@ -376,6 +376,63 @@ def __call__(self, text: str, images: Union[Image, list[Image]],
376376 return hf_model
377377
378378
379+ def skyworkr1v_patch_hf_runner (hf_model : HfRunner ) -> HfRunner :
380+ """Patches and returns an instance of the HfRunner to use for SkyworkR1V."""
381+
382+ class SkyworkR1VProcessor :
383+ """A simple processor for SkyworkR1V."""
384+
385+ def __init__ (self , hf_runner : HfRunner ):
386+ self .num_image_token = hf_runner .model .num_image_token
387+ self .tokenizer = hf_runner .tokenizer
388+
389+ self .config = AutoConfig .from_pretrained (hf_runner .model_name ,
390+ trust_remote_code = True )
391+ self .vision_config = self .config .vision_config
392+ self .use_thumbnail = self .config .use_thumbnail
393+ self .min_num = self .config .min_dynamic_patch
394+ self .max_num = self .config .max_dynamic_patch
395+ self .image_size = self .vision_config .image_size
396+
397+ def __call__ (self , text : str , images : Union [Image , list [Image ]],
398+ ** kwargs ):
399+ from vllm .model_executor .models .skyworkr1v import (
400+ IMG_CONTEXT , IMG_END , IMG_START ,
401+ image_to_pixel_values_skyworkr1v )
402+ images = [images ] if isinstance (images , Image ) else images
403+ pixel_values = [
404+ image_to_pixel_values_skyworkr1v (
405+ image ,
406+ input_size = self .image_size ,
407+ min_num = self .min_num ,
408+ max_num = self .max_num ,
409+ use_thumbnail = self .use_thumbnail ,
410+ ) for image in images
411+ ]
412+ num_patches_list = [
413+ pixel_value .shape [0 ] for pixel_value in pixel_values
414+ ]
415+ pixel_values = torch .cat (pixel_values , dim = 0 )
416+ for num_patches in num_patches_list :
417+ context_tokens = IMG_CONTEXT * self .num_image_token \
418+ * num_patches
419+ image_tokens = IMG_START + context_tokens + IMG_END
420+ text = text .replace ('<image>' , image_tokens , 1 )
421+ prompt = self .tokenizer (text , return_tensors = "pt" )
422+ prompt .update ({"pixel_values" : pixel_values })
423+ return prompt
424+
425+ img_context_token_id = hf_model .tokenizer .convert_tokens_to_ids (
426+ "<IMG_CONTEXT>" )
427+ hf_model .model .img_context_token_id = img_context_token_id
428+ hf_model .processor = SkyworkR1VProcessor (hf_model )
429+ hf_model .model .get_output_embeddings = lambda : \
430+ hf_model .model .language_model .get_output_embeddings ()
431+ hf_model .model .generate = types .MethodType (_internvl_generate ,
432+ hf_model .model )
433+ return hf_model
434+
435+
379436def internvl_patch_hf_runner (hf_model : HfRunner ) -> HfRunner :
380437 """Patches and returns an instance of the HfRunner to use for InternVL."""
381438
0 commit comments