@@ -289,6 +289,106 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
289289 )
290290
291291
292+ def load_llava (question : str , image_urls : list [str ]) -> ModelRequestData :
293+ # NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
294+ # it will generate poor response for multi-image inputs!
295+ model_name = "llava-hf/llava-1.5-7b-hf"
296+ engine_args = EngineArgs (
297+ model = model_name ,
298+ max_num_seqs = 16 ,
299+ limit_mm_per_prompt = {"image" : len (image_urls )},
300+ )
301+
302+ placeholders = [{"type" : "image" , "image" : url } for url in image_urls ]
303+ messages = [
304+ {
305+ "role" : "user" ,
306+ "content" : [
307+ * placeholders ,
308+ {"type" : "text" , "text" : question },
309+ ],
310+ }
311+ ]
312+
313+ processor = AutoProcessor .from_pretrained (model_name )
314+
315+ prompt = processor .apply_chat_template (
316+ messages , tokenize = False , add_generation_prompt = True
317+ )
318+
319+ return ModelRequestData (
320+ engine_args = engine_args ,
321+ prompt = prompt ,
322+ image_data = [fetch_image (url ) for url in image_urls ],
323+ )
324+
325+
326+ def load_llava_next (question : str , image_urls : list [str ]) -> ModelRequestData :
327+ model_name = "llava-hf/llava-v1.6-mistral-7b-hf"
328+ engine_args = EngineArgs (
329+ model = model_name ,
330+ max_model_len = 8192 ,
331+ max_num_seqs = 16 ,
332+ limit_mm_per_prompt = {"image" : len (image_urls )},
333+ )
334+
335+ placeholders = [{"type" : "image" , "image" : url } for url in image_urls ]
336+ messages = [
337+ {
338+ "role" : "user" ,
339+ "content" : [
340+ * placeholders ,
341+ {"type" : "text" , "text" : question },
342+ ],
343+ }
344+ ]
345+
346+ processor = AutoProcessor .from_pretrained (model_name )
347+
348+ prompt = processor .apply_chat_template (
349+ messages , tokenize = False , add_generation_prompt = True
350+ )
351+
352+ return ModelRequestData (
353+ engine_args = engine_args ,
354+ prompt = prompt ,
355+ image_data = [fetch_image (url ) for url in image_urls ],
356+ )
357+
358+
359+ def load_llava_onevision (question : str , image_urls : list [str ]) -> ModelRequestData :
360+ model_name = "llava-hf/llava-onevision-qwen2-7b-ov-hf"
361+ engine_args = EngineArgs (
362+ model = model_name ,
363+ max_model_len = 16384 ,
364+ max_num_seqs = 16 ,
365+ limit_mm_per_prompt = {"image" : len (image_urls )},
366+ )
367+
368+ placeholders = [{"type" : "image" , "image" : url } for url in image_urls ]
369+ messages = [
370+ {
371+ "role" : "user" ,
372+ "content" : [
373+ * placeholders ,
374+ {"type" : "text" , "text" : question },
375+ ],
376+ }
377+ ]
378+
379+ processor = AutoProcessor .from_pretrained (model_name )
380+
381+ prompt = processor .apply_chat_template (
382+ messages , tokenize = False , add_generation_prompt = True
383+ )
384+
385+ return ModelRequestData (
386+ engine_args = engine_args ,
387+ prompt = prompt ,
388+ image_data = [fetch_image (url ) for url in image_urls ],
389+ )
390+
391+
292392def load_llama4 (question : str , image_urls : list [str ]) -> ModelRequestData :
293393 model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
294394
@@ -737,6 +837,9 @@ def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
737837 "idefics3" : load_idefics3 ,
738838 "internvl_chat" : load_internvl ,
739839 "kimi_vl" : load_kimi_vl ,
840+ "llava" : load_llava ,
841+ "llava-next" : load_llava_next ,
842+ "llava-onevision" : load_llava_onevision ,
740843 "llama4" : load_llama4 ,
741844 "mistral3" : load_mistral3 ,
742845 "mllama" : load_mllama ,
0 commit comments