@@ -177,6 +177,70 @@ Multi-image input can be extended to perform video captioning. We show this with
177177You can pass a list of NumPy arrays directly to the ` 'video' ` field of the multi-modal dictionary
178178instead of using multi-image input.
179179
180+ Instead of NumPy arrays, you can also pass ` 'torch.Tensor' ` instances, as shown in this example using Qwen2.5-VL:
181+
182+ ??? code
183+
184+ ```python
185+ from transformers import AutoProcessor
186+ from vllm import LLM, SamplingParams
187+ from qwen_vl_utils import process_vision_info
188+
189+ model_path = "Qwen/Qwen2.5-VL-3B-Instruct/"
190+ video_path = "https://content.pexels.com/videos/free-videos.mp4"
191+
192+ llm = LLM(
193+ model=model_path,
194+ gpu_memory_utilization=0.8,
195+ enforce_eager=True,
196+ limit_mm_per_prompt={"video": 1},
197+ )
198+
199+ sampling_params = SamplingParams(
200+ max_tokens=1024,
201+ )
202+
203+ video_messages = [
204+ {"role": "system", "content": "You are a helpful assistant."},
205+ {"role": "user", "content": [
206+ {"type": "text", "text": "describe this video."},
207+ {
208+ "type": "video",
209+ "video": video_path,
210+ "total_pixels": 20480 * 28 * 28,
211+ "min_pixels": 16 * 28 * 28
212+ }
213+ ]
214+ },
215+ ]
216+
217+ messages = video_messages
218+ processor = AutoProcessor.from_pretrained(model_path)
219+ prompt = processor.apply_chat_template(
220+ messages,
221+ tokenize=False,
222+ add_generation_prompt=True,
223+ )
224+
225+ image_inputs, video_inputs = process_vision_info(messages)
226+ mm_data = {}
227+ if video_inputs is not None:
228+ mm_data["video"] = video_inputs
229+
230+ llm_inputs = {
231+ "prompt": prompt,
232+ "multi_modal_data": mm_data,
233+ }
234+
235+ outputs = llm.generate([llm_inputs], sampling_params=sampling_params)
236+ for o in outputs:
237+ generated_text = o.outputs[0].text
238+ print(generated_text)
239+ ```
240+
241+ !!! note
242+ 'process_vision_info' is only applicable to Qwen2.5-VL and similar models.
243+
180244Full example: < gh-file:examples/offline_inference/vision_language.py >
181245
182246### Audio Inputs
0 commit comments