ray-project · kouroshHakha · Oct 22, 2025 · Oct 22, 2025 · Oct 22, 2025 · kouroshHakha
@@ -261,6 +261,8 @@ async def _prepare_llm_request(self, row: Dict[str, Any]) -> vLLMEngineRequest:
         else:
             tokenized_prompt = None
 
+        # Extract image data from preprocessing output
+        # Note: Field name is 'image' (singular) not 'images' (plural).
         if "image" in row:
             image = row.pop("image")
         else:
@@ -690,7 +692,7 @@ def get_optional_input_keys(self) -> Dict[str, str]:
         """The optional input keys of the stage and their descriptions."""
         return {
             "tokenized_prompt": "The tokenized prompt. If provided, the prompt will not be tokenized by the vLLM engine.",
-            "images": "The images to generate text from. If provided, the prompt will be a multimodal prompt.",
+            "image": "The image(s) for multimodal input. Accepts a single image or list of images.",
             "model": "The model to use for this request. If the model is different from the "
             "model set in the stage, then this is a LoRA request.",
         }