LeiWang1999
diff --git a/‎docs/source/dev/input_processing/input_processing_pipeline.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/dev/input_processing/input_processing_pipeline.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/dev/multimodal/multimodal_index.rst‎
Lines changed: 3 additions & 0 deletions b/‎docs/source/dev/multimodal/multimodal_index.rst‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/source/models/enabling_multimodal_inputs.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/models/enabling_multimodal_inputs.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/engine/test_arg_utils.py‎
Lines changed: 24 additions & 0 deletions b/‎tests/engine/test_arg_utils.py‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎tests/models/test_blip2.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/models/test_blip2.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/models/test_fuyu.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/models/test_fuyu.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/models/test_internvl.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/models/test_internvl.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/models/test_llava.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/models/test_llava.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/models/test_llava_next.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/models/test_llava_next.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/models/test_minicpmv.py‎
Lines changed: 3 additions & 2 deletions b/‎tests/models/test_minicpmv.py‎
Lines changed: 3 additions & 2 deletions
@@ -17,4 +17,4 @@ Input Processing Pipeline
 
 6. If the data contains multi-modal data, convert it into keyword arguments using :meth:`MULTIMODAL_REGISTRY.map_input <vllm.multimodal.MultiModalRegistry.map_input>`.
 
-   - For example, convert a :class:`PIL.Image.Image` input to its pixel values for a vision language model.
+   - For example, convert a :class:`PIL.Image.Image` input to its pixel values for a vision model.
@@ -15,6 +15,9 @@ by following :ref:`this guide <adding_multimodal_plugin>`.
 
 Looking to add your own multi-modal model? Please follow the instructions listed :ref:`here <enabling_multimodal_inputs>`.
 
+..
+  TODO: Add usage of --limit-mm-per-prompt when multi-image input is officially supported
+
 Guides
 ++++++
 
 
@@ -66,7 +66,7 @@ A default mapper is available for each modality in the core vLLM library. This i
 3. Register maximum number of multi-modal tokens
 ------------------------------------------------
 
-For each modality type that the model accepts as input, calculate the maximum possible number of tokens
+For each modality type that the model accepts as input, calculate the maximum possible number of tokens per data instance
 and register it via :meth:`INPUT_REGISTRY.register_dummy_data <vllm.inputs.registry.InputRegistry.register_max_multimodal_tokens>`.
 
 .. code-block:: diff
 
@@ -0,0 +1,24 @@
+import pytest
+
+from vllm.engine.arg_utils import EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+@pytest.mark.parametrize(("arg", "expected"), [
+    (None, None),
+    ("image=16", {
+        "image": 16
+    }),
+    ("image=16,video=2", {
+        "image": 16,
+        "video": 2
+    }),
+])
+def test_limit_mm_per_prompt_parser(arg, expected):
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+    if arg is None:
+        args = parser.parse_args([])
+    else:
+        args = parser.parse_args(["--limit-mm-per-prompt", arg])
+
+    assert args.limit_mm_per_prompt == expected
@@ -59,7 +59,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
     All the image fixtures for the test is under tests/images.
     For huggingface runner, we provide the PIL images as input.
     For vllm runner, we provide MultiModalData objects and corresponding
-    vision language config as input.
+    MultiModalConfig as input.
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
     """
 
@@ -49,7 +49,7 @@ def run_test(
     All the image fixtures for the test is under tests/images.
     For huggingface runner, we provide the PIL images as input.
     For vllm runner, we provide MultiModalDataDict objects 
-    and corresponding vision language config as input.
+    and corresponding MultiModalConfig as input.
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
     """
 
@@ -117,7 +117,7 @@ def run_test(
     All the image fixtures for the test is under tests/images.
     For huggingface runner, we provide the PIL images as input.
     For vllm runner, we provide MultiModalDataDict objects 
-    and corresponding vision language config as input.
+    and corresponding MultiModalConfig as input.
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
     """
 
@@ -69,7 +69,7 @@ def run_test(
     All the image fixtures for the test is under tests/images.
     For huggingface runner, we provide the PIL images as input.
     For vllm runner, we provide MultiModalDataDict objects 
-    and corresponding vision language config as input.
+    and corresponding MultiModalConfig as input.
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
     """
 
@@ -177,7 +177,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
     All the image fixtures for the test is under tests/images.
     For huggingface runner, we provide the PIL images as input.
     For vllm runner, we provide MultiModalDataDict objects 
-    and corresponding vision language config as input.
+    and corresponding MultiModalConfig as input.
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
     """
 
@@ -61,7 +61,7 @@ def run_test(
     All the image fixtures for the test is under tests/images.
     For huggingface runner, we provide the PIL images as input.
     For vllm runner, we provide MultiModalDataDict objects 
-    and corresponding vision language config as input.
+    and corresponding MultiModalConfig as input.
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
     """
@@ -176,7 +176,7 @@ def run_multi_image_test(
     All the image fixtures for the test is under tests/images.
     For huggingface runner, we provide the PIL images as input.
     For vllm runner, we provide MultiModalDataDict objects 
-    and corresponding vision language config as input.
+    and corresponding MultiModalConfig as input.
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
     """
@@ -197,6 +197,7 @@ def run_multi_image_test(
     with vllm_runner(model,
                      max_model_len=4096,
                      max_num_seqs=1,
+                     limit_mm_per_prompt={"image": len(images)},
                      dtype=dtype,
                      tensor_parallel_size=tensor_parallel_size,
                      distributed_executor_backend=distributed_executor_backend,
Original file line number	Diff line number	Diff line change
`@@ -17,4 +17,4 @@ Input Processing Pipeline`
`17`	`17`
`18`	`18`	6. If the data contains multi-modal data, convert it into keyword arguments using :meth:`MULTIMODAL_REGISTRY.map_input <vllm.multimodal.MultiModalRegistry.map_input>`.
`19`	`19`
`20`		- - For example, convert a :class:`PIL.Image.Image` input to its pixel values for a vision language model.
	`20`	+ - For example, convert a :class:`PIL.Image.Image` input to its pixel values for a vision model.