Skip to content

Commit 139d155

Browse files
[Frontend] Use engine argument to control MM cache size (#22441)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
1 parent 8c9da6b commit 139d155

File tree

13 files changed

+101
-47
lines changed

13 files changed

+101
-47
lines changed

docs/configuration/conserving_memory.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct",
8686

8787
If you run out of CPU RAM, try the following options:
8888

89-
- (Multi-modal models only) you can set the size of multi-modal processor cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB per API process + 4 GiB per engine core process)
89+
- (Multi-modal models only) you can set the size of multi-modal processor cache by setting `mm_processor_cache_gb` engine argument (default 4 GiB per API process + 4 GiB per engine core process)
9090
- (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB).
9191

9292
## Multi-modal input limits

docs/configuration/optimization.md

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -161,12 +161,18 @@ By default, the multi-modal processor cache is enabled to avoid repeatedly proce
161161
the same multi-modal inputs via Hugging Face `AutoProcessor`,
162162
which commonly occurs in multi-turn conversations.
163163

164-
You can adjust the size of the cache via `VLLM_MM_INPUT_CACHE_GIB` environment variable
164+
You can adjust the size of the cache by setting the value of `mm_processor_cache_gb`
165165
(default 4 GiB per API process + 4 GiB per engine core process).
166+
If you do not benefit much from the cache, you can disable it completely via `mm_processor_cache_gb=0`.
166167

167-
If you do not benefit much from the cache, you can disable it completely via `disable_mm_preprocessor_cache`:
168+
Examples:
168169

169170
```python
171+
# Use a larger cache
170172
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
171-
disable_mm_preprocessor_cache=True)
173+
mm_processor_cache_gb=8)
174+
175+
# Disable the cache
176+
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
177+
mm_processor_cache_gb=0)
172178
```

examples/offline_inference/mistral-small.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def run_simple_demo(args: argparse.Namespace):
6868
max_model_len=4096,
6969
max_num_seqs=2,
7070
tensor_parallel_size=2,
71-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
71+
mm_processor_cache_gb=0 if args.disable_mm_processor_cache else 4,
7272
)
7373

7474
prompt = "Describe this image in one sentence."
@@ -105,7 +105,7 @@ def run_advanced_demo(args: argparse.Namespace):
105105
limit_mm_per_prompt={"image": max_img_per_msg},
106106
max_model_len=max_img_per_msg * max_tokens_per_img,
107107
tensor_parallel_size=2,
108-
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
108+
mm_processor_cache_gb=0 if args.disable_mm_processor_cache else 4,
109109
)
110110

111111
prompt = "Describe the following image."
@@ -164,7 +164,7 @@ def parse_args():
164164
)
165165

166166
parser.add_argument(
167-
"--disable-mm-preprocessor-cache",
167+
"--disable-mm-processor-cache",
168168
action="store_true",
169169
help="If True, disables caching of multi-modal processor.",
170170
)

examples/offline_inference/vision_language.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1563,7 +1563,7 @@ def parse_args():
15631563
)
15641564

15651565
parser.add_argument(
1566-
"--disable-mm-preprocessor-cache",
1566+
"--disable-mm-processor-cache",
15671567
action="store_true",
15681568
help="If True, disables caching of multi-modal processor.",
15691569
)
@@ -1603,7 +1603,7 @@ def main(args):
16031603

16041604
engine_args = asdict(req_data.engine_args) | {
16051605
"seed": args.seed,
1606-
"disable_mm_preprocessor_cache": args.disable_mm_preprocessor_cache,
1606+
"mm_processor_cache_gb": 0 if args.disable_mm_processor_cache else 4,
16071607
}
16081608
llm = LLM(**engine_args)
16091609

tests/models/multimodal/generation/vlm_utils/core.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,7 @@ def run_test(
6262
# if we run HF first, the cuda initialization will be done and it
6363
# will hurt multiprocessing backend with fork method (the default method).
6464

65-
vllm_runner_kwargs_: dict[str, Any] = {
66-
"disable_mm_preprocessor_cache": True,
67-
}
65+
vllm_runner_kwargs_: dict[str, Any] = {"mm_processor_cache_gb": 0}
6866
if model_info.tokenizer:
6967
vllm_runner_kwargs_["tokenizer_name"] = model_info.tokenizer
7068
if model_info.tokenizer_mode:

tests/models/multimodal/processing/test_llama4.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,22 +15,22 @@
1515
["meta-llama/Llama-4-Scout-17B-16E-Instruct"])
1616
@pytest.mark.parametrize("mm_processor_kwargs", [{}])
1717
@pytest.mark.parametrize("num_imgs", [1, 5])
18-
@pytest.mark.parametrize("disable_mm_preprocessor_cache", [True, False])
18+
@pytest.mark.parametrize("mm_processor_cache_gb", [0, 4])
1919
@pytest.mark.parametrize("tokenized_prompt", [True, False])
2020
def test_processor_override(
2121
image_assets: ImageTestAssets,
2222
model_id: str,
2323
mm_processor_kwargs: dict,
2424
num_imgs: int,
25-
disable_mm_preprocessor_cache: bool,
25+
mm_processor_cache_gb: int,
2626
tokenized_prompt: bool,
2727
):
2828
"""Ensure llama4 processor works properly."""
2929
ctx = build_model_context(
3030
model_id,
3131
mm_processor_kwargs=mm_processor_kwargs,
3232
limit_mm_per_prompt={"image": num_imgs},
33-
disable_mm_preprocessor_cache=disable_mm_preprocessor_cache,
33+
mm_processor_cache_gb=mm_processor_cache_gb,
3434
)
3535
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
3636
config = processor.info.get_hf_config()

tests/models/utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -261,7 +261,7 @@ def build_model_context(
261261
model_config_kwargs: Optional[dict[str, Any]] = None,
262262
mm_processor_kwargs: Optional[dict[str, Any]] = None,
263263
limit_mm_per_prompt: Optional[dict[str, int]] = None,
264-
disable_mm_preprocessor_cache: bool = True,
264+
mm_processor_cache_gb: int = 0,
265265
):
266266
"""Creates an InputContext for a given model.
267267
@@ -291,7 +291,7 @@ def build_model_context(
291291
seed=0,
292292
mm_processor_kwargs=mm_processor_kwargs,
293293
limit_mm_per_prompt=limit_mm_per_prompt,
294-
disable_mm_preprocessor_cache=disable_mm_preprocessor_cache,
294+
mm_processor_cache_gb=mm_processor_cache_gb,
295295
hf_overrides=model_info.hf_overrides,
296296
**model_config_kwargs,
297297
)

vllm/config.py

Lines changed: 32 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -443,8 +443,15 @@ class ModelConfig:
443443
from `AutoProcessor.from_pretrained`. The available overrides depend on the
444444
model that is being run. For example, for Phi-3-Vision: `{"num_crops": 4}`.
445445
"""
446-
disable_mm_preprocessor_cache: bool = False
447-
"""If `True`, disable caching of the multi-modal processor."""
446+
mm_processor_cache_gb: int = 4
447+
"""The size (in GiB) of the multi-modal processor cache, which is used to
448+
avoid re-processing past multi-modal inputs.
449+
450+
This cache is duplicated for each API process and engine core process,
451+
resulting in a total memory usage of
452+
`mm_processor_cache_gb * (api_server_count + data_parallel_size)`.
453+
454+
Set to `0` to disable this cache completely (not recommended)."""
448455
override_neuron_config: dict[str, Any] = field(default_factory=dict)
449456
"""Initialize non-default neuron config or override default neuron config
450457
that are specific to Neuron devices, this argument will be used to
@@ -881,17 +888,16 @@ def _init_multimodal_config(self) -> Optional["MultiModalConfig"]:
881888
limit_per_prompt=self.limit_mm_per_prompt,
882889
media_io_kwargs=self.media_io_kwargs,
883890
mm_processor_kwargs=self.mm_processor_kwargs,
884-
disable_mm_preprocessor_cache=self.
885-
disable_mm_preprocessor_cache,
891+
mm_processor_cache_gb=self.mm_processor_cache_gb,
886892
interleave_mm_strings=self.interleave_mm_strings)
887893

888894
return None
889895

890-
def set_disable_mm_preprocessor_cache(self, value: bool) -> None:
896+
def set_mm_processor_cache_gb(self, value: int) -> None:
891897
mm_config = self.get_multimodal_config()
892898

893-
self.disable_mm_preprocessor_cache = value
894-
mm_config.disable_mm_preprocessor_cache = value
899+
self.mm_processor_cache_gb = value
900+
mm_config.mm_processor_cache_gb = value
895901

896902
def _get_encoder_config(self):
897903
return get_sentence_transformer_tokenizer_config(
@@ -1698,7 +1704,16 @@ def processor_return_mm_hashes(self) -> bool:
16981704
if mm_config is None:
16991705
return False
17001706

1701-
return not mm_config.disable_mm_preprocessor_cache
1707+
return mm_config.mm_processor_cache_gb > 0
1708+
1709+
@property
1710+
def enable_mm_processor_cache(self) -> bool:
1711+
"""Whether the multi-modal processor cache should be enabled."""
1712+
mm_config = self.multimodal_config
1713+
if mm_config is None:
1714+
return False
1715+
1716+
return mm_config.mm_processor_cache_gb > 0
17021717

17031718
@property
17041719
def enable_mm_input_cache(self) -> bool:
@@ -1707,7 +1722,7 @@ def enable_mm_input_cache(self) -> bool:
17071722
if mm_config is None:
17081723
return False
17091724

1710-
return not mm_config.disable_mm_preprocessor_cache
1725+
return mm_config.mm_processor_cache_gb > 0
17111726

17121727
def get_mm_input_cache_gb(self) -> int:
17131728
mm_config = self.multimodal_config
@@ -3391,9 +3406,15 @@ class MultiModalConfig:
33913406
`{"num_crops": 4}`.
33923407
"""
33933408

3394-
disable_mm_preprocessor_cache: bool = False
3409+
mm_processor_cache_gb: int = 4
33953410
"""
3396-
If `True`, disable caching of the multi-modal processor.
3411+
The size (in GiB) of the multi-modal processor cache, which is used to
3412+
3413+
This cache is duplicated for each API process and engine core process,
3414+
resulting in a total memory usage of
3415+
`mm_processor_cache_gb * (api_server_count + data_parallel_size)`.
3416+
3417+
Set to `0` to disable this cache completely (not recommended).
33973418
"""
33983419

33993420
interleave_mm_strings: bool = False

vllm/engine/arg_utils.py

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -358,8 +358,8 @@ class EngineArgs:
358358
"media_io_kwargs")
359359
mm_processor_kwargs: Optional[Dict[str, Any]] = \
360360
MultiModalConfig.mm_processor_kwargs
361-
disable_mm_preprocessor_cache: bool = \
362-
MultiModalConfig.disable_mm_preprocessor_cache
361+
disable_mm_preprocessor_cache: bool = False # DEPRECATED
362+
mm_processor_cache_gb: int = MultiModalConfig.mm_processor_cache_gb
363363
# LoRA fields
364364
enable_lora: bool = False
365365
enable_lora_bias: bool = LoRAConfig.bias_enabled
@@ -720,8 +720,11 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
720720
"--mm-processor-kwargs",
721721
**multimodal_kwargs["mm_processor_kwargs"])
722722
multimodal_group.add_argument(
723-
"--disable-mm-preprocessor-cache",
724-
**multimodal_kwargs["disable_mm_preprocessor_cache"])
723+
"--mm-processor-cache-gb",
724+
**multimodal_kwargs["mm_processor_cache_gb"])
725+
multimodal_group.add_argument("--disable-mm-preprocessor-cache",
726+
type=bool,
727+
deprecated=True)
725728
multimodal_group.add_argument(
726729
"--interleave-mm-strings",
727730
**multimodal_kwargs["interleave_mm_strings"])
@@ -886,6 +889,23 @@ def create_model_config(self) -> ModelConfig:
886889
self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}"
887890
self.load_format = "runai_streamer"
888891

892+
if self.disable_mm_preprocessor_cache:
893+
logger.warning(
894+
"`--disable-mm-preprocessor-cache` is deprecated "
895+
"and will be removed in v0.13. "
896+
"Please use `--mm-processor-cache-gb 0` instead.", )
897+
898+
self.mm_processor_cache_gb = 0
899+
elif envs.VLLM_MM_INPUT_CACHE_GIB != 4:
900+
logger.warning(
901+
"VLLM_MM_INPUT_CACHE_GIB` is deprecated "
902+
"and will be removed in v0.13. "
903+
"Please use `--mm-processor-cache-gb %d` instead.",
904+
envs.VLLM_MM_INPUT_CACHE_GIB,
905+
)
906+
907+
self.mm_processor_cache_gb = envs.VLLM_MM_INPUT_CACHE_GIB
908+
889909
return ModelConfig(
890910
model=self.model,
891911
hf_config_path=self.hf_config_path,
@@ -922,7 +942,7 @@ def create_model_config(self) -> ModelConfig:
922942
use_async_output_proc=not self.disable_async_output_proc,
923943
config_format=self.config_format,
924944
mm_processor_kwargs=self.mm_processor_kwargs,
925-
disable_mm_preprocessor_cache=self.disable_mm_preprocessor_cache,
945+
mm_processor_cache_gb=self.mm_processor_cache_gb,
926946
override_neuron_config=self.override_neuron_config,
927947
override_pooler_config=self.override_pooler_config,
928948
logits_processor_pattern=self.logits_processor_pattern,
@@ -1234,13 +1254,13 @@ def create_engine_config(
12341254
dp_supports_mm_processor_cache = (self.data_parallel_size == 1
12351255
or data_parallel_external_lb)
12361256
if (not dp_supports_mm_processor_cache
1237-
and not model_config.disable_mm_preprocessor_cache):
1257+
and model_config.mm_processor_cache_gb > 0):
12381258
logger.warning(
12391259
"Multi-modal processor cache is disabled because "
12401260
"it is not compatible with data parallelism when "
12411261
"there does not exist a one-to-one correspondance "
12421262
"between API and engine core processes.")
1243-
model_config.set_disable_mm_preprocessor_cache(True)
1263+
model_config.set_mm_processor_cache_gb(0)
12441264

12451265
speculative_config = self.create_speculative_config(
12461266
target_model_config=model_config,

vllm/entrypoints/cli/serve.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -138,13 +138,13 @@ def run_multi_api_server(args: argparse.Namespace):
138138
num_api_servers = args.api_server_count
139139
assert num_api_servers > 0
140140

141-
orig_disable_mm_preprocessor_cache = args.disable_mm_preprocessor_cache
141+
orig_mm_processor_cache_gb = args.mm_processor_cache_gb
142142

143143
if num_api_servers > 1:
144144
setup_multiprocess_prometheus()
145145

146146
# Not compatible with API server scale-out
147-
args.disable_mm_preprocessor_cache = True
147+
args.mm_processor_cache_gb = 0
148148

149149
listen_address, sock = setup_server(args)
150150

@@ -161,8 +161,7 @@ def run_multi_api_server(args: argparse.Namespace):
161161
raise ValueError("VLLM_ALLOW_RUNTIME_LORA_UPDATING cannot be used "
162162
"with api_server_count > 1")
163163

164-
if model_config.is_multimodal_model and not (
165-
orig_disable_mm_preprocessor_cache):
164+
if model_config.is_multimodal_model and orig_mm_processor_cache_gb > 0:
166165
logger.warning("Multi-modal processor cache is disabled because "
167166
"it is not compatible with `api_server_count > 1`.")
168167

0 commit comments

Comments
 (0)