Skip to content

Commit c8b671a

Browse files
russellbDarkLight1337
authored andcommitted
[Frontend] Require flag for loading text and image embeds (vllm-project#27204)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Alberto Perdomo <aperdomo@redhat.com>
1 parent e40252b commit c8b671a

25 files changed

+203
-64
lines changed

docs/features/multimodal_inputs.md

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -359,13 +359,19 @@ Full example: [examples/offline_inference/audio_language.py](../../examples/offl
359359
To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
360360
pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
361361

362+
You must enable this feature via `enable_mm_embeds=True`.
363+
364+
!!! warning
365+
The vLLM engine may crash if incorrect shape of embeddings is passed.
366+
Only enable this flag for trusted users!
367+
362368
??? code
363369

364370
```python
365371
from vllm import LLM
366372

367373
# Inference with image embeddings as input
368-
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
374+
llm = LLM(model="llava-hf/llava-1.5-7b-hf", enable_mm_embeds=True)
369375

370376
# Refer to the HuggingFace repo for the correct format to use
371377
prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
@@ -397,7 +403,11 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd
397403
image_embeds = torch.load(...)
398404

399405
# Qwen2-VL
400-
llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
406+
llm = LLM(
407+
"Qwen/Qwen2-VL-2B-Instruct",
408+
limit_mm_per_prompt={"image": 4},
409+
enable_mm_embeds=True,
410+
)
401411
mm_data = {
402412
"image": {
403413
"image_embeds": image_embeds,
@@ -407,7 +417,12 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd
407417
}
408418

409419
# MiniCPM-V
410-
llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4})
420+
llm = LLM(
421+
"openbmb/MiniCPM-V-2_6",
422+
trust_remote_code=True,
423+
limit_mm_per_prompt={"image": 4},
424+
enable_mm_embeds=True,
425+
)
411426
mm_data = {
412427
"image": {
413428
"image_embeds": image_embeds,
@@ -732,7 +747,13 @@ Full example: [examples/online_serving/openai_chat_completion_client_for_multimo
732747
### Embedding Inputs
733748

734749
To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
735-
pass a tensor of shape to the corresponding field of the multi-modal dictionary.
750+
pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
751+
752+
You must enable this feature via the `--enable-mm-embeds` flag in `vllm serve`.
753+
754+
!!! warning
755+
The vLLM engine may crash if incorrect shape of embeddings is passed.
756+
Only enable this flag for trusted users!
736757

737758
#### Image Embedding Inputs
738759

docs/features/prompt_embeds.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,16 @@ You can pass prompt embeddings from Hugging Face Transformers models to the `'p
2020

2121
## Online Serving
2222

23-
Our OpenAI-compatible server accepts prompt embeddings inputs via the [Completions API](https://platform.openai.com/docs/api-reference/completions). Prompt embeddings inputs are added via a new `'prompt_embeds'` key in the JSON package.
23+
Our OpenAI-compatible server accepts prompt embeddings inputs via the [Completions API](https://platform.openai.com/docs/api-reference/completions). Prompt embeddings inputs are added via a new `'prompt_embeds'` key in the JSON package and are enabled by the `--enable-prompt-embeds` flag in `vllm serve`.
2424

2525
When a mixture of `'prompt_embeds'` and `'prompt'` inputs are provided in a single request, the prompt embeds are always returned first.
2626

2727
Prompt embeddings are passed in as base64 encoded torch tensors.
2828

29+
!!! warning
30+
The vLLM engine may crash if incorrect shape of embeddings is passed.
31+
Only enable this flag for trusted users!
32+
2933
### Transformers Inputs via OpenAI Client
3034

3135
First, launch the OpenAI-compatible server:

examples/offline_inference/prithvi_geospatial_mae.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ def __init__(self, model):
4949
dtype="float16",
5050
enforce_eager=True,
5151
model_impl="terratorch",
52+
enable_mm_embeds=True,
5253
)
5354

5455
def run(self, input_data, location_coords):

examples/offline_inference/prithvi_geospatial_mae_io_processor.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ def main():
3838
max_num_seqs=32,
3939
io_processor_plugin="prithvi_to_tiff",
4040
model_impl="terratorch",
41+
enable_mm_embeds=True,
4142
)
4243

4344
pooling_params = PoolingParams(task="token_classify", activation=False)

examples/online_serving/prithvi_geospatial_mae.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
# --task embed --trust-remote-code
2020
# --skip-tokenizer-init --enforce-eager
2121
# --io-processor-plugin prithvi_to_tiff
22+
# --enable-mm-embeds
2223

2324

2425
def main():

tests/entrypoints/llm/test_prompt_validation.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33

44
import pytest
5+
import torch
56

67
from vllm import LLM
78

@@ -12,8 +13,22 @@ def test_empty_prompt():
1213
llm.generate([""])
1314

1415

15-
@pytest.mark.skip_v1
1616
def test_out_of_vocab_token():
1717
llm = LLM(model="openai-community/gpt2", enforce_eager=True)
1818
with pytest.raises(ValueError, match="out of vocabulary"):
1919
llm.generate({"prompt_token_ids": [999999]})
20+
21+
22+
def test_require_mm_embeds():
23+
llm = LLM(
24+
model="llava-hf/llava-1.5-7b-hf",
25+
enforce_eager=True,
26+
enable_mm_embeds=False,
27+
)
28+
with pytest.raises(ValueError, match="--enable-mm-embeds"):
29+
llm.generate(
30+
{
31+
"prompt": "<image>",
32+
"multi_modal_data": {"image": torch.empty(1, 1, 1)},
33+
}
34+
)

tests/entrypoints/openai/test_completion_with_prompt_embeds.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -292,3 +292,16 @@ async def test_prompt_logprobs_raises_error(
292292
temperature=0.0,
293293
extra_body={"prompt_embeds": encoded_embeds, "prompt_logprobs": True},
294294
)
295+
296+
297+
@pytest.mark.asyncio
298+
async def test_empty_prompt_embeds(
299+
client_with_prompt_embeds: openai.AsyncOpenAI,
300+
) -> None:
301+
await client_with_prompt_embeds.completions.create(
302+
model=MODEL_NAME,
303+
prompt="Hello",
304+
max_tokens=5,
305+
temperature=0.0,
306+
extra_body={"prompt_embeds": []},
307+
)

tests/entrypoints/openai/test_prompt_validation.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33

44
import io
5+
from unittest.mock import Mock
56

67
# imports for structured outputs tests
78
import openai
@@ -10,7 +11,8 @@
1011
import regex as re
1112
import torch
1213

13-
from vllm.entrypoints.renderer import BaseRenderer
14+
from vllm.config import ModelConfig
15+
from vllm.entrypoints.renderer import CompletionRenderer
1416

1517
from ...utils import RemoteOpenAIServer
1618

@@ -59,6 +61,10 @@ async def test_out_of_vocab_token_ids():
5961
def test_load_prompt_embeds(
6062
dtype: torch.dtype, layout: torch.layout, seq_len: int, hidden_size: int
6163
):
64+
model_config = Mock(spec=ModelConfig)
65+
model_config.enable_prompt_embeds = True
66+
renderer = CompletionRenderer(model_config, tokenizer=None)
67+
6268
# construct arbitrary tensors of various dtypes, layouts, and sizes.
6369
# We need to check against different layouts to make sure that if a user
6470
# uses sparse tensors to reduce the transmission size of prompt embeddings,
@@ -83,11 +89,30 @@ def test_load_prompt_embeds(
8389
buffer.seek(0)
8490
encoded_tensor = pybase64.b64encode(buffer.getvalue())
8591

86-
loaded_prompt_embeds = BaseRenderer.load_prompt_embeds(encoded_tensor)
92+
loaded_prompt_embeds = renderer.load_prompt_embeds(encoded_tensor)
8793
assert len(loaded_prompt_embeds) == 1
8894
loaded_tensor = loaded_prompt_embeds[0]["prompt_embeds"]
8995
assert loaded_tensor.device.type == "cpu"
9096
assert loaded_tensor.layout == torch.strided
9197
torch.testing.assert_close(
9298
loaded_tensor, tensor.to("cpu").to_dense(), equal_nan=True
9399
)
100+
101+
102+
@pytest.mark.parametrize("dtype", [torch.float32])
103+
@pytest.mark.parametrize("seq_len", [2])
104+
@pytest.mark.parametrize("hidden_size", [2])
105+
def test_disable_prompt_embeds(dtype: torch.dtype, seq_len: int, hidden_size: int):
106+
model_config = Mock(spec=ModelConfig)
107+
model_config.enable_prompt_embeds = False
108+
renderer = CompletionRenderer(model_config, tokenizer=None)
109+
110+
tensor = torch.randn((seq_len, hidden_size), dtype=dtype)
111+
112+
buffer = io.BytesIO()
113+
torch.save(tensor, buffer)
114+
buffer.seek(0)
115+
encoded_tensor = pybase64.b64encode(buffer.getvalue())
116+
117+
with pytest.raises(ValueError, match="--enable-prompt-embeds"):
118+
renderer.load_prompt_embeds(encoded_tensor)

tests/entrypoints/openai/test_skip_tokenizer.py renamed to tests/entrypoints/openai/test_vision_embeds.py

Lines changed: 29 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -15,30 +15,7 @@
1515
DTYPE = "float16"
1616

1717

18-
@pytest.fixture(scope="module")
19-
def server():
20-
args = [
21-
"--runner",
22-
"pooling",
23-
# use half precision for speed and memory savings in CI environment
24-
"--dtype",
25-
DTYPE,
26-
"--enforce-eager",
27-
"--trust-remote-code",
28-
"--skip-tokenizer-init",
29-
"--max-num-seqs",
30-
"32",
31-
"--model-impl",
32-
"terratorch",
33-
]
34-
35-
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
36-
yield remote_server
37-
38-
39-
@pytest.mark.asyncio
40-
@pytest.mark.parametrize("model_name", [MODEL_NAME])
41-
async def test_single_request(server: RemoteOpenAIServer, model_name: str):
18+
def _terratorch_dummy_inputs(model_name: str):
4219
pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16)
4320
location_coords = torch.full((1, 2), 1.0, dtype=torch.float16)
4421

@@ -54,7 +31,7 @@ async def test_single_request(server: RemoteOpenAIServer, model_name: str):
5431
binary_data = buffer_coord.read()
5532
base64_coord_embedding = base64.b64encode(binary_data).decode("utf-8")
5633

57-
prompt = {
34+
return {
5835
"model": model_name,
5936
"additional_data": {"prompt_token_ids": [1]},
6037
"encoding_format": "base64",
@@ -74,12 +51,33 @@ async def test_single_request(server: RemoteOpenAIServer, model_name: str):
7451
],
7552
}
7653

77-
# test single pooling
78-
response = requests.post(server.url_for("pooling"), json=prompt)
79-
response.raise_for_status()
8054

81-
output = response.json()["data"][0]["data"]
55+
@pytest.mark.parametrize("model_name", [MODEL_NAME])
56+
async def test_single_request(model_name: str):
57+
args = [
58+
"--runner",
59+
"pooling",
60+
# use half precision for speed and memory savings in CI environment
61+
"--dtype",
62+
DTYPE,
63+
"--enforce-eager",
64+
"--trust-remote-code",
65+
"--max-num-seqs",
66+
"32",
67+
"--model-impl",
68+
"terratorch",
69+
"--skip-tokenizer-init",
70+
"--enable-mm-embeds",
71+
]
72+
73+
with RemoteOpenAIServer(MODEL_NAME, args) as server:
74+
prompt = _terratorch_dummy_inputs(model_name)
75+
76+
# test single pooling
77+
response = requests.post(server.url_for("pooling"), json=prompt)
78+
response.raise_for_status()
8279

83-
np_response = np.frombuffer(base64.b64decode(output), dtype=np.float32)
80+
output = response.json()["data"][0]["data"]
8481

85-
assert len(np_response) == 524288
82+
np_response = np.frombuffer(base64.b64decode(output), dtype=np.float32)
83+
assert len(np_response) == 524288

tests/entrypoints/test_chat_utils.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,19 @@ def phi3v_model_config_mm_interleaved():
7373
)
7474

7575

76+
@pytest.fixture(scope="function")
77+
def phi3v_model_config_image_embeds():
78+
return ModelConfig(
79+
PHI3V_MODEL_ID,
80+
runner="generate",
81+
trust_remote_code=True,
82+
limit_mm_per_prompt={
83+
"image": 2,
84+
},
85+
enable_mm_embeds=True,
86+
)
87+
88+
7689
@pytest.fixture(scope="module")
7790
def phi3v_tokenizer():
7891
return get_tokenizer(PHI3V_MODEL_ID)
@@ -799,7 +812,7 @@ def test_parse_chat_messages_empty_pil_image_with_uuid(
799812

800813

801814
def test_parse_chat_messages_empty_image_embeds_with_uuid(
802-
phi3v_model_config,
815+
phi3v_model_config_image_embeds,
803816
phi3v_tokenizer,
804817
):
805818
uuid = "abcd"
@@ -813,7 +826,7 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
813826
],
814827
}
815828
],
816-
phi3v_model_config,
829+
phi3v_model_config_image_embeds,
817830
phi3v_tokenizer,
818831
content_format="string",
819832
)
@@ -832,7 +845,7 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
832845

833846
@pytest.mark.asyncio
834847
async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
835-
phi3v_model_config,
848+
phi3v_model_config_image_embeds,
836849
phi3v_tokenizer,
837850
):
838851
uuid = "abcd"
@@ -846,7 +859,7 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
846859
],
847860
}
848861
],
849-
phi3v_model_config,
862+
phi3v_model_config_image_embeds,
850863
phi3v_tokenizer,
851864
content_format="string",
852865
)

0 commit comments

Comments
 (0)