Skip to content

Commit b849709

Browse files
committed
Remove vision language config.
Replace it with a placeholder mm config. Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
1 parent 9831aec commit b849709

40 files changed

+243
-433
lines changed

docs/source/models/vlm.rst

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,12 +33,7 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM``
3333

3434
.. code-block:: python
3535
36-
llm = LLM(
37-
model="llava-hf/llava-1.5-7b-hf",
38-
image_token_id=32000,
39-
image_input_shape="1,3,336,336",
40-
image_feature_size=576,
41-
)
36+
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
4237
4338
.. important::
4439
Currently, you have to specify ``image_feature_size`` to support memory profiling.

examples/llava_example.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,7 @@
1010

1111

1212
def run_llava():
13-
llm = LLM(
14-
model="llava-hf/llava-1.5-7b-hf",
15-
image_token_id=32000,
16-
image_input_shape="1,3,336,336",
17-
image_feature_size=576,
18-
)
13+
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
1914

2015
prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
2116

examples/llava_next_example.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,7 @@
77

88

99
def run_llava_next():
10-
llm = LLM(
11-
model="llava-hf/llava-v1.6-mistral-7b-hf",
12-
image_token_id=32000,
13-
image_input_shape="1,3,336,336",
14-
# Use the maximum possible value for memory profiling
15-
image_feature_size=2928,
16-
)
10+
llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf")
1711

1812
prompt = "[INST] <image>\nWhat is shown in this image? [/INST]"
1913
url = "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.jpg"

examples/openai_vision_api_client.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,6 @@
33
Launch the vLLM server with the following command:
44
python -m vllm.entrypoints.openai.api_server \
55
--model llava-hf/llava-1.5-7b-hf \
6-
--image-token-id 32000 \
7-
--image-input-shape 1,3,336,336 \
8-
--image-feature-size 576 \
96
--chat-template template_llava.jinja
107
"""
118
import base64

examples/phi3v_example.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,7 @@ def run_phi3v():
1919
llm = LLM(
2020
model=model_path,
2121
trust_remote_code=True,
22-
image_token_id=32044,
23-
image_input_shape="1,3,1008,1344",
24-
# Use the maximum possible value for memory profiling
25-
image_feature_size=2653,
26-
max_num_seqs=5,
22+
max_num_seqs=1,
2723
)
2824

2925
image = Image.open("images/cherry_blossom.jpg")

tests/distributed/test_multimodal_broadcast.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,9 @@
2020
model = os.environ["TEST_DIST_MODEL"]
2121

2222
if model.startswith("llava-hf/llava"):
23-
from ..models.test_llava import model_and_vl_config, run_test
23+
from ..models.test_llava import models, run_test
2424
elif model.startswith("microsoft/Phi-3-vision"):
25-
from ..models.test_phi3v import model_and_vl_config, run_test
25+
from ..models.test_phi3v import models, run_test
2626
else:
2727
raise NotImplementedError(f"Unsupported model: {model}")
2828

@@ -44,7 +44,7 @@ def test_models(hf_runner, vllm_runner, image_assets,
4444
hf_runner,
4545
vllm_runner,
4646
image_assets,
47-
model_and_config=model_and_vl_config[0],
47+
model=models[0],
4848
size_factors=[1.0],
4949
dtype=dtype,
5050
max_tokens=max_tokens,

tests/models/test_llava.py

Lines changed: 17 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import pytest
44
from transformers import AutoTokenizer
55

6-
from vllm.config import VisionLanguageConfig
76
from vllm.multimodal.utils import rescale_image_size
87
from vllm.sequence import SampleLogprobs
98

@@ -21,49 +20,27 @@
2120
"USER: <image>\nWhat's in this image?\nASSISTANT:",
2221
})
2322

23+
IMAGE_TOKEN_ID = 32000
2424

25-
def iter_llava_configs(model_name: str):
26-
image_hw_to_feature_size = {
27-
(336, 336): 576,
28-
}
29-
30-
for (h, w), f in image_hw_to_feature_size.items():
31-
input_shape = (1, 3, h, w)
32-
yield (model_name,
33-
VisionLanguageConfig(image_feature_size=f,
34-
image_token_id=32000,
35-
image_input_shape=input_shape))
36-
37-
38-
model_and_vl_config = [
39-
*iter_llava_configs("llava-hf/llava-1.5-7b-hf"),
40-
]
25+
models = ["llava-hf/llava-1.5-7b-hf"]
4126

4227

4328
def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
4429
Optional[SampleLogprobs]],
45-
vlm_config: VisionLanguageConfig, model_id: str):
46-
"""Sanitize vllm output to be comparable with hf output.
47-
The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
48-
x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
49-
It also reduces `output_str` from "<image><image>bla" to "bla".
50-
"""
30+
model: str):
31+
"""Sanitize vllm output to be comparable with hf output."""
5132
output_ids, output_str, out_logprobs = vllm_output
52-
image_token_id = vlm_config.image_token_id
5333

54-
tokenizer = AutoTokenizer.from_pretrained(model_id)
55-
image_token_str = tokenizer.decode(image_token_id)
34+
tokenizer = AutoTokenizer.from_pretrained(model)
5635
eos_token_id = tokenizer.eos_token_id
5736

5837
hf_output_ids = [
5938
token_id for idx, token_id in enumerate(output_ids)
60-
if token_id != image_token_id or output_ids[idx - 1] != image_token_id
39+
if token_id != IMAGE_TOKEN_ID or output_ids[idx - 1] != IMAGE_TOKEN_ID
6140
]
6241

63-
hf_output_str = output_str \
64-
.replace(image_token_str * vlm_config.image_feature_size, "")
65-
assert hf_output_str[0] == " "
66-
hf_output_str = hf_output_str[1:]
42+
assert output_str[0] == " "
43+
hf_output_str = output_str[1:]
6744
if hf_output_ids[-1] == eos_token_id:
6845
hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
6946

@@ -74,7 +51,7 @@ def run_test(
7451
hf_runner: Type[HfRunner],
7552
vllm_runner: Type[VllmRunner],
7653
image_assets: _ImageAssets,
77-
model_and_config: Tuple[str, VisionLanguageConfig],
54+
model,
7855
*,
7956
size_factors: List[float],
8057
dtype: str,
@@ -92,7 +69,6 @@ def run_test(
9269
Note, the text input is also adjusted to abide by vllm contract.
9370
The text output is sanitized to be able to compare with hf.
9471
"""
95-
model_id, vlm_config = model_and_config
9672
images = [asset.pil_image for asset in image_assets]
9773

9874
inputs_per_image = [(
@@ -106,12 +82,11 @@ def run_test(
10682
# will hurt multiprocessing backend with fork method (the default method).
10783

10884
# max_model_len should be greater than image_feature_size
109-
with vllm_runner(model_id,
85+
with vllm_runner(model,
11086
dtype=dtype,
11187
tensor_parallel_size=tensor_parallel_size,
11288
distributed_executor_backend=distributed_executor_backend,
113-
enforce_eager=True,
114-
**vlm_config.as_cli_args_dict()) as vllm_model:
89+
enforce_eager=True) as vllm_model:
11590
vllm_outputs_per_image = [
11691
vllm_model.generate_greedy_logprobs(prompts,
11792
max_tokens,
@@ -120,7 +95,7 @@ def run_test(
12095
for prompts, images in inputs_per_image
12196
]
12297

123-
with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
98+
with hf_runner(model, dtype=dtype, is_vision_model=True) as hf_model:
12499
hf_outputs_per_image = [
125100
hf_model.generate_greedy_logprobs_limit(prompts,
126101
max_tokens,
@@ -136,15 +111,15 @@ def run_test(
136111
check_logprobs_close(
137112
outputs_0_lst=hf_outputs,
138113
outputs_1_lst=[
139-
vllm_to_hf_output(vllm_output, vlm_config, model_id)
114+
vllm_to_hf_output(vllm_output, model)
140115
for vllm_output in vllm_outputs
141116
],
142117
name_0="hf",
143118
name_1="vllm",
144119
)
145120

146121

147-
@pytest.mark.parametrize("model_and_config", model_and_vl_config)
122+
@pytest.mark.parametrize("model", models)
148123
@pytest.mark.parametrize(
149124
"size_factors",
150125
[
@@ -161,14 +136,13 @@ def run_test(
161136
@pytest.mark.parametrize("dtype", ["half"])
162137
@pytest.mark.parametrize("max_tokens", [128])
163138
@pytest.mark.parametrize("num_logprobs", [5])
164-
def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
165-
size_factors, dtype: str, max_tokens: int,
166-
num_logprobs: int) -> None:
139+
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
140+
dtype: str, max_tokens: int, num_logprobs: int) -> None:
167141
run_test(
168142
hf_runner,
169143
vllm_runner,
170144
image_assets,
171-
model_and_config,
145+
model,
172146
size_factors=size_factors,
173147
dtype=dtype,
174148
max_tokens=max_tokens,

tests/models/test_llava_next.py

Lines changed: 13 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import pytest
55
from transformers import AutoTokenizer
66

7-
from vllm.config import VisionLanguageConfig
87
from vllm.multimodal.utils import rescale_image_size
98
from vllm.sequence import SampleLogprobs
109

@@ -27,46 +26,22 @@
2726
f"{_PREFACE} USER: <image>\nWhat's in this image? ASSISTANT:",
2827
})
2928

30-
31-
def iter_llava_next_configs(model_name: str):
32-
# Need to use the max possible feature size for profile_run
33-
image_hw_to_feature_size = {
34-
(336, 336): 2928,
35-
}
36-
37-
for (h, w), f in image_hw_to_feature_size.items():
38-
input_shape = (1, 3, h, w)
39-
yield (model_name,
40-
VisionLanguageConfig(
41-
image_feature_size=f,
42-
image_token_id=32000,
43-
image_input_shape=input_shape,
44-
))
45-
46-
47-
model_and_vl_config = [
48-
*iter_llava_next_configs("llava-hf/llava-v1.6-vicuna-7b-hf"),
49-
]
29+
IMAGE_TOKEN_ID = 32000
5030

5131

5232
def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
5333
Optional[SampleLogprobs]],
54-
vlm_config: VisionLanguageConfig, model_id: str):
55-
"""Sanitize vllm output to be comparable with hf output.
56-
The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
57-
x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
58-
It also reduces `output_str` from "<image><image>bla" to "bla".
59-
"""
34+
model: str):
35+
"""Sanitize vllm output to be comparable with hf output."""
6036
output_ids, output_str, out_logprobs = vllm_output
61-
image_token_id = vlm_config.image_token_id
6237

63-
tokenizer = AutoTokenizer.from_pretrained(model_id)
64-
image_token_str = tokenizer.decode(image_token_id)
38+
tokenizer = AutoTokenizer.from_pretrained(model)
39+
image_token_str = tokenizer.decode(IMAGE_TOKEN_ID)
6540
eos_token_id = tokenizer.eos_token_id
6641

6742
hf_output_ids = [
6843
token_id for idx, token_id in enumerate(output_ids)
69-
if token_id != image_token_id or output_ids[idx - 1] != image_token_id
44+
if token_id != IMAGE_TOKEN_ID or output_ids[idx - 1] != IMAGE_TOKEN_ID
7045
]
7146

7247
hf_output_str = re.sub(fr"({image_token_str})+", "", output_str)
@@ -78,7 +53,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
7853
return hf_output_ids, hf_output_str, out_logprobs
7954

8055

81-
@pytest.mark.parametrize("model_and_config", model_and_vl_config)
56+
@pytest.mark.parametrize("model", ["llava-hf/llava-v1.6-vicuna-7b-hf"])
8257
@pytest.mark.parametrize(
8358
"size_factors",
8459
[
@@ -95,9 +70,8 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
9570
@pytest.mark.parametrize("dtype", ["half"])
9671
@pytest.mark.parametrize("max_tokens", [128])
9772
@pytest.mark.parametrize("num_logprobs", [5])
98-
def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
99-
size_factors, dtype: str, max_tokens: int,
100-
num_logprobs: int) -> None:
73+
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
74+
dtype: str, max_tokens: int, num_logprobs: int) -> None:
10175
"""Inference result should be the same between hf and vllm.
10276
10377
All the image fixtures for the test is under tests/images.
@@ -107,7 +81,6 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
10781
Note, the text input is also adjusted to abide by vllm contract.
10882
The text output is sanitized to be able to compare with hf.
10983
"""
110-
model_id, vlm_config = model_and_config
11184
images = [asset.pil_image for asset in image_assets]
11285

11386
inputs_per_image = [(
@@ -116,11 +89,10 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
11689
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
11790

11891
# max_model_len should be greater than image_feature_size
119-
with vllm_runner(model_id,
92+
with vllm_runner(model,
12093
dtype=dtype,
12194
max_model_len=4096,
122-
enforce_eager=True,
123-
**vlm_config.as_cli_args_dict()) as vllm_model:
95+
enforce_eager=True) as vllm_model:
12496
vllm_outputs_per_image = [
12597
vllm_model.generate_greedy_logprobs(prompts,
12698
max_tokens,
@@ -129,7 +101,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
129101
for prompts, images in inputs_per_image
130102
]
131103

132-
with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
104+
with hf_runner(model, dtype=dtype, is_vision_model=True) as hf_model:
133105
hf_outputs_per_image = [
134106
hf_model.generate_greedy_logprobs_limit(prompts,
135107
max_tokens,
@@ -145,7 +117,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
145117
check_logprobs_close(
146118
outputs_0_lst=hf_outputs,
147119
outputs_1_lst=[
148-
vllm_to_hf_output(vllm_output, vlm_config, model_id)
120+
vllm_to_hf_output(vllm_output, model)
149121
for vllm_output in vllm_outputs
150122
],
151123
name_0="hf",

0 commit comments

Comments
 (0)