Skip to content

Commit 04eb88d

Browse files
authored
Re-submit: Fix: Proper RGBA -> RGB conversion for PIL images. (#18569)
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
1 parent 46791e1 commit 04eb88d

File tree

15 files changed

+89
-20
lines changed

15 files changed

+89
-20
lines changed

benchmarks/benchmark_dataset.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
from vllm.lora.request import LoRARequest
3636
from vllm.lora.utils import get_adapter_absolute_path
3737
from vllm.multimodal import MultiModalDataDict
38+
from vllm.multimodal.image import convert_image_mode
3839
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
3940

4041
logger = logging.getLogger(__name__)
@@ -257,7 +258,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
257258
if isinstance(image, dict) and "bytes" in image:
258259
image = Image.open(BytesIO(image["bytes"]))
259260
if isinstance(image, Image.Image):
260-
image = image.convert("RGB")
261+
image = convert_image_mode(image, "RGB")
261262
with io.BytesIO() as image_data:
262263
image.save(image_data, format="JPEG")
263264
image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")

examples/offline_inference/qwen2_5_omni/only_thinker.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from vllm.assets.audio import AudioAsset
1212
from vllm.assets.image import ImageAsset
1313
from vllm.assets.video import VideoAsset
14+
from vllm.multimodal.image import convert_image_mode
1415
from vllm.utils import FlexibleArgumentParser
1516

1617

@@ -45,7 +46,8 @@ def get_mixed_modalities_query() -> QueryResult:
4546
"audio":
4647
AudioAsset("mary_had_lamb").audio_and_sample_rate,
4748
"image":
48-
ImageAsset("cherry_blossom").pil_image.convert("RGB"),
49+
convert_image_mode(
50+
ImageAsset("cherry_blossom").pil_image, "RGB"),
4951
"video":
5052
VideoAsset(name="baby_reading", num_frames=16).np_ndarrays,
5153
},

examples/offline_inference/vision_language.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from vllm.assets.image import ImageAsset
2020
from vllm.assets.video import VideoAsset
2121
from vllm.lora.request import LoRARequest
22+
from vllm.multimodal.image import convert_image_mode
2223
from vllm.utils import FlexibleArgumentParser
2324

2425

@@ -1096,8 +1097,8 @@ def get_multi_modal_input(args):
10961097
"""
10971098
if args.modality == "image":
10981099
# Input image and question
1099-
image = ImageAsset("cherry_blossom") \
1100-
.pil_image.convert("RGB")
1100+
image = convert_image_mode(
1101+
ImageAsset("cherry_blossom").pil_image, "RGB")
11011102
img_questions = [
11021103
"What is the content of this image?",
11031104
"Describe the content of this image in detail.",

tests/models/multimodal/generation/test_interleaved.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from vllm.assets.image import ImageAsset
66
from vllm.assets.video import VideoAsset
7+
from vllm.multimodal.image import convert_image_mode
78

89
models = ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]
910

@@ -26,8 +27,9 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
2627
give the same result.
2728
"""
2829

29-
image_cherry = ImageAsset("cherry_blossom").pil_image.convert("RGB")
30-
image_stop = ImageAsset("stop_sign").pil_image.convert("RGB")
30+
image_cherry = convert_image_mode(
31+
ImageAsset("cherry_blossom").pil_image, "RGB")
32+
image_stop = convert_image_mode(ImageAsset("stop_sign").pil_image, "RGB")
3133
images = [image_cherry, image_stop]
3234
video = VideoAsset(name="baby_reading", num_frames=16).np_ndarrays
3335

tests/models/multimodal/generation/test_phi4mm.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
from vllm.assets.image import ImageAsset
1414
from vllm.lora.request import LoRARequest
15-
from vllm.multimodal.image import rescale_image_size
15+
from vllm.multimodal.image import convert_image_mode, rescale_image_size
1616
from vllm.platforms import current_platform
1717
from vllm.sequence import SampleLogprobs
1818

@@ -267,7 +267,7 @@ def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
267267

268268
# use the example speech question so that the model outputs are reasonable
269269
audio = librosa.load(speech_question, sr=None)
270-
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
270+
image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
271271

272272
inputs_vision_speech = [
273273
(

tests/models/test_oot_registration.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from vllm import LLM, SamplingParams
66
from vllm.assets.image import ImageAsset
7+
from vllm.multimodal.image import convert_image_mode
78

89
from ..utils import create_new_process_for_each_test
910

@@ -58,7 +59,7 @@ def test_oot_registration_embedding(
5859
assert all(v == 0 for v in output.outputs.embedding)
5960

6061

61-
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
62+
image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
6263

6364

6465
@create_new_process_for_each_test()

tests/multimodal/assets/rgba.png

219 KB
Loading

tests/multimodal/test_image.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
from pathlib import Path
3+
4+
import numpy as np
5+
from PIL import Image, ImageChops
6+
7+
from vllm.multimodal.image import convert_image_mode
8+
9+
ASSETS_DIR = Path(__file__).parent / "assets"
10+
assert ASSETS_DIR.exists()
11+
12+
13+
def test_rgb_to_rgb():
14+
# Start with an RGB image.
15+
original_image = Image.open(ASSETS_DIR / "image1.png").convert("RGB")
16+
converted_image = convert_image_mode(original_image, "RGB")
17+
18+
# RGB to RGB should be a no-op.
19+
diff = ImageChops.difference(original_image, converted_image)
20+
assert diff.getbbox() is None
21+
22+
23+
def test_rgba_to_rgb():
24+
original_image = Image.open(ASSETS_DIR / "rgba.png")
25+
original_image_numpy = np.array(original_image)
26+
27+
converted_image = convert_image_mode(original_image, "RGB")
28+
converted_image_numpy = np.array(converted_image)
29+
30+
for i in range(original_image_numpy.shape[0]):
31+
for j in range(original_image_numpy.shape[1]):
32+
# Verify that all transparent pixels are converted to white.
33+
if original_image_numpy[i][j][3] == 0:
34+
assert converted_image_numpy[i][j][0] == 255
35+
assert converted_image_numpy[i][j][1] == 255
36+
assert converted_image_numpy[i][j][2] == 255

tests/multimodal/test_utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import pytest
1111
from PIL import Image, ImageChops
1212

13+
from vllm.multimodal.image import convert_image_mode
1314
from vllm.multimodal.inputs import PlaceholderRange
1415
from vllm.multimodal.utils import (MediaConnector,
1516
merge_and_sort_multimodal_metadata)
@@ -53,7 +54,7 @@ def get_supported_suffixes() -> tuple[str, ...]:
5354

5455

5556
def _image_equals(a: Image.Image, b: Image.Image) -> bool:
56-
return (np.asarray(a) == np.asarray(b.convert(a.mode))).all()
57+
return (np.asarray(a) == np.asarray(convert_image_mode(b, a.mode))).all()
5758

5859

5960
@pytest.mark.asyncio

vllm/benchmarks/datasets.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
TODO: Implement CustomDataset to parse a JSON file and convert its contents into
1414
SampleRequest instances, similar to the approach used in ShareGPT.
1515
"""
16-
1716
import base64
1817
import io
1918
import json
@@ -33,6 +32,7 @@
3332
from vllm.lora.request import LoRARequest
3433
from vllm.lora.utils import get_adapter_absolute_path
3534
from vllm.multimodal import MultiModalDataDict
35+
from vllm.multimodal.image import convert_image_mode
3636
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
3737

3838
logger = logging.getLogger(__name__)
@@ -259,7 +259,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
259259
if isinstance(image, dict) and 'bytes' in image:
260260
image = Image.open(BytesIO(image['bytes']))
261261
if isinstance(image, Image.Image):
262-
image = image.convert("RGB")
262+
image = convert_image_mode(image, "RGB")
263263
with io.BytesIO() as image_data:
264264
image.save(image_data, format="JPEG")
265265
image_base64 = base64.b64encode(

0 commit comments

Comments
 (0)