From 395605b53676a7d5bc3c09e8fd1d51c9dc5ab77f Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 7 Jan 2025 09:10:54 +0800 Subject: [PATCH] [Bugfix] Fix LLaVA-NeXT feature size precision error (for real) (#11772) Signed-off-by: DarkLight1337 --- .../processing/test_llava_next.py | 3 +- .../processing/test_llava_onevision.py | 3 +- vllm/model_executor/models/llava_next.py | 39 ++++++++------- vllm/model_executor/models/llava_onevision.py | 47 ++++++++++--------- 4 files changed, 47 insertions(+), 45 deletions(-) diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_next.py b/tests/models/decoder_only/vision_language/processing/test_llava_next.py index 6c8d300717de4..37a6d334ee60c 100644 --- a/tests/models/decoder_only/vision_language/processing/test_llava_next.py +++ b/tests/models/decoder_only/vision_language/processing/test_llava_next.py @@ -17,7 +17,8 @@ def processor_for_llava_next(): @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) @pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488), - (488, 183), (198, 176), (176, 198)]) + (488, 183), (198, 176), (176, 198), + (161, 184), (184, 161)]) @pytest.mark.parametrize("num_imgs", [1, 2]) def test_processor_prompt_replacements( processor_for_llava_next, diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py index 71adde6568a17..ed3e2db799be7 100644 --- a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py +++ b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py @@ -18,7 +18,8 @@ def processor_for_llava_onevision(): @pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]) @pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488), - (488, 183), (198, 176), (176, 198)]) + (488, 183), (198, 176), (176, 198), + (161, 184), (184, 161)]) @pytest.mark.parametrize("num_imgs", [1, 2]) def test_processor_prompt_replacements( processor_for_llava_onevision, diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index c76ec164a3087..258352416d4a7 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -121,30 +121,29 @@ def _get_num_unpadded_features( num_patch_height: int, num_patch_width: int, ) -> tuple[int, int]: - current_height = npatches * num_patch_height - current_width = npatches * num_patch_width - # NOTE: Use float32 to remain consistent with HF output - original_aspect_ratio = np.array(original_width / original_height, - dtype=np.float32) - current_aspect_ratio = np.array(current_width / current_height, - dtype=np.float32) + current_height_f = np.float32(npatches * num_patch_height) + current_width_f = np.float32(npatches * num_patch_width) + + original_width_f = np.float32(original_width) + original_height_f = np.float32(original_height) + + original_aspect_ratio = original_width_f / original_height_f + current_aspect_ratio = current_width_f / current_height_f if original_aspect_ratio > current_aspect_ratio: - scale_factor = np.array(current_width / original_width, - dtype=np.float32) - new_height = int(original_height * scale_factor) - padding = (current_height - new_height) // 2 - current_height -= 2 * padding + scale_factor = current_width_f / original_width_f + new_height = int(original_height_f * scale_factor) + padding = (current_height_f - new_height) // 2 + current_height_f -= 2 * padding else: - scale_factor = np.array(current_height / original_height, - dtype=np.float32) - new_width = int(original_width * scale_factor) - padding = (current_width - new_width) // 2 - current_width -= 2 * padding - - unpadded_features = current_height * current_width - newline_features = current_height + scale_factor = current_height_f / original_height_f + new_width = int(original_width_f * scale_factor) + padding = (current_width_f - new_width) // 2 + current_width_f -= 2 * padding + + unpadded_features = int(current_height_f * current_width_f) + newline_features = int(current_height_f) return (unpadded_features, newline_features) diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 9117824995021..62dae74e377be 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -107,36 +107,37 @@ def _get_num_unpadded_features( num_patch_height: int, num_patch_width: int, ) -> tuple[int, int]: - current_height = npatches * num_patch_height - current_width = npatches * num_patch_width - # NOTE: Use float32 to remain consistent with HF output - original_aspect_ratio = np.array(original_width / original_height, - dtype=np.float32) - current_aspect_ratio = np.array(current_width / current_height, - dtype=np.float32) + current_height_f = np.float32(npatches * num_patch_height) + current_width_f = np.float32(npatches * num_patch_width) + + original_width_f = np.float32(original_width) + original_height_f = np.float32(original_height) + + original_aspect_ratio = original_width_f / original_height_f + current_aspect_ratio = current_width_f / current_height_f if original_aspect_ratio > current_aspect_ratio: - scale_factor = np.array(current_width / original_width, - dtype=np.float32) - new_height = int(original_height * scale_factor) - padding = (current_height - new_height) // 2 - current_height -= 2 * padding + scale_factor = current_width_f / original_width_f + new_height = int(original_height_f * scale_factor) + padding = (current_height_f - new_height) // 2 + current_height_f -= 2 * padding else: - scale_factor = np.array(current_height / original_height, - dtype=np.float32) - new_width = int(original_width * scale_factor) - padding = (current_width - new_width) // 2 - current_width -= 2 * padding + scale_factor = current_height_f / original_height_f + new_width = int(original_width_f * scale_factor) + padding = (current_width_f - new_width) // 2 + current_width_f -= 2 * padding - unpadded_features = current_height * current_width - newline_features = current_height + unpadded_features = int(current_height_f * current_width_f) + newline_features = int(current_height_f) - ratio = math.sqrt(current_height * current_width / (9 * npatches**2)) + ratio = math.sqrt(current_height_f * current_width_f / + (9 * npatches**2)) if ratio > 1.1: - unpadded_features = int(current_height // ratio) * int( - current_width // ratio) - newline_features = int(current_height // ratio) + height_factor = int(current_height_f // ratio) + width_factor = int(current_width_f // ratio) + unpadded_features = height_factor * width_factor + newline_features = height_factor return (unpadded_features, newline_features)