[Model] Add support for Gemma 3 (vllm-project#14660)

WoosukKwon · ywang96 · DarkLight1337 · shreyankg · commit 726fea22a2da · 2025-05-02T23:48:27.000+05:30
Signed-off-by: Woosuk Kwon &lt;woosuk.kwon@berkeley.edu&gt;
Signed-off-by: Roger Wang &lt;ywang@roblox.com&gt;
Signed-off-by: DarkLight1337 &lt;tlleungac@connect.ust.hk&gt;
Co-authored-by: Roger Wang &lt;ywang@roblox.com&gt;
Co-authored-by: DarkLight1337 &lt;tlleungac@connect.ust.hk&gt;
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
@@ -263,10 +263,15 @@ See [this page](#generative-models) for more information on how to use generativ
   * ✅︎
   * ✅︎
 - * `Gemma2ForCausalLM`
-  * Gemma2
+  * Gemma 2
   * `google/gemma-2-9b`, `google/gemma-2-27b`, etc.
   * ✅︎
   * ✅︎
+- * `Gemma3ForCausalLM`
+  * Gemma 3
+  * `google/gemma-3-1b-it`, etc.
+  * ✅︎
+  * ✅︎
 - * `GlmForCausalLM`
   * GLM-4
   * `THUDM/glm-4-9b-chat-hf`, etc.
@@ -504,7 +509,7 @@ you should explicitly specify the task type to ensure that the model is used in
   *
   *
 - * `Gemma2Model`
-  * Gemma2-based
+  * Gemma 2-based
   * `BAAI/bge-multilingual-gemma2`, etc.
   *
   * ✅︎
@@ -752,6 +757,13 @@ See [this page](#generative-models) for more information on how to use generativ
   *
   * ✅︎
   * ✅︎
+- * `Gemma3ForConditionalGeneration`
+  * Gemma 3
+  * T + I<sup>+</sup>
+  * `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc.
+  * ✅︎
+  * ✅︎
+  * ✅︎\*
 - * `GLM4VForCausalLM`<sup>^</sup>
   * GLM-4V
   * T + I
@@ -937,6 +949,31 @@ For more details, please see: <gh-pr:4087#issuecomment-2250397630>
 To use Qwen2.5-VL series models, you have to install Hugging Face Transformers library from source via `pip install git+https://github.com/huggingface/transformers`.
 :::
 
+:::{note}
+To use Gemma3 series models, you have to install Hugging Face Transformers library from source via
+`pip install git+https://github.com/huggingface/transformers`.
+The earliest commit that supports this is [`50d3530aa04e7a7d003e6b255a98f79fd0447357`](https://github.com/huggingface/transformers/commit/50d3530aa04e7a7d003e6b255a98f79fd0447357).
+
+Both V0 and V1 support `Gemma3ForConditionalGeneration` for text-only inputs.
+However, there are differences in how they handle text + image inputs:
+
+V0 correctly implements the model's attention pattern:
+- Uses bidirectional attention between the image tokens corresponding to the same image
+- Uses causal attention for other tokens
+- Implemented via (naive) PyTorch SDPA with masking tensors
+- Note: May use significant memory for long prompts with image
+
+V1 currently uses a simplified attention pattern:
+- Uses causal attention for all tokens, including image tokens
+- Generates reasonable outputs but does not match the original model's attention for text + image inputs
+- Will be updated in the future to support the correct behavior
+
+This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
+
+Additionally, vLLM's current Gemma 3 implementation does not support the pan-and-scan image pre-processing algorithm, which helps handle images with skewed aspect ratios by intelligently cropping them into multiple views.
+Without this feature, model performance may degrade when processing images that deviate significantly from square dimensions.
+:::
+
 ### Pooling Models
 
 See [this page](pooling-models) for more information on how to use pooling models.
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
@@ -118,6 +118,23 @@ def run_fuyu(questions: list[str], modality: str):
     return llm, prompts, stop_token_ids
 
 
+# Gemma 3
+def run_gemma3(questions: list[str], modality: str):
+    assert modality == "image"
+    model_name = "google/gemma-3-4b-it"
+
+    llm = LLM(model=model_name,
+              max_model_len=2048,
+              max_num_seqs=2,
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+
+    prompts = [("<bos><start_of_turn>user\n"
+                f"<start_of_image>{question}<end_of_turn>\n"
+                "<start_of_turn>model\n") for question in questions]
+    stop_token_ids = None
+    return llm, prompts, stop_token_ids
+
+
 # GLM-4v
 def run_glm4v(questions: list[str], modality: str):
     assert modality == "image"
@@ -405,7 +422,7 @@ def run_mllama(questions: list[str], modality: str):
             "type": "image"
         }, {
             "type": "text",
-            "text": f"{question}"
+            "text": question
         }]
     }] for question in questions]
     prompts = tokenizer.apply_chat_template(messages,
@@ -664,6 +681,7 @@ def run_qwen2_5_vl(questions: list[str], modality: str):
     "deepseek_vl_v2": run_deepseek_vl2,
     "florence2": run_florence2,
     "fuyu": run_fuyu,
+    "gemma3": run_gemma3,
     "glm4v": run_glm4v,
     "h2ovl_chat": run_h2ovl,
     "idefics3": run_idefics3,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
@@ -80,6 +80,42 @@ def load_deepseek_vl2(question: str, image_urls: list[str]):
     )
 
 
+def load_gemma3(question, image_urls: list[str]) -> ModelRequestData:
+    model_name = "google/gemma-3-4b-it"
+
+    llm = LLM(model=model_name,
+              max_model_len=8192,
+              max_num_seqs=2,
+              limit_mm_per_prompt={"image": len(image_urls)})
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            *placeholders,
+            {
+                "type": "text",
+                "text": question
+            },
+        ],
+    }]
+
+    processor = AutoProcessor.from_pretrained(model_name)
+
+    prompt = processor.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=None,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None,
+    )
+
+
 def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "h2oai/h2ovl-mississippi-800m"
 
@@ -496,6 +532,7 @@ def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:
 model_example_map = {
     "aria": load_aria,
     "deepseek_vl_v2": load_deepseek_vl2,
+    "gemma3": load_gemma3,
     "h2ovl_chat": load_h2ovl,
     "idefics3": load_idefics3,
     "internvl_chat": load_internvl,
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
@@ -162,6 +162,7 @@ def _test_processing_correctness(
     "deepseek-ai/deepseek-vl2-tiny",
     "microsoft/Florence-2-base",
     "adept/fuyu-8b",
+    "google/gemma-3-4b-it",
     "THUDM/glm-4v-9b",
     "h2oai/h2ovl-mississippi-800m",
     "OpenGVLab/InternVL2-1B",
diff --git a/tests/models/registry.py b/tests/models/registry.py
@@ -124,6 +124,8 @@ def check_available_online(
     "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
     "GemmaForCausalLM": _HfExamplesInfo("google/gemma-2b"),
     "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
+    "Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it",
+                                         min_transformers_version="4.50"),
     "GlmForCausalLM": _HfExamplesInfo("THUDM/glm-4-9b-chat-hf"),
     "GPT2LMHeadModel": _HfExamplesInfo("gpt2"),
     "GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder"),
@@ -241,6 +243,8 @@ def check_available_online(
     "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny",  # noqa: E501
                                                hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}),  # noqa: E501
     "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
+    "Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it",
+                                                      min_transformers_version="4.50"),
     "GLM4VForCausalLM": _HfExamplesInfo("THUDM/glm-4v-9b",
                                         trust_remote_code=True,
                                         hf_overrides={"architectures": ["GLM4VForCausalLM"]}),  # noqa: E501
diff --git a/vllm/config.py b/vllm/config.py
@@ -350,10 +350,11 @@ def __init__(
         if self.enforce_eager is None:
             self.enforce_eager = False
 
+        interleaved_attn_models = ["gemma2", "gemma3_text", "cohere2"]
         sliding_window = getattr(self.hf_text_config, "sliding_window", None)
         has_interleaved_attention = (sliding_window is not None) and (
             isinstance(sliding_window, list) or
-            (self.hf_text_config.model_type in ["gemma2", "cohere2"]))
+            (self.hf_text_config.model_type in interleaved_attn_models))
 
         if (not self.disable_sliding_window and has_interleaved_attention):
             if (backend :=
@@ -2503,11 +2504,11 @@ def _get_and_verify_dtype(
         dtype = dtype.lower()
         if dtype == "auto":
             if config_dtype == torch.float32:
-                if config.model_type == "gemma2":
+                if config.model_type in ("gemma2", "gemma3", "gemma3_text"):
                     logger.info(
-                        "For Gemma 2, we downcast float32 to bfloat16 instead "
-                        "of float16 by default. Please specify `dtype` if you "
-                        "want to use float16.")
+                        "For Gemma 2 and 3, we downcast float32 to bfloat16 "
+                        "instead of float16 by default. Please specify `dtype` "
+                        "if you want to use float16.")
                     torch_dtype = torch.bfloat16
                 else:
                     # Following the common practice, we use float16 for float32
@@ -2639,7 +2640,9 @@ def _get_and_verify_max_len(
         derived_max_model_len = default_max_len
 
     rope_scaling = getattr(hf_config, "rope_scaling", None)
-    if rope_scaling is not None:
+    # NOTE(woosuk): Gemma3's max_model_len (128K) is already scaled by RoPE
+    # scaling, so we skip applying the scaling factor again.
+    if rope_scaling is not None and "gemma3" not in hf_config.model_type:
         # No need to consider "type" key because of patch_rope_scaling when
         # loading HF config
         rope_type = rope_scaling["rope_type"]
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
@@ -433,6 +433,8 @@ def _placeholder_str(self, modality: ModalityStr,
                 return "<image>"
             if model_type == "aria":
                 return "<|fim_prefix|><|img|><|fim_suffix|>"
+            if model_type == "gemma3":
+                return "<start_of_image>"
 
             raise TypeError(f"Unknown {modality} model type: {model_type}")
         elif modality == "audio":
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py