From 999068b81763b5fd5e3ff0a17ad50cd9b33ce028 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Wed, 5 Nov 2025 09:21:36 +0000 Subject: [PATCH 01/16] Continuous Batching for VLMs Signed-off-by: Asmita Goswami --- .../models/gemma3/modeling_gemma3.py | 119 +++++++++++------ .../models/llava/modeling_llava.py | 123 ++++++++++++------ 2 files changed, 162 insertions(+), 80 deletions(-) diff --git a/QEfficient/transformers/models/gemma3/modeling_gemma3.py b/QEfficient/transformers/models/gemma3/modeling_gemma3.py index 398259d8b..234dff860 100644 --- a/QEfficient/transformers/models/gemma3/modeling_gemma3.py +++ b/QEfficient/transformers/models/gemma3/modeling_gemma3.py @@ -610,6 +610,7 @@ def forward( image_idx, past_key_values, comp_ctx_lengths: Optional[List[int]] = None, + batch_index: Optional[torch.LongTensor] = None, ): inputs_embeds = self.model.get_input_embeddings()(input_ids) B, N, C = inputs_embeds.shape @@ -625,6 +626,7 @@ def forward( position_ids=position_ids, past_key_values=past_key_values, comp_ctx_lengths=comp_ctx_lengths, + batch_index=batch_index, use_cache=True, ) image_idx = (indices1.max() + 1).unsqueeze(0).unsqueeze(0) @@ -684,6 +686,9 @@ def get_specializations( comp_ctx_lengths_prefill: Optional[List[int]] = None, comp_ctx_lengths_decode: Optional[List[int]] = None, kv_offload: bool = False, + continuous_batching: bool = False, + kv_cache_batch_size: Optional[int] = None, + full_batch_size: Optional[int] = None, **compiler_options, ): prefill_seq_len = prefill_seq_len if prefill_seq_len else 32 @@ -707,50 +712,74 @@ def get_specializations( lang = [] for i in range(0, len(comp_ctx_lengths_prefill)): - lang.append( - { - "batch_size": batch_size, - "seq_len": prefill_seq_len, - "ctx_len": ctx_len, - "comp_ctx_lengths": comp_ctx_lengths_prefill[i], - "sliding_window": self.language_model.config.sliding_window, - "img_size": img_size, - "mm_tokens_per_image": mm_tokens_per_image, - } - ) - - for i in range(0, len(comp_ctx_lengths_decode)): - lang.append( - { - "batch_size": batch_size, - "seq_len": "1", - "ctx_len": ctx_len, - "comp_ctx_lengths": comp_ctx_lengths_decode[i], - "sliding_window": self.language_model.config.sliding_window, - "img_size": img_size, - "mm_tokens_per_image": mm_tokens_per_image, - } - ) - - else: - lang = [ - { - "batch_size": batch_size, + lang_prefill = { + "batch_size": 1 if continuous_batching else batch_size, "seq_len": prefill_seq_len, "ctx_len": ctx_len, + "comp_ctx_lengths": comp_ctx_lengths_prefill[i], "sliding_window": self.language_model.config.sliding_window, "img_size": img_size, "mm_tokens_per_image": mm_tokens_per_image, - }, - { - "batch_size": batch_size, + "vision_batch_size": batch_size, + } + if continuous_batching: + lang_prefill["full_batch_size"] = kv_cache_batch_size + else: + lang_prefill["batch_size"] = kv_cache_batch_size + if full_batch_size: + lang_prefill["full_batch_exec_size"] = full_batch_size + lang.append(lang_prefill) + + for i in range(0, len(comp_ctx_lengths_decode)): + lang_decode = { + "batch_size": full_batch_size if continuous_batching else batch_size, "seq_len": "1", "ctx_len": ctx_len, + "comp_ctx_lengths": comp_ctx_lengths_decode[i], "sliding_window": self.language_model.config.sliding_window, "img_size": img_size, "mm_tokens_per_image": mm_tokens_per_image, - }, - ] + "vision_batch_size": batch_size, + } + if continuous_batching: + lang_decode["full_batch_size"] = kv_cache_batch_size + else: + lang_decode["batch_size"] = kv_cache_batch_size + lang.append(lang_decode) + + else: + lang_prefill = { + "batch_size": 1 if continuous_batching else batch_size, + "seq_len": prefill_seq_len, + "ctx_len": ctx_len, + "sliding_window": self.language_model.config.sliding_window, + "img_size": img_size, + "mm_tokens_per_image": mm_tokens_per_image, + "vision_batch_size": batch_size, + } + if continuous_batching: + lang_prefill["full_batch_size"] = kv_cache_batch_size + else: + lang_prefill["batch_size"] = kv_cache_batch_size + if full_batch_size: + lang_prefill["full_batch_exec_size"] = full_batch_size + + lang_decode = { + "batch_size": full_batch_size if continuous_batching else batch_size, + "seq_len": "1", + "ctx_len": ctx_len, + "sliding_window": self.language_model.config.sliding_window, + "img_size": img_size, + "mm_tokens_per_image": mm_tokens_per_image, + "vision_batch_size": batch_size, + } + if continuous_batching: + lang_decode["full_batch_size"] = kv_cache_batch_size + else: + lang_decode["batch_size"] = kv_cache_batch_size + lang = [] + lang.append(lang_prefill) + lang.append(lang_decode) specializations = {} @@ -759,19 +788,23 @@ def get_specializations( specializations["lang"] = lang return specializations, compiler_options else: + lang[0].pop("vision_size") + lang[1].pop("vision_size") return lang, compiler_options - def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False): + def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False): # Define dynamic axes vision_dynamic_axes = {} lang_dynamic_axes = {} lang_dynamic_axes["input_ids"] = {0: "batch_size", 1: "seq_len"} lang_dynamic_axes["position_ids"] = {0: "batch_size", 1: "seq_len"} - lang_dynamic_axes["vision_embeds"] = {0: "batch_size", 1: "mm_tokens_per_image"} + lang_dynamic_axes["vision_embeds"] = {0: "vision_batch_size", 1: "mm_tokens_per_image"} + if continuous_batching: + lang_dynamic_axes["batch_index"] = {0: "batch_size"} vision_dynamic_axes["pixel_values"] = {0: "batch_size", 2: "img_size", 3: "img_size"} - pkv_dynamic_axes = {0: "batch_size", 2: "ctx_len"} - pkv_dynamic_sliding_axes = {0: "batch_size", 2: "sliding_window"} + pkv_dynamic_axes = {0: "full_batch_size" if continuous_batching else "batch_size", 2: "ctx_len"} + pkv_dynamic_sliding_axes = {0: "full_batch_size" if continuous_batching else "batch_size", 2: "sliding_window"} layer_switch = ( self.language_model.config.sliding_window_pattern if hasattr(self.language_model.config, "sliding_window_pattern") @@ -837,7 +870,7 @@ def get_dummy_pkv_cache(self, config, batch_size, seq_len): past_key_values.append(pkv) return past_key_values - def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False): + def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False): if vis_cfg := getattr(self.config, "vision_config", None): img_size = getattr(vis_cfg, "image_size", 896) else: @@ -876,15 +909,21 @@ def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offl .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1) ) lang_inputs["image_idx"] = torch.zeros((inputs_shapes["image_idx"]), dtype=torch.int64) + + bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE + fbs: int = constants.ONNX_EXPORT_EXAMPLE_FBS + # Add data for KV lang_inputs["past_key_values"] = self.get_dummy_pkv_cache( config=self.language_model.config, - batch_size=constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, + batch_size=fbs if continuous_batching else bs, seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) if comp_ctx_lengths is not None: lang_inputs["comp_ctx_lengths"] = torch.randint(0, 100, (40,), dtype=torch.long) + if continuous_batching: + lang_inputs["batch_index"] = torch.arange(bs).view(bs, 1) inputs = {} if kv_offload: diff --git a/QEfficient/transformers/models/llava/modeling_llava.py b/QEfficient/transformers/models/llava/modeling_llava.py index dc6653db0..775d47768 100644 --- a/QEfficient/transformers/models/llava/modeling_llava.py +++ b/QEfficient/transformers/models/llava/modeling_llava.py @@ -18,6 +18,7 @@ from QEfficient.utils.logging_utils import logger BS = 1 +FBS = 4 NUM_CHANNEL = 3 SEQ_LEN = 592 CTX_LEN = 1024 @@ -61,6 +62,7 @@ def forward( image_idx, past_key_values, comp_ctx_lengths: Optional[List[int]] = None, + batch_index: Optional[torch.LongTensor] = None, ): inputs_embeds = self.model.get_input_embeddings()(input_ids) vision_embeds = vision_embeds.to(inputs_embeds.device, inputs_embeds.dtype) @@ -76,6 +78,7 @@ def forward( position_ids=position_ids, past_key_values=past_key_values, comp_ctx_lengths=comp_ctx_lengths, + batch_index=batch_index, return_dict=True, ) @@ -140,7 +143,7 @@ def forward( image_idx = torch.where(image_idx < next_image_idx, next_image_idx, image_idx) return logits, pixel_values, image_idx, outputs.past_key_values - def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, **kwargs): + def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False, **kwargs): num_layers = self.config.text_config.num_hidden_layers num_key_value_heads = self.config.text_config.num_key_value_heads head_dim = self.config.text_config.hidden_size // self.config.text_config.num_attention_heads @@ -165,8 +168,8 @@ def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offl for i in range(num_layers): lang_inputs["past_key_values"].append( ( - torch.zeros(BS, num_key_value_heads, CTX_LEN, head_dim), - torch.zeros(BS, num_key_value_heads, CTX_LEN, head_dim), + torch.zeros(FBS if continuous_batching else BS, num_key_value_heads, CTX_LEN, head_dim), + torch.zeros(FBS if continuous_batching else BS, num_key_value_heads, CTX_LEN, head_dim), ) ) lang_inputs["position_ids"] = torch.full(lang_inputs["position_ids"].shape, CTX_LEN - 1) @@ -174,6 +177,8 @@ def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offl if comp_ctx_lengths is not None: lang_inputs["comp_ctx_lengths"] = torch.randint(0, 100, (40,), dtype=torch.long) + if continuous_batching: + lang_inputs["batch_index"] = torch.arange(BS).view(BS, 1) inputs = {} if kv_offload: @@ -193,6 +198,9 @@ def get_specializations( comp_ctx_lengths_prefill: Optional[List[int]] = None, comp_ctx_lengths_decode: Optional[List[int]] = None, kv_offload: bool = False, + continuous_batching: bool = False, + kv_cache_batch_size: Optional[int] = None, + full_batch_size: Optional[int] = None, **compiler_options, ): max_num_images = compiler_options.pop("max_num_images", 1) @@ -218,49 +226,74 @@ def get_specializations( lang = [] for i in range(0, len(comp_ctx_lengths_prefill)): - lang.append( - { - "batch_size": batch_size, - "seq_len": prefill_seq_len, - "ctx_len": ctx_len, - "comp_ctx_lengths": comp_ctx_lengths_prefill[i], - "max_num_images": max_num_images, - "img_size": img_size, - "vision_size": vision_size, - } - ) - - for i in range(0, len(comp_ctx_lengths_decode)): - lang.append( - { - "batch_size": batch_size, - "seq_len": "1", - "ctx_len": ctx_len, - "comp_ctx_lengths": comp_ctx_lengths_decode[i], - "max_num_images": max_num_images, - "img_size": img_size, - "vision_size": vision_size, - } - ) - else: - lang = [ - { - "batch_size": batch_size, + lang_prefill = { + "batch_size": 1 if continuous_batching else batch_size, "seq_len": prefill_seq_len, "ctx_len": ctx_len, + "comp_ctx_lengths": comp_ctx_lengths_prefill[i], "max_num_images": max_num_images, "img_size": img_size, "vision_size": vision_size, - }, - { - "batch_size": batch_size, + "vision_batch_size": batch_size, + } + if continuous_batching: + lang_prefill["full_batch_size"] = kv_cache_batch_size + else: + lang_prefill["batch_size"] = kv_cache_batch_size + if full_batch_size: + lang_prefill["full_batch_exec_size"] = full_batch_size + lang.append(lang_prefill) + + for i in range(0, len(comp_ctx_lengths_decode)): + lang_decode = { + "batch_size": full_batch_size if continuous_batching else batch_size,, "seq_len": "1", "ctx_len": ctx_len, + "comp_ctx_lengths": comp_ctx_lengths_decode[i], "max_num_images": max_num_images, "img_size": img_size, "vision_size": vision_size, - }, - ] + "vision_batch_size": batch_size, + } + if continuous_batching: + lang_decode["full_batch_size"] = kv_cache_batch_size + else: + lang_decode["batch_size"] = kv_cache_batch_size + lang.append(lang_decode) + else: + lang_prefill = { + "batch_size": 1 if continuous_batching else batch_size, + "seq_len": prefill_seq_len, + "ctx_len": ctx_len, + "max_num_images": max_num_images, + "img_size": img_size, + "vision_size": vision_size, + "vision_batch_size": batch_size, + } + if continuous_batching: + lang_prefill["full_batch_size"] = kv_cache_batch_size + else: + lang_prefill["batch_size"] = kv_cache_batch_size + if full_batch_size: + lang_prefill["full_batch_exec_size"] = full_batch_size + + lang_decode = { + "batch_size": full_batch_size if continuous_batching else batch_size, + "seq_len": "1", + "ctx_len": ctx_len, + "max_num_images": max_num_images, + "img_size": img_size, + "vision_size": vision_size, + "vision_batch_size": batch_size, + } + if continuous_batching: + lang_decode["full_batch_size"] = kv_cache_batch_size + else: + lang_decode["batch_size"] = kv_cache_batch_size + + lang = [] + lang.append(lang_prefill) + lang.append(lang_decode) specializations = {} @@ -269,9 +302,11 @@ def get_specializations( specializations["lang"] = lang return specializations, compiler_options else: + lang[0].pop("vision_size") + lang[1].pop("vision_size") return lang, compiler_options - def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False): + def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False): # Define dynamic axes num_layers = self.config.text_config.num_hidden_layers @@ -281,11 +316,19 @@ def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv lang_dynamic_axes = { "input_ids": {0: "batch_size", 1: "seq_len"}, "position_ids": {0: "batch_size", 1: "seq_len"}, - "vision_embeds": {0: "batch_size", 1: "vision_size"}, + "vision_embeds": {0: "vision_batch_size", 1: "vision_size"}, } + if continuous_batching: + lang_dynamic_axes["batch_index"] = {0: "batch_size"} for i in range(num_layers): - lang_dynamic_axes[f"past_key.{i}"] = {0: "batch_size", 2: "ctx_len"} - lang_dynamic_axes[f"past_value.{i}"] = {0: "batch_size", 2: "ctx_len"} + lang_dynamic_axes[f"past_key.{i}"] = { + 0: "full_batch_size" if continuous_batching else "batch_size", + 2: "ctx_len", + } + lang_dynamic_axes[f"past_value.{i}"] = { + 0: "full_batch_size" if continuous_batching else "batch_size", + 2: "ctx_len", + } if comp_ctx_lengths is not None: lang_dynamic_axes["comp_ctx_lengths"] = {0: "comp_ctx_lengths"} From 1220cf99f07c1203730161f4d8afc91242077736 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Mon, 10 Nov 2025 12:30:54 +0000 Subject: [PATCH 02/16] Added CB support for InternVL Signed-off-by: Asmita Goswami --- QEfficient/generation/embedding_handler.py | 85 ++++++++++++- QEfficient/generation/vlm_generation.py | 8 ++ .../models/internvl/modeling_internvl.py | 113 ++++++++++++------ .../transformers/models/modeling_auto.py | 5 + examples/internvl_CB_example.py | 98 +++++++++++++++ 5 files changed, 268 insertions(+), 41 deletions(-) create mode 100644 examples/internvl_CB_example.py diff --git a/QEfficient/generation/embedding_handler.py b/QEfficient/generation/embedding_handler.py index 76da7afc2..f18e84179 100644 --- a/QEfficient/generation/embedding_handler.py +++ b/QEfficient/generation/embedding_handler.py @@ -12,13 +12,14 @@ operations, separating them from the main text generation logic. """ -from typing import Any, Dict, Optional, Tuple +from io import BytesIO +from typing import Any, Dict, List, Optional, Tuple import numpy as np import requests import torch from PIL import Image -from transformers import AutoImageProcessor +from transformers import AutoImageProcessor, AutoTokenizer from QEfficient.generation.cloud_infer import QAICInferenceSession from QEfficient.utils.logging_utils import logger @@ -37,6 +38,9 @@ def __init__( qeff_model: Optional[QAICInferenceSession], vision_session: Optional[QAICInferenceSession], processor: Optional[AutoImageProcessor], + tokenizer: Optional[AutoTokenizer], + image_height: Optional[int] = None, + image_width: Optional[int] = None, config: Optional[Dict[str, Any]] = None, lang_session: Optional[QAICInferenceSession] = None, ): @@ -46,12 +50,16 @@ def __init__( Args: vision_session: QAICInferenceSession for vision model processor: AutoImageProcessor for image preprocessing + tokenizer: AutoTokenizer for text tokenization config: Configuration dictionary with vision model parameters lang_session: Optional language session for coordination (to avoid resource conflicts) """ self._qeff_model = qeff_model self._vision_session = vision_session self._processor = processor + self._tokenizer = tokenizer + self._image_height = image_height + self._image_width = image_width self._config = config or {} self._lang_session = lang_session # Store language session for coordination @@ -70,6 +78,71 @@ def is_available(self) -> bool: """ return self._vision_session is not None and self._processor is not None + def prepare_internVL_inputs(self, img_url: str, query: str) -> Dict[str, np.ndarray]: + """ + Prepare inputs for InternVL model + + Args: + image_url: URL or path to image + query: Text query to process with image + prompt = [query] + """ + if not self._tokenizer: + raise ValueError("Tokenizer is required for InternVL input preparation") + prompt = query + pixel_values = [] + num_patches_list = [] + questions = [] + img = requests.get(img_url, stream=True) + image = Image.open(BytesIO(img.content)).convert("RGB") + + if self._image_height and self._image_width: + image = image.resize((self._image_height, self._image_width)) + else: + logger.warning("Height and Width not specified. Using default image size for num_patches = 13.") + image = image.resize((1000, 747)) + + # preprocess the resized image + pixel_value = self._processor.load_image(image, max_num=12) + num_patches_list.append(pixel_value.shape[0]) + pixel_values.append(pixel_value) + + question = "\n" + prompt + questions.append(question) + + pixel_values = torch.cat(pixel_values, dim=0) + + # Chat Template information for prompt preprocessing + messages: List[List[str]] = [] + roles = ("<|im_start|>user\n", "<|im_start|>assistant\n") + prompt = self._processor(pixel_values, questions, messages, roles, num_patches_list=num_patches_list) + + inputs = self._tokenizer(prompt, return_tensors="pt") + inputs["pixel_values"] = pixel_values.clone() + + # Convert to numpy arrays + vision_inputs = {} + for k, v in inputs.items(): + if k in { + "pixel_values", + "image_masks", + "image_input_idx", + "valid_idx", + "aspect_ratio_ids", + "aspect_ratio_mask", + }: + vision_inputs[k] = np.array(v) + + # Convert specific inputs to float16 + vision_inputs_fp16 = {"pixel_values", "image_masks"} + for k in vision_inputs_fp16: + if k in vision_inputs: + vision_inputs[k] = vision_inputs[k].astype("float16") + + lang_inputs = {k: v for k, v in inputs.items() if k not in vision_inputs} + + return vision_inputs, lang_inputs + def prepare_vlm_inputs(self, image_url: str, query: str, prefill_seq_len: int) -> Dict[str, np.ndarray]: """ Download and preprocess image into model inputs @@ -323,7 +396,13 @@ def get_processed_inputs( try: ## Get vlm inputs ## - vision_inputs, lang_inputs = self.prepare_vlm_inputs(image_url, query, prefill_seq_len) + if ( + hasattr(self._qeff_model.model.config, "model_type") + and self._qeff_model.model.config.model_type == "internvl_chat" + ): + vision_inputs, lang_inputs = self.prepare_internVL_inputs(image_url, query) + else: + vision_inputs, lang_inputs = self.prepare_vlm_inputs(image_url, query, prefill_seq_len) # Handle padding for language model pad_token_id = 1 diff --git a/QEfficient/generation/vlm_generation.py b/QEfficient/generation/vlm_generation.py index 5eb91d142..dd5f579a8 100644 --- a/QEfficient/generation/vlm_generation.py +++ b/QEfficient/generation/vlm_generation.py @@ -88,6 +88,8 @@ def __init__( enable_debug_logs: bool = False, write_io_dir: Optional[str] = None, full_batch_size: Optional[int] = None, + image_height: Optional[int] = None, + image_width: Optional[int] = None, is_tlm: bool = False, include_sampler: bool = False, return_pdfs: bool = False, @@ -143,6 +145,9 @@ def __init__( ) self.qeff_model = qeff_model self.processor = processor + self.tokenizer = tokenizer + self.image_height = image_height + self.image_width = image_width self._vision_qpc_path = vision_qpc_path self.device_id = device_id # Store device_id for vision components self.enable_debug_logs = enable_debug_logs # Store for vision components @@ -173,6 +178,9 @@ def _init_vision_components(self): qeff_model=self.qeff_model, vision_session=self._vision_session, processor=self.processor, + tokenizer=self.tokenizer, + image_height=self.image_height, + image_width=self.image_width, config=vision_config, lang_session=self._session, # Pass language session for coordination ) diff --git a/QEfficient/transformers/models/internvl/modeling_internvl.py b/QEfficient/transformers/models/internvl/modeling_internvl.py index 96c59325f..fb0275acc 100644 --- a/QEfficient/transformers/models/internvl/modeling_internvl.py +++ b/QEfficient/transformers/models/internvl/modeling_internvl.py @@ -44,6 +44,7 @@ def forward( image_idx, past_key_values, comp_ctx_lengths: Optional[List[int]] = None, + batch_index: Optional[torch.LongTensor] = None, ): input_embeds = self.model.language_model.get_input_embeddings()(input_ids) B, N, C = input_embeds.shape @@ -69,6 +70,7 @@ def forward( position_ids=position_ids, past_key_values=past_key_values, comp_ctx_lengths=comp_ctx_lengths, + batch_index=batch_index, use_cache=True, ) image_idx = (indices1.max() + 1).unsqueeze(0).unsqueeze(0) @@ -91,6 +93,9 @@ def get_specializations( comp_ctx_lengths_prefill: Optional[List[int]] = None, comp_ctx_lengths_decode: Optional[List[int]] = None, kv_offload: bool = False, + continuous_batching: bool = False, + kv_cache_batch_size: Optional[int] = None, + full_batch_size: Optional[int] = None, **compiler_options, ): num_patches = compiler_options.pop("num_patches", None) @@ -124,50 +129,73 @@ def get_specializations( lang = [] for i in range(0, len(comp_ctx_lengths_prefill)): - lang.append( - { - "batch_size": batch_size, - "seq_len": prefill_seq_len, - "ctx_len": ctx_len, - "comp_ctx_lengths": comp_ctx_lengths_prefill[i], - "num_patches": num_patches, - "img_size": img_size, - "vision_size": vision_size, - } - ) - - for i in range(0, len(comp_ctx_lengths_decode)): - lang.append( - { - "batch_size": batch_size, - "seq_len": "1", - "ctx_len": ctx_len, - "comp_ctx_lengths": comp_ctx_lengths_decode[i], - "num_patches": num_patches, - "img_size": img_size, - "vision_size": vision_size, - } - ) - - else: - lang = [ - { - "batch_size": batch_size, + lang_prefill = { + "batch_size": 1 if continuous_batching else batch_size, "seq_len": prefill_seq_len, "ctx_len": ctx_len, + "comp_ctx_lengths": comp_ctx_lengths_prefill[i], "num_patches": num_patches, "img_size": img_size, "vision_size": vision_size, - }, - { - "batch_size": batch_size, + } + if continuous_batching: + lang_prefill["full_batch_size"] = kv_cache_batch_size + else: + lang_prefill["batch_size"] = kv_cache_batch_size + if full_batch_size: + lang_prefill["full_batch_exec_size"] = full_batch_size + lang.append(lang_prefill) + + for i in range(0, len(comp_ctx_lengths_decode)): + lang_decode = { + "batch_size": full_batch_size if continuous_batching else batch_size, "seq_len": "1", "ctx_len": ctx_len, + "comp_ctx_lengths": comp_ctx_lengths_prefill[i], "num_patches": num_patches, "img_size": img_size, "vision_size": vision_size, - }, - ] + } + + if continuous_batching: + lang_decode["full_batch_size"] = kv_cache_batch_size + else: + lang_decode["batch_size"] = kv_cache_batch_size + lang.append(lang_decode) + + else: + lang_prefill = { + "batch_size": 1 if continuous_batching else batch_size, + "seq_len": prefill_seq_len, + "ctx_len": ctx_len, + "num_patches": num_patches, + "img_size": img_size, + "vision_size": vision_size, + } + if continuous_batching: + lang_prefill["full_batch_size"] = kv_cache_batch_size + else: + lang_prefill["batch_size"] = kv_cache_batch_size + if full_batch_size: + lang_prefill["full_batch_exec_size"] = full_batch_size + + lang_decode = { + "batch_size": full_batch_size if continuous_batching else batch_size, + "seq_len": "1", + "ctx_len": ctx_len, + "num_patches": num_patches, + "img_size": img_size, + "vision_size": vision_size, + } + + if continuous_batching: + lang_decode["full_batch_size"] = kv_cache_batch_size + else: + lang_decode["batch_size"] = kv_cache_batch_size + + lang = [] + lang.append(lang_prefill) + lang.append(lang_decode) specializations = {} @@ -176,18 +204,22 @@ def get_specializations( specializations["lang"] = lang return specializations, compiler_options else: + lang[0].pop("vision_size") + lang[1].pop("vision_size") return lang, compiler_options - def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False): + def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False): # Define dynamic axes vision_dynamic_axes = {} lang_dynamic_axes = {} lang_dynamic_axes["input_ids"] = {0: "batch_size", 1: "seq_len"} lang_dynamic_axes["position_ids"] = {0: "batch_size", 1: "seq_len"} lang_dynamic_axes["vision_embeds"] = {1: "vision_size"} + if continuous_batching: + lang_dynamic_axes["batch_index"] = {0: "batch_size"} vision_dynamic_axes["pixel_values"] = {0: "batched_num_patches", 2: "img_size", 3: "img_size"} - pkv_dynamic_axes = {0: "batch_size", 2: "ctx_len"} + pkv_dynamic_axes = {0: "full_batch_size" if continuous_batching else "batch_size", 2: "ctx_len"} for i in range(self.language_model.config.num_hidden_layers): for kv in ["key", "value"]: lang_dynamic_axes[f"past_{kv}.{i}"] = pkv_dynamic_axes @@ -222,7 +254,7 @@ def get_output_names(self, kv_offload: bool = False): return lang_output_names return output_names - def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False): + def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False): if vis_cfg := getattr(self.config, "vision_config", None): img_size = getattr(vis_cfg, "image_size", constants.INTERN_IMG_SIZE) else: @@ -271,10 +303,13 @@ def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offl ) lang_inputs["image_idx"] = torch.zeros((1, 1), dtype=torch.int64) + bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE + fbs: int = constants.ONNX_EXPORT_EXAMPLE_FBS + # Add data for KV kv_cache_shape = get_padding_shape_from_config( config=self.language_model.config, - batch_size=constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, + batch_size=fbs if continuous_batching else bs, seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) @@ -285,6 +320,8 @@ def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offl if comp_ctx_lengths is not None: lang_inputs["comp_ctx_lengths"] = torch.randint(0, 100, (40,), dtype=torch.long) + if continuous_batching: + lang_inputs["batch_index"] = torch.arange(bs).view(bs, 1) inputs = {} if kv_offload: diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 5f1ec51e6..f3c5c6a7c 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -1215,6 +1215,8 @@ def generate( device_ids: List[int] = None, runtime_ai100: bool = True, generation_len: Optional[int] = None, + image_height: Optional[int] = None, + image_width: Optional[int] = None, ) -> Union[torch.Tensor, np.ndarray]: """ Generates output by executing the compiled QPC(s) on Cloud AI 100 Hardware cards. @@ -1273,6 +1275,8 @@ def generate( full_batch_size=fbs, comp_ctx_lengths_prefill=self.comp_ctx_lengths_prefill, comp_ctx_lengths_decode=self.comp_ctx_lengths_decode, + image_height=image_height, + image_width=image_width, ) # Call generate method @@ -2401,6 +2405,7 @@ def from_pretrained( kv_offload=kv_offload, pretrained_model_name_or_path=pretrained_model_name_or_path, qaic_config=qaic_config, + continuous_batching=continuous_batching, **kwargs, ) return cls( diff --git a/examples/internvl_CB_example.py b/examples/internvl_CB_example.py new file mode 100644 index 000000000..486f9db6c --- /dev/null +++ b/examples/internvl_CB_example.py @@ -0,0 +1,98 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ---------------------------------------------------------------------------- + +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer + +from QEfficient import QEFFAutoModelForCausalLM +from QEfficient.utils.test_utils import InternProcessor + +model_id = "OpenGVLab/InternVL2_5-1B" +config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) +# For Testing Purpose Only +config.llm_config.num_hidden_layers = 2 +config.vision_config.num_hidden_layers = 2 + +model_hf = AutoModelForCausalLM.from_pretrained( + model_id, + low_cpu_mem_usage=False, + trust_remote_code=True, + config=config, +) + +tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, use_fast=False) +processor = InternProcessor(model_hf, tokenizer) + + +continuous_batching = True +if continuous_batching: + qeff_model = QEFFAutoModelForCausalLM.from_pretrained( + model_id, + attn_implementation="eager", + kv_offload=True, + config=config, + continuous_batching=True, + trust_remote_code=True, + ) + + qeff_model.compile( + num_patches=13, # Set num_patches according to image_height and image_width, default is 13 (747 x 1000) + prefill_seq_len=128, + ctx_len=4096, + num_cores=16, + num_devices=4, + batch_size=1, + full_batch_size=1, + mxfp6_matmul=True, + mxint8_kv_cache=True, + aic_enable_depth_first=True, + mos=1, + ) +else: + qeff_model = QEFFAutoModelForCausalLM.from_pretrained( + model_id, attn_implementation="eager", kv_offload=True, config=config, trust_remote_code=True + ) + + qeff_model.compile( + num_patches=13, + prefill_seq_len=128, + ctx_len=4096, + num_cores=16, + num_devices=4, + batch_size=1, + mxfp6_matmul=True, + mxint8_kv_cache=True, + aic_enable_depth_first=True, + ) + +image_urls = [ + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", +] + +prompts = [ + "Can you describe the image in detail?", + "What are the objects in the image?", + "What is the main subject of the image?", + "What colors are predominant in the image?", +] + +exec_info = qeff_model.generate( + tokenizer=tokenizer, + prompts=prompts, + processor=processor, + images=image_urls, + device_ids=[0, 1, 2, 3], + generation_len=10, + image_height=747, + image_width=1000, +) + +print("Generated texts:", exec_info.generated_texts) +print("Generated IDs:", exec_info.generated_ids) +print(exec_info) From c39ae015cf5fe88842ef9d0e82fe5d9d73f6e718 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Mon, 10 Nov 2025 12:42:23 +0000 Subject: [PATCH 03/16] Added CB support for Mistral3 Signed-off-by: Asmita Goswami --- QEfficient/generation/embedding_handler.py | 3 + .../models/internvl/modeling_internvl.py | 2 +- .../models/mistral3/modeling_mistral3.py | 115 ++++++++++++------ examples/internvl_CB_example.py | 2 +- 4 files changed, 82 insertions(+), 40 deletions(-) diff --git a/QEfficient/generation/embedding_handler.py b/QEfficient/generation/embedding_handler.py index f18e84179..d196a23a2 100644 --- a/QEfficient/generation/embedding_handler.py +++ b/QEfficient/generation/embedding_handler.py @@ -168,6 +168,9 @@ def prepare_vlm_inputs(self, image_url: str, query: str, prefill_seq_len: int) - else: image = Image.open(image_url) + if "mistral3" in self._qeff_model.model.config.model_type: + image = image.resize((1540, 1540)) + # Prepare conversation format conversation = [ { diff --git a/QEfficient/transformers/models/internvl/modeling_internvl.py b/QEfficient/transformers/models/internvl/modeling_internvl.py index fb0275acc..eb5a4b475 100644 --- a/QEfficient/transformers/models/internvl/modeling_internvl.py +++ b/QEfficient/transformers/models/internvl/modeling_internvl.py @@ -151,7 +151,7 @@ def get_specializations( "batch_size": full_batch_size if continuous_batching else batch_size, "seq_len": "1", "ctx_len": ctx_len, - "comp_ctx_lengths": comp_ctx_lengths_prefill[i], + "comp_ctx_lengths": comp_ctx_lengths_decode[i], "num_patches": num_patches, "img_size": img_size, "vision_size": vision_size, diff --git a/QEfficient/transformers/models/mistral3/modeling_mistral3.py b/QEfficient/transformers/models/mistral3/modeling_mistral3.py index 694ed4cde..afe838f74 100644 --- a/QEfficient/transformers/models/mistral3/modeling_mistral3.py +++ b/QEfficient/transformers/models/mistral3/modeling_mistral3.py @@ -176,6 +176,7 @@ def forward( image_idx, past_key_values, comp_ctx_lengths: Optional[List[int]] = None, + batch_index: Optional[torch.LongTensor] = None, ): inputs_embeds = self.model.get_input_embeddings()(input_ids) vision_embeds = vision_embeds.to(inputs_embeds.device, inputs_embeds.dtype) @@ -190,6 +191,7 @@ def forward( position_ids=position_ids, past_key_values=past_key_values, comp_ctx_lengths=comp_ctx_lengths, + batch_index=batch_index, ) # Cast to int32 to avoid ONNXRT issue @@ -250,7 +252,7 @@ def forward( return logits, pixel_values, image_idx, outputs.past_key_values - def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, **kwargs): + def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False, **kwargs): inputs_shapes = {} inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) height = self.config.vision_config.image_size @@ -290,10 +292,14 @@ def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offl .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1) ) lang_inputs["image_idx"] = torch.zeros((inputs_shapes["image_idx"]), dtype=torch.int64) + + bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE + fbs: int = constants.ONNX_EXPORT_EXAMPLE_FBS + # Add data for KV kv_cache_shape = get_padding_shape_from_config( - config=self.language_model.config, - batch_size=constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, + config=self.model.config.text_config, + batch_size=fbs if continuous_batching else bs, seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) @@ -304,6 +310,8 @@ def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offl if comp_ctx_lengths is not None: lang_inputs["comp_ctx_lengths"] = torch.randint(0, 100, (40,), dtype=torch.long) + if continuous_batching: + lang_inputs["batch_index"] = torch.arange(bs).view(bs, 1) inputs = {} if kv_offload: @@ -324,6 +332,9 @@ def get_specializations( comp_ctx_lengths_prefill: Optional[List[int]] = None, comp_ctx_lengths_decode: Optional[List[int]] = None, kv_offload: bool = False, + continuous_batching: bool = False, + kv_cache_batch_size: Optional[int] = None, + full_batch_size: Optional[int] = None, **compiler_options, ): if img_size is None and hasattr(self.config.vision_config, "image_size"): @@ -352,46 +363,65 @@ def get_specializations( lang = [] for i in range(0, len(comp_ctx_lengths_prefill)): - lang.append( - { - "batch_size": batch_size, - "seq_len": prefill_seq_len, - "ctx_len": ctx_len, - "comp_ctx_lengths": comp_ctx_lengths_prefill[i], - "image_size": img_size, - "vision_size": vision_size, - } - ) - - # Remaining elements use comp_ctx_lengths[1:] in a loop - for i in range(0, len(comp_ctx_lengths_decode)): - lang.append( - { - "batch_size": batch_size, - "seq_len": "1", - "ctx_len": ctx_len, - "comp_ctx_lengths": comp_ctx_lengths_decode[i], - "image_size": img_size, - "vision_size": vision_size, - } - ) - else: - lang = [ - { - "batch_size": batch_size, + lang_prefill = { + "batch_size": 1 if continuous_batching else batch_size, "seq_len": prefill_seq_len, "ctx_len": ctx_len, + "comp_ctx_lengths": comp_ctx_lengths_prefill[i], "image_size": img_size, "vision_size": vision_size, - }, - { - "batch_size": batch_size, + } + if continuous_batching: + lang_prefill["full_batch_size"] = kv_cache_batch_size + else: + lang_prefill["batch_size"] = kv_cache_batch_size + if full_batch_size: + lang_prefill["full_batch_exec_size"] = full_batch_size + lang.append(lang_prefill) + + # Remaining elements use comp_ctx_lengths[1:] in a loop + for i in range(0, len(comp_ctx_lengths_decode)): + lang_decode = { + "batch_size": full_batch_size if continuous_batching else batch_size, "seq_len": "1", "ctx_len": ctx_len, + "comp_ctx_lengths": comp_ctx_lengths_decode[i], "image_size": img_size, "vision_size": vision_size, - }, - ] + } + + if continuous_batching: + lang_decode["full_batch_size"] = kv_cache_batch_size + else: + lang_decode["batch_size"] = kv_cache_batch_size + lang.append(lang_decode) + else: + lang_prefill = { + "batch_size": 1 if continuous_batching else batch_size, + "seq_len": prefill_seq_len, + "ctx_len": ctx_len, + "image_size": img_size, + "vision_size": vision_size, + } + if continuous_batching: + lang_prefill["full_batch_size"] = kv_cache_batch_size + else: + lang_prefill["batch_size"] = kv_cache_batch_size + if full_batch_size: + lang_prefill["full_batch_exec_size"] = full_batch_size + + lang_decode = { + "batch_size": full_batch_size if continuous_batching else batch_size, + "seq_len": "1", + "ctx_len": ctx_len, + "image_size": img_size, + "vision_size": vision_size, + } + + if continuous_batching: + lang_decode["full_batch_size"] = kv_cache_batch_size + else: + lang_decode["batch_size"] = kv_cache_batch_size specializations = {} @@ -404,7 +434,7 @@ def get_specializations( lang[1].pop("vision_size") return lang, compiler_options - def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False): + def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False): # Define dynamic axes num_layers = self.config.text_config.num_hidden_layers @@ -417,9 +447,18 @@ def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv "vision_embeds": {0: "vision_size"}, } + if continuous_batching: + lang_dynamic_axes["batch_index"] = {0: "batch_size"} + for i in range(num_layers): - lang_dynamic_axes[f"past_key.{i}"] = {0: "batch_size", 2: "ctx_len"} - lang_dynamic_axes[f"past_value.{i}"] = {0: "batch_size", 2: "ctx_len"} + lang_dynamic_axes[f"past_key.{i}"] = { + 0: "full_batch_size" if continuous_batching else "batch_size", + 2: "ctx_len", + } + lang_dynamic_axes[f"past_value.{i}"] = { + 0: "full_batch_size" if continuous_batching else "batch_size", + 2: "ctx_len", + } if comp_ctx_lengths is not None: lang_dynamic_axes["comp_ctx_lengths"] = {0: "comp_ctx_lengths"} diff --git a/examples/internvl_CB_example.py b/examples/internvl_CB_example.py index 486f9db6c..29cb9a5c4 100644 --- a/examples/internvl_CB_example.py +++ b/examples/internvl_CB_example.py @@ -45,7 +45,7 @@ num_cores=16, num_devices=4, batch_size=1, - full_batch_size=1, + full_batch_size=4, mxfp6_matmul=True, mxint8_kv_cache=True, aic_enable_depth_first=True, From 39f5c1649025809595bf5aa7b6fb3cadf580e5aa Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Tue, 11 Nov 2025 08:15:11 +0000 Subject: [PATCH 04/16] Updated test_image_text_to_text for CB tests Signed-off-by: Asmita Goswami --- QEfficient/utils/run_utils.py | 48 +++++++++++++ .../models/test_image_text_to_text_models.py | 67 ++++++++++++++++++- 2 files changed, 112 insertions(+), 3 deletions(-) diff --git a/QEfficient/utils/run_utils.py b/QEfficient/utils/run_utils.py index c54dadeac..0f82fb027 100644 --- a/QEfficient/utils/run_utils.py +++ b/QEfficient/utils/run_utils.py @@ -276,6 +276,54 @@ def __init__( self.config = config self.gen_len = max_gen_len + @torch.no_grad() + def run_vlm_hf_model_on_pytorch_CB(self, model, images, queries): + """ + Function responsible for running HuggingFace ``PyTorch`` model for continuous batching + and return the output tokens for each prompt/image pair. + + ``Mandatory`` Args: + :model (torch.nn.module): Original ``PyTorch`` model + :images (List[PIL.Image]): List of input images + :queries (List[str]): List of input queries + + Return: + :List[numpy.ndarray]: List of generated output tokens for each prompt + """ + generated_ids = [] + + for idx, (image, query) in enumerate(zip(images, queries)): + # Prepare conversation format for each image-query pair + conversation = [ + { + "role": "user", + "content": [ + {"type": "text", "text": query}, + {"type": "image"}, + ], + }, + ] + prompt = self.processor.apply_chat_template(conversation, add_generation_prompt=True) + + # Process inputs + inputs = self.processor(images=image, text=prompt, return_tensors="pt") + if "pixel_values" in inputs: + inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) + + # Generate tokens + output = model.generate(**inputs, max_new_tokens=self.gen_len, do_sample=False) + offset_output = output[0, inputs["input_ids"].shape[1]:] + + # Decode and print output + py_output = self.processor.tokenizer.decode(offset_output).strip() + print(f"Original HF Model Outputs (Torch CPU) for prompt {idx}:") + print("Query:", repr(query)) + print("Completion:", repr(py_output)) + + generated_ids.append(offset_output.numpy()) + + return generated_ids + @torch.no_grad() def run_vlm_hf_model_on_pytorch(self, model, inputs): output = model.generate(**inputs, max_new_tokens=self.gen_len, do_sample=False) diff --git a/tests/transformers/models/test_image_text_to_text_models.py b/tests/transformers/models/test_image_text_to_text_models.py index e6a145195..5d095fe87 100644 --- a/tests/transformers/models/test_image_text_to_text_models.py +++ b/tests/transformers/models/test_image_text_to_text_models.py @@ -38,6 +38,7 @@ # model_name, # kv_offload, # batch_size, + # full_batch_size, # prompt_len, # ctx_len, # img_size, @@ -49,6 +50,7 @@ "llava-hf/llava-1.5-7b-hf", True, 1, + 4, 784, 1024, 336, @@ -60,6 +62,7 @@ "llava-hf/llava-1.5-7b-hf", False, 1, + 4, 784, 1024, 336, @@ -72,6 +75,7 @@ # "meta-llama/Llama-4-Scout-17B-16E-Instruct", # True, # 1, + # 4, # 128, # 3072, # 336, @@ -83,6 +87,7 @@ # "meta-llama/Llama-4-Scout-17B-16E-Instruct", # False, # 1, + # 4, # 128, # 3072, # 336, @@ -94,6 +99,7 @@ "google/gemma-3-4b-it", True, 1, + 4, 128, 3072, 896, @@ -105,6 +111,7 @@ "google/gemma-3-4b-it", False, 1, + 4, 128, 3072, 896, @@ -116,6 +123,7 @@ "mistralai/Mistral-Small-3.1-24B-Instruct-2503", True, 1, + 4, 128, 4096, 1540, @@ -127,6 +135,7 @@ "mistralai/Mistral-Small-3.1-24B-Instruct-2503", False, 1, + 4, 128, 4096, 1540, @@ -138,6 +147,7 @@ "Qwen/Qwen2.5-VL-3B-Instruct", True, 1, + 4, 128, 4096, 1540, @@ -149,6 +159,7 @@ # "meta-llama/Llama-3.2-11B-Vision-Instruct", # True, # 1, + # 4, # 32, # 512, # 560, @@ -256,6 +267,7 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( query: str, prompt_len: int, ctx_len: int, + full_batch_size: int, max_gen_len: int = 20, batch_size: int = 1, n_layer: int = 1, @@ -341,8 +353,56 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer) qpc_tokens = output.generated_ids[:, :-1] assert (pytorch_hf_tokens == qpc_tokens).all(), "Tokens don't match for pytorch HF output and QPC output" - return + # testing for CB models + if not kv_offload: # CB not yet enabled for Single QPC + return + images = [image] * full_batch_size + queries = [query] * full_batch_size + + streamer = TextStreamer(processor.tokenizer) + pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch_CB(model_hf, images, queries) + + qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( + model_config["model_name"], + kv_offload=kv_offload, + config=config, + continuous_batching=True, + ) + + qeff_model.export() + + if not get_available_device_id(): + pytest.skip("No available devices to run model on Cloud AI 100") + + qeff_model.compile( + img_size=model_config["img_size"], + num_cores=16, + num_devices=num_devices, + prefill_seq_len=prompt_len, + ctx_len=ctx_len, + batch_size=batch_size, + full_batch_size=full_batch_size, + mxfp6_matmul=True, + enable_qnn=enable_qnn, + qnn_config=qnn_config, + ) + + print("QPC Outputs (QAIC):") + exec_info = qeff_model.generate( + tokenizer=processor.tokenizer, + processor=processor, + images=[img_url] * full_batch_size, + prompts=queries, + generation_len=max_gen_len, + ) + + qpc_tokens = exec_info.generated_ids[:, :max_gen_len] + + for i in range(full_batch_size): + assert (pytorch_hf_tokens[i] == qpc_tokens[i]).all(), f"Tokens don't match for prompt {i} between HF and QPC output" + + return def check_molmo_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( model_name: str, @@ -527,10 +587,10 @@ def check_intern_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( @pytest.mark.on_qaic @pytest.mark.multimodal @pytest.mark.parametrize( - "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer", test_models_config + "model_name, kv_offload, batch_size, full_batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer", test_models_config ) def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( - model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer + model_name, kv_offload, batch_size, full_batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer ): """ Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, without continuous batching. @@ -547,6 +607,7 @@ def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( query=query, n_layer=n_layer, batch_size=batch_size, + full_batch_size=full_batch_size, kv_offload=kv_offload, ) From 9a42a081da61bf10b8a087f6087eaa7fe4172d12 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Tue, 11 Nov 2025 08:18:34 +0000 Subject: [PATCH 05/16] Ruff format Signed-off-by: Asmita Goswami --- QEfficient/utils/run_utils.py | 2 +- .../models/test_image_text_to_text_models.py | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/QEfficient/utils/run_utils.py b/QEfficient/utils/run_utils.py index 0f82fb027..59e3f9bf4 100644 --- a/QEfficient/utils/run_utils.py +++ b/QEfficient/utils/run_utils.py @@ -312,7 +312,7 @@ def run_vlm_hf_model_on_pytorch_CB(self, model, images, queries): # Generate tokens output = model.generate(**inputs, max_new_tokens=self.gen_len, do_sample=False) - offset_output = output[0, inputs["input_ids"].shape[1]:] + offset_output = output[0, inputs["input_ids"].shape[1] :] # Decode and print output py_output = self.processor.tokenizer.decode(offset_output).strip() diff --git a/tests/transformers/models/test_image_text_to_text_models.py b/tests/transformers/models/test_image_text_to_text_models.py index 5d095fe87..11fcf6857 100644 --- a/tests/transformers/models/test_image_text_to_text_models.py +++ b/tests/transformers/models/test_image_text_to_text_models.py @@ -355,7 +355,7 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( assert (pytorch_hf_tokens == qpc_tokens).all(), "Tokens don't match for pytorch HF output and QPC output" # testing for CB models - if not kv_offload: # CB not yet enabled for Single QPC + if not kv_offload: # CB not yet enabled for Single QPC return images = [image] * full_batch_size queries = [query] * full_batch_size @@ -400,10 +400,13 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( qpc_tokens = exec_info.generated_ids[:, :max_gen_len] for i in range(full_batch_size): - assert (pytorch_hf_tokens[i] == qpc_tokens[i]).all(), f"Tokens don't match for prompt {i} between HF and QPC output" + assert (pytorch_hf_tokens[i] == qpc_tokens[i]).all(), ( + f"Tokens don't match for prompt {i} between HF and QPC output" + ) return + def check_molmo_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( model_name: str, img_url: str, @@ -587,7 +590,8 @@ def check_intern_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( @pytest.mark.on_qaic @pytest.mark.multimodal @pytest.mark.parametrize( - "model_name, kv_offload, batch_size, full_batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer", test_models_config + "model_name, kv_offload, batch_size, full_batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer", + test_models_config, ) def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( model_name, kv_offload, batch_size, full_batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer From c1465c8ede242bbac1590d99d06329f0c5008d5a Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Sun, 16 Nov 2025 16:54:54 +0000 Subject: [PATCH 06/16] Added CB update for Molmo Signed-off-by: Asmita Goswami --- QEfficient/generation/embedding_handler.py | 60 +++++++++++++ .../models/gemma3/modeling_gemma3.py | 8 +- .../models/internvl/modeling_internvl.py | 8 +- .../models/llava/modeling_llava.py | 14 ++- .../models/mistral3/modeling_mistral3.py | 12 ++- .../models/molmo/modeling_molmo.py | 87 ++++++++++++++++--- .../models/test_image_text_to_text_models.py | 57 +++++++++++- 7 files changed, 222 insertions(+), 24 deletions(-) diff --git a/QEfficient/generation/embedding_handler.py b/QEfficient/generation/embedding_handler.py index d196a23a2..b3ba55098 100644 --- a/QEfficient/generation/embedding_handler.py +++ b/QEfficient/generation/embedding_handler.py @@ -143,6 +143,61 @@ def prepare_internVL_inputs(self, img_url: str, query: str) -> Dict[str, np.ndar return vision_inputs, lang_inputs + def prepare_molmo_inputs(self, image_url: str, query: str) -> Dict[str, np.ndarray]: + """ + Download and preprocess image into model inputs + Args: + image_url: URL or path to image + query: Text query to process with image + Returns: + Dictionary of vision model inputs + Raises: + ValueError: If vision handler is not properly initialized + RuntimeError: If image processing fails + """ + if not self.is_available(): + raise ValueError("Vision handler not properly initialized. Need both vision_session and processor.") + + try: + # Download image + if image_url.startswith(("http://", "https://")): + image = Image.open(requests.get(image_url, stream=True).raw) + else: + image = Image.open(image_url) + image = image.resize((536, 354)) + inputs = self._processor.process(images=[image], text=query) + inputs = {k: v.unsqueeze(0) for k, v in inputs.items()} + inputs["attention_mask"] = torch.ones((inputs["input_ids"].shape), dtype=torch.int64) + valid = inputs["image_input_idx"] > 0 + valid = valid.reshape(1, -1) + inputs["valid_idx"] = torch.nonzero(valid)[:, 1].unsqueeze(0) + inputs["pixel_values"] = inputs.pop("images") + + # Convert to numpy arrays + vision_inputs = {} + for k, v in inputs.items(): + if k in { + "pixel_values", + "image_masks", + "image_input_idx", + "valid_idx", + "aspect_ratio_ids", + "aspect_ratio_mask", + }: + vision_inputs[k] = np.array(v) + + # Convert specific inputs to float16 + vision_inputs_fp16 = {"pixel_values", "image_masks"} + for k in vision_inputs_fp16: + if k in vision_inputs: + vision_inputs[k] = vision_inputs[k].astype("float16") + + lang_inputs = {k: v for k, v in inputs.items() if k not in vision_inputs} + + return vision_inputs, lang_inputs + except Exception as e: + raise RuntimeError(f"Failed to process image {image_url}: {str(e)}") + def prepare_vlm_inputs(self, image_url: str, query: str, prefill_seq_len: int) -> Dict[str, np.ndarray]: """ Download and preprocess image into model inputs @@ -404,6 +459,11 @@ def get_processed_inputs( and self._qeff_model.model.config.model_type == "internvl_chat" ): vision_inputs, lang_inputs = self.prepare_internVL_inputs(image_url, query) + elif ( + hasattr(self._qeff_model.model.config, "model_type") + and self._qeff_model.model.config.model_type == "molmo" + ): + vision_inputs, lang_inputs = self.prepare_molmo_inputs(image_url, query) else: vision_inputs, lang_inputs = self.prepare_vlm_inputs(image_url, query, prefill_seq_len) diff --git a/QEfficient/transformers/models/gemma3/modeling_gemma3.py b/QEfficient/transformers/models/gemma3/modeling_gemma3.py index 234dff860..15e30acc4 100644 --- a/QEfficient/transformers/models/gemma3/modeling_gemma3.py +++ b/QEfficient/transformers/models/gemma3/modeling_gemma3.py @@ -792,7 +792,9 @@ def get_specializations( lang[1].pop("vision_size") return lang, compiler_options - def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False): + def get_onnx_dynamic_axes( + self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False + ): # Define dynamic axes vision_dynamic_axes = {} lang_dynamic_axes = {} @@ -870,7 +872,9 @@ def get_dummy_pkv_cache(self, config, batch_size, seq_len): past_key_values.append(pkv) return past_key_values - def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False): + def get_dummy_inputs( + self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False + ): if vis_cfg := getattr(self.config, "vision_config", None): img_size = getattr(vis_cfg, "image_size", 896) else: diff --git a/QEfficient/transformers/models/internvl/modeling_internvl.py b/QEfficient/transformers/models/internvl/modeling_internvl.py index eb5a4b475..402f0450b 100644 --- a/QEfficient/transformers/models/internvl/modeling_internvl.py +++ b/QEfficient/transformers/models/internvl/modeling_internvl.py @@ -208,7 +208,9 @@ def get_specializations( lang[1].pop("vision_size") return lang, compiler_options - def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False): + def get_onnx_dynamic_axes( + self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False + ): # Define dynamic axes vision_dynamic_axes = {} lang_dynamic_axes = {} @@ -254,7 +256,9 @@ def get_output_names(self, kv_offload: bool = False): return lang_output_names return output_names - def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False): + def get_dummy_inputs( + self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False + ): if vis_cfg := getattr(self.config, "vision_config", None): img_size = getattr(vis_cfg, "image_size", constants.INTERN_IMG_SIZE) else: diff --git a/QEfficient/transformers/models/llava/modeling_llava.py b/QEfficient/transformers/models/llava/modeling_llava.py index 775d47768..b8f493b93 100644 --- a/QEfficient/transformers/models/llava/modeling_llava.py +++ b/QEfficient/transformers/models/llava/modeling_llava.py @@ -143,7 +143,13 @@ def forward( image_idx = torch.where(image_idx < next_image_idx, next_image_idx, image_idx) return logits, pixel_values, image_idx, outputs.past_key_values - def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False, **kwargs): + def get_dummy_inputs( + self, + comp_ctx_lengths: Optional[List[int]] = None, + kv_offload: bool = False, + continuous_batching: bool = False, + **kwargs, + ): num_layers = self.config.text_config.num_hidden_layers num_key_value_heads = self.config.text_config.num_key_value_heads head_dim = self.config.text_config.hidden_size // self.config.text_config.num_attention_heads @@ -246,7 +252,7 @@ def get_specializations( for i in range(0, len(comp_ctx_lengths_decode)): lang_decode = { - "batch_size": full_batch_size if continuous_batching else batch_size,, + "batch_size": full_batch_size if continuous_batching else batch_size, "seq_len": "1", "ctx_len": ctx_len, "comp_ctx_lengths": comp_ctx_lengths_decode[i], @@ -306,7 +312,9 @@ def get_specializations( lang[1].pop("vision_size") return lang, compiler_options - def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False): + def get_onnx_dynamic_axes( + self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False + ): # Define dynamic axes num_layers = self.config.text_config.num_hidden_layers diff --git a/QEfficient/transformers/models/mistral3/modeling_mistral3.py b/QEfficient/transformers/models/mistral3/modeling_mistral3.py index afe838f74..ab5c12bcc 100644 --- a/QEfficient/transformers/models/mistral3/modeling_mistral3.py +++ b/QEfficient/transformers/models/mistral3/modeling_mistral3.py @@ -252,7 +252,13 @@ def forward( return logits, pixel_values, image_idx, outputs.past_key_values - def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False, **kwargs): + def get_dummy_inputs( + self, + comp_ctx_lengths: Optional[List[int]] = None, + kv_offload: bool = False, + continuous_batching: bool = False, + **kwargs, + ): inputs_shapes = {} inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) height = self.config.vision_config.image_size @@ -434,7 +440,9 @@ def get_specializations( lang[1].pop("vision_size") return lang, compiler_options - def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False): + def get_onnx_dynamic_axes( + self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False + ): # Define dynamic axes num_layers = self.config.text_config.num_hidden_layers diff --git a/QEfficient/transformers/models/molmo/modeling_molmo.py b/QEfficient/transformers/models/molmo/modeling_molmo.py index c088158c4..db4755843 100644 --- a/QEfficient/transformers/models/molmo/modeling_molmo.py +++ b/QEfficient/transformers/models/molmo/modeling_molmo.py @@ -43,14 +43,14 @@ def eager_attention_forward( if num_q_heads != num_kv_heads: assert num_q_heads % num_kv_heads == 0 repeat_factor = num_q_heads // num_kv_heads - _, _, S, D = k.shape + B, _, S, D = k.shape k = k.unsqueeze(2) k = k.expand(-1, -1, repeat_factor, -1, -1) - k = k.reshape(1, num_q_heads, S, D) + k = k.reshape(B, num_q_heads, S, D) v = v.unsqueeze(2) v = v.expand(-1, -1, repeat_factor, -1, -1) - v = v.reshape(1, num_q_heads, S, D) + v = v.reshape(B, num_q_heads, S, D) attn_weights = torch.matmul(q, k.transpose(2, 3)) * scale_factor @@ -596,6 +596,7 @@ def forward( image_idx, past_key_values, comp_ctx_lengths: Optional[List[int]] = None, + batch_index: Optional[torch.LongTensor] = None, ): if input_ids is not None: input_ids = input_ids * (input_ids != -1).to(input_ids.dtype) @@ -613,6 +614,7 @@ def forward( position_ids=position_ids, past_key_values=past_key_values, comp_ctx_lengths=comp_ctx_lengths, + batch_index=batch_index, use_cache=True, ) next_idx = (indices1.max() + 1).unsqueeze(0).unsqueeze(0) @@ -694,6 +696,9 @@ def get_specializations( comp_ctx_lengths_decode: Optional[List[int]] = None, valid_size: int = None, kv_offload: bool = False, + continuous_batching: bool = False, + kv_cache_batch_size: Optional[int] = None, + full_batch_size: Optional[int] = None, **compiler_options, ): prefill_seq_len = prefill_seq_len if prefill_seq_len else 1024 @@ -725,12 +730,20 @@ def get_specializations( for i in range(0, len(comp_ctx_lengths_prefill)): lang_prefill = { - "batch_size": batch_size, + "batch_size": 1 if continuous_batching else batch_size, "seq_len": prefill_seq_len, "ctx_len": ctx_len, "comp_ctx_lengths": comp_ctx_lengths_prefill[i], "valid_size": valid_size, + "vision_batch_size": batch_size, } + if continuous_batching: + lang_prefill["full_batch_size"] = kv_cache_batch_size + else: + lang_prefill["batch_size"] = kv_cache_batch_size + + if full_batch_size: + lang_prefill["full_batch_exec_size"] = full_batch_size if kv_offload: values = { "img_size": img_size, @@ -746,12 +759,17 @@ def get_specializations( for i in range(0, len(comp_ctx_lengths_decode)): lang_decode = { - "batch_size": batch_size, + "batch_size": full_batch_size if continuous_batching else batch_size, "seq_len": "1", "ctx_len": ctx_len, "comp_ctx_lengths": comp_ctx_lengths_decode[i], "valid_size": valid_size, + "vision_batch_size": batch_size, } + if continuous_batching: + lang_decode["full_batch_size"] = kv_cache_batch_size + else: + lang_decode["batch_size"] = kv_cache_batch_size if kv_offload: values = { "img_size": img_size, @@ -767,13 +785,33 @@ def get_specializations( else: lang_prefill = { - "batch_size": batch_size, + "batch_size": 1 if continuous_batching else batch_size, "seq_len": prefill_seq_len, "ctx_len": ctx_len, "valid_size": valid_size, + "vision_batch_size": batch_size, } - lang_decode = {"batch_size": batch_size, "seq_len": "1", "ctx_len": ctx_len, "valid_size": valid_size} + if continuous_batching: + lang_prefill["full_batch_size"] = kv_cache_batch_size + else: + lang_prefill["batch_size"] = kv_cache_batch_size + + if full_batch_size: + lang_prefill["full_batch_exec_size"] = full_batch_size + + lang_decode = { + "batch_size": full_batch_size if continuous_batching else batch_size, + "seq_len": "1", + "ctx_len": ctx_len, + "valid_size": valid_size, + "vision_batch_size": batch_size, + } + + if continuous_batching: + lang_decode["full_batch_size"] = kv_cache_batch_size + else: + lang_decode["batch_size"] = kv_cache_batch_size if kv_offload: values = { @@ -800,13 +838,15 @@ def get_specializations( else: return lang, compiler_options - def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False): + def get_onnx_dynamic_axes( + self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False + ): # Define dynamic axes vision_dynamic_axes = {} lang_dynamic_axes = {} lang_dynamic_axes["input_ids"] = {0: "batch_size", 1: "seq_len"} lang_dynamic_axes["position_ids"] = {0: "batch_size", 1: "seq_len"} - lang_dynamic_axes["vision_embeds"] = {0: "batch_size", 1: "valid_size"} + lang_dynamic_axes["vision_embeds"] = {0: "vision_batch_size", 1: "valid_size"} vision_dynamic_axes["pixel_values"] = {0: "batch_size", 1: "num_images", 2: "img_tile", 3: "img_size"} vision_dynamic_axes["image_input_idx"] = {0: "batch_size", 1: "num_images", 2: "num_patch"} @@ -816,8 +856,17 @@ def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv num_layers = self.model.config.n_layers for i in range(num_layers): - lang_dynamic_axes[f"past_key.{i}"] = {0: "batch_size", 2: "ctx_len"} - lang_dynamic_axes[f"past_value.{i}"] = {0: "batch_size", 2: "ctx_len"} + lang_dynamic_axes[f"past_key.{i}"] = { + 0: "full_batch_size" if continuous_batching else "batch_size", + 2: "ctx_len", + } + lang_dynamic_axes[f"past_value.{i}"] = { + 0: "full_batch_size" if continuous_batching else "batch_size", + 2: "ctx_len", + } + + if continuous_batching: + lang_dynamic_axes["batch_index"] = {0: "batch_size"} if comp_ctx_lengths is not None: lang_dynamic_axes["comp_ctx_lengths"] = {0: "comp_ctx_lengths"} @@ -851,7 +900,13 @@ def get_output_names(self, kv_offload: bool = False): return lang_output_names return output_names - def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, **kwargs): + def get_dummy_inputs( + self, + comp_ctx_lengths: Optional[List[int]] = None, + kv_offload: bool = False, + continuous_batching: bool = False, + **kwargs, + ): inputs_shapes = {} inputs_shapes_lang = {} inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) @@ -902,10 +957,14 @@ def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offl .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1) ) lang_inputs["image_idx"] = torch.zeros((inputs_shapes["image_idx"]), dtype=torch.int64) + + bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE + fbs: int = constants.ONNX_EXPORT_EXAMPLE_FBS + # Add data for KV kv_cache_shape = get_padding_shape_from_config( config=self.config, - batch_size=constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, + batch_size=fbs if continuous_batching else bs, seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) @@ -916,6 +975,8 @@ def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offl if comp_ctx_lengths is not None: lang_inputs["comp_ctx_lengths"] = torch.randint(0, 100, (40,), dtype=torch.long) + if continuous_batching: + lang_inputs["batch_index"] = torch.arange(bs).view(bs, 1) inputs = {} if kv_offload: diff --git a/tests/transformers/models/test_image_text_to_text_models.py b/tests/transformers/models/test_image_text_to_text_models.py index 11fcf6857..3ca4b65b0 100644 --- a/tests/transformers/models/test_image_text_to_text_models.py +++ b/tests/transformers/models/test_image_text_to_text_models.py @@ -208,6 +208,7 @@ # "allenai/Molmo-7B-D-0924", # True, # 1, + # 4, # 128, # 4096, # "https://picsum.photos/id/237/536/354", @@ -413,6 +414,7 @@ def check_molmo_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( query: str, prompt_len: int, ctx_len: int, + full_batch_size: int, max_gen_len: int = 20, batch_size: int = 1, n_layer: int = 1, @@ -430,6 +432,7 @@ def check_molmo_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( n_layer = (n_layer, n_layer) processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, padding=True) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) img = requests.get(img_url, stream=True) image = Image.open(BytesIO(img.content)).convert("RGB") image = image.resize((536, 354)) @@ -475,6 +478,54 @@ def check_molmo_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer) qpc_tokens = output.generated_ids[:, :-1] assert (pytorch_hf_tokens == qpc_tokens).all(), "Tokens don't match for pytorch HF output and QPC output" + + if not kv_offload: # CB not yet enabled for Single QPC + return + images = [image] * full_batch_size + queries = [query] * full_batch_size + + pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch_CB(model_hf, images, queries) + + qeff_model = QEFFAutoModelForCausalLM.from_pretrained( + model_name, + trust_remote_code=True, + attn_implementation="eager", + kv_offload=True, + config=config, + continuous_batching=True, + ) + + qeff_model.export() + + qeff_model.compile( + prefill_seq_len=prompt_len, + ctx_len=ctx_len, + num_devices=4, + batch_size=1, + full_batch_size=4, + mxfp6_matmul=False, + mxint8_kv_cache=True, + aic_enable_depth_first=True, + mos=1, + ) + + exec_info = qeff_model.generate( + tokenizer=tokenizer, + processor=processor, + images=[img_url] * full_batch_size, + prompts=queries, + generation_len=max_gen_len, + ) + + qpc_tokens = exec_info.generated_ids[:, :max_gen_len] + print("QPC Outputs (QAIC) for Continuous Batching:") + print(exec_info.generated_texts) + + for i in range(full_batch_size): + assert (pytorch_hf_tokens[i] == qpc_tokens[i]).all(), ( + f"Tokens don't match for prompt {i} between HF and QPC output" + ) + return @@ -655,15 +706,17 @@ def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100_qnn( @pytest.mark.on_qaic @pytest.mark.multimodal @pytest.mark.parametrize( - "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer", molmo_model_config + "model_name, kv_offload, batch_size, full_batch_size, prompt_len, ctx_len, img_url, query, n_layer", + molmo_model_config, ) def test_image_text_to_text_molmo_pytorch_vs_kv_vs_ort_vs_ai100( - model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer + model_name, kv_offload, batch_size, full_batch_size, prompt_len, ctx_len, img_url, query, n_layer ): check_molmo_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( model_name=model_name, prompt_len=prompt_len, ctx_len=ctx_len, + full_batch_size=full_batch_size, max_gen_len=NEW_GENERATION_TOKENS, img_url=img_url, query=query, From a6f11823e6282481370c6db1056b839e75a1bef1 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Mon, 17 Nov 2025 06:56:48 +0000 Subject: [PATCH 07/16] Added mistral CB support Signed-off-by: Asmita Goswami --- .../models/mistral3/modeling_mistral3.py | 13 ++++++++----- .../models/test_image_text_to_text_models.py | 5 +++-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/QEfficient/transformers/models/mistral3/modeling_mistral3.py b/QEfficient/transformers/models/mistral3/modeling_mistral3.py index ab5c12bcc..60d33f388 100644 --- a/QEfficient/transformers/models/mistral3/modeling_mistral3.py +++ b/QEfficient/transformers/models/mistral3/modeling_mistral3.py @@ -178,16 +178,16 @@ def forward( comp_ctx_lengths: Optional[List[int]] = None, batch_index: Optional[torch.LongTensor] = None, ): - inputs_embeds = self.model.get_input_embeddings()(input_ids) - vision_embeds = vision_embeds.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = self.model.language_model.get_input_embeddings()(input_ids) mask = input_ids == self.model.config.image_token_index indices1 = mask.to(torch.int64).cumsum(1) - 1 indices1 = torch.where(indices1 != -1, indices1 + image_idx, indices1) indices0 = torch.arange(mask.shape[0]).view(-1, 1) image_features_expanded = vision_embeds.unsqueeze(0)[indices0, indices1] - inputs_embeds_1 = torch.where(mask.unsqueeze(-1), image_features_expanded, inputs_embeds) - outputs = self.model.model( - inputs_embeds=inputs_embeds_1, + image_embeds = torch.where(mask.unsqueeze(-1), image_features_expanded, inputs_embeds) + inputs_embeds = torch.where(input_ids.shape[1] == torch.tensor(1), inputs_embeds, image_embeds) + outputs = self.language_model( + inputs_embeds=inputs_embeds, position_ids=position_ids, past_key_values=past_key_values, comp_ctx_lengths=comp_ctx_lengths, @@ -428,6 +428,9 @@ def get_specializations( lang_decode["full_batch_size"] = kv_cache_batch_size else: lang_decode["batch_size"] = kv_cache_batch_size + lang = [] + lang.append(lang_prefill) + lang.append(lang_decode) specializations = {} diff --git a/tests/transformers/models/test_image_text_to_text_models.py b/tests/transformers/models/test_image_text_to_text_models.py index 3ca4b65b0..89a915189 100644 --- a/tests/transformers/models/test_image_text_to_text_models.py +++ b/tests/transformers/models/test_image_text_to_text_models.py @@ -384,7 +384,7 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( ctx_len=ctx_len, batch_size=batch_size, full_batch_size=full_batch_size, - mxfp6_matmul=True, + mxfp6_matmul=False, enable_qnn=enable_qnn, qnn_config=qnn_config, ) @@ -399,12 +399,13 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( ) qpc_tokens = exec_info.generated_ids[:, :max_gen_len] + print("QPC Outputs (QAIC) for Continuous Batching:") + print(exec_info.generated_texts) for i in range(full_batch_size): assert (pytorch_hf_tokens[i] == qpc_tokens[i]).all(), ( f"Tokens don't match for prompt {i} between HF and QPC output" ) - return From 94552e05d9770782d11a0cda730d43c70a8ead85 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Thu, 20 Nov 2025 10:14:40 +0000 Subject: [PATCH 08/16] Added CB Test for InternVL Signed-off-by: Asmita Goswami --- QEfficient/utils/run_utils.py | 46 +++++++++++ .../models/test_image_text_to_text_models.py | 80 ++++++++++++++----- 2 files changed, 106 insertions(+), 20 deletions(-) diff --git a/QEfficient/utils/run_utils.py b/QEfficient/utils/run_utils.py index 59e3f9bf4..f43654c0c 100644 --- a/QEfficient/utils/run_utils.py +++ b/QEfficient/utils/run_utils.py @@ -496,6 +496,52 @@ def __init__(self, batch_size, processor, config, image, prompt, prompt_len, ctx self.config = config self.gen_len = max_gen_len + @torch.no_grad() + def run_vlm_hf_model_on_pytorch_CB(self, model, images, queries): + """ + Function responsible for running HuggingFace ``PyTorch`` model for continuous batching + and return the output tokens for each prompt/image pair. + + ``Mandatory`` Args: + :model (torch.nn.module): Original ``PyTorch`` model + :images (List[PIL.Image]): List of input images + :queries (List[str]): List of input queries + + Return: + :List[numpy.ndarray]: List of generated output tokens for each prompt + """ + generated_ids = [] + + for idx, (image, query) in enumerate(zip(images, queries)): + num_patches_list = [] + + pixel_value = self.processor.load_image(image, max_num=12) + num_patches_list.append(pixel_value.shape[0]) + question = "\n" + query + + # Chat Template information for prompt preprocessing + messages: List[List[str]] = [] + roles = ("<|im_start|>user\n", "<|im_start|>assistant\n") + prompt = self.processor(pixel_value, question, messages, roles, num_patches_list=num_patches_list) + + inputs = self.processor.tokenizer(prompt, return_tensors="pt") + batch_size, prompt_len = inputs["input_ids"].shape + inputs["pixel_values"] = pixel_value.clone() + + generation_config = dict(max_new_tokens=self.gen_len, do_sample=False) + generation_config["eos_token_id"] = self.processor.tokenizer.convert_tokens_to_ids("<|im_end|>\n".strip()) + + # Decode and print output + outputs = model.generate(**inputs, **generation_config) + offset_output = outputs[0].detach().numpy() + + py_output = self.processor.tokenizer.decode(offset_output, skip_special_tokens=True).strip() + print("Original HF Model Outputs (Torch CPU):") + print("Completion:", repr(py_output)) + generated_ids.append(offset_output) + + return generated_ids + @torch.no_grad() def run_vlm_hf_model_on_pytorch(self, model, inputs, generation_config): outputs = model.generate(**inputs, **generation_config) diff --git a/tests/transformers/models/test_image_text_to_text_models.py b/tests/transformers/models/test_image_text_to_text_models.py index 89a915189..52068b99d 100644 --- a/tests/transformers/models/test_image_text_to_text_models.py +++ b/tests/transformers/models/test_image_text_to_text_models.py @@ -38,7 +38,6 @@ # model_name, # kv_offload, # batch_size, - # full_batch_size, # prompt_len, # ctx_len, # img_size, @@ -50,7 +49,6 @@ "llava-hf/llava-1.5-7b-hf", True, 1, - 4, 784, 1024, 336, @@ -62,7 +60,6 @@ "llava-hf/llava-1.5-7b-hf", False, 1, - 4, 784, 1024, 336, @@ -75,7 +72,6 @@ # "meta-llama/Llama-4-Scout-17B-16E-Instruct", # True, # 1, - # 4, # 128, # 3072, # 336, @@ -87,7 +83,6 @@ # "meta-llama/Llama-4-Scout-17B-16E-Instruct", # False, # 1, - # 4, # 128, # 3072, # 336, @@ -99,7 +94,6 @@ "google/gemma-3-4b-it", True, 1, - 4, 128, 3072, 896, @@ -111,7 +105,6 @@ "google/gemma-3-4b-it", False, 1, - 4, 128, 3072, 896, @@ -123,7 +116,6 @@ "mistralai/Mistral-Small-3.1-24B-Instruct-2503", True, 1, - 4, 128, 4096, 1540, @@ -135,7 +127,6 @@ "mistralai/Mistral-Small-3.1-24B-Instruct-2503", False, 1, - 4, 128, 4096, 1540, @@ -147,7 +138,6 @@ "Qwen/Qwen2.5-VL-3B-Instruct", True, 1, - 4, 128, 4096, 1540, @@ -159,7 +149,6 @@ # "meta-llama/Llama-3.2-11B-Vision-Instruct", # True, # 1, - # 4, # 32, # 512, # 560, @@ -208,7 +197,6 @@ # "allenai/Molmo-7B-D-0924", # True, # 1, - # 4, # 128, # 4096, # "https://picsum.photos/id/237/536/354", @@ -268,7 +256,6 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( query: str, prompt_len: int, ctx_len: int, - full_batch_size: int, max_gen_len: int = 20, batch_size: int = 1, n_layer: int = 1, @@ -358,6 +345,7 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( # testing for CB models if not kv_offload: # CB not yet enabled for Single QPC return + full_batch_size = 4 images = [image] * full_batch_size queries = [query] * full_batch_size @@ -415,7 +403,6 @@ def check_molmo_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( query: str, prompt_len: int, ctx_len: int, - full_batch_size: int, max_gen_len: int = 20, batch_size: int = 1, n_layer: int = 1, @@ -480,8 +467,10 @@ def check_molmo_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( qpc_tokens = output.generated_ids[:, :-1] assert (pytorch_hf_tokens == qpc_tokens).all(), "Tokens don't match for pytorch HF output and QPC output" + # testing for CB models if not kv_offload: # CB not yet enabled for Single QPC return + full_batch_size = 4 images = [image] * full_batch_size queries = [query] * full_batch_size @@ -636,17 +625,68 @@ def check_intern_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer) qpc_tokens = output.generated_ids[:, :-1] assert (pytorch_hf_tokens == qpc_tokens).all(), "Tokens don't match for pytorch HF output and QPC output" + + # testing for CB models + if not kv_offload: # CB not yet enabled for Single QPC + return + + full_batch_size = 4 + image = [image] * full_batch_size + queries = [query] * full_batch_size + + pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch_CB(model_hf, image, queries) + + qeff_model = QEFFAutoModelForCausalLM.from_pretrained( + model_name, + trust_remote_code=True, + attn_implementation="eager", + kv_offload=True, + config=config, + continuous_batching=True, + ) + + qeff_model.export() + + qeff_model.compile( + num_patches=1, + prefill_seq_len=prompt_len, + ctx_len=ctx_len, + num_devices=4, + batch_size=1, + full_batch_size=full_batch_size, + mxfp6_matmul=False, + ) + + exec_info = qeff_model.generate( + tokenizer=tokenizer, + processor=processor, + images=img_url * full_batch_size, + prompts=queries, + generation_len=max_gen_len, + image_height=448, + image_width=448, + ) + + qpc_tokens = exec_info.generated_ids[:, :max_gen_len] + print("QPC Outputs (QAIC) for Continuous Batching:") + print(exec_info.generated_texts) + + for i in range(full_batch_size): + assert (pytorch_hf_tokens[i] == qpc_tokens[i]).all(), ( + f"Tokens don't match for prompt {i} between HF and QPC output" + ) + return @pytest.mark.on_qaic @pytest.mark.multimodal @pytest.mark.parametrize( - "model_name, kv_offload, batch_size, full_batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer", + "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer", test_models_config, ) def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( - model_name, kv_offload, batch_size, full_batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer + model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer ): """ Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, without continuous batching. @@ -663,7 +703,6 @@ def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( query=query, n_layer=n_layer, batch_size=batch_size, - full_batch_size=full_batch_size, kv_offload=kv_offload, ) @@ -671,6 +710,7 @@ def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( @pytest.mark.on_qaic @pytest.mark.qnn @pytest.mark.multimodal +@pytest.mark.skip(reason="Issues with QNN") @pytest.mark.parametrize( "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer", test_models_config ) @@ -707,17 +747,16 @@ def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100_qnn( @pytest.mark.on_qaic @pytest.mark.multimodal @pytest.mark.parametrize( - "model_name, kv_offload, batch_size, full_batch_size, prompt_len, ctx_len, img_url, query, n_layer", + "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer", molmo_model_config, ) def test_image_text_to_text_molmo_pytorch_vs_kv_vs_ort_vs_ai100( - model_name, kv_offload, batch_size, full_batch_size, prompt_len, ctx_len, img_url, query, n_layer + model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer ): check_molmo_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( model_name=model_name, prompt_len=prompt_len, ctx_len=ctx_len, - full_batch_size=full_batch_size, max_gen_len=NEW_GENERATION_TOKENS, img_url=img_url, query=query, @@ -751,6 +790,7 @@ def test_image_text_to_text_intern_pytorch_vs_kv_vs_ort_vs_ai100( @pytest.mark.on_qaic @pytest.mark.qnn @pytest.mark.multimodal +@pytest.mark.skip(reason="Issues with QNN") @pytest.mark.parametrize( "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer", intern_model_config ) From e8af9174d1744c6eea9985f73899fe46ae0ad72f Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Thu, 20 Nov 2025 10:19:16 +0000 Subject: [PATCH 09/16] Ruff format Signed-off-by: Asmita Goswami --- QEfficient/utils/run_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/QEfficient/utils/run_utils.py b/QEfficient/utils/run_utils.py index f43654c0c..76234f76e 100644 --- a/QEfficient/utils/run_utils.py +++ b/QEfficient/utils/run_utils.py @@ -6,6 +6,7 @@ # ----------------------------------------------------------------------------- import os +from typing import List import numpy as np import onnx From eea2ffa526c30fb688181d74e669c1035017caef Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Tue, 25 Nov 2025 18:03:45 +0000 Subject: [PATCH 10/16] Resolving CI issues Signed-off-by: Asmita Goswami --- .../models/gemma3/modeling_gemma3.py | 2 - examples/internvl_CB_example.py | 98 ------------------- .../models/test_image_text_to_text_models.py | 3 +- 3 files changed, 1 insertion(+), 102 deletions(-) delete mode 100644 examples/internvl_CB_example.py diff --git a/QEfficient/transformers/models/gemma3/modeling_gemma3.py b/QEfficient/transformers/models/gemma3/modeling_gemma3.py index 15e30acc4..c80efde55 100644 --- a/QEfficient/transformers/models/gemma3/modeling_gemma3.py +++ b/QEfficient/transformers/models/gemma3/modeling_gemma3.py @@ -788,8 +788,6 @@ def get_specializations( specializations["lang"] = lang return specializations, compiler_options else: - lang[0].pop("vision_size") - lang[1].pop("vision_size") return lang, compiler_options def get_onnx_dynamic_axes( diff --git a/examples/internvl_CB_example.py b/examples/internvl_CB_example.py deleted file mode 100644 index 29cb9a5c4..000000000 --- a/examples/internvl_CB_example.py +++ /dev/null @@ -1,98 +0,0 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. -# SPDX-License-Identifier: BSD-3-Clause -# -# ---------------------------------------------------------------------------- - -from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer - -from QEfficient import QEFFAutoModelForCausalLM -from QEfficient.utils.test_utils import InternProcessor - -model_id = "OpenGVLab/InternVL2_5-1B" -config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) -# For Testing Purpose Only -config.llm_config.num_hidden_layers = 2 -config.vision_config.num_hidden_layers = 2 - -model_hf = AutoModelForCausalLM.from_pretrained( - model_id, - low_cpu_mem_usage=False, - trust_remote_code=True, - config=config, -) - -tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, use_fast=False) -processor = InternProcessor(model_hf, tokenizer) - - -continuous_batching = True -if continuous_batching: - qeff_model = QEFFAutoModelForCausalLM.from_pretrained( - model_id, - attn_implementation="eager", - kv_offload=True, - config=config, - continuous_batching=True, - trust_remote_code=True, - ) - - qeff_model.compile( - num_patches=13, # Set num_patches according to image_height and image_width, default is 13 (747 x 1000) - prefill_seq_len=128, - ctx_len=4096, - num_cores=16, - num_devices=4, - batch_size=1, - full_batch_size=4, - mxfp6_matmul=True, - mxint8_kv_cache=True, - aic_enable_depth_first=True, - mos=1, - ) -else: - qeff_model = QEFFAutoModelForCausalLM.from_pretrained( - model_id, attn_implementation="eager", kv_offload=True, config=config, trust_remote_code=True - ) - - qeff_model.compile( - num_patches=13, - prefill_seq_len=128, - ctx_len=4096, - num_cores=16, - num_devices=4, - batch_size=1, - mxfp6_matmul=True, - mxint8_kv_cache=True, - aic_enable_depth_first=True, - ) - -image_urls = [ - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", -] - -prompts = [ - "Can you describe the image in detail?", - "What are the objects in the image?", - "What is the main subject of the image?", - "What colors are predominant in the image?", -] - -exec_info = qeff_model.generate( - tokenizer=tokenizer, - prompts=prompts, - processor=processor, - images=image_urls, - device_ids=[0, 1, 2, 3], - generation_len=10, - image_height=747, - image_width=1000, -) - -print("Generated texts:", exec_info.generated_texts) -print("Generated IDs:", exec_info.generated_ids) -print(exec_info) diff --git a/tests/transformers/models/test_image_text_to_text_models.py b/tests/transformers/models/test_image_text_to_text_models.py index 52068b99d..cc66ddb9c 100644 --- a/tests/transformers/models/test_image_text_to_text_models.py +++ b/tests/transformers/models/test_image_text_to_text_models.py @@ -631,10 +631,9 @@ def check_intern_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( return full_batch_size = 4 - image = [image] * full_batch_size queries = [query] * full_batch_size - pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch_CB(model_hf, image, queries) + pytorch_hf_tokens = [pytorch_hf_tokens] * 4 qeff_model = QEFFAutoModelForCausalLM.from_pretrained( model_name, From ee5421526c9739872200b77f8859719d12c5fdf4 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Tue, 25 Nov 2025 18:05:08 +0000 Subject: [PATCH 11/16] Added InetrnVL example file for CB Signed-off-by: Asmita Goswami --- .../models/internvl/continuous_batching.py | 98 +++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 examples/image_text_to_text/models/internvl/continuous_batching.py diff --git a/examples/image_text_to_text/models/internvl/continuous_batching.py b/examples/image_text_to_text/models/internvl/continuous_batching.py new file mode 100644 index 000000000..29cb9a5c4 --- /dev/null +++ b/examples/image_text_to_text/models/internvl/continuous_batching.py @@ -0,0 +1,98 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ---------------------------------------------------------------------------- + +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer + +from QEfficient import QEFFAutoModelForCausalLM +from QEfficient.utils.test_utils import InternProcessor + +model_id = "OpenGVLab/InternVL2_5-1B" +config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) +# For Testing Purpose Only +config.llm_config.num_hidden_layers = 2 +config.vision_config.num_hidden_layers = 2 + +model_hf = AutoModelForCausalLM.from_pretrained( + model_id, + low_cpu_mem_usage=False, + trust_remote_code=True, + config=config, +) + +tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, use_fast=False) +processor = InternProcessor(model_hf, tokenizer) + + +continuous_batching = True +if continuous_batching: + qeff_model = QEFFAutoModelForCausalLM.from_pretrained( + model_id, + attn_implementation="eager", + kv_offload=True, + config=config, + continuous_batching=True, + trust_remote_code=True, + ) + + qeff_model.compile( + num_patches=13, # Set num_patches according to image_height and image_width, default is 13 (747 x 1000) + prefill_seq_len=128, + ctx_len=4096, + num_cores=16, + num_devices=4, + batch_size=1, + full_batch_size=4, + mxfp6_matmul=True, + mxint8_kv_cache=True, + aic_enable_depth_first=True, + mos=1, + ) +else: + qeff_model = QEFFAutoModelForCausalLM.from_pretrained( + model_id, attn_implementation="eager", kv_offload=True, config=config, trust_remote_code=True + ) + + qeff_model.compile( + num_patches=13, + prefill_seq_len=128, + ctx_len=4096, + num_cores=16, + num_devices=4, + batch_size=1, + mxfp6_matmul=True, + mxint8_kv_cache=True, + aic_enable_depth_first=True, + ) + +image_urls = [ + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", +] + +prompts = [ + "Can you describe the image in detail?", + "What are the objects in the image?", + "What is the main subject of the image?", + "What colors are predominant in the image?", +] + +exec_info = qeff_model.generate( + tokenizer=tokenizer, + prompts=prompts, + processor=processor, + images=image_urls, + device_ids=[0, 1, 2, 3], + generation_len=10, + image_height=747, + image_width=1000, +) + +print("Generated texts:", exec_info.generated_texts) +print("Generated IDs:", exec_info.generated_ids) +print(exec_info) From 453bd9e6e9768683c9e486dbff6bc87f5a54778b Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Tue, 2 Dec 2025 06:33:09 +0000 Subject: [PATCH 12/16] Addressed Comments Signed-off-by: Asmita Goswami --- QEfficient/generation/embedding_handler.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/QEfficient/generation/embedding_handler.py b/QEfficient/generation/embedding_handler.py index b3ba55098..72869781f 100644 --- a/QEfficient/generation/embedding_handler.py +++ b/QEfficient/generation/embedding_handler.py @@ -51,6 +51,8 @@ def __init__( vision_session: QAICInferenceSession for vision model processor: AutoImageProcessor for image preprocessing tokenizer: AutoTokenizer for text tokenization + image_height: Desired image height for resizing + image_width: Desired image width for resizing config: Configuration dictionary with vision model parameters lang_session: Optional language session for coordination (to avoid resource conflicts) """ @@ -78,18 +80,16 @@ def is_available(self) -> bool: """ return self._vision_session is not None and self._processor is not None - def prepare_internVL_inputs(self, img_url: str, query: str) -> Dict[str, np.ndarray]: + def prepare_internVL_inputs(self, img_url: str, prompt: str) -> Dict[str, np.ndarray]: """ Prepare inputs for InternVL model Args: image_url: URL or path to image - query: Text query to process with image - prompt = [query] + prompt: Text query to process with image """ if not self._tokenizer: raise ValueError("Tokenizer is required for InternVL input preparation") - prompt = query pixel_values = [] num_patches_list = [] questions = [] @@ -205,6 +205,7 @@ def prepare_vlm_inputs(self, image_url: str, query: str, prefill_seq_len: int) - Args: image_url: URL or path to image query: Text query to process with image + prefill_seq_len: Padded sequence length for language model Returns: Dictionary of vision model inputs From c2fe7ff6fdd17cc3c855acbf30c2e31fad4ff7a1 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Tue, 2 Dec 2025 06:58:23 +0000 Subject: [PATCH 13/16] Comments Addressed Signed-off-by: Asmita Goswami --- QEfficient/generation/vlm_generation.py | 2 ++ QEfficient/utils/run_utils.py | 13 +++++++++---- .../models/test_image_text_to_text_models.py | 2 +- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/QEfficient/generation/vlm_generation.py b/QEfficient/generation/vlm_generation.py index dd5f579a8..b37fdc74a 100644 --- a/QEfficient/generation/vlm_generation.py +++ b/QEfficient/generation/vlm_generation.py @@ -109,6 +109,8 @@ def __init__( enable_debug_logs: Enable debug logging write_io_dir: Directory for I/O file writing full_batch_size: Enable continuous batching (new feature) + image_height: Desired image height for resizing + image_width: Desired image width for resizing is_tlm: Target language model flag include_sampler: Enable on-device sampling (new feature) return_pdfs: Return probability distributions diff --git a/QEfficient/utils/run_utils.py b/QEfficient/utils/run_utils.py index 76234f76e..5ec30630d 100644 --- a/QEfficient/utils/run_utils.py +++ b/QEfficient/utils/run_utils.py @@ -515,19 +515,24 @@ def run_vlm_hf_model_on_pytorch_CB(self, model, images, queries): for idx, (image, query) in enumerate(zip(images, queries)): num_patches_list = [] + pixel_values = [] + questions = [] pixel_value = self.processor.load_image(image, max_num=12) num_patches_list.append(pixel_value.shape[0]) question = "\n" + query + pixel_values.append(pixel_value) + pixel_values = torch.cat(pixel_values, dim=0) + questions.append(question) + # Chat Template information for prompt preprocessing messages: List[List[str]] = [] roles = ("<|im_start|>user\n", "<|im_start|>assistant\n") - prompt = self.processor(pixel_value, question, messages, roles, num_patches_list=num_patches_list) + prompt = self.processor(pixel_values, questions, messages, roles, num_patches_list=num_patches_list) inputs = self.processor.tokenizer(prompt, return_tensors="pt") - batch_size, prompt_len = inputs["input_ids"].shape - inputs["pixel_values"] = pixel_value.clone() + inputs["pixel_values"] = pixel_values.clone() generation_config = dict(max_new_tokens=self.gen_len, do_sample=False) generation_config["eos_token_id"] = self.processor.tokenizer.convert_tokens_to_ids("<|im_end|>\n".strip()) @@ -537,7 +542,7 @@ def run_vlm_hf_model_on_pytorch_CB(self, model, images, queries): offset_output = outputs[0].detach().numpy() py_output = self.processor.tokenizer.decode(offset_output, skip_special_tokens=True).strip() - print("Original HF Model Outputs (Torch CPU):") + print(f"Original HF Model Outputs (Torch CPU) for prompt {idx}:") print("Completion:", repr(py_output)) generated_ids.append(offset_output) diff --git a/tests/transformers/models/test_image_text_to_text_models.py b/tests/transformers/models/test_image_text_to_text_models.py index cc66ddb9c..387aa0a3a 100644 --- a/tests/transformers/models/test_image_text_to_text_models.py +++ b/tests/transformers/models/test_image_text_to_text_models.py @@ -633,7 +633,7 @@ def check_intern_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( full_batch_size = 4 queries = [query] * full_batch_size - pytorch_hf_tokens = [pytorch_hf_tokens] * 4 + pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch_CB(model_hf, [image] * full_batch_size, queries) qeff_model = QEFFAutoModelForCausalLM.from_pretrained( model_name, From 7202b67c8bd8ea5b42788b29753c75f930d987b4 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Thu, 4 Dec 2025 17:10:44 +0000 Subject: [PATCH 14/16] Added CB test file Signed-off-by: Asmita Goswami --- QEfficient/generation/embedding_handler.py | 7 +- .../models/gemma3/modeling_gemma3.py | 4 +- .../models/internvl/modeling_internvl.py | 4 +- .../models/llama4/modeling_llama4.py | 4 +- .../models/llava/modeling_llava.py | 4 +- .../models/llava_next/modeling_llava_next.py | 146 ++-- .../models/mistral3/modeling_mistral3.py | 4 +- .../models/molmo/modeling_molmo.py | 4 +- .../models/qwen2_5_vl/modeling_qwen2_5_vl.py | 4 +- QEfficient/utils/constants.py | 10 + QEfficient/utils/run_utils.py | 31 + .../models/internvl/continuous_batching.py | 2 + .../test_continuous_batching.py | 711 ++++++++++++++++++ .../models/test_image_text_to_text_models.py | 162 +--- 14 files changed, 866 insertions(+), 231 deletions(-) create mode 100644 tests/transformers/models/image_text_to_text/test_continuous_batching.py diff --git a/QEfficient/generation/embedding_handler.py b/QEfficient/generation/embedding_handler.py index 72869781f..be8896917 100644 --- a/QEfficient/generation/embedding_handler.py +++ b/QEfficient/generation/embedding_handler.py @@ -22,6 +22,7 @@ from transformers import AutoImageProcessor, AutoTokenizer from QEfficient.generation.cloud_infer import QAICInferenceSession +from QEfficient.utils import constants from QEfficient.utils.logging_utils import logger @@ -100,7 +101,7 @@ def prepare_internVL_inputs(self, img_url: str, prompt: str) -> Dict[str, np.nda image = image.resize((self._image_height, self._image_width)) else: logger.warning("Height and Width not specified. Using default image size for num_patches = 13.") - image = image.resize((1000, 747)) + image = image.resize((constants.INTERN_IMAGE_HEIGHT, constants.INTERN_IMAGE_WIDTH)) # preprocess the resized image pixel_value = self._processor.load_image(image, max_num=12) @@ -164,7 +165,7 @@ def prepare_molmo_inputs(self, image_url: str, query: str) -> Dict[str, np.ndarr image = Image.open(requests.get(image_url, stream=True).raw) else: image = Image.open(image_url) - image = image.resize((536, 354)) + image = image.resize((constants.MOLMO_IMAGE_HEIGHT, constants.MOLMO_IMAGE_WIDTH)) inputs = self._processor.process(images=[image], text=query) inputs = {k: v.unsqueeze(0) for k, v in inputs.items()} inputs["attention_mask"] = torch.ones((inputs["input_ids"].shape), dtype=torch.int64) @@ -225,7 +226,7 @@ def prepare_vlm_inputs(self, image_url: str, query: str, prefill_seq_len: int) - image = Image.open(image_url) if "mistral3" in self._qeff_model.model.config.model_type: - image = image.resize((1540, 1540)) + image = image.resize((constants.MISTRAL3_IMAGE_HEIGHT, constants.MISTRAL3_IMAGE_WIDTH)) # Prepare conversation format conversation = [ diff --git a/QEfficient/transformers/models/gemma3/modeling_gemma3.py b/QEfficient/transformers/models/gemma3/modeling_gemma3.py index c80efde55..c91d2fe32 100644 --- a/QEfficient/transformers/models/gemma3/modeling_gemma3.py +++ b/QEfficient/transformers/models/gemma3/modeling_gemma3.py @@ -777,9 +777,7 @@ def get_specializations( lang_decode["full_batch_size"] = kv_cache_batch_size else: lang_decode["batch_size"] = kv_cache_batch_size - lang = [] - lang.append(lang_prefill) - lang.append(lang_decode) + lang = [lang_prefill, lang_decode] specializations = {} diff --git a/QEfficient/transformers/models/internvl/modeling_internvl.py b/QEfficient/transformers/models/internvl/modeling_internvl.py index 402f0450b..85c331aa8 100644 --- a/QEfficient/transformers/models/internvl/modeling_internvl.py +++ b/QEfficient/transformers/models/internvl/modeling_internvl.py @@ -193,9 +193,7 @@ def get_specializations( else: lang_decode["batch_size"] = kv_cache_batch_size - lang = [] - lang.append(lang_prefill) - lang.append(lang_decode) + lang = [lang_prefill, lang_decode] specializations = {} diff --git a/QEfficient/transformers/models/llama4/modeling_llama4.py b/QEfficient/transformers/models/llama4/modeling_llama4.py index 0bcdf8ae0..7a2f687fe 100644 --- a/QEfficient/transformers/models/llama4/modeling_llama4.py +++ b/QEfficient/transformers/models/llama4/modeling_llama4.py @@ -1065,9 +1065,7 @@ def get_specializations( else: lang_decode["batch_size"] = kv_cache_batch_size - lang = [] - lang.append(lang_prefill) - lang.append(lang_decode) + lang = [lang_prefill, lang_decode] specializations = {} diff --git a/QEfficient/transformers/models/llava/modeling_llava.py b/QEfficient/transformers/models/llava/modeling_llava.py index b8f493b93..d5f5ee920 100644 --- a/QEfficient/transformers/models/llava/modeling_llava.py +++ b/QEfficient/transformers/models/llava/modeling_llava.py @@ -297,9 +297,7 @@ def get_specializations( else: lang_decode["batch_size"] = kv_cache_batch_size - lang = [] - lang.append(lang_prefill) - lang.append(lang_decode) + lang = [lang_prefill, lang_decode] specializations = {} diff --git a/QEfficient/transformers/models/llava_next/modeling_llava_next.py b/QEfficient/transformers/models/llava_next/modeling_llava_next.py index 2e4848b6b..9cac61264 100755 --- a/QEfficient/transformers/models/llava_next/modeling_llava_next.py +++ b/QEfficient/transformers/models/llava_next/modeling_llava_next.py @@ -20,6 +20,9 @@ from QEfficient.utils._utils import IOInfo from QEfficient.utils.logging_utils import logger +BS = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE +FBS = constants.ONNX_EXPORT_EXAMPLE_FBS + class QEffLlavaNextEncoderWrapper(nn.Module): def __init__(self, model): @@ -133,6 +136,7 @@ def forward( image_idx, past_key_values, comp_ctx_lengths: Optional[List[int]] = None, + batch_index: Optional[torch.LongTensor] = None, ): inputs_embeds = self.model.get_input_embeddings()(input_ids) image_features = vision_embeds.to(inputs_embeds.device, inputs_embeds.dtype) @@ -149,6 +153,7 @@ def forward( position_ids=position_ids, past_key_values=past_key_values, comp_ctx_lengths=comp_ctx_lengths, + batch_index=batch_index, ) image_idx = (indices1.max() + 1).unsqueeze(0).unsqueeze(0) logit_index = position_ids.to(torch.int32).argmax(1, keepdim=True) @@ -165,7 +170,13 @@ def get_qeff_vision_encoder(self): def get_qeff_language_decoder(self): return QEffLlavaNextDecoderWrapper(self) - def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, **kwargs): + def get_dummy_inputs( + self, + comp_ctx_lengths: Optional[List[int]] = None, + kv_offload: bool = False, + continuous_batching: bool = False, + **kwargs, + ): num_layers = self.config.text_config.num_hidden_layers num_key_value_heads = self.config.text_config.num_key_value_heads head_dim = self.config.text_config.hidden_size // self.config.text_config.num_attention_heads @@ -214,13 +225,13 @@ def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offl lang_inputs["past_key_values"].append( ( torch.zeros( - constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, + FBS if continuous_batching else BS, num_key_value_heads, constants.GRANITEVISION_CTX_LEN, head_dim, ), torch.zeros( - constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, + FBS if continuous_batching else BS, num_key_value_heads, constants.GRANITEVISION_CTX_LEN, head_dim, @@ -232,6 +243,9 @@ def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offl if comp_ctx_lengths is not None: lang_inputs["comp_ctx_lengths"] = torch.randint(0, 100, (40,), dtype=torch.long) + if continuous_batching: + lang_inputs["batch_index"] = torch.arange(BS).view(BS, 1) + inputs = {} if kv_offload: inputs["vision"] = vision_inputs @@ -250,6 +264,9 @@ def get_specializations( comp_ctx_lengths_prefill: Optional[List[int]] = None, comp_ctx_lengths_decode: Optional[List[int]] = None, kv_offload: bool = False, + continuous_batching: bool = False, + kv_cache_batch_size: Optional[int] = None, + full_batch_size: Optional[int] = None, **compiler_options, ): max_num_images = compiler_options.pop("max_num_images", 1) @@ -306,62 +323,85 @@ def get_specializations( lang = [] for i in range(0, len(comp_ctx_lengths_prefill)): - lang.append( - { - "batch_size": batch_size, - "seq_len": prefill_seq_len, - "ctx_len": ctx_len, - "comp_ctx_lengths": comp_ctx_lengths_prefill[i], - "image_size_height": image_size_height, - "image_size_width": image_size_width, - "num_patches": num_patches, - "max_num_images": max_num_images, - "img_size": img_size, - "vision_size": vision_size, - } - ) - - # Remaining elements use comp_ctx_lengths[1:] in a loop - for i in range(0, len(comp_ctx_lengths_decode)): - lang.append( - { - "batch_size": batch_size, - "seq_len": "1", - "ctx_len": ctx_len, - "comp_ctx_lengths": comp_ctx_lengths_decode[i], - "image_size_height": image_size_height, - "image_size_width": image_size_width, - "num_patches": num_patches, - "max_num_images": max_num_images, - "img_size": img_size, - "vision_size": vision_size, - } - ) - else: - lang = [ - { - "batch_size": batch_size, + lang_prefill = { + "batch_size": 1 if continuous_batching else batch_size, "seq_len": prefill_seq_len, "ctx_len": ctx_len, + "comp_ctx_lengths": comp_ctx_lengths_prefill[i], "image_size_height": image_size_height, "image_size_width": image_size_width, "num_patches": num_patches, "max_num_images": max_num_images, "img_size": img_size, "vision_size": vision_size, - }, - { - "batch_size": batch_size, + "vision_batch_size": batch_size, + } + if continuous_batching: + lang_prefill["full_batch_size"] = kv_cache_batch_size + else: + lang_prefill["batch_size"] = kv_cache_batch_size + if full_batch_size: + lang_prefill["full_batch_exec_size"] = full_batch_size + lang.append(lang_prefill) + + # Remaining elements use comp_ctx_lengths[1:] in a loop + for i in range(0, len(comp_ctx_lengths_decode)): + lang_decode = { + "batch_size": full_batch_size if continuous_batching else batch_size, "seq_len": "1", "ctx_len": ctx_len, + "comp_ctx_lengths": comp_ctx_lengths_decode[i], "image_size_height": image_size_height, "image_size_width": image_size_width, "num_patches": num_patches, "max_num_images": max_num_images, "img_size": img_size, "vision_size": vision_size, - }, - ] + "vision_batch_size": batch_size, + } + if continuous_batching: + lang_decode["full_batch_size"] = kv_cache_batch_size + else: + lang_decode["batch_size"] = kv_cache_batch_size + lang.append(lang_decode) + else: + lang_prefill = { + "batch_size": batch_size, + "seq_len": prefill_seq_len, + "ctx_len": ctx_len, + "image_size_height": image_size_height, + "image_size_width": image_size_width, + "num_patches": num_patches, + "max_num_images": max_num_images, + "img_size": img_size, + "vision_size": vision_size, + "vision_batch_size": batch_size, + } + if continuous_batching: + lang_prefill["full_batch_size"] = kv_cache_batch_size + else: + lang_prefill["batch_size"] = kv_cache_batch_size + if full_batch_size: + lang_prefill["full_batch_exec_size"] = full_batch_size + + lang_decode = { + "batch_size": batch_size, + "seq_len": "1", + "ctx_len": ctx_len, + "image_size_height": image_size_height, + "image_size_width": image_size_width, + "num_patches": num_patches, + "max_num_images": max_num_images, + "img_size": img_size, + "vision_size": vision_size, + "vision_batch_size": batch_size, + } + if continuous_batching: + lang_decode["full_batch_size"] = kv_cache_batch_size + else: + lang_decode["batch_size"] = kv_cache_batch_size + + lang = [lang_prefill, lang_decode] specializations = {} if kv_offload: @@ -369,9 +409,13 @@ def get_specializations( specializations["lang"] = lang return specializations, compiler_options else: + lang[0].pop("vision_size") + lang[1].pop("vision_size") return lang, compiler_options - def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False): + def get_onnx_dynamic_axes( + self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False + ): # Define dynamic axes num_layers = self.config.text_config.num_hidden_layers vision_dynamic_axes = { @@ -381,11 +425,19 @@ def get_onnx_dynamic_axes(self, comp_ctx_lengths: Optional[List[int]] = None, kv lang_dynamic_axes = { "input_ids": {0: "batch_size", 1: "seq_len"}, "position_ids": {0: "batch_size", 1: "seq_len"}, - "vision_embeds": {0: "batch_size", 1: "vision_size"}, + "vision_embeds": {0: "vision_batch_size", 1: "vision_size"}, } + if continuous_batching: + lang_dynamic_axes["batch_index"] = {0: "batch_size"} for i in range(num_layers): - lang_dynamic_axes[f"past_key.{i}"] = {0: "batch_size", 2: "ctx_len"} - lang_dynamic_axes[f"past_value.{i}"] = {0: "batch_size", 2: "ctx_len"} + lang_dynamic_axes[f"past_key.{i}"] = { + 0: "full_batch_size" if continuous_batching else "batch_size", + 2: "ctx_len", + } + lang_dynamic_axes[f"past_value.{i}"] = { + 0: "full_batch_size" if continuous_batching else "batch_size", + 2: "ctx_len", + } if comp_ctx_lengths is not None: lang_dynamic_axes["comp_ctx_lengths"] = {0: "comp_ctx_lengths"} diff --git a/QEfficient/transformers/models/mistral3/modeling_mistral3.py b/QEfficient/transformers/models/mistral3/modeling_mistral3.py index 60d33f388..89e19c65b 100644 --- a/QEfficient/transformers/models/mistral3/modeling_mistral3.py +++ b/QEfficient/transformers/models/mistral3/modeling_mistral3.py @@ -428,9 +428,7 @@ def get_specializations( lang_decode["full_batch_size"] = kv_cache_batch_size else: lang_decode["batch_size"] = kv_cache_batch_size - lang = [] - lang.append(lang_prefill) - lang.append(lang_decode) + lang = [lang_prefill, lang_decode] specializations = {} diff --git a/QEfficient/transformers/models/molmo/modeling_molmo.py b/QEfficient/transformers/models/molmo/modeling_molmo.py index db4755843..7bfa58fc0 100644 --- a/QEfficient/transformers/models/molmo/modeling_molmo.py +++ b/QEfficient/transformers/models/molmo/modeling_molmo.py @@ -825,9 +825,7 @@ def get_specializations( lang_prefill[key] = value lang_decode[key] = value - lang = [] - lang.append(lang_prefill) - lang.append(lang_decode) + lang = [lang_prefill, lang_decode] specializations = {} diff --git a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index 33a434db1..63e046600 100644 --- a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -1169,9 +1169,7 @@ def smart_resize( else: lang_decode["batch_size"] = kv_cache_batch_size - lang = [] - lang.append(lang_prefill) - lang.append(lang_decode) + lang = [lang_prefill, lang_decode] specializations = {} diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index 3752db40c..e0b003422 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -100,6 +100,8 @@ def get_models_dir(): INTERN_CTX_LEN = 4096 INTERN_PREFILL_SEQ_LEN = INTERN_CTX_LEN - 256 # 4096-256 INTERN_NUM_CHANNELS = 3 +INTERN_IMAGE_HEIGHT = 1000 +INTERN_IMAGE_WIDTH = 747 INTERN_IMG_CONTEXT_TOKEN = 151667 # Specific to InternVL3_5 series, same token won't work for InternVL2_5 series @@ -135,6 +137,14 @@ def get_models_dir(): # Modules to cache while clearing the pytorch weights CACHE_MODULES = ["get_output_names", "get_dummy_inputs", "get_onnx_dynamic_axes", "get_specializations"] +# Mistral3 Constants +MISTRAL3_IMAGE_HEIGHT = 1540 +MISTRAL3_IMAGE_WIDTH = 1540 + +# Molmo Constants +MOLMO_IMAGE_HEIGHT = 536 +MOLMO_IMAGE_WIDTH = 354 + class Constants: # Export Constants. diff --git a/QEfficient/utils/run_utils.py b/QEfficient/utils/run_utils.py index 5ec30630d..61553e7ea 100644 --- a/QEfficient/utils/run_utils.py +++ b/QEfficient/utils/run_utils.py @@ -590,3 +590,34 @@ def run_vlm_hf_model_on_pytorch(self, model, inputs, generation_config): print("Original HF Model Outputs (Torch CPU):") print("Completion:", repr(py_output)) return generated_ids + + @torch.no_grad() + def run_vlm_hf_model_on_pytorch_CB(self, model, images, queries, generation_config): + """ + Function responsible for running HuggingFace ``PyTorch`` model for continuous batching + and return the output tokens for each prompt/image pair. + + ``Mandatory`` Args: + :model (torch.nn.module): Original ``PyTorch`` model + :images (List[PIL.Image]): List of input images + :queries (List[str]): List of input queries + :generation_config (dict): Generation configuration parameters + + Return: + :List[numpy.ndarray]: List of generated output tokens for each prompt + """ + generated_ids = [] + for idx, (image, query) in enumerate(zip(images, queries)): + inputs = self.processor.process(images=[image], text=query) + inputs = {k: v.unsqueeze(0) for k, v in inputs.items()} + outputs = model.generate_from_batch( + inputs, generation_config, tokenizer=self.processor.tokenizer, do_sample=False + ) + + offset_output = outputs[0, inputs["input_ids"].size(1) :] + + py_output = self.processor.tokenizer.decode(offset_output, skip_special_tokens=True).strip() + print(f"Original HF Model Outputs (Torch CPU) for prompt {idx}:") + print("Completion:", repr(py_output)) + generated_ids.append(offset_output) + return generated_ids diff --git a/examples/image_text_to_text/models/internvl/continuous_batching.py b/examples/image_text_to_text/models/internvl/continuous_batching.py index 29cb9a5c4..ca3e0ede3 100644 --- a/examples/image_text_to_text/models/internvl/continuous_batching.py +++ b/examples/image_text_to_text/models/internvl/continuous_batching.py @@ -16,6 +16,8 @@ config.llm_config.num_hidden_layers = 2 config.vision_config.num_hidden_layers = 2 +# The original Intern-VL model, despite being multimodal, is loaded using `AutoModelForCausalLM` in Huggingface. +# To maintain compatibility, we load this model using `QEFFAutoModelForCausalLM`. model_hf = AutoModelForCausalLM.from_pretrained( model_id, low_cpu_mem_usage=False, diff --git a/tests/transformers/models/image_text_to_text/test_continuous_batching.py b/tests/transformers/models/image_text_to_text/test_continuous_batching.py new file mode 100644 index 000000000..6a88601e7 --- /dev/null +++ b/tests/transformers/models/image_text_to_text/test_continuous_batching.py @@ -0,0 +1,711 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ---------------------------------------------------------------------------- + +from io import BytesIO +from typing import List + +import pytest +import requests +from PIL import Image +from transformers import ( + AutoConfig, + AutoModelForCausalLM, + AutoModelForImageTextToText, + AutoProcessor, + AutoTokenizer, + GenerationConfig, +) + +from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM, QEFFAutoModelForImageTextToText +from QEfficient.utils import hf_download +from QEfficient.utils._utils import get_num_layers_vlm +from QEfficient.utils.device_utils import get_available_device_id +from QEfficient.utils.run_utils import ApiRunnerInternVL, ApiRunnerMolmo, ApiRunnerVlm +from QEfficient.utils.test_utils import InternProcessor + +NEW_GENERATION_TOKENS = 10 + +# TODO: Add CB support for kv_offload=False case +test_models_config = [ + # CONFIG PARAMS NEEDED FOR A MODEL TO BE TESTED + # ( + # model_name, + # kv_offload, + # batch_size, + # prompt_len, + # ctx_len, + # img_size, + # img_url_list", + # text_prompt_list, + # number of layers of the model, + # full_batch_size + # ), + ( + "llava-hf/llava-1.5-7b-hf", + True, + 1, + 784, + 1024, + 336, + [ + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", + ], + [ + "Can you describe the image in detail?", + "What are the objects in the image?", + "What is the main subject of the image?", + "What colors are predominant in the image?", + ], + 1, + 4, + ), + # Disabled in CI due to performance issues + # ( + # "meta-llama/Llama-4-Scout-17B-16E-Instruct", + # True, + # 1, + # 128, + # 3072, + # 336, + # ["https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + # "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", + # "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + # "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg",], + # ["Can you describe the image in detail?", + # "What are the objects in the image?", + # "What is the main subject of the image?", + # "What colors are predominant in the image?"], + # 4, + # 4, + # ), + ( + "google/gemma-3-4b-it", + True, + 1, + 128, + 3072, + 896, + [ + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", + ], + [ + "Can you describe the image in detail?", + "What are the objects in the image?", + "What is the main subject of the image?", + "What colors are predominant in the image?", + ], + 1, + 4, + ), + ( + "mistralai/Mistral-Small-3.1-24B-Instruct-2503", + True, + 1, + 128, + 4096, + 1540, + [ + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", + ], + [ + "Can you describe the image in detail?", + "What are the objects in the image?", + "What is the main subject of the image?", + "What colors are predominant in the image?", + ], + 1, + 4, + ), + ( + "Qwen/Qwen2.5-VL-3B-Instruct", + True, + 1, + 128, + 4096, + 1540, + [ + "https://picsum.photos/id/237/536/354", + "https://picsum.photos/id/237/536/354", + "https://picsum.photos/id/237/536/354", + "https://picsum.photos/id/237/536/354", + ], + [ + "Can you describe the image in detail?", + "What are the objects in the image?", + "What is the main subject of the image?", + "What colors are predominant in the image?", + ], + 2, + 4, + ), + # ( + # "meta-llama/Llama-3.2-11B-Vision-Instruct", + # True, + # 1, + # 32, + # 512, + # 560, + # ["https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + # "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", + # "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + # "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg",], + # ["Can you describe the image in detail?", + # "What are the objects in the image?", + # "What is the main subject of the image?", + # "What colors are predominant in the image?"], + # 7, + # 4, + # ), +] + +intern_model_config = [ + ( + "OpenGVLab/InternVL2_5-1B", + True, + 1, + 384, + 512, + [ + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", + ], + [ + "Can you describe the image in detail?", + "What are the objects in the image?", + "What is the main subject of the image?", + "What colors are predominant in the image?", + ], + 2, + 4, + ), + ( + "OpenGVLab/InternVL3_5-1B", + True, + 1, + 384, + 512, + [ + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", + ], + [ + "Can you describe the image in detail?", + "What are the objects in the image?", + "What is the main subject of the image?", + "What colors are predominant in the image?", + ], + 2, + 4, + ), +] + +molmo_model_config = [ + # Disabled in CI due to HF issues + # ( + # "allenai/Molmo-7B-D-0924", + # True, + # 1, + # 128, + # 4096, + # ["https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + # "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", + # "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + # "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg",], + # ["Can you describe the image in detail?", + # "What are the objects in the image?", + # "What is the main subject of the image?", + # "What colors are predominant in the image?"], + # 2, + # 4, + # ), +] + + +def load_image_text_to_text_model(model_config): + model_path = hf_download( + repo_id=model_config._name_or_path, + ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"], + ) + try: + model_hf = AutoModelForImageTextToText.from_pretrained( + model_path, + low_cpu_mem_usage=False, + config=model_config, + ) + except ValueError: + model_hf = AutoModelForCausalLM.from_pretrained( + model_path, + low_cpu_mem_usage=False, + trust_remote_code=True, + config=model_config, + ) + params = sum(p.numel() for p in model_hf.parameters()) + model_hf.eval() + return model_hf, params + + +def set_num_layers(config, n_layer=1): + ## -1 indicates use all the layers of the model. + if n_layer == -1: + return config + elif hasattr(config, "model_type") and "mllama" in config.model_type: + config.text_config.num_hidden_layers = n_layer + config.text_config.cross_attention_layers = [ + x for x in config.text_config.cross_attention_layers if x < n_layer + ] + elif hasattr(config, "text_config"): + config.text_config.num_hidden_layers = n_layer + config.vision_config.num_hidden_layers = n_layer + elif hasattr(config, "llm_config"): + config.llm_config.num_hidden_layers = n_layer + config.vision_config.num_hidden_layers = n_layer + else: + config.num_hidden_layers = n_layer + return config + + +def check_image_text_to_text_pytorch_vs_ai100_continuous_batching( + model_name: str, + img_size: int, + image_urls: List[str], + queries: List[str], + prompt_len: int, + ctx_len: int, + max_gen_len: int = 20, + batch_size: int = 1, + n_layer: int = 1, + num_devices: int = 1, + full_batch_size: int = 4, + kv_offload: bool = True, +): + model_config = {"model_name": model_name} + model_config["img_size"] = img_size + config = AutoConfig.from_pretrained(model_config["model_name"], trust_remote_code=True) + config = set_num_layers(config, n_layer=n_layer) + model_hf, _ = load_image_text_to_text_model(config) + processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, padding=True) + + n_layer = get_num_layers_vlm(config) + + images = [] + for img_url in image_urls: + image = Image.open(requests.get(img_url, stream=True).raw) + if model_name == "mistralai/Mistral-Small-3.1-24B-Instruct-2503": + image = image.resize((1540, 1540)) + images.append(image) + + conversation = [ + { + "role": "user", + "content": [ + {"type": "text", "text": queries[0]}, + {"type": "image"}, + ], + }, + ] + prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) + api_runner = ApiRunnerVlm( + batch_size, + processor, + config, + images[0], + conversation, + prompt, + prompt_len, + ctx_len, + max_gen_len, + n_layer, + ) + + # For same prompt + image_list = [images[0]] * full_batch_size + prompt_list = [queries[0]] * full_batch_size + + pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch_CB(model_hf, image_list, prompt_list) + + qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( + model_config["model_name"], + kv_offload=kv_offload, + config=config, + continuous_batching=True, + ) + + qeff_model.export() + + if not get_available_device_id(): + pytest.skip("No available devices to run model on Cloud AI 100") + + qeff_model.compile( + img_size=model_config["img_size"], + num_cores=16, + num_devices=num_devices, + prefill_seq_len=prompt_len, + ctx_len=ctx_len, + batch_size=batch_size, + full_batch_size=full_batch_size, + mxfp6_matmul=False, + ) + + print("QPC Outputs (QAIC):") + exec_info = qeff_model.generate( + tokenizer=processor.tokenizer, + processor=processor, + images=[image_urls[0]] * full_batch_size, + prompts=prompt_list, + generation_len=max_gen_len, + ) + + qpc_tokens = exec_info.generated_ids[:, :max_gen_len] + print("QPC Outputs (QAIC) for Continuous Batching with same prompt:") + print(exec_info.generated_texts) + + for i in range(full_batch_size): + assert (pytorch_hf_tokens[i] == qpc_tokens[i]).all(), ( + f"Tokens don't match for prompt {i} between HF and QPC output for same prompts" + ) + + # For different prompts + pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch_CB(model_hf, images, queries) + + print("QPC Outputs (QAIC):") + exec_info = qeff_model.generate( + tokenizer=processor.tokenizer, + processor=processor, + images=image_urls, + prompts=queries, + generation_len=max_gen_len, + ) + + qpc_tokens = exec_info.generated_ids[:, :max_gen_len] + print("QPC Outputs (QAIC) for Continuous Batching with different prompt:") + print(exec_info.generated_texts) + + for i in range(full_batch_size): + assert (pytorch_hf_tokens[i] == qpc_tokens[i]).all(), ( + f"Tokens don't match for prompt {i} between HF and QPC output for different prompts" + ) + return + + +def check_molmo_image_text_to_text_pytorch_vs_ai100_continuous_batching( + model_name: str, + image_urls: List[str], + queries: List[str], + prompt_len: int, + ctx_len: int, + max_gen_len: int = 20, + batch_size: int = 1, + n_layer: int = 1, + num_devices: int = 1, + full_batch_size: int = 4, + kv_offload: bool = True, +): + model_config = {"model_name": model_name} + + config = AutoConfig.from_pretrained(model_config["model_name"], trust_remote_code=True) + config._attn_implementation = "eager" + config = set_num_layers(config, n_layer=n_layer) + model_hf, _ = load_image_text_to_text_model(config) + n_layer = (n_layer, n_layer) + + processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, padding=True) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + + images = [] + for img_url in image_urls: + img = requests.get(img_url, stream=True) + image = Image.open(BytesIO(img.content)).convert("RGB") + image = image.resize((536, 354)) + images.append(image) + + api_runner = ApiRunnerMolmo( + batch_size, + processor, + config, + images[0], + queries[0], + prompt_len, + ctx_len, + max_gen_len, + n_layer, + ) + + generation_config = GenerationConfig(max_new_tokens=NEW_GENERATION_TOKENS, stop_strings="<|endoftext|>") + + # For same prompt + image_list = [images[0]] * full_batch_size + prompt_list = [queries[0]] * full_batch_size + pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch_CB(model_hf, image_list, prompt_list, generation_config) + + qeff_model = QEFFAutoModelForCausalLM.from_pretrained( + model_name, + trust_remote_code=True, + attn_implementation="eager", + kv_offload=kv_offload, + config=config, + continuous_batching=True, + ) + + qeff_model.export() + + qeff_model.compile( + prefill_seq_len=prompt_len, + ctx_len=ctx_len, + num_devices=4, + batch_size=1, + full_batch_size=full_batch_size, + mxfp6_matmul=False, + mxint8_kv_cache=True, + aic_enable_depth_first=True, + mos=1, + ) + + exec_info = qeff_model.generate( + tokenizer=tokenizer, + processor=processor, + images=[image_urls[0]] * full_batch_size, + prompts=prompt_list, + generation_len=max_gen_len, + ) + + qpc_tokens = exec_info.generated_ids[:, :max_gen_len] + print("QPC Outputs (QAIC) for Continuous Batching with same prompt:") + print(exec_info.generated_texts) + + for i in range(full_batch_size): + assert (pytorch_hf_tokens[i] == qpc_tokens[i]).all(), ( + f"Tokens don't match for prompt {i} between HF and QPC output for same prompts" + ) + + # For different prompts + pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch_CB(model_hf, images, queries, generation_config) + exec_info = qeff_model.generate( + tokenizer=tokenizer, + processor=processor, + images=image_urls, + prompts=queries, + generation_len=max_gen_len, + ) + + qpc_tokens = exec_info.generated_ids[:, :max_gen_len] + print("QPC Outputs (QAIC) for Continuous Batching with different prompt:") + print(exec_info.generated_texts) + + for i in range(full_batch_size): + assert (pytorch_hf_tokens[i] == qpc_tokens[i]).all(), ( + f"Tokens don't match for prompt {i} between HF and QPC output for different prompts" + ) + return + + +def check_intern_image_text_to_text_pytorch_vs_ai100_continuous_batching( + model_name: str, + image_urls: str, + queries: str, + prompt_len: int, + ctx_len: int, + max_gen_len: int = 20, + batch_size: int = 1, + n_layer: int = 1, + kv_offload: bool = True, + num_devices: int = 1, + full_batch_size: int = 4, +): + model_config = {"model_name": model_name} + + config = AutoConfig.from_pretrained(model_config["model_name"], trust_remote_code=True) + config._attn_implementation = "eager" + config = set_num_layers(config, n_layer=n_layer) + model_hf = AutoModelForCausalLM.from_pretrained( + model_name, + low_cpu_mem_usage=False, + trust_remote_code=True, + config=config, + ) + n_layer = get_num_layers_vlm(config) + + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False) + processor = InternProcessor(model_hf, tokenizer) + + generation_config = dict(max_new_tokens=max_gen_len, do_sample=False) + generation_config["eos_token_id"] = tokenizer.convert_tokens_to_ids("<|im_end|>\n".strip()) + + images = [] + for img_url in image_urls: + img = requests.get(img_url, stream=True) + image = Image.open(BytesIO(img.content)).convert("RGB") + image = image.resize((448, 448)) + images.append(image) + + api_runner = ApiRunnerInternVL( + batch_size, + processor, + config, + images[0], + queries[0], + prompt_len, + ctx_len, + max_gen_len, + n_layer, + ) + + # For same prompt + image_list = [images[0]] * full_batch_size + prompt_list = [queries[0]] * full_batch_size + + pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch_CB(model_hf, image_list, prompt_list) + + qeff_model = QEFFAutoModelForCausalLM.from_pretrained( + model_name, + trust_remote_code=True, + attn_implementation="eager", + kv_offload=True, + config=config, + continuous_batching=True, + ) + + qeff_model.export() + + qeff_model.compile( + num_patches=1, + prefill_seq_len=prompt_len, + ctx_len=ctx_len, + num_devices=4, + batch_size=1, + full_batch_size=full_batch_size, + mxfp6_matmul=False, + ) + + exec_info = qeff_model.generate( + tokenizer=tokenizer, + processor=processor, + images=[image_urls[0]] * full_batch_size, + prompts=prompt_list, + generation_len=max_gen_len, + image_height=448, + image_width=448, + ) + + qpc_tokens = exec_info.generated_ids[:, :max_gen_len] + print("QPC Outputs (QAIC) for Continuous Batching for same prompts:") + print(exec_info.generated_texts) + + for i in range(full_batch_size): + assert (pytorch_hf_tokens[i] == qpc_tokens[i]).all(), ( + f"Tokens don't match for prompt {i} between HF and QPC output for same prompts" + ) + + # For different prompts + pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch_CB(model_hf, images, queries) + + exec_info = qeff_model.generate( + tokenizer=tokenizer, + processor=processor, + images=image_urls, + prompts=queries, + generation_len=max_gen_len, + image_height=448, + image_width=448, + ) + + qpc_tokens = exec_info.generated_ids[:, :max_gen_len] + print("QPC Outputs (QAIC) for Continuous Batching for different prompts:") + print(exec_info.generated_texts) + + for i in range(full_batch_size): + assert (pytorch_hf_tokens[i] == qpc_tokens[i]).all(), ( + f"Tokens don't match for prompt {i} between HF and QPC output for different prompts" + ) + return + + +@pytest.mark.on_qaic +@pytest.mark.multimodal +@pytest.mark.parametrize( + "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_urls, queries, n_layer, full_batch_size", + test_models_config, +) +def test_image_text_to_text_pytorch_vs_ai100_continuous_batching( + model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_urls, queries, n_layer, full_batch_size +): + """ + Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, without continuous batching. + ``Mandatory`` Args: + :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` + """ + check_image_text_to_text_pytorch_vs_ai100_continuous_batching( + model_name=model_name, + prompt_len=prompt_len, + ctx_len=ctx_len, + max_gen_len=NEW_GENERATION_TOKENS, + img_size=img_size, + image_urls=img_urls, + queries=queries, + n_layer=n_layer, + batch_size=batch_size, + kv_offload=kv_offload, + full_batch_size=full_batch_size, + ) + + +@pytest.mark.on_qaic +@pytest.mark.multimodal +@pytest.mark.parametrize( + "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_urls, queries, n_layer, full_batch_size", + molmo_model_config, +) +def test_image_text_to_text_molmo_pytorch_vs_ai100_continuous_batching( + model_name, kv_offload, batch_size, prompt_len, ctx_len, img_urls, queries, n_layer, full_batch_size +): + check_molmo_image_text_to_text_pytorch_vs_ai100_continuous_batching( + model_name=model_name, + prompt_len=prompt_len, + ctx_len=ctx_len, + max_gen_len=NEW_GENERATION_TOKENS, + image_urls=img_urls, + queries=queries, + n_layer=n_layer, + batch_size=batch_size, + kv_offload=kv_offload, + full_batch_size=full_batch_size, + ) + + +@pytest.mark.on_qaic +@pytest.mark.multimodal +@pytest.mark.parametrize( + "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, queries, n_layer, full_batch_size", + intern_model_config, +) +def test_image_text_to_text_intern_pytorch_vs_ai100_continuous_batching( + model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, queries, n_layer, full_batch_size +): + check_intern_image_text_to_text_pytorch_vs_ai100_continuous_batching( + model_name=model_name, + prompt_len=prompt_len, + ctx_len=ctx_len, + max_gen_len=NEW_GENERATION_TOKENS, + image_urls=img_url, + queries=queries, + n_layer=n_layer, + batch_size=batch_size, + kv_offload=kv_offload, + full_batch_size=full_batch_size, + ) diff --git a/tests/transformers/models/test_image_text_to_text_models.py b/tests/transformers/models/test_image_text_to_text_models.py index 387aa0a3a..e6a145195 100644 --- a/tests/transformers/models/test_image_text_to_text_models.py +++ b/tests/transformers/models/test_image_text_to_text_models.py @@ -341,59 +341,6 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer) qpc_tokens = output.generated_ids[:, :-1] assert (pytorch_hf_tokens == qpc_tokens).all(), "Tokens don't match for pytorch HF output and QPC output" - - # testing for CB models - if not kv_offload: # CB not yet enabled for Single QPC - return - full_batch_size = 4 - images = [image] * full_batch_size - queries = [query] * full_batch_size - - streamer = TextStreamer(processor.tokenizer) - pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch_CB(model_hf, images, queries) - - qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( - model_config["model_name"], - kv_offload=kv_offload, - config=config, - continuous_batching=True, - ) - - qeff_model.export() - - if not get_available_device_id(): - pytest.skip("No available devices to run model on Cloud AI 100") - - qeff_model.compile( - img_size=model_config["img_size"], - num_cores=16, - num_devices=num_devices, - prefill_seq_len=prompt_len, - ctx_len=ctx_len, - batch_size=batch_size, - full_batch_size=full_batch_size, - mxfp6_matmul=False, - enable_qnn=enable_qnn, - qnn_config=qnn_config, - ) - - print("QPC Outputs (QAIC):") - exec_info = qeff_model.generate( - tokenizer=processor.tokenizer, - processor=processor, - images=[img_url] * full_batch_size, - prompts=queries, - generation_len=max_gen_len, - ) - - qpc_tokens = exec_info.generated_ids[:, :max_gen_len] - print("QPC Outputs (QAIC) for Continuous Batching:") - print(exec_info.generated_texts) - - for i in range(full_batch_size): - assert (pytorch_hf_tokens[i] == qpc_tokens[i]).all(), ( - f"Tokens don't match for prompt {i} between HF and QPC output" - ) return @@ -420,7 +367,6 @@ def check_molmo_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( n_layer = (n_layer, n_layer) processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, padding=True) - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) img = requests.get(img_url, stream=True) image = Image.open(BytesIO(img.content)).convert("RGB") image = image.resize((536, 354)) @@ -466,56 +412,6 @@ def check_molmo_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer) qpc_tokens = output.generated_ids[:, :-1] assert (pytorch_hf_tokens == qpc_tokens).all(), "Tokens don't match for pytorch HF output and QPC output" - - # testing for CB models - if not kv_offload: # CB not yet enabled for Single QPC - return - full_batch_size = 4 - images = [image] * full_batch_size - queries = [query] * full_batch_size - - pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch_CB(model_hf, images, queries) - - qeff_model = QEFFAutoModelForCausalLM.from_pretrained( - model_name, - trust_remote_code=True, - attn_implementation="eager", - kv_offload=True, - config=config, - continuous_batching=True, - ) - - qeff_model.export() - - qeff_model.compile( - prefill_seq_len=prompt_len, - ctx_len=ctx_len, - num_devices=4, - batch_size=1, - full_batch_size=4, - mxfp6_matmul=False, - mxint8_kv_cache=True, - aic_enable_depth_first=True, - mos=1, - ) - - exec_info = qeff_model.generate( - tokenizer=tokenizer, - processor=processor, - images=[img_url] * full_batch_size, - prompts=queries, - generation_len=max_gen_len, - ) - - qpc_tokens = exec_info.generated_ids[:, :max_gen_len] - print("QPC Outputs (QAIC) for Continuous Batching:") - print(exec_info.generated_texts) - - for i in range(full_batch_size): - assert (pytorch_hf_tokens[i] == qpc_tokens[i]).all(), ( - f"Tokens don't match for prompt {i} between HF and QPC output" - ) - return @@ -625,64 +521,13 @@ def check_intern_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer) qpc_tokens = output.generated_ids[:, :-1] assert (pytorch_hf_tokens == qpc_tokens).all(), "Tokens don't match for pytorch HF output and QPC output" - - # testing for CB models - if not kv_offload: # CB not yet enabled for Single QPC - return - - full_batch_size = 4 - queries = [query] * full_batch_size - - pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch_CB(model_hf, [image] * full_batch_size, queries) - - qeff_model = QEFFAutoModelForCausalLM.from_pretrained( - model_name, - trust_remote_code=True, - attn_implementation="eager", - kv_offload=True, - config=config, - continuous_batching=True, - ) - - qeff_model.export() - - qeff_model.compile( - num_patches=1, - prefill_seq_len=prompt_len, - ctx_len=ctx_len, - num_devices=4, - batch_size=1, - full_batch_size=full_batch_size, - mxfp6_matmul=False, - ) - - exec_info = qeff_model.generate( - tokenizer=tokenizer, - processor=processor, - images=img_url * full_batch_size, - prompts=queries, - generation_len=max_gen_len, - image_height=448, - image_width=448, - ) - - qpc_tokens = exec_info.generated_ids[:, :max_gen_len] - print("QPC Outputs (QAIC) for Continuous Batching:") - print(exec_info.generated_texts) - - for i in range(full_batch_size): - assert (pytorch_hf_tokens[i] == qpc_tokens[i]).all(), ( - f"Tokens don't match for prompt {i} between HF and QPC output" - ) - return @pytest.mark.on_qaic @pytest.mark.multimodal @pytest.mark.parametrize( - "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer", - test_models_config, + "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer", test_models_config ) def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer @@ -709,7 +554,6 @@ def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( @pytest.mark.on_qaic @pytest.mark.qnn @pytest.mark.multimodal -@pytest.mark.skip(reason="Issues with QNN") @pytest.mark.parametrize( "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer", test_models_config ) @@ -746,8 +590,7 @@ def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100_qnn( @pytest.mark.on_qaic @pytest.mark.multimodal @pytest.mark.parametrize( - "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer", - molmo_model_config, + "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer", molmo_model_config ) def test_image_text_to_text_molmo_pytorch_vs_kv_vs_ort_vs_ai100( model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer @@ -789,7 +632,6 @@ def test_image_text_to_text_intern_pytorch_vs_kv_vs_ort_vs_ai100( @pytest.mark.on_qaic @pytest.mark.qnn @pytest.mark.multimodal -@pytest.mark.skip(reason="Issues with QNN") @pytest.mark.parametrize( "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer", intern_model_config ) From 7fb8a1dbd14d76b04fab418d48f45d0b3b31591d Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Thu, 4 Dec 2025 18:40:10 +0000 Subject: [PATCH 15/16] Added llava_next CB support Signed-off-by: Asmita Goswami --- QEfficient/generation/embedding_handler.py | 10 ++- .../models/llava_next/modeling_llava_next.py | 4 +- .../granite_vision/continuous_batching.py | 67 +++++++++++++++++++ .../test_continuous_batching.py | 11 ++- .../test_image_text_to_text_models.py | 0 5 files changed, 87 insertions(+), 5 deletions(-) create mode 100644 examples/image_text_to_text/models/granite_vision/continuous_batching.py rename tests/transformers/models/{ => image_text_to_text}/test_image_text_to_text_models.py (100%) diff --git a/QEfficient/generation/embedding_handler.py b/QEfficient/generation/embedding_handler.py index be8896917..5b4b87b40 100644 --- a/QEfficient/generation/embedding_handler.py +++ b/QEfficient/generation/embedding_handler.py @@ -225,8 +225,14 @@ def prepare_vlm_inputs(self, image_url: str, query: str, prefill_seq_len: int) - else: image = Image.open(image_url) - if "mistral3" in self._qeff_model.model.config.model_type: - image = image.resize((constants.MISTRAL3_IMAGE_HEIGHT, constants.MISTRAL3_IMAGE_WIDTH)) + if self._image_height and self._image_width: + image = image.resize((self._image_width, self._image_height)) + else: + logger.warning("Height and Width not specified. Using default image size.") + if "mistral3" in self._qeff_model.model.config.model_type: + image = image.resize((constants.MISTRAL3_IMAGE_HEIGHT, constants.MISTRAL3_IMAGE_WIDTH)) + if "llava_next" in self._qeff_model.model.config.model_type: + image = image.resize((constants.GRANITEVISION_IMG_SIZE_HEIGHT, constants.GRANITEVISION_IMG_SIZE_WIDTH)) # Prepare conversation format conversation = [ diff --git a/QEfficient/transformers/models/llava_next/modeling_llava_next.py b/QEfficient/transformers/models/llava_next/modeling_llava_next.py index 9cac61264..878d04a45 100755 --- a/QEfficient/transformers/models/llava_next/modeling_llava_next.py +++ b/QEfficient/transformers/models/llava_next/modeling_llava_next.py @@ -366,7 +366,7 @@ def get_specializations( lang.append(lang_decode) else: lang_prefill = { - "batch_size": batch_size, + "batch_size": 1 if continuous_batching else batch_size, "seq_len": prefill_seq_len, "ctx_len": ctx_len, "image_size_height": image_size_height, @@ -385,7 +385,7 @@ def get_specializations( lang_prefill["full_batch_exec_size"] = full_batch_size lang_decode = { - "batch_size": batch_size, + "batch_size": full_batch_size if continuous_batching else batch_size, "seq_len": "1", "ctx_len": ctx_len, "image_size_height": image_size_height, diff --git a/examples/image_text_to_text/models/granite_vision/continuous_batching.py b/examples/image_text_to_text/models/granite_vision/continuous_batching.py new file mode 100644 index 000000000..22c4270bc --- /dev/null +++ b/examples/image_text_to_text/models/granite_vision/continuous_batching.py @@ -0,0 +1,67 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import transformers +from transformers import AutoConfig, AutoProcessor, TextStreamer + +from QEfficient import QEFFAutoModelForImageTextToText + +## For AWQ model update pytorch version to 2.8.* +model_id = "ibm-granite/granite-vision-3.2-2b" +config = AutoConfig.from_pretrained(model_id) +config.text_config.num_hidden_layers = 2 + +qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( + model_id, + attn_implementation="eager", + kv_offload=True, + config=config, + continuous_batching=True, +) +tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) +processor = AutoProcessor.from_pretrained(model_id) + +batch_size = 1 +## Vision + Text ## +qeff_model.compile( + batch_size=batch_size, + full_batch_size=4, + prefill_seq_len=5500, + ctx_len=6000, + num_cores=16, + num_devices=4, + img_size=384, + mxfp6_matmul=False, +) + +image_urls = [ + "http://images.cocodataset.org/val2017/000000039769.jpg", + "http://images.cocodataset.org/val2017/000000039769.jpg", + "http://images.cocodataset.org/val2017/000000039769.jpg", + "http://images.cocodataset.org/val2017/000000039769.jpg", +] + +prompts = [ + "Describe the image", + "What are the objects in the image?", + "What is the main subject of the image?", + "What colors are predominant in the image?", +] + +streamer = TextStreamer(tokenizer) +output = qeff_model.generate( + tokenizer=tokenizer, + prompts=prompts, + processor=processor, + images=image_urls, + generation_len=10, + image_height=1610, + image_width=1109, +) +print(output.generated_ids) +print(tokenizer.batch_decode(output.generated_ids)) +print(output.generated_texts) diff --git a/tests/transformers/models/image_text_to_text/test_continuous_batching.py b/tests/transformers/models/image_text_to_text/test_continuous_batching.py index 6a88601e7..341d09094 100644 --- a/tests/transformers/models/image_text_to_text/test_continuous_batching.py +++ b/tests/transformers/models/image_text_to_text/test_continuous_batching.py @@ -304,11 +304,16 @@ def check_image_text_to_text_pytorch_vs_ai100_continuous_batching( n_layer = get_num_layers_vlm(config) + image_height=None + image_width=None + images = [] for img_url in image_urls: image = Image.open(requests.get(img_url, stream=True).raw) if model_name == "mistralai/Mistral-Small-3.1-24B-Instruct-2503": - image = image.resize((1540, 1540)) + image_height = 1540 + image_width = 1540 + image = image.resize((image_height, image_width)) images.append(image) conversation = [ @@ -370,6 +375,8 @@ def check_image_text_to_text_pytorch_vs_ai100_continuous_batching( images=[image_urls[0]] * full_batch_size, prompts=prompt_list, generation_len=max_gen_len, + image_height=image_height, + image_width=image_width, ) qpc_tokens = exec_info.generated_ids[:, :max_gen_len] @@ -391,6 +398,8 @@ def check_image_text_to_text_pytorch_vs_ai100_continuous_batching( images=image_urls, prompts=queries, generation_len=max_gen_len, + image_height=image_height, + image_width=image_width, ) qpc_tokens = exec_info.generated_ids[:, :max_gen_len] diff --git a/tests/transformers/models/test_image_text_to_text_models.py b/tests/transformers/models/image_text_to_text/test_image_text_to_text_models.py similarity index 100% rename from tests/transformers/models/test_image_text_to_text_models.py rename to tests/transformers/models/image_text_to_text/test_image_text_to_text_models.py From 6c56af2b3b9a5ff23a12971cfae4efef9004ceb1 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Thu, 4 Dec 2025 18:41:35 +0000 Subject: [PATCH 16/16] Added llava_next CB support Signed-off-by: Asmita Goswami --- QEfficient/generation/embedding_handler.py | 4 +++- .../models/image_text_to_text/test_continuous_batching.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/QEfficient/generation/embedding_handler.py b/QEfficient/generation/embedding_handler.py index 5b4b87b40..e07b5dd04 100644 --- a/QEfficient/generation/embedding_handler.py +++ b/QEfficient/generation/embedding_handler.py @@ -232,7 +232,9 @@ def prepare_vlm_inputs(self, image_url: str, query: str, prefill_seq_len: int) - if "mistral3" in self._qeff_model.model.config.model_type: image = image.resize((constants.MISTRAL3_IMAGE_HEIGHT, constants.MISTRAL3_IMAGE_WIDTH)) if "llava_next" in self._qeff_model.model.config.model_type: - image = image.resize((constants.GRANITEVISION_IMG_SIZE_HEIGHT, constants.GRANITEVISION_IMG_SIZE_WIDTH)) + image = image.resize( + (constants.GRANITEVISION_IMG_SIZE_HEIGHT, constants.GRANITEVISION_IMG_SIZE_WIDTH) + ) # Prepare conversation format conversation = [ diff --git a/tests/transformers/models/image_text_to_text/test_continuous_batching.py b/tests/transformers/models/image_text_to_text/test_continuous_batching.py index 341d09094..2f33b7ee8 100644 --- a/tests/transformers/models/image_text_to_text/test_continuous_batching.py +++ b/tests/transformers/models/image_text_to_text/test_continuous_batching.py @@ -304,8 +304,8 @@ def check_image_text_to_text_pytorch_vs_ai100_continuous_batching( n_layer = get_num_layers_vlm(config) - image_height=None - image_width=None + image_height = None + image_width = None images = [] for img_url in image_urls: