diff --git a/tests/conftest.py b/tests/conftest.py index ba0f341fe..051701036 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,46 +5,11 @@ # # ----------------------------------------------------------------------------- -import json import os import shutil -import pytest -from transformers import AutoConfig - from QEfficient.utils.constants import QEFF_MODELS_DIR from QEfficient.utils.logging_utils import logger -from QEfficient.utils.test_utils import ModelConfig - - -def get_custom_model_config_dict(configs): - """ - Converts a list of custom model configuration dictionaries into a dictionary - mapping model names to their corresponding AutoConfig objects. - - Args: - configs (List[Dict]): A list of dictionaries, each containing model configuration parameters. - - Returns: - Dict[str, AutoConfig]: A dictionary where keys are model names and values are AutoConfig objects. - """ - config_dict = {} - for config in configs: - model_name = config["model_name"] - config_dict[model_name] = AutoConfig.from_pretrained( - model_name, - trust_remote_code=config["model_name"] in ModelConfig.EXTERNAL_MODELS, - **config.get("additional_params", {}), - ) - return config_dict - - -# Pytest fixture to load custom model configs from a JSON file -@pytest.fixture(scope="session") -def custom_causal_model_config_dict(): - with open("tests/transformers/models/custom_tiny_model_configs.json", "r") as f: - custom_model_configs_data = json.load(f) - return get_custom_model_config_dict(custom_model_configs_data) def qeff_models_clean_up(): diff --git a/tests/transformers/models/custom_tiny_model_configs.json b/tests/transformers/models/custom_tiny_model_configs.json deleted file mode 100644 index 03a9541fd..000000000 --- a/tests/transformers/models/custom_tiny_model_configs.json +++ /dev/null @@ -1,348 +0,0 @@ -[ - { - "model_name": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "model_type": "llama", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 32000, - "num_key_value_heads": 1 - } - }, - { - "model_name": "gpt2", - "model_type": "gpt2", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 50257, - "num_key_value_heads": 1 - } - }, - { - "model_name": "allenai/OLMo-2-0425-1B", - "model_type": "olmo2", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 100352, - "num_key_value_heads": 1 - } - }, - { - "model_name": "Salesforce/codegen-350M-mono", - "model_type": "codegen", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 4, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 51200, - "num_key_value_heads": 1, - "rotary_dim": 16 - } - }, - - { - "model_name": "microsoft/Phi-3-mini-4k-instruct", - "model_type": "phi3", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 32064, - "num_key_value_heads": 1 - } - }, - { - "model_name": "tiiuae/falcon-7b", - "model_type": "falcon", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 65024, - "num_key_value_heads": 1 - } - }, - { - "model_name": "Qwen/Qwen3-30B-A3B-Instruct-2507", - "model_type": "qwen3_moe", - "additional_params": { - "hidden_size": 256, - "intermediate_size": 256, - "max_position_embeddings": 128, - "max_window_layers": 48, - "moe_intermediate_size": 768, - "num_attention_heads": 2, - "num_experts": 4, - "num_experts_per_tok": 2, - "num_hidden_layers": 1, - "num_key_value_heads": 1, - "vocab_size": 151936 - } - }, - { - "model_name": "Qwen/Qwen2-0.5B", - "model_type": "qwen2", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 151936, - "num_key_value_heads": 1 - } - }, - { - "model_name": "bigcode/starcoder2-3b", - "model_type": "starcoder2", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 49152, - "num_key_value_heads": 1 - } - }, - { - "model_name": "Felladrin/Minueza-32M-Base", - "model_type": "mistral", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 32002, - "num_key_value_heads": 1 - } - }, - { - "model_name": "wtang06/mpt-125m-c4", - "model_type": "mpt", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 50368 - } - }, - { - "model_name": "hakurei/gpt-j-random-tinier", - "model_type": "gptj", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 50400, - "num_key_value_heads": 1, - "rotary_dim": 16 - } - }, - { - "model_name": "mistralai/Mixtral-8x7B-Instruct-v0.1", - "model_type": "mixtral", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 32000, - "num_key_value_heads": 1 - } - }, - { - "model_name": "meta-llama/Llama-3.2-1B", - "model_type": "llama", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 128256, - "num_key_value_heads": 1, - "rope_scaling": { - "factor": 32.0, - "high_freq_factor": 4.0, - "low_freq_factor": 1.0, - "original_max_position_embeddings": 8192, - "rope_type": "llama3" - } - } - }, - { - "model_name": "unsloth/gemma-2b", - "model_type": "gemma", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 256000, - "num_key_value_heads": 1 - } - }, - { - "model_name": "unsloth/gemma-2-2b", - "model_type": "gemma2", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 256000, - "num_key_value_heads": 1 - } - }, - { - "model_name": "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", - "model_type": "llama", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 32003 - } - }, - { - "model_name": "TheBloke/Llama-2-7B-GPTQ", - "model_type": "llama", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 32000 - } - }, - { - "model_name": "ibm-granite/granite-20b-code-base", - "model_type": "gpt_bigcode", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 49152, - "num_key_value_heads": 1, - "activation_function": "gelu", - "architectures": [ - "GPTBigCodeForCausalLM" - ] - } - }, - { - "model_name": "neuralmagic/Llama-3.2-3B-Instruct-FP8", - "model_type": "llama", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 128256 - } - }, - { - "model_name": "neuralmagic/Qwen2-0.5B-Instruct-FP8", - "model_type": "qwen2", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 2, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 151936 - } - }, - { - "model_name": "ibm-granite/granite-3.1-2b-instruct", - "model_type": "granite", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 49155, - "num_key_value_heads": 1 - } - }, - { - "model_name": "ibm-granite/granite-guardian-3.1-2b", - "model_type": "granite", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 49155, - "num_key_value_heads": 1 - } - }, - { - "model_name": "hpcai-tech/grok-1", - "model_type": null, - "additional_params":{ - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 131072, - "num_key_value_heads": 1 - } - }, - { - "model_name": "Snowflake/Llama-3.1-SwiftKV-8B-Instruct", - "model_type": null, - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 2, - "num_attention_heads": 2, - "hidden_size": 256, - "intermediate_size": 256, - "vocab_size": 128256, - "num_key_value_layers": 1, - "num_key_value_heads": 1, - "rope_scaling": { - "factor": 8.0, - "high_freq_factor": 4.0, - "low_freq_factor": 1.0, - "original_max_position_embeddings": 8192, - "rope_type": "llama3" - } - } - } -] diff --git a/tests/transformers/models/qnn_config.json b/tests/transformers/models/qnn_config.json deleted file mode 100644 index b1f249e2b..000000000 --- a/tests/transformers/models/qnn_config.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "SKIP_QNN_CONVERTER_STEP":false, - "context_binary_generator_args_extension":"--log_level debug", - "converter_args_extension":"--onnx_defer_loading", - "qnn_compilation_backend":{ - "compiler_enable_depth_first":true, - "compiler_printDDRStats":false, - "compiler_printPerfMetrics":false - } -} \ No newline at end of file diff --git a/tests/transformers/models/test_audio_embedding_models.py b/tests/transformers/models/test_audio_embedding_models.py index da30c76b0..75f9fac08 100644 --- a/tests/transformers/models/test_audio_embedding_models.py +++ b/tests/transformers/models/test_audio_embedding_models.py @@ -5,6 +5,7 @@ # # ----------------------------------------------------------------------------- +import json import os from typing import List, Optional @@ -23,9 +24,11 @@ from QEfficient.utils.constants import WAV2VEC2_MAX_SEQ_LEN, QnnConstants from QEfficient.utils.device_utils import get_available_device_id -test_models = [ - "facebook/wav2vec2-base-960h", -] +CONFIG_PATH = os.path.join(os.path.dirname(__file__), "test_model_configs.json") + +with open(CONFIG_PATH, "r") as f: + config_data = json.load(f) + test_models = config_data["audio_embedding_models"] def load_ctc_model(model_config): diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py index 321a466ab..b9d573775 100644 --- a/tests/transformers/models/test_causal_lm_models.py +++ b/tests/transformers/models/test_causal_lm_models.py @@ -6,6 +6,7 @@ # ----------------------------------------------------------------------------- import copy +import json import os from typing import Optional @@ -24,48 +25,40 @@ from QEfficient.utils.run_utils import ApiRunner from QEfficient.utils.test_utils import ModelConfig -test_models_causal = [ - "openai/gpt-oss-20b", - "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "gpt2", - "Salesforce/codegen-350M-mono", - "microsoft/Phi-3-mini-4k-instruct", - "tiiuae/falcon-7b", - "Qwen/Qwen2-0.5B", - "Qwen/Qwen3-0.6B", - "bigcode/starcoder2-3b", - "Qwen/Qwen3-30B-A3B-Instruct-2507", - "Felladrin/Minueza-32M-Base", - "wtang06/mpt-125m-c4", - "hakurei/gpt-j-random-tinier", - "mistralai/Mixtral-8x7B-Instruct-v0.1", - "meta-llama/Llama-3.2-1B", - "unsloth/gemma-2b", - "unsloth/gemma-2-2b", - "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", # AWQ model - "TheBloke/Llama-2-7B-GPTQ", # GPTQ model - "ibm-granite/granite-20b-code-base", - # "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic", # naive-quantized compressed-tensor FP8 model per-channel weight, per-token activations - "neuralmagic/Llama-3.2-3B-Instruct-FP8", # float quantized compressed-tensor per tensor both weight and activations - "neuralmagic/Qwen2-0.5B-Instruct-FP8", # fp8 quant method, static, with lm head ignored - "ibm-granite/granite-3.1-2b-instruct", - "ibm-granite/granite-guardian-3.1-2b", - "hpcai-tech/grok-1", - "Snowflake/Llama-3.1-SwiftKV-8B-Instruct", - "allenai/OLMo-2-0425-1B", -] - -test_models_qnn = [ - "mistralai/Mixtral-8x7B-Instruct-v0.1", - "meta-llama/Llama-3.2-1B", - "unsloth/gemma-2b", - "ibm-granite/granite-guardian-3.1-2b", -] - -test_models_spd = [ - "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "Qwen/Qwen2-0.5B", -] +CONFIG_PATH = os.path.join(os.path.dirname(__file__), "test_model_configs.json") + +with open(CONFIG_PATH, "r") as f: + config_data = json.load(f) + causal_lm_models = config_data["causal_lm_models"] + spd_models = config_data["spd_causal_lm_models"] + qnn_models = config_data["qnn_causal_lm_models"] + + +# Create a list of model names for parameterization +test_models_causal = [model["model_name"] for model in causal_lm_models] +test_models_spd = [model["model_name"] for model in spd_models] +test_models_qnn = [model["model_name"] for model in qnn_models] + +# Create a dictionary mapping model names to their configs +model_config_dict = {model["model_name"]: model for model in causal_lm_models} + + +def get_hf_config_from_custom_config(model_name): + """ + Function to get HF config from custom config file + -------- + :model_name: str + + :return config + """ + custom_config = model_config_dict[model_name] + + hf_config = AutoConfig.from_pretrained( + model_name, + trust_remote_code=model_name in ModelConfig.EXTERNAL_MODELS, + **custom_config.get("additional_params", {}), + ) + return hf_config def get_custom_n_layers(model_name): @@ -102,7 +95,6 @@ def load_causal_lm_model(model_name, n_layer=1, config=None): ) if config is None: # If custom config is not provided, load the model config from Hugging Face if n_layer is not None: - # If n_layer is specified, load the model with that many layers model_hf = AutoModelForCausalLM.from_pretrained( model_path, use_cache=True, @@ -146,7 +138,6 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( enable_qnn: Optional[bool] = False, qnn_config: Optional[str] = None, config: Optional[AutoConfig] = None, - pytorch_hf_tokens: Optional[list] = None, ): """ Validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. @@ -174,6 +165,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( Constants.PROMPT_LEN, Constants.CTX_LEN, ) + if model_name not in ModelConfig.SWIFTKV_MODELS and model_name not in ModelConfig.EXTERNAL_MODELS: pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch(model_hf) @@ -183,7 +175,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( ) pytorch_kv_tokens = api_runner.run_kv_model_on_pytorch(qeff_model.model) - if model_name not in ModelConfig.SWIFTKV_MODELS: + if model_name not in ModelConfig.SWIFTKV_MODELS and model_name not in ModelConfig.EXTERNAL_MODELS: assert (pytorch_hf_tokens == pytorch_kv_tokens).all(), ( "Tokens don't match for HF PyTorch model output and KV PyTorch model output" ) @@ -193,8 +185,6 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( assert (pytorch_kv_tokens == ort_tokens).all(), "Tokens don't match for ONNXRT output and PyTorch output." - if not get_available_device_id(): - pytest.skip("No available devices to run model on Cloud AI 100") qpc_path = qeff_model.compile( prefill_seq_len=prompt_len, ctx_len=ctx_len, @@ -234,14 +224,10 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( Constants.CTX_LEN, full_batch_size, ) - if model_name not in ModelConfig.SWIFTKV_MODELS and model_name not in ModelConfig.EXTERNAL_MODELS: pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch_CB(model_hf) pytorch_hf_tokens = np.vstack(pytorch_hf_tokens) - if model_name in ModelConfig.EXTERNAL_MODELS: - pytorch_hf_tokens = [pytorch_hf_tokens for _ in range(full_batch_size)] - qeff_model = QEFFAutoModelForCausalLM( model_hf, continuous_batching=True, is_tlm=is_tlm, pretrained_model_name_or_path=model_name ) @@ -263,8 +249,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( qnn_config=qnn_config, ) exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts) - - if model_name in ModelConfig.SWIFTKV_MODELS: + if model_name in ModelConfig.SWIFTKV_MODELS or model_name in ModelConfig.EXTERNAL_MODELS: assert all( [ all(ort_token[:24] == cloud_token[:24]) @@ -317,25 +302,19 @@ def test_causal_lm_export_with_deprecated_api(model_name): @pytest.mark.on_qaic @pytest.mark.regular @pytest.mark.parametrize("model_name", test_models_causal) -def test_custom_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, custom_causal_model_config_dict): +def test_custom_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name): """ Test function to validate the dummy PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. ``Mandatory`` Args: :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` """ - config = custom_causal_model_config_dict.get(model_name) - - # Using fixed reference tokens for external models for specific test cases. - # These tokens are hardcoded, therefore will not match if the model config changes. - pytorch_hf_tokens = None - if model_name in ModelConfig.EXTERNAL_MODELS: - pytorch_hf_tokens = ModelConfig.EXTERNAL_MODELS[model_name]["pytorch_hf_tokens_custom_case"] + hf_config = get_hf_config_from_custom_config(model_name) if model_name in ModelConfig.QUANTIZED_MODELS: n_layer = get_custom_n_layers(model_name) - check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, n_layer=n_layer, pytorch_hf_tokens=pytorch_hf_tokens) + check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, n_layer=n_layer) else: - check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, config=config, pytorch_hf_tokens=pytorch_hf_tokens) + check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, config=hf_config) @pytest.mark.nightly @@ -349,34 +328,26 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name): """ n_layer = get_custom_n_layers(model_name) - # Using fixed reference tokens for external models for specific test cases. - # These tokens are hardcoded, therefore will not match if the model config changes. - pytorch_hf_tokens = None - if model_name in ModelConfig.EXTERNAL_MODELS: - pytorch_hf_tokens = ModelConfig.EXTERNAL_MODELS[model_name]["pytorch_hf_tokens_normal_case"] - - check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( - model_name=model_name, n_layer=n_layer, pytorch_hf_tokens=pytorch_hf_tokens - ) + check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, n_layer=n_layer) @pytest.mark.on_qaic @pytest.mark.regular @pytest.mark.qnn @pytest.mark.parametrize("model_name", test_models_qnn) -def test_custom_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name, custom_causal_model_config_dict): +def test_custom_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name): """ QNN Setup Test function to validate the dummy PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. ``Mandatory`` Args: :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` """ - config = custom_causal_model_config_dict.get(model_name) + hf_config = get_hf_config_from_custom_config(model_name) qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json") create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG) check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( - model_name, enable_qnn=True, qnn_config=qnn_config_json_path, config=config + model_name, enable_qnn=True, qnn_config=qnn_config_json_path, config=hf_config ) @@ -404,18 +375,18 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name): @pytest.mark.on_qaic @pytest.mark.qnn @pytest.mark.parametrize("model_name", test_models_spd) -def test_custom_causal_tlm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, custom_causal_model_config_dict): +def test_custom_causal_tlm_pytorch_vs_kv_vs_ort_vs_ai100(model_name): """ Test function to validate the dummy PyTorch model for speculative decoding, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. ``Mandatory`` Args: :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` """ - config = custom_causal_model_config_dict.get(model_name) + hf_config = get_hf_config_from_custom_config(model_name) check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( model_name=model_name, num_speculative_tokens=Constants.NUM_SPECULATIVE_TOKENS, - config=config, + config=hf_config, ) diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py index 2d110faeb..e9a636d71 100644 --- a/tests/transformers/models/test_embedding_models.py +++ b/tests/transformers/models/test_embedding_models.py @@ -5,6 +5,7 @@ # # ----------------------------------------------------------------------------- +import json import os from typing import Optional @@ -19,10 +20,11 @@ from QEfficient.utils._utils import create_json from QEfficient.utils.constants import Constants, QnnConstants -embed_test_models = [ - {"model_name": "jinaai/jina-embeddings-v2-base-code", "pooling": "mean"}, - {"model_name": "sentence-transformers/nli-bert-base-cls-pooling", "pooling": "cls"}, -] +CONFIG_PATH = os.path.join(os.path.dirname(__file__), "test_model_configs.json") + +with open(CONFIG_PATH, "r") as f: + config_data = json.load(f) + embed_test_models = config_data["embedding_models"] def check_embed_pytorch_vs_ort_vs_ai100( diff --git a/tests/transformers/models/test_image_text_to_text_models.py b/tests/transformers/models/test_image_text_to_text_models.py index e6a145195..2d5500d08 100644 --- a/tests/transformers/models/test_image_text_to_text_models.py +++ b/tests/transformers/models/test_image_text_to_text_models.py @@ -5,6 +5,7 @@ # # ---------------------------------------------------------------------------- +import json import os from io import BytesIO from typing import List, Optional @@ -27,183 +28,19 @@ from QEfficient.utils import hf_download from QEfficient.utils._utils import create_json, get_num_layers_vlm from QEfficient.utils.constants import QnnConstants -from QEfficient.utils.device_utils import get_available_device_id from QEfficient.utils.run_utils import ApiRunnerInternVL, ApiRunnerMolmo, ApiRunnerVlm from QEfficient.utils.test_utils import InternProcessor NEW_GENERATION_TOKENS = 10 -test_models_config = [ - # CONFIG PARAMS NEEDED FOR A MODEL TO BE TESTED - # ( - # model_name, - # kv_offload, - # batch_size, - # prompt_len, - # ctx_len, - # img_size, - # img_url", - # text_prompt, - # number of layers of the model, - # ), - ( - "llava-hf/llava-1.5-7b-hf", - True, - 1, - 784, - 1024, - 336, - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", - "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", - 1, - ), - ( - "llava-hf/llava-1.5-7b-hf", - False, - 1, - 784, - 1024, - 336, - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", - "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", - 1, - ), - # Disabled in CI due to performance issues - # ( - # "meta-llama/Llama-4-Scout-17B-16E-Instruct", - # True, - # 1, - # 128, - # 3072, - # 336, - # "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", - # "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", - # 4, - # ), - # ( - # "meta-llama/Llama-4-Scout-17B-16E-Instruct", - # False, - # 1, - # 128, - # 3072, - # 336, - # "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", - # "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", - # 4, - # ), - ( - "google/gemma-3-4b-it", - True, - 1, - 128, - 3072, - 896, - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", - "Can you describe the image in detail.", - 1, - ), - ( - "google/gemma-3-4b-it", - False, - 1, - 128, - 3072, - 896, - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", - "Can you describe the image in detail.", - 1, - ), - ( - "mistralai/Mistral-Small-3.1-24B-Instruct-2503", - True, - 1, - 128, - 4096, - 1540, - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", - "Can you describe the image in detail.", - 1, - ), - ( - "mistralai/Mistral-Small-3.1-24B-Instruct-2503", - False, - 1, - 128, - 4096, - 1540, - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", - "Can you describe the image in detail.", - 1, - ), - ( - "Qwen/Qwen2.5-VL-3B-Instruct", - True, - 1, - 128, - 4096, - 1540, - "https://picsum.photos/id/237/536/354", - "Can you describe the image in detail.", - 1, - ), - # ( - # "meta-llama/Llama-3.2-11B-Vision-Instruct", - # True, - # 1, - # 32, - # 512, - # 560, - # "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", - # "Explain this image", - # 7, - # ), -] - -intern_model_config = [ - ( - "OpenGVLab/InternVL2_5-1B", - True, - 1, - 384, - 512, - "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-1-2048.jpg", - "Please describe the image in detail.", - 2, - ), - ( - "OpenGVLab/InternVL3_5-1B", - True, - 1, - 384, - 512, - "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-1-2048.jpg", - "Please describe the image in detail.", - 2, - ), - # ( - # "OpenGVLab/InternVL2_5-1B", - # False, - # 1, - # 384, - # 512, - # "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-1-2048.jpg", - # "Please describe the image in detail.", - # 2, - # ), # commented becuase QNN Convertor is not supported for this model yet. -] - -molmo_model_config = [ - # Disabled in CI due to HF issues - # ( - # "allenai/Molmo-7B-D-0924", - # True, - # 1, - # 128, - # 4096, - # "https://picsum.photos/id/237/536/354", - # "Can you describe the image in detail.", - # 2, - # ), -] + +CONFIG_PATH = os.path.join(os.path.dirname(__file__), "test_model_configs.json") + +with open(CONFIG_PATH, "r") as f: + config_data = json.load(f) + multimodal_models = config_data["multimodal_models"] + +test_mm_models = [model_config["model_name"] for model_config in multimodal_models] +model_config_dict = {model["model_name"]: model for model in multimodal_models} def load_image_text_to_text_model(model_config): @@ -229,6 +66,28 @@ def load_image_text_to_text_model(model_config): return model_hf, params +def load_image_text_to_text_model_from_config(model_name, config): + torch.manual_seed(42) + model_path = hf_download( + repo_id=model_name, + ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"], + ) + try: + model_hf = AutoModelForImageTextToText.from_config( + config, + ) + except ValueError: + model_hf = AutoModelForCausalLM.from_pretrained( + model_path, + low_cpu_mem_usage=False, + trust_remote_code=True, + config=config, + ) + params = sum(p.numel() for p in model_hf.parameters()) + model_hf.eval() + return model_hf, params + + def set_num_layers(config, n_layer=1): ## -1 indicates use all the layers of the model. if n_layer == -1: @@ -251,7 +110,6 @@ def set_num_layers(config, n_layer=1): def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( model_name: str, - img_size: int, img_url: str, query: str, prompt_len: int, @@ -263,260 +121,210 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( num_devices: int = 1, enable_qnn: Optional[bool] = False, qnn_config: Optional[str] = None, + config: Optional[AutoConfig] = None, + img_size: Optional[int] = None, ): - model_config = {"model_name": model_name} - model_config["img_size"] = img_size - config = AutoConfig.from_pretrained(model_config["model_name"], trust_remote_code=True, padding=True) - config = set_num_layers(config, n_layer=n_layer) - model_hf, _ = load_image_text_to_text_model(config) - processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, padding=True) - - n_layer = get_num_layers_vlm(config) - image = Image.open(requests.get(img_url, stream=True).raw) - if model_name == "mistralai/Mistral-Small-3.1-24B-Instruct-2503": - image = image.resize((1540, 1540)) - - conversation = [ - { - "role": "user", - "content": [ - {"type": "text", "text": query}, - {"type": "image"}, - ], - }, - ] - prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) - api_runner = ApiRunnerVlm( - batch_size, - processor, - config, - image, - conversation, - prompt, - prompt_len, - ctx_len, - max_gen_len, - n_layer, - ) + """ + Unified function to test PyTorch model, PyTorch KV model, ONNX model, and Cloud AI 100 model. + Handles standard VLM models, InternVL models, and Molmo models. + + Args: + model_name: Hugging Face model identifier + img_url: URL to image for testing + query: Text query for the model + prompt_len: Prompt sequence length + ctx_len: Context length + max_gen_len: Maximum generation length + batch_size: Batch size for processing + n_layer: Number of layers to use + kv_offload: Whether to use KV offloading + num_devices: Number of devices to use + enable_qnn: Enable QNN compilation + qnn_config: Path to QNN config file + config: Pre-configured model config (optional) + img_size: Image size for standard models (optional) + """ - inputs = processor(images=image, text=prompt, return_tensors="pt") - if "pixel_values" in inputs: - inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) - streamer = TextStreamer(processor.tokenizer) - pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch(model_hf, inputs) - qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( - model_config["model_name"], - kv_offload=kv_offload, - config=config, - ) + is_intern_model = model_name == "OpenGVLab/InternVL2_5-1B" or model_name == "OpenGVLab/InternVL3_5-1B" + is_molmo_model = model_name == "allenai/Molmo-7B-D-0924" - # pytorch_kv_tokens = api_runner.run_vlm_kv_model_on_pytorch(qeff_model.model) - # assert (pytorch_kv_tokens == pytorch_hf_tokens).all(), ( - # "Tokens don't match for pytorch HF output and pytorch KV output" - # ) + # ========== Config and Model Loading ========== + if config is None: + config = AutoConfig.from_pretrained(model_name, trust_remote_code=True, padding=not is_molmo_model) + config._attn_implementation = "eager" if (is_intern_model or is_molmo_model) else None + config = set_num_layers(config, n_layer=n_layer) - qeff_model.export() - # onnx_model_path = qeff_model.export() - # ort_tokens = api_runner.run_vlm_kv_model_on_ort(onnx_model_path) - # assert (pytorch_hf_tokens == ort_tokens).all(), "Tokens don't match for pytorch HF output and ORT output" - if not get_available_device_id(): - pytest.skip("No available devices to run model on Cloud AI 100") - qeff_model.compile( - img_size=model_config["img_size"], - num_devices=num_devices, - prefill_seq_len=prompt_len, - ctx_len=ctx_len, - mxfp6=False, - enable_qnn=enable_qnn, - qnn_config=qnn_config, - ) - inputs = processor(images=image, text=prompt, return_tensors="pt") - if hasattr(qeff_model.model.config, "model_type") and qeff_model.model.config.model_type == "qwen2_5_vl": - inputs = qeff_model.model.prepare_inputs_for_generation( - inputs=inputs, prefill_seq_len=prompt_len, batch_size=batch_size + if is_intern_model: + model_hf = AutoModelForCausalLM.from_pretrained( + model_name, + low_cpu_mem_usage=False, + trust_remote_code=True, + config=config, ) - if "pixel_values" in inputs: - inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) - print("QPC Outputs (QAIC):") - output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer) - qpc_tokens = output.generated_ids[:, :-1] - assert (pytorch_hf_tokens == qpc_tokens).all(), "Tokens don't match for pytorch HF output and QPC output" - return - - -def check_molmo_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( - model_name: str, - img_url: str, - query: str, - prompt_len: int, - ctx_len: int, - max_gen_len: int = 20, - batch_size: int = 1, - n_layer: int = 1, - kv_offload: bool = False, - num_devices: int = 1, - enable_qnn: Optional[bool] = False, - qnn_config: Optional[str] = None, -): - model_config = {"model_name": model_name} - - config = AutoConfig.from_pretrained(model_config["model_name"], trust_remote_code=True) - config._attn_implementation = "eager" - config = set_num_layers(config, n_layer=n_layer) - model_hf, _ = load_image_text_to_text_model(config) - n_layer = (n_layer, n_layer) - - processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, padding=True) - img = requests.get(img_url, stream=True) - image = Image.open(BytesIO(img.content)).convert("RGB") - image = image.resize((536, 354)) - - api_runner = ApiRunnerMolmo( - batch_size, - processor, - config, - image, - query, - prompt_len, - ctx_len, - max_gen_len, - n_layer, - ) - - inputs = processor.process(images=[image], text=query) - inputs = {k: v.unsqueeze(0) for k, v in inputs.items()} - - generation_config = GenerationConfig(max_new_tokens=NEW_GENERATION_TOKENS, stop_strings="<|endoftext|>") - pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch(model_hf, inputs, generation_config) - - batch_size, prompt_len = inputs["input_ids"].shape - inputs["attention_mask"] = torch.ones((inputs["input_ids"].shape), dtype=torch.int64) - valid = inputs["image_input_idx"] > 0 - valid = valid.reshape(1, -1) - inputs["valid_idx"] = torch.nonzero(valid)[:, 1].unsqueeze(0) - inputs["pixel_values"] = inputs.pop("images") - - qeff_model = QEFFAutoModelForCausalLM.from_pretrained( - model_config["model_name"], - kv_offload=kv_offload, - config=config, - ) - - streamer = TextStreamer(processor.tokenizer) - qeff_model.export() - - if not get_available_device_id(): - pytest.skip("No available devices to run model on Cloud AI 100") - qeff_model.compile(num_devices=num_devices, prefill_seq_len=prompt_len, ctx_len=ctx_len, mxfp6=False) - print("QPC Outputs (QAIC):") - output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer) - qpc_tokens = output.generated_ids[:, :-1] - assert (pytorch_hf_tokens == qpc_tokens).all(), "Tokens don't match for pytorch HF output and QPC output" - return + n_layer = get_num_layers_vlm(config) + elif is_molmo_model: + model_hf, _ = load_image_text_to_text_model(config) + n_layer = (n_layer, n_layer) + else: + model_hf, _ = load_image_text_to_text_model(config) + n_layer = get_num_layers_vlm(config) -def check_intern_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( - model_name: str, - img_url: str, - query: str, - prompt_len: int, - ctx_len: int, - max_gen_len: int = 20, - batch_size: int = 1, - n_layer: int = 1, - kv_offload: bool = False, - num_devices: int = 1, - enable_qnn: Optional[bool] = False, - qnn_config: Optional[str] = None, -): - model_config = {"model_name": model_name} - - config = AutoConfig.from_pretrained(model_config["model_name"], trust_remote_code=True) - config._attn_implementation = "eager" - config = set_num_layers(config, n_layer=n_layer) - model_hf = AutoModelForCausalLM.from_pretrained( - model_name, - low_cpu_mem_usage=False, - trust_remote_code=True, - config=config, - ) - n_layer = get_num_layers_vlm(config) - - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False) - processor = InternProcessor(model_hf, tokenizer) - - prompt = [query] - img_url = [img_url] - pixel_values = [] - num_patches_list = [] - questions = [] - for i in range(len(prompt)): - img = requests.get(img_url[i], stream=True) - image = Image.open(BytesIO(img.content)).convert("RGB") - - image = image.resize((448, 448)) - - # preprocess the resized image - pixel_value = processor.load_image(image, max_num=12) - num_patches_list.append(pixel_value.shape[0]) - pixel_values.append(pixel_value) - - question = "\n" + prompt[i] - questions.append(question) - - pixel_values = torch.cat(pixel_values, dim=0) - - # Chat Template information for prompt preprocessing - messages: List[List[str]] = [] - roles = ("<|im_start|>user\n", "<|im_start|>assistant\n") - prompt = processor(pixel_values, questions, messages, roles, num_patches_list=num_patches_list) - - inputs = tokenizer(prompt, return_tensors="pt") - batch_size, prompt_len = inputs["input_ids"].shape - inputs["pixel_values"] = pixel_values.clone() - - generation_config = dict(max_new_tokens=max_gen_len, do_sample=False) - generation_config["eos_token_id"] = tokenizer.convert_tokens_to_ids("<|im_end|>\n".strip()) - api_runner = ApiRunnerInternVL( - batch_size, - processor, - config, - image, - query, - prompt_len, - ctx_len, - max_gen_len, - n_layer, - ) - pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch(model_hf, inputs, generation_config) + # ========== Processor and Image Loading ========== + if is_intern_model: + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False) + processor = InternProcessor(model_hf, tokenizer) + else: + processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, padding=True) + + if is_intern_model: + prompt = [query] + img_url_list = [img_url] + pixel_values = [] + num_patches_list = [] + questions = [] + for i in range(len(prompt)): + img = requests.get(img_url_list[i], stream=True) + image = Image.open(BytesIO(img.content)).convert("RGB") + image = image.resize((448, 448)) + pixel_value = processor.load_image(image, max_num=12) + num_patches_list.append(pixel_value.shape[0]) + pixel_values.append(pixel_value) + question = "\n" + prompt[i] + questions.append(question) + pixel_values = torch.cat(pixel_values, dim=0) + else: + if is_molmo_model: + img = requests.get(img_url, stream=True) + image = Image.open(BytesIO(img.content)).convert("RGB") + image = image.resize((536, 354)) + else: + image = Image.open(requests.get(img_url, stream=True).raw) + if model_name == "mistralai/Mistral-Small-3.1-24B-Instruct-2503": + image = image.resize((1540, 1540)) + + # ========== Prepare Inputs and Get PyTorch HF Tokens ========== + if is_intern_model: + messages: List[List[str]] = [] + roles = ("<|im_start|>user\n", "<|im_start|>assistant\n") + prompt = processor(pixel_values, questions, messages, roles, num_patches_list=num_patches_list) + inputs = tokenizer(prompt, return_tensors="pt") + batch_size, prompt_len = inputs["input_ids"].shape + inputs["pixel_values"] = pixel_values.clone() + generation_config = dict(max_new_tokens=max_gen_len, do_sample=False) + generation_config["eos_token_id"] = tokenizer.convert_tokens_to_ids("<|im_end|>\n".strip()) + api_runner = ApiRunnerInternVL( + batch_size, + processor, + config, + image, + query, + prompt_len, + ctx_len, + max_gen_len, + n_layer, + ) + pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch(model_hf, inputs, generation_config) + elif is_molmo_model: + inputs = processor.process(images=[image], text=query) + inputs = {k: v.unsqueeze(0) for k, v in inputs.items()} + generation_config = GenerationConfig(max_new_tokens=NEW_GENERATION_TOKENS, stop_strings="<|endoftext|>") + api_runner = ApiRunnerMolmo( + batch_size, + processor, + config, + image, + query, + prompt_len, + ctx_len, + max_gen_len, + n_layer, + ) + pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch(model_hf, inputs, generation_config) + batch_size, prompt_len = inputs["input_ids"].shape + inputs["attention_mask"] = torch.ones((inputs["input_ids"].shape), dtype=torch.int64) + valid = inputs["image_input_idx"] > 0 + valid = valid.reshape(1, -1) + inputs["valid_idx"] = torch.nonzero(valid)[:, 1].unsqueeze(0) + inputs["pixel_values"] = inputs.pop("images") + else: + conversation = [ + { + "role": "user", + "content": [ + {"type": "text", "text": query}, + {"type": "image"}, + ], + }, + ] + prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) + api_runner = ApiRunnerVlm( + batch_size, + processor, + config, + image, + conversation, + prompt, + prompt_len, + ctx_len, + max_gen_len, + n_layer, + ) + inputs = processor(images=image, text=prompt, return_tensors="pt") + if "pixel_values" in inputs: + inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) + pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch(model_hf, inputs) - qeff_model = QEFFAutoModelForCausalLM.from_pretrained( - model_config["model_name"], - kv_offload=kv_offload, - config=config, - ) # pytorch_kv_tokens = api_runner.run_vlm_kv_model_on_pytorch(qeff_model.model) - # assert (pytorch_hf_tokens == pytorch_kv_tokens).all(), ( - # "Tokens don't match for pytorch HF output and QEFF KV Model output" + # assert (pytorch_kv_tokens == pytorch_hf_tokens).all(), ( + # "Tokens don't match for pytorch HF output and pytorch KV output" # ) streamer = TextStreamer(processor.tokenizer) + + # ========== Export and Compile Model ========== + if is_intern_model or is_molmo_model: + qeff_model = QEFFAutoModelForCausalLM.from_pretrained( + model_name, + kv_offload=kv_offload, + config=config, + ) + else: + qeff_model = QEFFAutoModelForImageTextToText(model_hf, kv_offload=kv_offload) + qeff_model.export() # onnx_model_path = qeff_model.export() # ort_tokens = api_runner.run_vlm_kv_model_on_ort(onnx_model_path) # assert (pytorch_hf_tokens == ort_tokens).all(), "Tokens don't match for pytorch HF output and ORT output" - if not get_available_device_id(): - pytest.skip("No available devices to run model on Cloud AI 100") - qeff_model.compile( - num_patches=1, - num_devices=num_devices, - prefill_seq_len=prompt_len, - ctx_len=ctx_len, - mxfp6=False, - enable_qnn=enable_qnn, - qnn_config=qnn_config, - ) + + compile_kwargs = { + "num_devices": num_devices, + "prefill_seq_len": prompt_len, + "ctx_len": ctx_len, + "mxfp6": False, + "enable_qnn": enable_qnn, + "qnn_config": qnn_config, + } + + if is_intern_model: + compile_kwargs["num_patches"] = 1 + elif not is_molmo_model and img_size is not None: + compile_kwargs["img_size"] = img_size + + qeff_model.compile(**compile_kwargs) + + # ========== Generate and Verify Output ========== + + if not is_intern_model and not is_molmo_model: + inputs = processor(images=image, text=prompt, return_tensors="pt") + if hasattr(qeff_model.model.config, "model_type") and qeff_model.model.config.model_type == "qwen2_5_vl": + inputs = qeff_model.model.prepare_inputs_for_generation( + inputs=inputs, prefill_seq_len=prompt_len, batch_size=batch_size + ) + if "pixel_values" in inputs: + inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) + print("QPC Outputs (QAIC):") output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer) qpc_tokens = output.generated_ids[:, :-1] @@ -526,40 +334,51 @@ def check_intern_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( @pytest.mark.on_qaic @pytest.mark.multimodal -@pytest.mark.parametrize( - "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer", test_models_config -) -def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( - model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer -): +@pytest.mark.parametrize("model_name", test_mm_models) +@pytest.mark.parametrize("kv_offload", [True, False]) +def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(model_name, kv_offload): """ Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, without continuous batching. ``Mandatory`` Args: :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` """ + if model_name in [ + "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "allenai/Molmo-7B-D-0924", + "meta-llama/Llama-3.2-11B-Vision-Instruct", + ]: + pytest.skip("Test skipped for this model due to some issues.") + if ( + model_name in ["OpenGVLab/InternVL2_5-1B", "OpenGVLab/InternVL3_5-1B", "Qwen/Qwen2.5-VL-3B-Instruct"] + and not kv_offload + ): + pytest.skip("These models require kv_offload=True for testing.") + # Get img_size for standard models, None for InternVL and Molmo + img_size = model_config_dict[model_name].get("img_size") + check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( model_name=model_name, - prompt_len=prompt_len, - ctx_len=ctx_len, + prompt_len=model_config_dict[model_name]["prompt_len"], + ctx_len=model_config_dict[model_name]["ctx_len"], max_gen_len=NEW_GENERATION_TOKENS, img_size=img_size, - img_url=img_url, - query=query, - n_layer=n_layer, - batch_size=batch_size, + img_url=model_config_dict[model_name]["img_url"], + query=model_config_dict[model_name]["text_prompt"], + n_layer=model_config_dict[model_name]["num_layers"], + batch_size=model_config_dict[model_name]["batch_size"], kv_offload=kv_offload, ) +### QNN Tests ### + + @pytest.mark.on_qaic @pytest.mark.qnn @pytest.mark.multimodal -@pytest.mark.parametrize( - "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer", test_models_config -) -def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100_qnn( - model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer -): +@pytest.mark.parametrize("model_name", test_mm_models) +@pytest.mark.parametrize("kv_offload", [True, False]) +def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name, kv_offload): """ Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, without continuous batching. ``Mandatory`` Args: @@ -573,83 +392,14 @@ def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100_qnn( check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( model_name=model_name, - prompt_len=prompt_len, - ctx_len=ctx_len, - max_gen_len=NEW_GENERATION_TOKENS, - img_size=img_size, - img_url=img_url, - query=query, - n_layer=n_layer, - batch_size=batch_size, - kv_offload=kv_offload, - enable_qnn=True, - qnn_config=qnn_config_json_path, - ) - - -@pytest.mark.on_qaic -@pytest.mark.multimodal -@pytest.mark.parametrize( - "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer", molmo_model_config -) -def test_image_text_to_text_molmo_pytorch_vs_kv_vs_ort_vs_ai100( - model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer -): - check_molmo_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( - model_name=model_name, - prompt_len=prompt_len, - ctx_len=ctx_len, - max_gen_len=NEW_GENERATION_TOKENS, - img_url=img_url, - query=query, - n_layer=n_layer, - batch_size=batch_size, - kv_offload=kv_offload, - ) - - -@pytest.mark.on_qaic -@pytest.mark.multimodal -@pytest.mark.parametrize( - "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer", intern_model_config -) -def test_image_text_to_text_intern_pytorch_vs_kv_vs_ort_vs_ai100( - model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer -): - check_intern_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( - model_name=model_name, - prompt_len=prompt_len, - ctx_len=ctx_len, - max_gen_len=NEW_GENERATION_TOKENS, - img_url=img_url, - query=query, - n_layer=n_layer, - batch_size=batch_size, - kv_offload=kv_offload, - ) - - -@pytest.mark.on_qaic -@pytest.mark.qnn -@pytest.mark.multimodal -@pytest.mark.parametrize( - "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer", intern_model_config -) -def test_image_text_to_text_intern_pytorch_vs_kv_vs_ort_vs_ai100_qnn( - model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer -): - qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json") - create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG) - - check_intern_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( - model_name=model_name, - prompt_len=prompt_len, - ctx_len=ctx_len, + prompt_len=model_config_dict[model_name]["prompt_len"], + ctx_len=model_config_dict[model_name]["ctx_len"], max_gen_len=NEW_GENERATION_TOKENS, - img_url=img_url, - query=query, - n_layer=n_layer, - batch_size=batch_size, + img_size=model_config_dict[model_name]["img_size"], + img_url=model_config_dict[model_name]["img_url"], + query=model_config_dict[model_name]["text_prompt"], + n_layer=model_config_dict[model_name]["num_layers"], + batch_size=model_config_dict[model_name]["batch_size"], kv_offload=kv_offload, enable_qnn=True, qnn_config=qnn_config_json_path, diff --git a/tests/transformers/models/test_model_configs.json b/tests/transformers/models/test_model_configs.json new file mode 100644 index 000000000..d75eee0c1 --- /dev/null +++ b/tests/transformers/models/test_model_configs.json @@ -0,0 +1,575 @@ +{ + "causal_lm_models": [ + { + "model_name": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "model_type": "llama", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 32000, + "num_key_value_heads": 1 + } + }, + { + "model_name": "gpt2", + "model_type": "gpt2", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 50257, + "num_key_value_heads": 1 + } + }, + { + "model_name": "allenai/OLMo-2-0425-1B", + "model_type": "olmo2", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 100352, + "num_key_value_heads": 1 + } + }, + { + "model_name": "Salesforce/codegen-350M-mono", + "model_type": "codegen", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 4, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 51200, + "num_key_value_heads": 1, + "rotary_dim": 16 + } + }, + + { + "model_name": "microsoft/Phi-3-mini-4k-instruct", + "model_type": "phi3", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 32064, + "num_key_value_heads": 1 + } + }, + { + "model_name": "tiiuae/falcon-7b", + "model_type": "falcon", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 65024, + "num_key_value_heads": 1 + } + }, + { + "model_name": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "model_type": "qwen3_moe", + "additional_params": { + "hidden_size": 256, + "intermediate_size": 256, + "max_position_embeddings": 128, + "max_window_layers": 48, + "moe_intermediate_size": 768, + "num_attention_heads": 2, + "num_experts": 4, + "num_experts_per_tok": 2, + "num_hidden_layers": 1, + "num_key_value_heads": 1, + "vocab_size": 151936 + } + }, + { + "model_name": "Qwen/Qwen2-0.5B", + "model_type": "qwen2", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 151936, + "num_key_value_heads": 1 + } + }, + { + "model_name": "bigcode/starcoder2-3b", + "model_type": "starcoder2", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 49152, + "num_key_value_heads": 1 + } + }, + { + "model_name": "Felladrin/Minueza-32M-Base", + "model_type": "mistral", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 32002, + "num_key_value_heads": 1 + } + }, + { + "model_name": "wtang06/mpt-125m-c4", + "model_type": "mpt", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 50368 + } + }, + { + "model_name": "hakurei/gpt-j-random-tinier", + "model_type": "gptj", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 50400, + "num_key_value_heads": 1, + "rotary_dim": 16 + } + }, + { + "model_name": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "model_type": "mixtral", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 32000, + "num_key_value_heads": 1 + } + }, + { + "model_name": "meta-llama/Llama-3.2-1B", + "model_type": "llama", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 128256, + "num_key_value_heads": 1, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + } + } + }, + { + "model_name": "unsloth/gemma-2b", + "model_type": "gemma", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 256000, + "num_key_value_heads": 1 + } + }, + { + "model_name": "unsloth/gemma-2-2b", + "model_type": "gemma2", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 256000, + "num_key_value_heads": 1 + } + }, + { + "model_name": "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", + "model_type": "llama", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 32003 + } + }, + { + "model_name": "TheBloke/Llama-2-7B-GPTQ", + "model_type": "llama", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 32000 + } + }, + { + "model_name": "ibm-granite/granite-20b-code-base", + "model_type": "gpt_bigcode", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 49152, + "num_key_value_heads": 1, + "activation_function": "gelu", + "architectures": [ + "GPTBigCodeForCausalLM" + ] + } + }, + { + "model_name": "neuralmagic/Llama-3.2-3B-Instruct-FP8", + "model_type": "llama", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 128256 + } + }, + { + "model_name": "neuralmagic/Qwen2-0.5B-Instruct-FP8", + "model_type": "qwen2", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 2, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 151936 + } + }, + { + "model_name": "ibm-granite/granite-3.1-2b-instruct", + "model_type": "granite", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 49155, + "num_key_value_heads": 1 + } + }, + { + "model_name": "ibm-granite/granite-guardian-3.1-2b", + "model_type": "granite", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 49155, + "num_key_value_heads": 1 + } + }, + { + "model_name": "hpcai-tech/grok-1", + "model_type": null, + "additional_params":{ + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 131072, + "num_key_value_heads": 1 + } + }, + { + "model_name": "Snowflake/Llama-3.1-SwiftKV-8B-Instruct", + "model_type": null, + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 2, + "num_attention_heads": 2, + "hidden_size": 256, + "intermediate_size": 256, + "vocab_size": 128256, + "num_key_value_layers": 1, + "num_key_value_heads": 1, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + } + } + } + ], + "multimodal_models": [ + { + "model_name": "llava-hf/llava-1.5-7b-hf", + "model_type": "llava", + "batch_size": 1, + "prompt_len": 784, + "ctx_len": 1024, + "img_size": 336, + "img_url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", + "text_prompt": "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", + "num_layers": 1, + "additional_params": {} + }, + { + "model_name": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "model_type": "llama4", + "batch_size": 1, + "prompt_len": 32, + "ctx_len": 3072, + "img_size": 336, + "img_url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", + "text_prompt": "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", + "num_layers": 4, + "additional_params": {} + }, + { + "model_name": "google/gemma-3-4b-it", + "model_type": "gemma3", + "batch_size": 1, + "prompt_len": 128, + "ctx_len": 3072, + "img_size": 896, + "img_url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "text_prompt": "Can you describe the image in detail.", + "num_layers": 1, + "additional_params": {} + }, + { + "model_name": "mistralai/Mistral-Small-3.1-24B-Instruct-2503", + "model_type": "mistral3", + "batch_size": 1, + "prompt_len": 128, + "ctx_len": 4096, + "img_size": 1540, + "img_url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "text_prompt": "Can you describe the image in detail.", + "num_layers": 1, + "additional_params": {} + }, + { + "model_name": "Qwen/Qwen2.5-VL-3B-Instruct", + "model_type": "qwen2_5_vl", + "batch_size": 1, + "prompt_len": 128, + "ctx_len": 4096, + "img_size": 1540, + "img_url": "https://picsum.photos/id/237/536/354", + "text_prompt": "Can you describe the image in detail.", + "num_layers": 1, + "additional_params": {} + }, + { + "model_name": "allenai/Molmo-7B-D-0924", + "model_type": "molmo", + "batch_size": 1, + "prompt_len": 128, + "ctx_len": 4096, + "img_size": null, + "img_url": "https://picsum.photos/id/237/536/354", + "text_prompt": "Can you describe the image in detail.", + "num_layers": 2, + "additional_params": {} + }, + { + "model_name": "OpenGVLab/InternVL2_5-1B", + "model_type": "internvl_chat", + "batch_size": 1, + "prompt_len": 384, + "ctx_len": 512, + "img_size": null, + "img_url": "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-1-2048.jpg", + "text_prompt": "Please describe the image in detail.", + "num_layers": 2, + "additional_params": {} + }, + { + "model_name": "OpenGVLab/InternVL3_5-1B", + "model_type": "internvl_chat", + "batch_size": 1, + "prompt_len": 384, + "ctx_len": 512, + "img_size": null, + "img_url": "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-1-2048.jpg", + "text_prompt": "Please describe the image in detail.", + "num_layers": 2, + "additional_params": {} + }, + { + "model_name": "meta-llama/Llama-3.2-11B-Vision-Instruct", + "model_type": "mllama", + "batch_size": 1, + "prompt_len": 32, + "ctx_len": 512, + "img_size": 560, + "img_url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", + "text_prompt": "Explain this image", + "num_layers": 7, + "additional_params": {} + } + + ], + "speech_seq2seq_models": [ + "openai/whisper-tiny" + ], + "embedding_models": [ + {"model_name": "jinaai/jina-embeddings-v2-base-code", "pooling": "mean"}, + {"model_name": "sentence-transformers/nli-bert-base-cls-pooling", "pooling": "cls"} + ], + "spd_causal_lm_models": [ + { + "model_name": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "model_type": "llama", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 32000, + "num_key_value_heads": 1 + } + }, + { + "model_name": "Qwen/Qwen2-0.5B", + "model_type": "qwen2", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 151936, + "num_key_value_heads": 1 + } + } + ], + "qnn_causal_lm_models": [ + { + "model_name": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "model_type": "mixtral", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 32000, + "num_key_value_heads": 1 + } + }, + { + "model_name": "meta-llama/Llama-3.2-1B", + "model_type": "llama", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 128256, + "num_key_value_heads": 1, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + } + } + }, + { + "model_name": "unsloth/gemma-2b", + "model_type": "gemma", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 256000, + "num_key_value_heads": 1 + } + }, + { + "model_name": "ibm-granite/granite-guardian-3.1-2b", + "model_type": "granite", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 49155, + "num_key_value_heads": 1 + } + } + ], + "prefix_caching_models": [ + { + "model_name": "gpt2", + "model_type": "gpt2", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 50257, + "num_key_value_heads": 1 + } + } + ], + "audio_embedding_models": [ + "facebook/wav2vec2-base-960h" + ] +} diff --git a/tests/transformers/models/test_prefix_caching.py b/tests/transformers/models/test_prefix_caching.py index 88862fce7..a9662cc73 100644 --- a/tests/transformers/models/test_prefix_caching.py +++ b/tests/transformers/models/test_prefix_caching.py @@ -5,6 +5,7 @@ # # ----------------------------------------------------------------------------- +import json import os import numpy as np @@ -16,7 +17,13 @@ from QEfficient.utils._utils import create_json from QEfficient.utils.constants import QnnConstants -test_models = ["gpt2"] +CONFIG_PATH = os.path.join(os.path.dirname(__file__), "test_model_configs.json") + +with open(CONFIG_PATH, "r") as f: + config_data = json.load(f) + prefix_caching_models = config_data["prefix_caching_models"] + +test_models = [model["model_name"] for model in prefix_caching_models] # The test should first generate output with some prefix+suffix1 or batch_id and then confirm that we are still able to execute of prefix+suffix2 on same batch id and getting correct output. diff --git a/tests/transformers/models/test_speech_seq2seq_models.py b/tests/transformers/models/test_speech_seq2seq_models.py index 4ae8928b7..52a96d7fe 100644 --- a/tests/transformers/models/test_speech_seq2seq_models.py +++ b/tests/transformers/models/test_speech_seq2seq_models.py @@ -5,6 +5,7 @@ # # ----------------------------------------------------------------------------- +import json import os from importlib import reload from typing import List, Optional @@ -25,9 +26,11 @@ from QEfficient.utils.constants import Constants, QnnConstants from QEfficient.utils.device_utils import get_available_device_id -test_models = [ - "openai/whisper-tiny", -] +CONFIG_PATH = os.path.join(os.path.dirname(__file__), "test_model_configs.json") + +with open(CONFIG_PATH, "r") as f: + config_data = json.load(f) + test_models = config_data["speech_seq2seq_models"] def load_seq2seq_model(model_config):