From 7b76c58751f863f4667992e1c5994575d6b7c6c2 Mon Sep 17 00:00:00 2001 From: noooop Date: Tue, 22 Apr 2025 16:55:13 +0800 Subject: [PATCH 01/13] Embedding entrypoints test coverage for a wider range of model architectures --- tests/entrypoints/openai/test_embedding.py | 298 ++++++++++++------ .../openai/test_embedding_dimensions.py | 75 ----- 2 files changed, 205 insertions(+), 168 deletions(-) delete mode 100644 tests/entrypoints/openai/test_embedding_dimensions.py diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index 2cdeb684f75d..332ba32044fe 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -1,59 +1,124 @@ # SPDX-License-Identifier: Apache-2.0 import base64 +from collections.abc import Sequence +from http import HTTPStatus +from typing import Optional import numpy as np import openai import pytest -import pytest_asyncio import requests from vllm.entrypoints.openai.protocol import EmbeddingResponse from vllm.transformers_utils.tokenizer import get_tokenizer -from ...models.embedding.utils import check_embeddings_close +from ...conftest import HfRunner +from ...models.embedding.utils import (EmbedModelInfo, check_embeddings_close, + matryoshka_fy) from ...utils import RemoteOpenAIServer -MODEL_NAME = "intfloat/multilingual-e5-small" +MODELS = [ + EmbedModelInfo("BAAI/bge-m3", + is_matryoshka=False, + architecture="XLMRobertaModel"), + EmbedModelInfo("jinaai/jina-embeddings-v3", + is_matryoshka=True, + architecture="XLMRobertaModel"), # Bert with Rotary + EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5", + is_matryoshka=True, + architecture="BertModel") +] + DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501 +input_texts = [ + "The chef prepared a delicious meal.", +] + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_info", MODELS) +@pytest.mark.parametrize("dtype", ["bfloat16"]) +async def test_models(hf_runner, vllm_runner, dtype: str, + model_info: EmbedModelInfo): + subtest_config(vllm_runner, dtype, model_info) -@pytest.fixture(scope="module") -def server(): args = [ "--task", "embed", # use half precision for speed and memory savings in CI environment "--dtype", - "bfloat16", + dtype, "--enforce-eager", "--max-model-len", "512", "--chat-template", DUMMY_CHAT_TEMPLATE, + "--trust_remote_code" ] - with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: - yield remote_server + if model_info.name == "Snowflake/snowflake-arctic-embed-m-v1.5": + # Manually enable Matryoshka Embeddings + args.extend(["--hf_overrides", '{"is_matryoshka":true}']) + with (RemoteOpenAIServer(model_info.name, args) as remote_server, + hf_runner(model_info.name, dtype=dtype, is_sentence_transformer=True) + as hf_model): + client = remote_server.get_async_client() -@pytest_asyncio.fixture -async def client(server): - async with server.get_async_client() as async_client: - yield async_client + await subtest_basic(model_info, remote_server, client) + await subtest_single_embedding(model_info, client, hf_model) + await subtest_batch_embedding(model_info, client, hf_model) + await subtest_conversation_embedding(model_info, remote_server, client) + await subtest_batch_base64_embedding(model_info, client, hf_model) + await subtest_embedding_truncation(model_info, client) + await subtest_matryoshka(model_info, client, hf_model) -@pytest.mark.asyncio -@pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str): - input_texts = [ - "The chef prepared a delicious meal.", - ] +def subtest_config(vllm_runner, dtype: str, model_info: EmbedModelInfo): + vllm_extra_kwargs = { + "hf_overrides": { + "is_matryoshka": model_info.is_matryoshka + } + } + + with vllm_runner(model_info.name, + task="embed", + dtype=dtype, + max_model_len=None, + **vllm_extra_kwargs) as vllm_model: + + assert (vllm_model.model.llm_engine.model_config.is_matryoshka == + model_info.is_matryoshka) + + if model_info.architecture: + assert (model_info.architecture + in vllm_model.model.llm_engine.model_config.architectures) + + +async def subtest_basic(model_info: EmbedModelInfo, server: RemoteOpenAIServer, + client: openai.AsyncOpenAI): + response = requests.get(server.url_for("health")) + assert response.status_code == HTTPStatus.OK + + models = await client.models.list() + models = models.data + + assert len(models) == 1 + served_model = models[0] + assert served_model.id == model_info.name + assert served_model.root == model_info.name + +async def subtest_single_embedding(model_info: EmbedModelInfo, + client: openai.AsyncOpenAI, + hf_model: HfRunner): # test single embedding + prompts = input_texts embedding_response = await client.embeddings.create( - model=model_name, - input=input_texts, + model=model_info.name, + input=prompts, encoding_format="float", ) embeddings = EmbeddingResponse.model_validate( @@ -61,15 +126,18 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str): assert embeddings.id is not None assert len(embeddings.data) == 1 - assert len(embeddings.data[0].embedding) == 384 + assert len(embeddings.data[0].embedding) > 0 assert embeddings.usage.completion_tokens == 0 - assert embeddings.usage.prompt_tokens == 11 - assert embeddings.usage.total_tokens == 11 + assert embeddings.usage.prompt_tokens > 0 + assert embeddings.usage.total_tokens > 0 + + vllm_outputs = [d.embedding for d in embeddings.data] + _correctness_test(model_info, hf_model, prompts, vllm_outputs) # test using token IDs input_tokens = [1, 1, 1, 1, 1] embedding_response = await client.embeddings.create( - model=model_name, + model=model_info.name, input=input_tokens, encoding_format="float", ) @@ -78,23 +146,23 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str): assert embeddings.id is not None assert len(embeddings.data) == 1 - assert len(embeddings.data[0].embedding) == 384 + assert len(embeddings.data[0].embedding) > 0 assert embeddings.usage.completion_tokens == 0 assert embeddings.usage.prompt_tokens == 5 assert embeddings.usage.total_tokens == 5 -@pytest.mark.asyncio -@pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str): +async def subtest_batch_embedding(model_info: EmbedModelInfo, + client: openai.AsyncOpenAI, + hf_model: HfRunner): # test list[str] - input_texts = [ + prompts = [ "The cat sat on the mat.", "A feline was resting on a rug.", "Stars twinkle brightly in the night sky." ] embedding_response = await client.embeddings.create( - model=model_name, - input=input_texts, + model=model_info.name, + input=prompts, encoding_format="float", ) embeddings = EmbeddingResponse.model_validate( @@ -102,16 +170,19 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str): assert embeddings.id is not None assert len(embeddings.data) == 3 - assert len(embeddings.data[0].embedding) == 384 + assert len(embeddings.data[0].embedding) > 0 assert embeddings.usage.completion_tokens == 0 - assert embeddings.usage.prompt_tokens == 33 - assert embeddings.usage.total_tokens == 33 + assert embeddings.usage.prompt_tokens > 0 + assert embeddings.usage.total_tokens > 0 + + vllm_outputs = [d.embedding for d in embeddings.data] + _correctness_test(model_info, hf_model, prompts, vllm_outputs) # test list[list[int]] input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24], [25, 32, 64, 77]] embedding_response = await client.embeddings.create( - model=model_name, + model=model_info.name, input=input_tokens, encoding_format="float", ) @@ -120,17 +191,15 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str): assert embeddings.id is not None assert len(embeddings.data) == 4 - assert len(embeddings.data[0].embedding) == 384 + assert len(embeddings.data[0].embedding) > 0 assert embeddings.usage.completion_tokens == 0 assert embeddings.usage.prompt_tokens == 17 assert embeddings.usage.total_tokens == 17 -@pytest.mark.asyncio -@pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_conversation_embedding(server: RemoteOpenAIServer, - client: openai.AsyncOpenAI, - model_name: str): +async def subtest_conversation_embedding(model_info: EmbedModelInfo, + server: RemoteOpenAIServer, + client: openai.AsyncOpenAI): messages = [{ "role": "user", "content": "The cat sat on the mat.", @@ -145,7 +214,7 @@ async def test_conversation_embedding(server: RemoteOpenAIServer, chat_response = requests.post( server.url_for("v1/embeddings"), json={ - "model": model_name, + "model": model_info.name, "messages": messages, "encoding_format": "float", }, @@ -153,7 +222,8 @@ async def test_conversation_embedding(server: RemoteOpenAIServer, chat_response.raise_for_status() chat_embeddings = EmbeddingResponse.model_validate(chat_response.json()) - tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast") + tokenizer = get_tokenizer(tokenizer_name=model_info.name, + tokenizer_mode="fast") prompt = tokenizer.apply_chat_template( messages, chat_template=DUMMY_CHAT_TEMPLATE, @@ -162,7 +232,7 @@ async def test_conversation_embedding(server: RemoteOpenAIServer, tokenize=False, ) completion_response = await client.embeddings.create( - model=model_name, + model=model_info.name, input=prompt, encoding_format="float", # To be consistent with chat @@ -179,60 +249,42 @@ async def test_conversation_embedding(server: RemoteOpenAIServer, exclude={"id", "created"})) -@pytest.mark.asyncio -@pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_batch_base64_embedding(client: openai.AsyncOpenAI, - model_name: str): - input_texts = [ +async def subtest_batch_base64_embedding(model_info: EmbedModelInfo, + client: openai.AsyncOpenAI, + hf_model: HfRunner): + prompts = [ "Hello my name is", "The best thing about vLLM is that it supports many different models" ] - responses_float = await client.embeddings.create(input=input_texts, - model=model_name, + # test float responses + responses_float = await client.embeddings.create(input=prompts, + model=model_info.name, encoding_format="float") float_data = [d.embedding for d in responses_float.data] + _correctness_test(model_info, hf_model, prompts, float_data) - responses_base64 = await client.embeddings.create(input=input_texts, - model=model_name, + # test base64 responses + responses_base64 = await client.embeddings.create(input=prompts, + model=model_info.name, encoding_format="base64") base64_data = [] for data in responses_base64.data: base64_data.append( np.frombuffer(base64.b64decode(data.embedding), dtype="float32").tolist()) + _correctness_test(model_info, hf_model, prompts, base64_data) - check_embeddings_close( - embeddings_0_lst=float_data, - embeddings_1_lst=base64_data, - name_0="float", - name_1="base64", - ) - # Default response is float32 decoded from base64 by OpenAI Client - responses_default = await client.embeddings.create(input=input_texts, - model=model_name) - default_data = [d.embedding for d in responses_default.data] - - check_embeddings_close( - embeddings_0_lst=float_data, - embeddings_1_lst=default_data, - name_0="float", - name_1="default", - ) - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_single_embedding_truncation(client: openai.AsyncOpenAI, - model_name: str): +async def subtest_embedding_truncation(model_info: EmbedModelInfo, + client: openai.AsyncOpenAI): input_texts = [ "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?", ] # test single embedding embedding_response = await client.embeddings.create( - model=model_name, + model=model_info.name, input=input_texts, extra_body={"truncate_prompt_tokens": 10}) embeddings = EmbeddingResponse.model_validate( @@ -240,17 +292,17 @@ async def test_single_embedding_truncation(client: openai.AsyncOpenAI, assert embeddings.id is not None assert len(embeddings.data) == 1 - assert len(embeddings.data[0].embedding) == 384 + assert len(embeddings.data[0].embedding) > 0 assert embeddings.usage.completion_tokens == 0 - assert embeddings.usage.prompt_tokens == 10 - assert embeddings.usage.total_tokens == 10 + assert embeddings.usage.prompt_tokens > 0 + assert embeddings.usage.total_tokens > 0 input_tokens = [ 1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728, 9901, 340, 2229, 385, 340, 315, 28741, 28804, 2 ] embedding_response = await client.embeddings.create( - model=model_name, + model=model_info.name, input=input_tokens, extra_body={"truncate_prompt_tokens": 10}) embeddings = EmbeddingResponse.model_validate( @@ -258,25 +310,85 @@ async def test_single_embedding_truncation(client: openai.AsyncOpenAI, assert embeddings.id is not None assert len(embeddings.data) == 1 - assert len(embeddings.data[0].embedding) == 384 + assert len(embeddings.data[0].embedding) > 0 assert embeddings.usage.completion_tokens == 0 assert embeddings.usage.prompt_tokens == 10 assert embeddings.usage.total_tokens == 10 - -@pytest.mark.asyncio -@pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI, - model_name: str): - input_texts = [ - "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?", - ] - + # test_single_embedding_truncation_invalid with pytest.raises(openai.BadRequestError): response = await client.embeddings.create( - model=model_name, + model=model_info.name, input=input_texts, - extra_body={"truncate_prompt_tokens": 8193}) + extra_body={"truncate_prompt_tokens": 100000}) assert "error" in response.object assert "truncate_prompt_tokens value is greater than max_model_len. "\ "Please, select a smaller truncation size." in response.message + + +async def subtest_matryoshka(model_info: EmbedModelInfo, + client: openai.AsyncOpenAI, hf_model: HfRunner): + + async def make_request_and_correctness_test(dimensions): + prompts = input_texts * 3 + + embedding_response = await client.embeddings.create( + model=model_info.name, + input=prompts, + dimensions=dimensions, + encoding_format="float", + ) + embeddings = EmbeddingResponse.model_validate( + embedding_response.model_dump(mode="json")) + + assert embeddings.id is not None + assert len(embeddings.data) == 3 + assert len(embeddings.data[0].embedding) > 0 + assert embeddings.usage.completion_tokens == 0 + assert embeddings.usage.prompt_tokens > 0 + assert embeddings.usage.total_tokens > 0 + + if dimensions is not None: + assert len(embeddings.data[0].embedding) == dimensions + + vllm_outputs = [d.embedding for d in embeddings.data] + _correctness_test(model_info, hf_model, prompts, vllm_outputs, + dimensions) + + if model_info.is_matryoshka: + for dimensions in [None, 16]: + await make_request_and_correctness_test(dimensions) + + with pytest.raises(openai.BadRequestError): + for dimensions in [-1]: + await make_request_and_correctness_test(dimensions) + + else: + for dimensions in [None]: + await make_request_and_correctness_test(dimensions) + + with pytest.raises(openai.BadRequestError): + for dimensions in [-1, 16]: + await make_request_and_correctness_test(dimensions) + + +def _correctness_test(model_info: EmbedModelInfo, + hf_model: HfRunner, + inputs, + vllm_outputs: Sequence[list[float]], + dimensions: Optional[int] = None): + hf_kwargs = {} + if model_info.name == "jinaai/jina-embeddings-v3": + hf_kwargs["task"] = "text-matching" + + hf_outputs = hf_model.encode(inputs, **hf_kwargs) + if dimensions: + hf_outputs = matryoshka_fy(hf_outputs, dimensions) + + check_embeddings_close( + embeddings_0_lst=hf_outputs, + embeddings_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + tol=1e-2, + ) diff --git a/tests/entrypoints/openai/test_embedding_dimensions.py b/tests/entrypoints/openai/test_embedding_dimensions.py deleted file mode 100644 index 43d109f74f5d..000000000000 --- a/tests/entrypoints/openai/test_embedding_dimensions.py +++ /dev/null @@ -1,75 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -""" -Run `pytest tests/entrypoints/openai/test_embedding_dimensions.py`. -""" - -import openai -import pytest - -from vllm.entrypoints.openai.protocol import EmbeddingResponse - -from ...models.embedding.utils import EmbedModelInfo -from ...utils import RemoteOpenAIServer - -MODELS = [ - EmbedModelInfo(name="BAAI/bge-m3", is_matryoshka=False), - EmbedModelInfo(name="jinaai/jina-embeddings-v3", is_matryoshka=True), -] - -input_texts = [ - "The chef prepared a delicious meal.", -] * 3 - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model", MODELS) -async def test_validating_dimensions(model: EmbedModelInfo): - args = [ - "--task", - "embed", - # use half precision for speed and memory savings in CI environment - "--dtype", - "bfloat16", - "--enforce-eager", - "--max-model-len", - "512", - "--trust_remote_code" - ] - with RemoteOpenAIServer(model.name, args) as remote_server: - client = remote_server.get_async_client() - - async def make_request(dimensions): - embedding_response = await client.embeddings.create( - model=model.name, - input=input_texts, - dimensions=dimensions, - encoding_format="float", - ) - embeddings = EmbeddingResponse.model_validate( - embedding_response.model_dump(mode="json")) - - assert embeddings.id is not None - assert len(embeddings.data) == 3 - assert len(embeddings.data[0].embedding) > 0 - assert embeddings.usage.completion_tokens == 0 - assert embeddings.usage.prompt_tokens > 0 - assert embeddings.usage.total_tokens > 0 - - if dimensions is not None: - assert len(embeddings.data[0].embedding) == dimensions - - if model.is_matryoshka: - for dimensions in [None, 16]: - await make_request(dimensions) - - with pytest.raises(openai.BadRequestError): - for dimensions in [-1]: - await make_request(dimensions) - - else: - for dimensions in [None]: - await make_request(dimensions) - - with pytest.raises(openai.BadRequestError): - for dimensions in [-1, 16]: - await make_request(dimensions) From 0cb9df5bf3f5f53d6fc77090d698d72df87cbef3 Mon Sep 17 00:00:00 2001 From: noooop Date: Tue, 22 Apr 2025 19:11:20 +0800 Subject: [PATCH 02/13] pytest style test --- tests/entrypoints/openai/test_embedding.py | 135 ++++++++++++--------- 1 file changed, 81 insertions(+), 54 deletions(-) diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index 332ba32044fe..5c02d94376a1 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -37,46 +37,17 @@ ] -@pytest.mark.asyncio -@pytest.mark.parametrize("model_info", MODELS) -@pytest.mark.parametrize("dtype", ["bfloat16"]) -async def test_models(hf_runner, vllm_runner, dtype: str, - model_info: EmbedModelInfo): - subtest_config(vllm_runner, dtype, model_info) - - args = [ - "--task", - "embed", - # use half precision for speed and memory savings in CI environment - "--dtype", - dtype, - "--enforce-eager", - "--max-model-len", - "512", - "--chat-template", - DUMMY_CHAT_TEMPLATE, - "--trust_remote_code" - ] +@pytest.fixture(scope="module", params=MODELS) +def model_info(request): + return request.param - if model_info.name == "Snowflake/snowflake-arctic-embed-m-v1.5": - # Manually enable Matryoshka Embeddings - args.extend(["--hf_overrides", '{"is_matryoshka":true}']) - with (RemoteOpenAIServer(model_info.name, args) as remote_server, - hf_runner(model_info.name, dtype=dtype, is_sentence_transformer=True) - as hf_model): - client = remote_server.get_async_client() +@pytest.fixture(scope="module", params=["bfloat16"]) +def dtype(request): + return request.param - await subtest_basic(model_info, remote_server, client) - await subtest_single_embedding(model_info, client, hf_model) - await subtest_batch_embedding(model_info, client, hf_model) - await subtest_conversation_embedding(model_info, remote_server, client) - await subtest_batch_base64_embedding(model_info, client, hf_model) - await subtest_embedding_truncation(model_info, client) - await subtest_matryoshka(model_info, client, hf_model) - -def subtest_config(vllm_runner, dtype: str, model_info: EmbedModelInfo): +def test_config(vllm_runner, dtype, model_info): vllm_extra_kwargs = { "hf_overrides": { "is_matryoshka": model_info.is_matryoshka @@ -97,11 +68,44 @@ def subtest_config(vllm_runner, dtype: str, model_info: EmbedModelInfo): in vllm_model.model.llm_engine.model_config.architectures) -async def subtest_basic(model_info: EmbedModelInfo, server: RemoteOpenAIServer, - client: openai.AsyncOpenAI): +@pytest.fixture(scope="module") +def server(model_info, dtype: str): + args = [ + "--task", + "embed", + # use half precision for speed and memory savings in CI environment + "--dtype", + dtype, + "--enforce-eager", + "--max-model-len", + "512", + "--chat-template", + DUMMY_CHAT_TEMPLATE, + "--trust_remote_code" + ] + + if model_info.name == "Snowflake/snowflake-arctic-embed-m-v1.5": + # Manually enable Matryoshka Embeddings + args.extend(["--hf_overrides", '{"is_matryoshka":true}']) + + with RemoteOpenAIServer(model_info.name, args) as remote_server: + yield remote_server + + +@pytest.fixture(scope="module") +def hf_model(hf_runner, model_info, dtype: str): + with hf_runner(model_info.name, dtype=dtype, + is_sentence_transformer=True) as hf_model: + yield hf_model + + +@pytest.mark.asyncio +async def test_basic(model_info: EmbedModelInfo, server: RemoteOpenAIServer): response = requests.get(server.url_for("health")) assert response.status_code == HTTPStatus.OK + client = server.get_async_client() + models = await client.models.list() models = models.data @@ -111,9 +115,12 @@ async def subtest_basic(model_info: EmbedModelInfo, server: RemoteOpenAIServer, assert served_model.root == model_info.name -async def subtest_single_embedding(model_info: EmbedModelInfo, - client: openai.AsyncOpenAI, - hf_model: HfRunner): +@pytest.mark.asyncio +async def test_single_embedding(model_info: EmbedModelInfo, + server: RemoteOpenAIServer, + hf_model: HfRunner): + client = server.get_async_client() + # test single embedding prompts = input_texts embedding_response = await client.embeddings.create( @@ -152,9 +159,11 @@ async def subtest_single_embedding(model_info: EmbedModelInfo, assert embeddings.usage.total_tokens == 5 -async def subtest_batch_embedding(model_info: EmbedModelInfo, - client: openai.AsyncOpenAI, - hf_model: HfRunner): +@pytest.mark.asyncio +async def test_batch_embedding(model_info: EmbedModelInfo, + server: RemoteOpenAIServer, hf_model: HfRunner): + client = server.get_async_client() + # test list[str] prompts = [ "The cat sat on the mat.", "A feline was resting on a rug.", @@ -197,9 +206,11 @@ async def subtest_batch_embedding(model_info: EmbedModelInfo, assert embeddings.usage.total_tokens == 17 -async def subtest_conversation_embedding(model_info: EmbedModelInfo, - server: RemoteOpenAIServer, - client: openai.AsyncOpenAI): +@pytest.mark.asyncio +async def test_conversation_embedding(model_info: EmbedModelInfo, + server: RemoteOpenAIServer): + client = server.get_async_client() + messages = [{ "role": "user", "content": "The cat sat on the mat.", @@ -249,9 +260,12 @@ async def subtest_conversation_embedding(model_info: EmbedModelInfo, exclude={"id", "created"})) -async def subtest_batch_base64_embedding(model_info: EmbedModelInfo, - client: openai.AsyncOpenAI, - hf_model: HfRunner): +@pytest.mark.asyncio +async def test_batch_base64_embedding(model_info: EmbedModelInfo, + server: RemoteOpenAIServer, + hf_model: HfRunner): + client = server.get_async_client() + prompts = [ "Hello my name is", "The best thing about vLLM is that it supports many different models" @@ -275,9 +289,20 @@ async def subtest_batch_base64_embedding(model_info: EmbedModelInfo, dtype="float32").tolist()) _correctness_test(model_info, hf_model, prompts, base64_data) + # Default response is float32 decoded from base64 by OpenAI Client + responses_default = await client.embeddings.create(input=prompts, + model=model_info.name) + default_data = [d.embedding for d in responses_default.data] + _correctness_test(model_info, hf_model, prompts, default_data) + + +@pytest.mark.asyncio +async def test_embedding_truncation( + model_info: EmbedModelInfo, + server: RemoteOpenAIServer, +): + client = server.get_async_client() -async def subtest_embedding_truncation(model_info: EmbedModelInfo, - client: openai.AsyncOpenAI): input_texts = [ "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?", ] @@ -326,8 +351,10 @@ async def subtest_embedding_truncation(model_info: EmbedModelInfo, "Please, select a smaller truncation size." in response.message -async def subtest_matryoshka(model_info: EmbedModelInfo, - client: openai.AsyncOpenAI, hf_model: HfRunner): +@pytest.mark.asyncio +async def test_matryoshka(model_info: EmbedModelInfo, + server: RemoteOpenAIServer, hf_model: HfRunner): + client = server.get_async_client() async def make_request_and_correctness_test(dimensions): prompts = input_texts * 3 From 60ea5215f88978f06db2a87efbcb63a36e24b280 Mon Sep 17 00:00:00 2001 From: noooop Date: Wed, 23 Apr 2025 11:00:36 +0800 Subject: [PATCH 03/13] Add end-to-end accuracy tests compared with hf --- tests/entrypoints/openai/test_embedding.py | 52 +++++----------------- 1 file changed, 10 insertions(+), 42 deletions(-) diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index 5c02d94376a1..02d2784f0ecc 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -19,15 +19,9 @@ from ...utils import RemoteOpenAIServer MODELS = [ - EmbedModelInfo("BAAI/bge-m3", - is_matryoshka=False, - architecture="XLMRobertaModel"), - EmbedModelInfo("jinaai/jina-embeddings-v3", - is_matryoshka=True, - architecture="XLMRobertaModel"), # Bert with Rotary + EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False), EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5", - is_matryoshka=True, - architecture="BertModel") + is_matryoshka=True) ] DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501 @@ -47,27 +41,6 @@ def dtype(request): return request.param -def test_config(vllm_runner, dtype, model_info): - vllm_extra_kwargs = { - "hf_overrides": { - "is_matryoshka": model_info.is_matryoshka - } - } - - with vllm_runner(model_info.name, - task="embed", - dtype=dtype, - max_model_len=None, - **vllm_extra_kwargs) as vllm_model: - - assert (vllm_model.model.llm_engine.model_config.is_matryoshka == - model_info.is_matryoshka) - - if model_info.architecture: - assert (model_info.architecture - in vllm_model.model.llm_engine.model_config.architectures) - - @pytest.fixture(scope="module") def server(model_info, dtype: str): args = [ @@ -139,7 +112,7 @@ async def test_single_embedding(model_info: EmbedModelInfo, assert embeddings.usage.total_tokens > 0 vllm_outputs = [d.embedding for d in embeddings.data] - _correctness_test(model_info, hf_model, prompts, vllm_outputs) + _correctness_test(hf_model, prompts, vllm_outputs) # test using token IDs input_tokens = [1, 1, 1, 1, 1] @@ -185,7 +158,7 @@ async def test_batch_embedding(model_info: EmbedModelInfo, assert embeddings.usage.total_tokens > 0 vllm_outputs = [d.embedding for d in embeddings.data] - _correctness_test(model_info, hf_model, prompts, vllm_outputs) + _correctness_test(hf_model, prompts, vllm_outputs) # test list[list[int]] input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24], @@ -276,7 +249,7 @@ async def test_batch_base64_embedding(model_info: EmbedModelInfo, model=model_info.name, encoding_format="float") float_data = [d.embedding for d in responses_float.data] - _correctness_test(model_info, hf_model, prompts, float_data) + _correctness_test(hf_model, prompts, float_data) # test base64 responses responses_base64 = await client.embeddings.create(input=prompts, @@ -287,13 +260,13 @@ async def test_batch_base64_embedding(model_info: EmbedModelInfo, base64_data.append( np.frombuffer(base64.b64decode(data.embedding), dtype="float32").tolist()) - _correctness_test(model_info, hf_model, prompts, base64_data) + _correctness_test(hf_model, prompts, base64_data) # Default response is float32 decoded from base64 by OpenAI Client responses_default = await client.embeddings.create(input=prompts, model=model_info.name) default_data = [d.embedding for d in responses_default.data] - _correctness_test(model_info, hf_model, prompts, default_data) + _correctness_test(hf_model, prompts, default_data) @pytest.mark.asyncio @@ -379,8 +352,7 @@ async def make_request_and_correctness_test(dimensions): assert len(embeddings.data[0].embedding) == dimensions vllm_outputs = [d.embedding for d in embeddings.data] - _correctness_test(model_info, hf_model, prompts, vllm_outputs, - dimensions) + _correctness_test(hf_model, prompts, vllm_outputs, dimensions) if model_info.is_matryoshka: for dimensions in [None, 16]: @@ -399,16 +371,12 @@ async def make_request_and_correctness_test(dimensions): await make_request_and_correctness_test(dimensions) -def _correctness_test(model_info: EmbedModelInfo, - hf_model: HfRunner, +def _correctness_test(hf_model: HfRunner, inputs, vllm_outputs: Sequence[list[float]], dimensions: Optional[int] = None): - hf_kwargs = {} - if model_info.name == "jinaai/jina-embeddings-v3": - hf_kwargs["task"] = "text-matching" - hf_outputs = hf_model.encode(inputs, **hf_kwargs) + hf_outputs = hf_model.encode(inputs) if dimensions: hf_outputs = matryoshka_fy(hf_outputs, dimensions) From 2024ca187aae4b9aa9df5e0440aee60acdeb0f37 Mon Sep 17 00:00:00 2001 From: noooop Date: Thu, 24 Apr 2025 11:15:02 +0800 Subject: [PATCH 04/13] Using `matryoshka_dimensions` control the allowed output dimensions. --- docs/source/models/pooling_models.md | 10 +++---- .../openai_embedding_matryoshka_fy.py | 4 +-- tests/entrypoints/openai/test_embedding.py | 30 +++++++++++++------ tests/models/embedding/utils.py | 3 +- vllm/config.py | 4 +++ vllm/pooling_params.py | 11 ++++++- 6 files changed, 44 insertions(+), 18 deletions(-) diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md index 5f1c2b5b4a3b..6cf425da2a38 100644 --- a/docs/source/models/pooling_models.md +++ b/docs/source/models/pooling_models.md @@ -159,14 +159,14 @@ For example, setting `dimensions` parameter while using the `BAAI/bge-m3` model ### Manually enable Matryoshka Embeddings -There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, we simply check the existence of the fields `is_matryoshka` or `matryoshka_dimensions` inside `config.json`. +There is currently no official interface for specifying support for Matryoshka Embeddings. In vllm, if `is_matryoshka` is `True` in `config.json,` it is allowed to change the output to arbitrary dimensions. Using `matryoshka_dimensions` can control the allowed output dimensions. -For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}` (offline) or `--hf_overrides '{"is_matryoshka": true}'` (online). +For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}`, `hf_overrides={"matryoshka_dimensions": [allowed output dimensions]}` (offline) or `--hf_overrides '{"is_matryoshka": true}'`, `--hf_overrides '{"matryoshka_dimensions": [allowed output dimensions]}'`(online). Here is an example to serve a model with Matryoshka Embeddings enabled. ```text -vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf_overrides '{"is_matryoshka":true}' +vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf_overrides '{"matryoshka_dimensions":[256]}' ``` ### Offline Inference @@ -204,14 +204,14 @@ curl http://127.0.0.1:8000/v1/embeddings \ "input": "Follow the white rabbit.", "model": "jinaai/jina-embeddings-v3", "encoding_format": "float", - "dimensions": 1 + "dimensions": 32 }' ``` Expected output: ```json -{"id":"embd-0aab28c384d348c3b8f0eb783109dc5f","object":"list","created":1744195454,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-1.0]}],"usage":{"prompt_tokens":10,"total_tokens":10,"completion_tokens":0,"prompt_tokens_details":null}} +{"id":"embd-0aab28c384d348c3b8f0eb783109dc5f","object":"list","created":1744195454,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":["32 floating-point numbers"]}],"usage":{"prompt_tokens":10,"total_tokens":10,"completion_tokens":0,"prompt_tokens_details":null}} ``` A openai client example can be found here: diff --git a/examples/online_serving/openai_embedding_matryoshka_fy.py b/examples/online_serving/openai_embedding_matryoshka_fy.py index 27ab8cb64037..4544dcfb5ab0 100644 --- a/examples/online_serving/openai_embedding_matryoshka_fy.py +++ b/examples/online_serving/openai_embedding_matryoshka_fy.py @@ -25,11 +25,11 @@ def main(): responses = client.embeddings.create( input=["Follow the white rabbit."], model=model, - dimensions=1, + dimensions=32, ) for data in responses.data: - print(data.embedding) # List of float of len 1 + print(data.embedding) # List of float of len 32 if __name__ == "__main__": diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index 02d2784f0ecc..61d7a47b5f7a 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -21,7 +21,8 @@ MODELS = [ EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False), EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5", - is_matryoshka=True) + is_matryoshka=True, + matryoshka_dimensions=[256]), ] DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501 @@ -53,13 +54,15 @@ def server(model_info, dtype: str): "--max-model-len", "512", "--chat-template", - DUMMY_CHAT_TEMPLATE, - "--trust_remote_code" + DUMMY_CHAT_TEMPLATE ] if model_info.name == "Snowflake/snowflake-arctic-embed-m-v1.5": # Manually enable Matryoshka Embeddings - args.extend(["--hf_overrides", '{"is_matryoshka":true}']) + args.extend([ + "--trust_remote_code", "--hf_overrides", + '{"matryoshka_dimensions":[256]}' + ]) with RemoteOpenAIServer(model_info.name, args) as remote_server: yield remote_server @@ -355,19 +358,28 @@ async def make_request_and_correctness_test(dimensions): _correctness_test(hf_model, prompts, vllm_outputs, dimensions) if model_info.is_matryoshka: - for dimensions in [None, 16]: + valid_dimensions = [None] + if model_info.matryoshka_dimensions: + valid_dimensions += model_info.matryoshka_dimensions[:2] + + for dimensions in valid_dimensions: await make_request_and_correctness_test(dimensions) - with pytest.raises(openai.BadRequestError): - for dimensions in [-1]: + invalid_dimensions = [-1] + if model_info.matryoshka_dimensions: + assert 5 not in model_info.matryoshka_dimensions + invalid_dimensions.append(5) + + for dimensions in invalid_dimensions: + with pytest.raises(openai.BadRequestError): await make_request_and_correctness_test(dimensions) else: for dimensions in [None]: await make_request_and_correctness_test(dimensions) - with pytest.raises(openai.BadRequestError): - for dimensions in [-1, 16]: + for dimensions in [-1, 16]: + with pytest.raises(openai.BadRequestError): await make_request_and_correctness_test(dimensions) diff --git a/tests/models/embedding/utils.py b/tests/models/embedding/utils.py index a58116e2bf0d..15ebd2549a45 100644 --- a/tests/models/embedding/utils.py +++ b/tests/models/embedding/utils.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from collections.abc import Sequence -from typing import NamedTuple +from typing import NamedTuple, Optional import torch import torch.nn.functional as F @@ -43,5 +43,6 @@ def matryoshka_fy(tensor, dimensions): class EmbedModelInfo(NamedTuple): name: str is_matryoshka: bool + matryoshka_dimensions: Optional[list[int]] = None architecture: str = "" enable_test: bool = True diff --git a/vllm/config.py b/vllm/config.py index 20ca20ad2b6d..7d7f6e6195a9 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1244,6 +1244,10 @@ def is_matryoshka(self) -> bool: return (hasattr(self.hf_config, "matryoshka_dimensions") or getattr(self.hf_config, "is_matryoshka", False)) + @property + def matryoshka_dimensions(self): + return getattr(self.hf_config, "matryoshka_dimensions", None) + BlockSize = Literal[8, 16, 32, 64, 128] CacheDType = Literal["auto", "fp8", "fp8_e4m3", "fp8_e5m2"] diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py index f71daf0c1955..9a3b254f9b68 100644 --- a/vllm/pooling_params.py +++ b/vllm/pooling_params.py @@ -35,7 +35,16 @@ def verify(self, model_config: "ModelConfig") -> None: f'Model "{model_config.served_model_name}" does not ' f'support matryoshka representation, ' f'changing output dimensions will lead to poor results.') - if self.dimensions < 1: + + mds = model_config.matryoshka_dimensions + if mds is not None: + if self.dimensions not in mds: + raise ValueError( + f'Model "{model_config.served_model_name}" ' + f'only supports {str(mds)} matryoshka dimensions, ' + f'use other output dimensions will ' + f'lead to poor results.') + elif self.dimensions < 1: raise ValueError("Dimensions must be greater than 0") def __repr__(self) -> str: From cf2600077d385ed919f3aee118fbf9de2e8781e3 Mon Sep 17 00:00:00 2001 From: noooop Date: Thu, 24 Apr 2025 11:21:39 +0800 Subject: [PATCH 05/13] mypy --- tests/entrypoints/openai/test_embedding.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index 61d7a47b5f7a..4f6c9091daf0 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -359,14 +359,14 @@ async def make_request_and_correctness_test(dimensions): if model_info.is_matryoshka: valid_dimensions = [None] - if model_info.matryoshka_dimensions: + if model_info.matryoshka_dimensions is not None: valid_dimensions += model_info.matryoshka_dimensions[:2] for dimensions in valid_dimensions: await make_request_and_correctness_test(dimensions) invalid_dimensions = [-1] - if model_info.matryoshka_dimensions: + if model_info.matryoshka_dimensions is not None: assert 5 not in model_info.matryoshka_dimensions invalid_dimensions.append(5) From 9e7e639938ed83cdf49f5fa475de51f464736379 Mon Sep 17 00:00:00 2001 From: noooop Date: Thu, 24 Apr 2025 11:29:26 +0800 Subject: [PATCH 06/13] mypy --- tests/entrypoints/openai/test_embedding.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index 4f6c9091daf0..d54222961e04 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -358,14 +358,14 @@ async def make_request_and_correctness_test(dimensions): _correctness_test(hf_model, prompts, vllm_outputs, dimensions) if model_info.is_matryoshka: - valid_dimensions = [None] + valid_dimensions: list[Optional[int]] = [None] if model_info.matryoshka_dimensions is not None: valid_dimensions += model_info.matryoshka_dimensions[:2] for dimensions in valid_dimensions: await make_request_and_correctness_test(dimensions) - invalid_dimensions = [-1] + invalid_dimensions: list[Optional[int]] = [-1] if model_info.matryoshka_dimensions is not None: assert 5 not in model_info.matryoshka_dimensions invalid_dimensions.append(5) From dcbaa26bc65581f94ee8939895a7e34910764475 Mon Sep 17 00:00:00 2001 From: noooop Date: Thu, 24 Apr 2025 14:21:18 +0800 Subject: [PATCH 07/13] typo --- docs/source/models/pooling_models.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md index 6cf425da2a38..5ee83b62f3bb 100644 --- a/docs/source/models/pooling_models.md +++ b/docs/source/models/pooling_models.md @@ -159,7 +159,7 @@ For example, setting `dimensions` parameter while using the `BAAI/bge-m3` model ### Manually enable Matryoshka Embeddings -There is currently no official interface for specifying support for Matryoshka Embeddings. In vllm, if `is_matryoshka` is `True` in `config.json,` it is allowed to change the output to arbitrary dimensions. Using `matryoshka_dimensions` can control the allowed output dimensions. +There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, if `is_matryoshka` is `True` in `config.json,` it is allowed to change the output to arbitrary dimensions. Using `matryoshka_dimensions` can control the allowed output dimensions. For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}`, `hf_overrides={"matryoshka_dimensions": [allowed output dimensions]}` (offline) or `--hf_overrides '{"is_matryoshka": true}'`, `--hf_overrides '{"matryoshka_dimensions": [allowed output dimensions]}'`(online). From 130647bca3e7141f54b4c146fe4be8a61f44edc2 Mon Sep 17 00:00:00 2001 From: noooop Date: Thu, 24 Apr 2025 14:39:16 +0800 Subject: [PATCH 08/13] fix --- docs/source/models/pooling_models.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md index 5ee83b62f3bb..7daa0ec1de4d 100644 --- a/docs/source/models/pooling_models.md +++ b/docs/source/models/pooling_models.md @@ -161,7 +161,7 @@ For example, setting `dimensions` parameter while using the `BAAI/bge-m3` model There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, if `is_matryoshka` is `True` in `config.json,` it is allowed to change the output to arbitrary dimensions. Using `matryoshka_dimensions` can control the allowed output dimensions. -For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}`, `hf_overrides={"matryoshka_dimensions": [allowed output dimensions]}` (offline) or `--hf_overrides '{"is_matryoshka": true}'`, `--hf_overrides '{"matryoshka_dimensions": [allowed output dimensions]}'`(online). +For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}`, `hf_overrides={"matryoshka_dimensions": []}` (offline) or `--hf_overrides '{"is_matryoshka": true}'`, `--hf_overrides '{"matryoshka_dimensions": []}'`(online). Here is an example to serve a model with Matryoshka Embeddings enabled. @@ -211,7 +211,7 @@ curl http://127.0.0.1:8000/v1/embeddings \ Expected output: ```json -{"id":"embd-0aab28c384d348c3b8f0eb783109dc5f","object":"list","created":1744195454,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":["32 floating-point numbers"]}],"usage":{"prompt_tokens":10,"total_tokens":10,"completion_tokens":0,"prompt_tokens_details":null}} +{"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}} ``` A openai client example can be found here: From e03b0cfcd415ac84dae9af494af056c851208723 Mon Sep 17 00:00:00 2001 From: noooop Date: Thu, 24 Apr 2025 15:16:09 +0800 Subject: [PATCH 09/13] split it into two files --- tests/entrypoints/openai/test_embedding.py | 102 +-------------- .../openai/test_embedding_dimensions.py | 123 ++++++++++++++++++ tests/models/embedding/utils.py | 18 +++ 3 files changed, 148 insertions(+), 95 deletions(-) create mode 100644 tests/entrypoints/openai/test_embedding_dimensions.py diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index d54222961e04..720aa4808c00 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -1,9 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import base64 -from collections.abc import Sequence from http import HTTPStatus -from typing import Optional import numpy as np import openai @@ -14,15 +12,11 @@ from vllm.transformers_utils.tokenizer import get_tokenizer from ...conftest import HfRunner -from ...models.embedding.utils import (EmbedModelInfo, check_embeddings_close, - matryoshka_fy) +from ...models.embedding.utils import EmbedModelInfo, correctness_test from ...utils import RemoteOpenAIServer MODELS = [ - EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False), - EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5", - is_matryoshka=True, - matryoshka_dimensions=[256]), + EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False) ] DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501 @@ -56,14 +50,6 @@ def server(model_info, dtype: str): "--chat-template", DUMMY_CHAT_TEMPLATE ] - - if model_info.name == "Snowflake/snowflake-arctic-embed-m-v1.5": - # Manually enable Matryoshka Embeddings - args.extend([ - "--trust_remote_code", "--hf_overrides", - '{"matryoshka_dimensions":[256]}' - ]) - with RemoteOpenAIServer(model_info.name, args) as remote_server: yield remote_server @@ -115,7 +101,7 @@ async def test_single_embedding(model_info: EmbedModelInfo, assert embeddings.usage.total_tokens > 0 vllm_outputs = [d.embedding for d in embeddings.data] - _correctness_test(hf_model, prompts, vllm_outputs) + correctness_test(hf_model, prompts, vllm_outputs) # test using token IDs input_tokens = [1, 1, 1, 1, 1] @@ -161,7 +147,7 @@ async def test_batch_embedding(model_info: EmbedModelInfo, assert embeddings.usage.total_tokens > 0 vllm_outputs = [d.embedding for d in embeddings.data] - _correctness_test(hf_model, prompts, vllm_outputs) + correctness_test(hf_model, prompts, vllm_outputs) # test list[list[int]] input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24], @@ -252,7 +238,7 @@ async def test_batch_base64_embedding(model_info: EmbedModelInfo, model=model_info.name, encoding_format="float") float_data = [d.embedding for d in responses_float.data] - _correctness_test(hf_model, prompts, float_data) + correctness_test(hf_model, prompts, float_data) # test base64 responses responses_base64 = await client.embeddings.create(input=prompts, @@ -263,13 +249,13 @@ async def test_batch_base64_embedding(model_info: EmbedModelInfo, base64_data.append( np.frombuffer(base64.b64decode(data.embedding), dtype="float32").tolist()) - _correctness_test(hf_model, prompts, base64_data) + correctness_test(hf_model, prompts, base64_data) # Default response is float32 decoded from base64 by OpenAI Client responses_default = await client.embeddings.create(input=prompts, model=model_info.name) default_data = [d.embedding for d in responses_default.data] - _correctness_test(hf_model, prompts, default_data) + correctness_test(hf_model, prompts, default_data) @pytest.mark.asyncio @@ -325,77 +311,3 @@ async def test_embedding_truncation( assert "error" in response.object assert "truncate_prompt_tokens value is greater than max_model_len. "\ "Please, select a smaller truncation size." in response.message - - -@pytest.mark.asyncio -async def test_matryoshka(model_info: EmbedModelInfo, - server: RemoteOpenAIServer, hf_model: HfRunner): - client = server.get_async_client() - - async def make_request_and_correctness_test(dimensions): - prompts = input_texts * 3 - - embedding_response = await client.embeddings.create( - model=model_info.name, - input=prompts, - dimensions=dimensions, - encoding_format="float", - ) - embeddings = EmbeddingResponse.model_validate( - embedding_response.model_dump(mode="json")) - - assert embeddings.id is not None - assert len(embeddings.data) == 3 - assert len(embeddings.data[0].embedding) > 0 - assert embeddings.usage.completion_tokens == 0 - assert embeddings.usage.prompt_tokens > 0 - assert embeddings.usage.total_tokens > 0 - - if dimensions is not None: - assert len(embeddings.data[0].embedding) == dimensions - - vllm_outputs = [d.embedding for d in embeddings.data] - _correctness_test(hf_model, prompts, vllm_outputs, dimensions) - - if model_info.is_matryoshka: - valid_dimensions: list[Optional[int]] = [None] - if model_info.matryoshka_dimensions is not None: - valid_dimensions += model_info.matryoshka_dimensions[:2] - - for dimensions in valid_dimensions: - await make_request_and_correctness_test(dimensions) - - invalid_dimensions: list[Optional[int]] = [-1] - if model_info.matryoshka_dimensions is not None: - assert 5 not in model_info.matryoshka_dimensions - invalid_dimensions.append(5) - - for dimensions in invalid_dimensions: - with pytest.raises(openai.BadRequestError): - await make_request_and_correctness_test(dimensions) - - else: - for dimensions in [None]: - await make_request_and_correctness_test(dimensions) - - for dimensions in [-1, 16]: - with pytest.raises(openai.BadRequestError): - await make_request_and_correctness_test(dimensions) - - -def _correctness_test(hf_model: HfRunner, - inputs, - vllm_outputs: Sequence[list[float]], - dimensions: Optional[int] = None): - - hf_outputs = hf_model.encode(inputs) - if dimensions: - hf_outputs = matryoshka_fy(hf_outputs, dimensions) - - check_embeddings_close( - embeddings_0_lst=hf_outputs, - embeddings_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - tol=1e-2, - ) diff --git a/tests/entrypoints/openai/test_embedding_dimensions.py b/tests/entrypoints/openai/test_embedding_dimensions.py new file mode 100644 index 000000000000..9f5a8c6839bc --- /dev/null +++ b/tests/entrypoints/openai/test_embedding_dimensions.py @@ -0,0 +1,123 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +Run `pytest tests/entrypoints/openai/test_embedding_dimensions.py`. +""" + +from typing import Optional + +import openai +import pytest + +from vllm.entrypoints.openai.protocol import EmbeddingResponse + +from ...conftest import HfRunner +from ...models.embedding.utils import EmbedModelInfo, correctness_test +from ...utils import RemoteOpenAIServer + +MODELS = [ + EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False), + EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5", + is_matryoshka=True, + matryoshka_dimensions=[256]), +] + +input_texts = [ + "The chef prepared a delicious meal.", +] + + +@pytest.fixture(scope="module", params=MODELS) +def model_info(request): + return request.param + + +@pytest.fixture(scope="module", params=["bfloat16"]) +def dtype(request): + return request.param + + +@pytest.fixture(scope="module") +def server(model_info, dtype: str): + args = [ + "--task", + "embed", + # use half precision for speed and memory savings in CI environment + "--dtype", + dtype, + "--enforce-eager", + "--max-model-len", + "512" + ] + + if model_info.name == "Snowflake/snowflake-arctic-embed-m-v1.5": + # Manually enable Matryoshka Embeddings + args.extend([ + "--trust_remote_code", "--hf_overrides", + '{"matryoshka_dimensions":[256]}' + ]) + + with RemoteOpenAIServer(model_info.name, args) as remote_server: + yield remote_server + + +@pytest.fixture(scope="module") +def hf_model(hf_runner, model_info, dtype: str): + with hf_runner(model_info.name, dtype=dtype, + is_sentence_transformer=True) as hf_model: + yield hf_model + + +@pytest.mark.asyncio +async def test_matryoshka(model_info: EmbedModelInfo, + server: RemoteOpenAIServer, hf_model: HfRunner): + client = server.get_async_client() + + async def make_request_and_correctness_test(dimensions): + prompts = input_texts * 3 + + embedding_response = await client.embeddings.create( + model=model_info.name, + input=prompts, + dimensions=dimensions, + encoding_format="float", + ) + embeddings = EmbeddingResponse.model_validate( + embedding_response.model_dump(mode="json")) + + assert embeddings.id is not None + assert len(embeddings.data) == 3 + assert len(embeddings.data[0].embedding) > 0 + assert embeddings.usage.completion_tokens == 0 + assert embeddings.usage.prompt_tokens > 0 + assert embeddings.usage.total_tokens > 0 + + if dimensions is not None: + assert len(embeddings.data[0].embedding) == dimensions + + vllm_outputs = [d.embedding for d in embeddings.data] + correctness_test(hf_model, prompts, vllm_outputs, dimensions) + + if model_info.is_matryoshka: + valid_dimensions: list[Optional[int]] = [None] + if model_info.matryoshka_dimensions is not None: + valid_dimensions += model_info.matryoshka_dimensions[:2] + + for dimensions in valid_dimensions: + await make_request_and_correctness_test(dimensions) + + invalid_dimensions: list[Optional[int]] = [-1] + if model_info.matryoshka_dimensions is not None: + assert 5 not in model_info.matryoshka_dimensions + invalid_dimensions.append(5) + + for dimensions in invalid_dimensions: + with pytest.raises(openai.BadRequestError): + await make_request_and_correctness_test(dimensions) + + else: + for dimensions in [None]: + await make_request_and_correctness_test(dimensions) + + for dimensions in [-1, 16]: + with pytest.raises(openai.BadRequestError): + await make_request_and_correctness_test(dimensions) diff --git a/tests/models/embedding/utils.py b/tests/models/embedding/utils.py index 15ebd2549a45..cb216ce5eed7 100644 --- a/tests/models/embedding/utils.py +++ b/tests/models/embedding/utils.py @@ -46,3 +46,21 @@ class EmbedModelInfo(NamedTuple): matryoshka_dimensions: Optional[list[int]] = None architecture: str = "" enable_test: bool = True + + +def correctness_test(hf_model, + inputs, + vllm_outputs: Sequence[list[float]], + dimensions: Optional[int] = None): + + hf_outputs = hf_model.encode(inputs) + if dimensions: + hf_outputs = matryoshka_fy(hf_outputs, dimensions) + + check_embeddings_close( + embeddings_0_lst=hf_outputs, + embeddings_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + tol=1e-2, + ) \ No newline at end of file From fadbfac8f8d85ca356b4559372d94d766635022f Mon Sep 17 00:00:00 2001 From: noooop Date: Thu, 24 Apr 2025 15:19:02 +0800 Subject: [PATCH 10/13] fix --- tests/models/embedding/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/embedding/utils.py b/tests/models/embedding/utils.py index cb216ce5eed7..068f05652096 100644 --- a/tests/models/embedding/utils.py +++ b/tests/models/embedding/utils.py @@ -63,4 +63,4 @@ def correctness_test(hf_model, name_0="hf", name_1="vllm", tol=1e-2, - ) \ No newline at end of file + ) From 3558e9dedf7de7492fb12527913c60f455bc9ff5 Mon Sep 17 00:00:00 2001 From: noooop Date: Thu, 24 Apr 2025 15:35:24 +0800 Subject: [PATCH 11/13] revert --- tests/entrypoints/openai/test_embedding.py | 193 +++++++++------------ 1 file changed, 84 insertions(+), 109 deletions(-) diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index 720aa4808c00..50b20e78c4c4 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -1,93 +1,68 @@ # SPDX-License-Identifier: Apache-2.0 import base64 -from http import HTTPStatus import numpy as np import openai import pytest +import pytest_asyncio import requests from vllm.entrypoints.openai.protocol import EmbeddingResponse from vllm.transformers_utils.tokenizer import get_tokenizer -from ...conftest import HfRunner -from ...models.embedding.utils import EmbedModelInfo, correctness_test +from ...models.embedding.utils import correctness_test from ...utils import RemoteOpenAIServer -MODELS = [ - EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False) -] - +MODEL_NAME = "intfloat/multilingual-e5-small" DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501 - -input_texts = [ - "The chef prepared a delicious meal.", -] - - -@pytest.fixture(scope="module", params=MODELS) -def model_info(request): - return request.param - - -@pytest.fixture(scope="module", params=["bfloat16"]) -def dtype(request): - return request.param +DTYPE = "bfloat16" @pytest.fixture(scope="module") -def server(model_info, dtype: str): +def server(): args = [ "--task", "embed", # use half precision for speed and memory savings in CI environment "--dtype", - dtype, + DTYPE, "--enforce-eager", "--max-model-len", "512", "--chat-template", - DUMMY_CHAT_TEMPLATE + DUMMY_CHAT_TEMPLATE, ] - with RemoteOpenAIServer(model_info.name, args) as remote_server: + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + @pytest.fixture(scope="module") -def hf_model(hf_runner, model_info, dtype: str): - with hf_runner(model_info.name, dtype=dtype, +def hf_model(hf_runner): + with hf_runner(MODEL_NAME, dtype=DTYPE, is_sentence_transformer=True) as hf_model: yield hf_model @pytest.mark.asyncio -async def test_basic(model_info: EmbedModelInfo, server: RemoteOpenAIServer): - response = requests.get(server.url_for("health")) - assert response.status_code == HTTPStatus.OK - - client = server.get_async_client() - - models = await client.models.list() - models = models.data - - assert len(models) == 1 - served_model = models[0] - assert served_model.id == model_info.name - assert served_model.root == model_info.name - - -@pytest.mark.asyncio -async def test_single_embedding(model_info: EmbedModelInfo, - server: RemoteOpenAIServer, - hf_model: HfRunner): - client = server.get_async_client() +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_single_embedding(hf_model, client: openai.AsyncOpenAI, + model_name: str): + input_texts = [ + "The chef prepared a delicious meal.", + ] # test single embedding - prompts = input_texts embedding_response = await client.embeddings.create( - model=model_info.name, - input=prompts, + model=model_name, + input=input_texts, encoding_format="float", ) embeddings = EmbeddingResponse.model_validate( @@ -95,18 +70,18 @@ async def test_single_embedding(model_info: EmbedModelInfo, assert embeddings.id is not None assert len(embeddings.data) == 1 - assert len(embeddings.data[0].embedding) > 0 + assert len(embeddings.data[0].embedding) == 384 assert embeddings.usage.completion_tokens == 0 - assert embeddings.usage.prompt_tokens > 0 - assert embeddings.usage.total_tokens > 0 + assert embeddings.usage.prompt_tokens == 11 + assert embeddings.usage.total_tokens == 11 vllm_outputs = [d.embedding for d in embeddings.data] - correctness_test(hf_model, prompts, vllm_outputs) + correctness_test(hf_model, input_texts, vllm_outputs) # test using token IDs input_tokens = [1, 1, 1, 1, 1] embedding_response = await client.embeddings.create( - model=model_info.name, + model=model_name, input=input_tokens, encoding_format="float", ) @@ -115,25 +90,24 @@ async def test_single_embedding(model_info: EmbedModelInfo, assert embeddings.id is not None assert len(embeddings.data) == 1 - assert len(embeddings.data[0].embedding) > 0 + assert len(embeddings.data[0].embedding) == 384 assert embeddings.usage.completion_tokens == 0 assert embeddings.usage.prompt_tokens == 5 assert embeddings.usage.total_tokens == 5 @pytest.mark.asyncio -async def test_batch_embedding(model_info: EmbedModelInfo, - server: RemoteOpenAIServer, hf_model: HfRunner): - client = server.get_async_client() - +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI, + model_name: str): # test list[str] - prompts = [ + input_texts = [ "The cat sat on the mat.", "A feline was resting on a rug.", "Stars twinkle brightly in the night sky." ] embedding_response = await client.embeddings.create( - model=model_info.name, - input=prompts, + model=model_name, + input=input_texts, encoding_format="float", ) embeddings = EmbeddingResponse.model_validate( @@ -141,19 +115,19 @@ async def test_batch_embedding(model_info: EmbedModelInfo, assert embeddings.id is not None assert len(embeddings.data) == 3 - assert len(embeddings.data[0].embedding) > 0 + assert len(embeddings.data[0].embedding) == 384 assert embeddings.usage.completion_tokens == 0 - assert embeddings.usage.prompt_tokens > 0 - assert embeddings.usage.total_tokens > 0 + assert embeddings.usage.prompt_tokens == 33 + assert embeddings.usage.total_tokens == 33 vllm_outputs = [d.embedding for d in embeddings.data] - correctness_test(hf_model, prompts, vllm_outputs) + correctness_test(hf_model, input_texts, vllm_outputs) # test list[list[int]] input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24], [25, 32, 64, 77]] embedding_response = await client.embeddings.create( - model=model_info.name, + model=model_name, input=input_tokens, encoding_format="float", ) @@ -162,17 +136,17 @@ async def test_batch_embedding(model_info: EmbedModelInfo, assert embeddings.id is not None assert len(embeddings.data) == 4 - assert len(embeddings.data[0].embedding) > 0 + assert len(embeddings.data[0].embedding) == 384 assert embeddings.usage.completion_tokens == 0 assert embeddings.usage.prompt_tokens == 17 assert embeddings.usage.total_tokens == 17 @pytest.mark.asyncio -async def test_conversation_embedding(model_info: EmbedModelInfo, - server: RemoteOpenAIServer): - client = server.get_async_client() - +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_conversation_embedding(server: RemoteOpenAIServer, + client: openai.AsyncOpenAI, + model_name: str): messages = [{ "role": "user", "content": "The cat sat on the mat.", @@ -187,7 +161,7 @@ async def test_conversation_embedding(model_info: EmbedModelInfo, chat_response = requests.post( server.url_for("v1/embeddings"), json={ - "model": model_info.name, + "model": model_name, "messages": messages, "encoding_format": "float", }, @@ -195,8 +169,7 @@ async def test_conversation_embedding(model_info: EmbedModelInfo, chat_response.raise_for_status() chat_embeddings = EmbeddingResponse.model_validate(chat_response.json()) - tokenizer = get_tokenizer(tokenizer_name=model_info.name, - tokenizer_mode="fast") + tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast") prompt = tokenizer.apply_chat_template( messages, chat_template=DUMMY_CHAT_TEMPLATE, @@ -205,7 +178,7 @@ async def test_conversation_embedding(model_info: EmbedModelInfo, tokenize=False, ) completion_response = await client.embeddings.create( - model=model_info.name, + model=model_name, input=prompt, encoding_format="float", # To be consistent with chat @@ -223,55 +196,49 @@ async def test_conversation_embedding(model_info: EmbedModelInfo, @pytest.mark.asyncio -async def test_batch_base64_embedding(model_info: EmbedModelInfo, - server: RemoteOpenAIServer, - hf_model: HfRunner): - client = server.get_async_client() - - prompts = [ +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_batch_base64_embedding(hf_model, client: openai.AsyncOpenAI, + model_name: str): + input_texts = [ "Hello my name is", "The best thing about vLLM is that it supports many different models" ] - # test float responses - responses_float = await client.embeddings.create(input=prompts, - model=model_info.name, + responses_float = await client.embeddings.create(input=input_texts, + model=model_name, encoding_format="float") float_data = [d.embedding for d in responses_float.data] - correctness_test(hf_model, prompts, float_data) + correctness_test(hf_model, input_texts, float_data) - # test base64 responses - responses_base64 = await client.embeddings.create(input=prompts, - model=model_info.name, + responses_base64 = await client.embeddings.create(input=input_texts, + model=model_name, encoding_format="base64") base64_data = [] for data in responses_base64.data: base64_data.append( np.frombuffer(base64.b64decode(data.embedding), dtype="float32").tolist()) - correctness_test(hf_model, prompts, base64_data) + + correctness_test(hf_model, input_texts, base64_data) # Default response is float32 decoded from base64 by OpenAI Client - responses_default = await client.embeddings.create(input=prompts, - model=model_info.name) + responses_default = await client.embeddings.create(input=input_texts, + model=model_name) default_data = [d.embedding for d in responses_default.data] - correctness_test(hf_model, prompts, default_data) + correctness_test(hf_model, input_texts, default_data) @pytest.mark.asyncio -async def test_embedding_truncation( - model_info: EmbedModelInfo, - server: RemoteOpenAIServer, -): - client = server.get_async_client() - +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_single_embedding_truncation(client: openai.AsyncOpenAI, + model_name: str): input_texts = [ "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?", ] # test single embedding embedding_response = await client.embeddings.create( - model=model_info.name, + model=model_name, input=input_texts, extra_body={"truncate_prompt_tokens": 10}) embeddings = EmbeddingResponse.model_validate( @@ -279,17 +246,17 @@ async def test_embedding_truncation( assert embeddings.id is not None assert len(embeddings.data) == 1 - assert len(embeddings.data[0].embedding) > 0 + assert len(embeddings.data[0].embedding) == 384 assert embeddings.usage.completion_tokens == 0 - assert embeddings.usage.prompt_tokens > 0 - assert embeddings.usage.total_tokens > 0 + assert embeddings.usage.prompt_tokens == 10 + assert embeddings.usage.total_tokens == 10 input_tokens = [ 1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728, 9901, 340, 2229, 385, 340, 315, 28741, 28804, 2 ] embedding_response = await client.embeddings.create( - model=model_info.name, + model=model_name, input=input_tokens, extra_body={"truncate_prompt_tokens": 10}) embeddings = EmbeddingResponse.model_validate( @@ -297,17 +264,25 @@ async def test_embedding_truncation( assert embeddings.id is not None assert len(embeddings.data) == 1 - assert len(embeddings.data[0].embedding) > 0 + assert len(embeddings.data[0].embedding) == 384 assert embeddings.usage.completion_tokens == 0 assert embeddings.usage.prompt_tokens == 10 assert embeddings.usage.total_tokens == 10 - # test_single_embedding_truncation_invalid + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI, + model_name: str): + input_texts = [ + "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?", + ] + with pytest.raises(openai.BadRequestError): response = await client.embeddings.create( - model=model_info.name, + model=model_name, input=input_texts, - extra_body={"truncate_prompt_tokens": 100000}) + extra_body={"truncate_prompt_tokens": 8193}) assert "error" in response.object assert "truncate_prompt_tokens value is greater than max_model_len. "\ "Please, select a smaller truncation size." in response.message From 6fd892e60bbe009ebb248edaf0a07d498383ea89 Mon Sep 17 00:00:00 2001 From: noooop Date: Thu, 24 Apr 2025 15:39:33 +0800 Subject: [PATCH 12/13] ruff --- tests/models/embedding/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/models/embedding/utils.py b/tests/models/embedding/utils.py index 068f05652096..6d4df2c265c4 100644 --- a/tests/models/embedding/utils.py +++ b/tests/models/embedding/utils.py @@ -49,9 +49,9 @@ class EmbedModelInfo(NamedTuple): def correctness_test(hf_model, - inputs, - vllm_outputs: Sequence[list[float]], - dimensions: Optional[int] = None): + inputs, + vllm_outputs: Sequence[list[float]], + dimensions: Optional[int] = None): hf_outputs = hf_model.encode(inputs) if dimensions: From e8a41a74ec72f767009b8fcbca11d38bcf31f5eb Mon Sep 17 00:00:00 2001 From: noooop Date: Thu, 24 Apr 2025 16:11:12 +0800 Subject: [PATCH 13/13] fix test_jina --- tests/models/embedding/language/test_jina.py | 32 +++++++++++++------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/tests/models/embedding/language/test_jina.py b/tests/models/embedding/language/test_jina.py index 881d0a75b158..1e234368f3b3 100644 --- a/tests/models/embedding/language/test_jina.py +++ b/tests/models/embedding/language/test_jina.py @@ -153,14 +153,24 @@ def test_matryoshka( with vllm_runner(model, task="embed", dtype=dtype, max_model_len=None) as vllm_model: - vllm_outputs = vllm_model.encode( - example_prompts, - pooling_params=PoolingParams(dimensions=dimensions)) - - check_embeddings_close( - embeddings_0_lst=hf_outputs, - embeddings_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - tol=1e-2, - ) + matryoshka_dimensions = ( + vllm_model.model.llm_engine.model_config.matryoshka_dimensions) + assert matryoshka_dimensions is not None + + if dimensions not in matryoshka_dimensions: + with pytest.raises(ValueError): + vllm_model.encode( + example_prompts, + pooling_params=PoolingParams(dimensions=dimensions)) + else: + vllm_outputs = vllm_model.encode( + example_prompts, + pooling_params=PoolingParams(dimensions=dimensions)) + + check_embeddings_close( + embeddings_0_lst=hf_outputs, + embeddings_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + tol=1e-2, + )