Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion requirements/test.in
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ num2words # required for smolvlm test
opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]==0.4.8 # required for model evaluation test
mteb>=1.38.11, <2 # required for mteb test
mteb[bm25s]>=1.38.11, <2 # required for mteb test
transformers==4.52.4
tokenizers==0.21.1
huggingface-hub[hf_xet]>=0.30.0 # Required for Xet downloads.
Expand Down
6 changes: 6 additions & 0 deletions requirements/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ black==24.10.0
# via datamodel-code-generator
blobfile==3.0.0
# via -r requirements/test.in
bm25s==0.2.13
# via mteb
boto3==1.35.57
# via tensorizer
botocore==1.35.57
Expand Down Expand Up @@ -344,6 +346,7 @@ numpy==1.26.4
# -r requirements/test.in
# accelerate
# bitsandbytes
# bm25s
# contourpy
# cupy-cuda12x
# datasets
Expand Down Expand Up @@ -534,6 +537,8 @@ pyparsing==3.2.0
# via matplotlib
pyrate-limiter==3.7.0
# via schemathesis
pystemmer==3.0.0
# via mteb
pytablewriter==1.2.0
# via lm-eval
pytest==8.3.3
Expand Down Expand Up @@ -668,6 +673,7 @@ scikit-learn==1.5.2
# sentence-transformers
scipy==1.13.1
# via
# bm25s
# librosa
# mteb
# scikit-learn
Expand Down
12 changes: 9 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -727,8 +727,12 @@ def encode(self, prompts: list[str], *args,
**kwargs) -> list[list[torch.Tensor]]:
return self.model.encode(prompts, *args, **kwargs)

def predict(self, prompts: list[list[str]]) -> torch.Tensor:
return self.model.predict(prompts, convert_to_tensor=True)
def predict(self, prompts: list[list[str]], *args,
**kwargs) -> torch.Tensor:
return self.model.predict(prompts,
*args,
convert_to_tensor=True,
**kwargs)

def __enter__(self):
return self
Expand Down Expand Up @@ -1037,8 +1041,10 @@ def score(
self,
text_1: Union[str, list[str]],
text_2: Union[str, list[str]],
*args,
**kwargs,
) -> list[float]:
req_outputs = self.model.score(text_1, text_2)
req_outputs = self.model.score(text_1, text_2, *args, **kwargs)
return [req_output.outputs.score for req_output in req_outputs]

def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,34 +7,30 @@
from tests.models.language.pooling.mteb_utils import (MTEB_EMBED_TASKS,
MTEB_EMBED_TOL,
OpenAIClientMtebEncoder,
run_mteb_embed_task,
run_mteb_embed_task_st)
run_mteb_embed_task)
from tests.utils import RemoteOpenAIServer

os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"

MODEL_NAME = "BAAI/bge-m3"
DTYPE = "float16"
MAIN_SCORE = 0.7873427091972599
MODEL_NAME = "intfloat/e5-small"
MAIN_SCORE = 0.7422994752439667


@pytest.fixture(scope="module")
def server():
args = [
"--task", "embed", "--dtype", DTYPE, "--enforce-eager",
"--max-model-len", "512"
"--task", "embed", "--enforce-eager", "--disable-uvicorn-access-log"
]

with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server


def test_mteb(server):
def test_mteb_embed(server):
client = server.get_client()
encoder = OpenAIClientMtebEncoder(MODEL_NAME, client)
vllm_main_score = run_mteb_embed_task(encoder, MTEB_EMBED_TASKS)
st_main_score = MAIN_SCORE or run_mteb_embed_task_st(
MODEL_NAME, MTEB_EMBED_TASKS)
st_main_score = MAIN_SCORE

print("VLLM main score: ", vllm_main_score)
print("SentenceTransformer main score: ", st_main_score)
Expand Down
59 changes: 59 additions & 0 deletions tests/entrypoints/openai/correctness/test_mteb_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os

import pytest

# yapf conflicts with isort for this block
# yapf: disable
from tests.models.language.pooling.mteb_utils import (MTEB_RERANK_LANGS,
MTEB_RERANK_TASKS,
MTEB_RERANK_TOL,
RerankClientMtebEncoder,
ScoreClientMtebEncoder,
run_mteb_rerank)
# yapf: enable
from tests.utils import RemoteOpenAIServer

os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"

MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
MAIN_SCORE = 0.33437


@pytest.fixture(scope="module")
def server():
args = [
"--task", "score", "--enforce-eager", "--disable-uvicorn-access-log"
]

with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server


def test_mteb_score(server):
url = server.url_for("score")
encoder = ScoreClientMtebEncoder(MODEL_NAME, url)
vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
MTEB_RERANK_LANGS)
st_main_score = MAIN_SCORE

print("VLLM main score: ", vllm_main_score)
print("SentenceTransformer main score: ", st_main_score)
print("Difference: ", st_main_score - vllm_main_score)

assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)


def test_mteb_rerank(server):
url = server.url_for("rerank")
encoder = RerankClientMtebEncoder(MODEL_NAME, url)
vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
MTEB_RERANK_LANGS)
st_main_score = MAIN_SCORE

print("VLLM main score: ", vllm_main_score)
print("SentenceTransformer main score: ", st_main_score)
print("Difference: ", st_main_score - vllm_main_score)

assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)
Loading