22# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33
44import base64
5+ import json
56
67import numpy as np
78import openai
1516from tests .models .utils import check_embeddings_close
1617from tests .utils import RemoteOpenAIServer
1718from vllm .entrypoints .openai .protocol import (
18- EMBED_DTYPE_TO_TORCH_DTYPE ,
1919 EmbeddingResponse ,
2020 PoolingResponse ,
2121)
2222from vllm .transformers_utils .tokenizer import get_tokenizer
23+ from vllm .utils .serial_utils import (
24+ EMBED_DTYPE_TO_TORCH_DTYPE ,
25+ ENDIANNESS ,
26+ MetadataItem ,
27+ binary2tensor ,
28+ decode_pooling_output ,
29+ )
2330
2431MODEL_NAME = "intfloat/multilingual-e5-small"
2532DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\ n'}}{% endfor %}""" # noqa: E501
@@ -250,8 +257,8 @@ async def test_batch_base64_embedding(
250257
251258@pytest .mark .asyncio
252259@pytest .mark .parametrize ("model_name" , [MODEL_NAME ])
253- async def test_base64_embed_dtype (
254- hf_model , server : RemoteOpenAIServer , client : openai .AsyncOpenAI , model_name : str
260+ async def test_base64_embed_dtype_and_endianness (
261+ server : RemoteOpenAIServer , client : openai .AsyncOpenAI , model_name : str
255262):
256263 input_texts = [
257264 "The best thing about vLLM is that it supports many different models" ,
@@ -262,59 +269,100 @@ async def test_base64_embed_dtype(
262269 )
263270 float_data = [d .embedding for d in responses_float .data ]
264271
265- for embed_dtype , torch_dtype in EMBED_DTYPE_TO_TORCH_DTYPE .items ():
266- responses_base64 = requests .post (
267- server .url_for ("/v1/embeddings" ),
268- json = {
269- "model" : model_name ,
270- "input" : input_texts ,
271- "encoding_format" : "base64" ,
272- "embed_dtype" : embed_dtype ,
273- },
274- )
275-
276- base64_data = []
277- for data in responses_base64 .json ()["data" ]:
278- base64_data .append (
279- torch .frombuffer (base64 .b64decode (data ["embedding" ]), dtype = torch_dtype )
280- .to (torch .float32 )
281- .tolist ()
272+ for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE :
273+ for endianness in ENDIANNESS :
274+ responses_base64 = requests .post (
275+ server .url_for ("/v1/embeddings" ),
276+ json = {
277+ "model" : model_name ,
278+ "input" : input_texts ,
279+ "encoding_format" : "base64" ,
280+ "embed_dtype" : embed_dtype ,
281+ "endianness" : endianness ,
282+ },
282283 )
283284
284- check_embeddings_close (
285- embeddings_0_lst = float_data ,
286- embeddings_1_lst = base64_data ,
287- name_0 = "float_data" ,
288- name_1 = "base64_data" ,
289- tol = 1e-2 ,
290- )
285+ base64_data = []
286+ for data in responses_base64 .json ()["data" ]:
287+ binary = base64 .b64decode (data ["embedding" ])
288+ tensor = binary2tensor (binary , (- 1 ,), embed_dtype , endianness )
289+ base64_data .append (tensor .to (torch .float32 ).tolist ())
290+
291+ check_embeddings_close (
292+ embeddings_0_lst = float_data ,
293+ embeddings_1_lst = base64_data ,
294+ name_0 = "float_data" ,
295+ name_1 = "base64_data" ,
296+ tol = 1e-2 ,
297+ )
291298
292299
293300@pytest .mark .asyncio
294301@pytest .mark .parametrize ("model_name" , [MODEL_NAME ])
295- async def test_base64_embed_dtype_not_supported (
296- hf_model , server : RemoteOpenAIServer , model_name : str
302+ async def test_bytes_embed_dtype_and_endianness (
303+ server : RemoteOpenAIServer , client : openai . AsyncOpenAI , model_name : str
297304):
298305 input_texts = [
299306 "The best thing about vLLM is that it supports many different models" ,
300307 ]
301308
302- bad_embed_dtype = "bad_embed_dtype"
309+ responses_float = await client .embeddings .create (
310+ input = input_texts , model = model_name , encoding_format = "float"
311+ )
312+ float_data = [d .embedding for d in responses_float .data ]
313+
314+ for embed_dtype in list (EMBED_DTYPE_TO_TORCH_DTYPE .keys ()):
315+ for endianness in ENDIANNESS :
316+ responses_bytes = requests .post (
317+ server .url_for ("/v1/embeddings" ),
318+ json = {
319+ "model" : model_name ,
320+ "input" : input_texts ,
321+ "encoding_format" : "bytes" ,
322+ "embed_dtype" : embed_dtype ,
323+ "endianness" : endianness ,
324+ },
325+ )
326+
327+ metadata = json .loads (responses_bytes .headers ["metadata" ])
328+ body = responses_bytes .content
329+ items = [MetadataItem (** x ) for x in metadata ["data" ]]
330+
331+ bytes_data = decode_pooling_output (items = items , body = body )
332+ bytes_data = [x .to (torch .float32 ).tolist () for x in bytes_data ]
333+
334+ check_embeddings_close (
335+ embeddings_0_lst = float_data ,
336+ embeddings_1_lst = bytes_data ,
337+ name_0 = "float_data" ,
338+ name_1 = "bytes_data" ,
339+ tol = 1e-2 ,
340+ )
341+
342+
343+ @pytest .mark .asyncio
344+ @pytest .mark .parametrize ("model_name" , [MODEL_NAME ])
345+ @pytest .mark .parametrize ("param_name" , ["encoding_format" , "embed_dtype" , "endianness" ])
346+ async def test_params_not_supported (
347+ server : RemoteOpenAIServer , model_name : str , param_name : str
348+ ):
349+ input_texts = [
350+ "The best thing about vLLM is that it supports many different models" ,
351+ ]
303352
304353 responses_base64 = requests .post (
305354 server .url_for ("/v1/embeddings" ),
306355 json = {
307356 "model" : model_name ,
308357 "input" : input_texts ,
309358 "encoding_format" : "base64" ,
310- "embed_dtype" : bad_embed_dtype ,
359+ param_name : f"bad_ { param_name } " ,
311360 },
312361 )
313362
314363 assert responses_base64 .status_code == 400
315- assert responses_base64 .json ()["error" ]["message" ].startswith (
316- f"embed_dtype={ bad_embed_dtype !r} is not supported."
317- )
364+ assert "literal_error" in responses_base64 .json ()["error" ]["message" ]
365+ assert f"bad_{ param_name } " in responses_base64 .json ()["error" ]["message" ]
318366
319367
320368@pytest .mark .asyncio
0 commit comments