1818
1919from ...utils import RemoteOpenAIServer
2020
21- MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
21+ MODELS = {
22+ "text" : "TinyLlama/TinyLlama-1.1B-Chat-v1.0" ,
23+ "multimodal" : "HuggingFaceTB/SmolVLM-256M-Instruct" ,
24+ }
2225PREV_MINOR_VERSION = version ._prev_minor_version ()
2326
2427
28+ @pytest .fixture (scope = "module" , params = list (MODELS .keys ()))
29+ def model_key (request ):
30+ yield request .param
31+
32+
2533@pytest .fixture (scope = "module" )
2634def default_server_args ():
2735 return [
@@ -45,11 +53,12 @@ def default_server_args():
4553 f"--show-hidden-metrics-for-version={ PREV_MINOR_VERSION } " ,
4654 ],
4755)
48- def server (default_server_args , request ):
56+ def server (model_key , default_server_args , request ):
4957 if request .param :
5058 default_server_args .append (request .param )
5159
52- with RemoteOpenAIServer (MODEL_NAME , default_server_args ) as remote_server :
60+ model_name = MODELS [model_key ]
61+ with RemoteOpenAIServer (model_name , default_server_args ) as remote_server :
5362 yield remote_server
5463
5564
@@ -60,73 +69,80 @@ async def client(server):
6069
6170
6271_PROMPT = "Hello my name is Robert and I love magic"
63- tokenizer = AutoTokenizer .from_pretrained (MODEL_NAME )
64- _TOKENIZED_PROMPT = tokenizer (_PROMPT )["input_ids" ]
65-
66- _NUM_REQUESTS = 10
67- _NUM_PROMPT_TOKENS_PER_REQUEST = len (_TOKENIZED_PROMPT )
68- _NUM_GENERATION_TOKENS_PER_REQUEST = 10
69-
70- # {metric_family: [(suffix, expected_value)]}
71- EXPECTED_VALUES = {
72- "vllm:time_to_first_token_seconds" : [("_count" , _NUM_REQUESTS )],
73- "vllm:time_per_output_token_seconds" : [
74- ("_count" , _NUM_REQUESTS * (_NUM_GENERATION_TOKENS_PER_REQUEST - 1 ))
75- ],
76- "vllm:e2e_request_latency_seconds" : [("_count" , _NUM_REQUESTS )],
77- "vllm:request_queue_time_seconds" : [("_count" , _NUM_REQUESTS )],
78- "vllm:request_inference_time_seconds" : [("_count" , _NUM_REQUESTS )],
79- "vllm:request_prefill_time_seconds" : [("_count" , _NUM_REQUESTS )],
80- "vllm:request_decode_time_seconds" : [("_count" , _NUM_REQUESTS )],
81- "vllm:request_prompt_tokens" : [
82- ("_sum" , _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST ),
83- ("_count" , _NUM_REQUESTS ),
84- ],
85- "vllm:request_generation_tokens" : [
86- ("_sum" , _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST ),
87- ("_count" , _NUM_REQUESTS ),
88- ],
89- "vllm:request_params_n" : [("_count" , _NUM_REQUESTS )],
90- "vllm:request_params_max_tokens" : [
91- ("_sum" , _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST ),
92- ("_count" , _NUM_REQUESTS ),
93- ],
94- "vllm:iteration_tokens_total" : [
95- (
96- "_sum" ,
97- _NUM_REQUESTS
98- * (_NUM_PROMPT_TOKENS_PER_REQUEST + _NUM_GENERATION_TOKENS_PER_REQUEST ),
99- ),
100- ("_count" , _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST ),
101- ],
102- "vllm:prompt_tokens" : [("_total" , _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST )],
103- "vllm:generation_tokens" : [
104- ("_total" , _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST )
105- ],
106- "vllm:request_success" : [("_total" , _NUM_REQUESTS )],
107- }
72+ _IMAGE_URL = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
73+
74+
75+ def _get_expected_values (num_requests : int , prompt_ids : list [int ], max_tokens : int ):
76+ num_prompt_tokens = len (prompt_ids )
77+
78+ # {metric_family: [(suffix, expected_value)]}
79+ return {
80+ "vllm:time_to_first_token_seconds" : [("_count" , num_requests )],
81+ "vllm:time_per_output_token_seconds" : [
82+ ("_count" , num_requests * (max_tokens - 1 ))
83+ ],
84+ "vllm:e2e_request_latency_seconds" : [("_count" , num_requests )],
85+ "vllm:request_queue_time_seconds" : [("_count" , num_requests )],
86+ "vllm:request_inference_time_seconds" : [("_count" , num_requests )],
87+ "vllm:request_prefill_time_seconds" : [("_count" , num_requests )],
88+ "vllm:request_decode_time_seconds" : [("_count" , num_requests )],
89+ "vllm:request_prompt_tokens" : [
90+ ("_sum" , num_requests * num_prompt_tokens ),
91+ ("_count" , num_requests ),
92+ ],
93+ "vllm:request_generation_tokens" : [
94+ ("_sum" , num_requests * max_tokens ),
95+ ("_count" , num_requests ),
96+ ],
97+ "vllm:request_params_n" : [("_count" , num_requests )],
98+ "vllm:request_params_max_tokens" : [
99+ ("_sum" , num_requests * max_tokens ),
100+ ("_count" , num_requests ),
101+ ],
102+ "vllm:iteration_tokens_total" : [
103+ (
104+ "_sum" ,
105+ num_requests * (num_prompt_tokens + max_tokens ),
106+ ),
107+ ("_count" , num_requests * max_tokens ),
108+ ],
109+ "vllm:prompt_tokens" : [("_total" , num_requests * num_prompt_tokens )],
110+ "vllm:generation_tokens" : [("_total" , num_requests * max_tokens )],
111+ "vllm:request_success" : [("_total" , num_requests )],
112+ }
108113
109114
110115@pytest .mark .asyncio
111116async def test_metrics_counts (
112117 server : RemoteOpenAIServer ,
113118 client : openai .AsyncClient ,
119+ model_key : str ,
114120):
115- for _ in range (_NUM_REQUESTS ):
121+ if model_key == "multimodal" :
122+ pytest .skip ("Unnecessary test" )
123+
124+ model_name = MODELS [model_key ]
125+ tokenizer = AutoTokenizer .from_pretrained (model_name )
126+ prompt_ids = tokenizer .encode (_PROMPT )
127+ num_requests = 10
128+ max_tokens = 10
129+
130+ for _ in range (num_requests ):
116131 # sending a request triggers the metrics to be logged.
117132 await client .completions .create (
118- model = MODEL_NAME ,
119- prompt = _TOKENIZED_PROMPT ,
120- max_tokens = _NUM_GENERATION_TOKENS_PER_REQUEST ,
133+ model = model_name ,
134+ prompt = prompt_ids ,
135+ max_tokens = max_tokens ,
121136 )
122137
123138 response = requests .get (server .url_for ("metrics" ))
124139 print (response .text )
125140 assert response .status_code == HTTPStatus .OK
126141
127142 # Loop over all expected metric_families
128- for metric_family , suffix_values_list in EXPECTED_VALUES .items ():
129- if (metric_family not in EXPECTED_METRICS_V1 ) or (
143+ expected_values = _get_expected_values (num_requests , prompt_ids , max_tokens )
144+ for metric_family , suffix_values_list in expected_values .items ():
145+ if metric_family not in EXPECTED_METRICS_V1 or (
130146 not server .show_hidden_metrics
131147 and metric_family in HIDDEN_DEPRECATED_METRICS
132148 ):
@@ -217,6 +233,11 @@ async def test_metrics_counts(
217233 "vllm:request_decode_time_seconds_count" ,
218234]
219235
236+ EXPECTED_METRICS_MM = [
237+ "vllm:mm_cache_queries" ,
238+ "vllm:mm_cache_hits" ,
239+ ]
240+
220241HIDDEN_DEPRECATED_METRICS : list [str ] = [
221242 "vllm:gpu_cache_usage_perc" ,
222243 "vllm:gpu_prefix_cache_queries" ,
@@ -231,19 +252,43 @@ async def test_metrics_counts(
231252async def test_metrics_exist (
232253 server : RemoteOpenAIServer ,
233254 client : openai .AsyncClient ,
255+ model_key : str ,
234256):
257+ model_name = MODELS [model_key ]
258+
235259 # sending a request triggers the metrics to be logged.
236- await client .completions .create (
237- model = MODEL_NAME ,
238- prompt = "Hello, my name is" ,
239- max_tokens = 5 ,
240- temperature = 0.0 ,
241- )
260+ if model_key == "text" :
261+ await client .completions .create (
262+ model = model_name ,
263+ prompt = "Hello, my name is" ,
264+ max_tokens = 5 ,
265+ temperature = 0.0 ,
266+ )
267+ else :
268+ await client .chat .completions .create (
269+ model = model_name ,
270+ messages = [
271+ {
272+ "role" : "user" ,
273+ "content" : [
274+ {"type" : "image_url" , "image_url" : {"url" : _IMAGE_URL }},
275+ {"type" : "text" , "text" : "What's in this image?" },
276+ ],
277+ }
278+ ],
279+ max_tokens = 5 ,
280+ temperature = 0.0 ,
281+ )
242282
243283 response = requests .get (server .url_for ("metrics" ))
244284 assert response .status_code == HTTPStatus .OK
245285
246- for metric in EXPECTED_METRICS_V1 :
286+ expected_metrics = EXPECTED_METRICS_V1
287+ if model_key == "multimodal" :
288+ # NOTE: Don't use in-place assignment
289+ expected_metrics = expected_metrics + EXPECTED_METRICS_MM
290+
291+ for metric in expected_metrics :
247292 if metric in HIDDEN_DEPRECATED_METRICS and not server .show_hidden_metrics :
248293 continue
249294 assert metric in response .text
@@ -253,9 +298,14 @@ async def test_metrics_exist(
253298async def test_abort_metrics_reset (
254299 server : RemoteOpenAIServer ,
255300 client : openai .AsyncClient ,
301+ model_key : str ,
256302):
303+ model_name = MODELS [model_key ]
304+ tokenizer = AutoTokenizer .from_pretrained (model_name )
305+ prompt_ids = tokenizer .encode (_PROMPT )
306+
257307 running_requests , waiting_requests , kv_cache_usage = _get_running_metrics_from_api (
258- server
308+ server ,
259309 )
260310
261311 # Expect no running requests or kvcache usage
@@ -268,8 +318,8 @@ async def test_abort_metrics_reset(
268318 for _ in range (3 ):
269319 task = asyncio .create_task (
270320 client .completions .create (
271- model = MODEL_NAME ,
272- prompt = _TOKENIZED_PROMPT ,
321+ model = model_name ,
322+ prompt = prompt_ids ,
273323 max_tokens = 100 , # Long generation to give time to abort
274324 temperature = 0.0 ,
275325 )
@@ -281,7 +331,7 @@ async def test_abort_metrics_reset(
281331
282332 # Check that we have running requests
283333 running_requests , waiting_requests , kv_cache_usage = _get_running_metrics_from_api (
284- server
334+ server ,
285335 )
286336
287337 # Expect running requests and kvcache usage
0 commit comments