@@ -411,6 +411,19 @@ def __init__(self,
411411 self .histogram_inter_token_latency = make_per_engine (
412412 histogram_inter_token_latency , engine_indexes , model_name )
413413
414+ histogram_request_time_per_output_token = self ._histogram_cls (
415+ name = "vllm:request_time_per_output_token_seconds" ,
416+ documentation =
417+ "Histogram of time_per_output_token_seconds per request." ,
418+ buckets = [
419+ 0.01 , 0.025 , 0.05 , 0.075 , 0.1 , 0.15 , 0.2 , 0.3 , 0.4 , 0.5 , 0.75 ,
420+ 1.0 , 2.5 , 5.0 , 7.5 , 10.0 , 20.0 , 40.0 , 80.0
421+ ],
422+ labelnames = labelnames )
423+ self .histogram_request_time_per_output_token = make_per_engine (
424+ histogram_request_time_per_output_token , engine_indexes ,
425+ model_name )
426+
414427 request_latency_buckets = [
415428 0.3 , 0.5 , 0.8 , 1.0 , 1.5 , 2.0 , 2.5 , 5.0 , 10.0 , 15.0 , 20.0 , 30.0 ,
416429 40.0 , 50.0 , 60.0 , 120.0 , 240.0 , 480.0 , 960.0 , 1920.0 , 7680.0
@@ -583,6 +596,8 @@ def record(self,
583596 finished_request .num_prompt_tokens )
584597 self .histogram_num_generation_tokens_request [engine_idx ].observe (
585598 finished_request .num_generation_tokens )
599+ self .histogram_request_time_per_output_token [engine_idx ].observe (
600+ finished_request .mean_time_per_output_token )
586601 if finished_request .max_tokens_param :
587602 self .histogram_max_tokens_request [engine_idx ].observe (
588603 finished_request .max_tokens_param )
0 commit comments