@@ -29,6 +29,7 @@ class RequestStateStats:
2929 num_generation_tokens : int = 0
3030 arrival_time : float = 0.0
3131 first_scheduled_time : float = 0.0
32+ first_token_time : float = 0.0
3233 last_token_time : float = 0.0
3334
3435
@@ -40,6 +41,8 @@ class FinishedRequestStats:
4041 e2e_latency : float = 0.0
4142 num_prompt_tokens : int = 0
4243 num_generation_tokens : int = 0
44+ inference_time : float = 0.0
45+ decode_time : float = 0.0
4346
4447
4548class IterationStats :
@@ -53,6 +56,7 @@ def __init__(self, log_stats: bool):
5356 self .time_to_first_tokens_iter : List [float ] = []
5457 self .time_per_output_tokens_iter : List [float ] = []
5558 self .queue_times_iter : List [float ] = []
59+ self .prefill_times_iter : List [float ] = []
5660
5761 def update_from_output (self , output : "EngineCoreOutput" ,
5862 is_prefilling : bool , prompt_len : int ,
@@ -72,8 +76,12 @@ def update_from_output(self, output: "EngineCoreOutput",
7276 # iff num_computed_tokens == num_tokens).
7377 assert (num_new_generation_tokens > 0 )
7478 self .num_prompt_tokens += prompt_len
79+ self .first_token_time = now
7580
7681 self .time_to_first_tokens_iter .append (last_token_latency )
82+
83+ prefill_time = now - request_state_stats .first_scheduled_time
84+ self .prefill_times_iter .append (prefill_time )
7785 else :
7886 self .time_per_output_tokens_iter .append (last_token_latency )
7987
@@ -92,8 +100,14 @@ def update_from_finished_request(self, finish_reason: "FinishReason",
92100 request_state_stats : RequestStateStats ):
93101 now = time .time ()
94102 e2e_latency = now - request_state_stats .arrival_time
95-
96- self .finished_requests .append (
97- FinishedRequestStats (finish_reason , e2e_latency ,
98- len (request_output .prompt_token_ids ),
99- request_state_stats .num_generation_tokens ))
103+ inference_time = now - request_state_stats .first_scheduled_time
104+ decode_time = now - request_state_stats .first_token_time
105+
106+ finished_req = \
107+ FinishedRequestStats (finish_reason = finish_reason ,
108+ e2e_latency = e2e_latency ,
109+ num_prompt_tokens = len (request_output .prompt_token_ids ),
110+ num_generation_tokens = request_state_stats .num_generation_tokens ,
111+ inference_time = inference_time ,
112+ decode_time = decode_time )
113+ self .finished_requests .append (finished_req )
0 commit comments