@@ -131,7 +131,7 @@ impl Metrics {
131131 "Input sequence length in tokens" ,
132132 )
133133 . buckets ( vec ! [
134- 50.0 , 100.0 , 500.0 , 1000.0 , 2000.0 , 4000.0 , 8000.0 , 16000.0 , 32000.0 , 64000.0 ,
134+ 0.0 , 50.0 , 100.0 , 500.0 , 1000.0 , 2000.0 , 4000.0 , 8000.0 , 16000.0 , 32000.0 , 64000.0 ,
135135 128000.0 ,
136136 ] ) ,
137137 & [ "model" ] ,
@@ -144,7 +144,7 @@ impl Metrics {
144144 "Output sequence length in tokens" ,
145145 )
146146 . buckets ( vec ! [
147- 50.0 , 100.0 , 500.0 , 1000.0 , 2000.0 , 4000.0 , 8000.0 , 16000.0 , 32000.0 ,
147+ 0.0 , 50.0 , 100.0 , 500.0 , 1000.0 , 2000.0 , 4000.0 , 8000.0 , 16000.0 , 32000.0 ,
148148 ] ) ,
149149 & [ "model" ] ,
150150 )
@@ -156,8 +156,8 @@ impl Metrics {
156156 "Time to first token in seconds" ,
157157 )
158158 . buckets ( vec ! [
159- 0.001 , 0.005 , 0.01 , 0.025 , 0.05 , 0.1 , 0.25 , 0.5 , 1.0 , 2.0 , 5.0 , 10.0 , 30.0 , 60 .0,
160- 120.0 , 240.0 , 480.0 ,
159+ 0.0 , 0. 001, 0.005 , 0.01 , 0.025 , 0.05 , 0.1 , 0.25 , 0.5 , 1.0 , 2.0 , 5.0 , 10.0 , 30.0 ,
160+ 60.0 , 120.0 , 240.0 , 480.0 ,
161161 ] ) ,
162162 & [ "model" ] ,
163163 )
@@ -169,7 +169,7 @@ impl Metrics {
169169 "Inter-token latency in seconds" ,
170170 )
171171 . buckets ( vec ! [
172- 0.001 , 0.005 , 0.01 , 0.015 , 0.02 , 0.025 , 0.05 , 0.1 , 0.25 , 0.5 , 1.0 , 2.0 ,
172+ 0.0 , 0. 001, 0.005 , 0.01 , 0.015 , 0.02 , 0.025 , 0.05 , 0.1 , 0.25 , 0.5 , 1.0 , 2.0 ,
173173 ] ) ,
174174 & [ "model" ] ,
175175 )
@@ -316,7 +316,7 @@ impl InflightGuard {
316316
317317 pub ( crate ) fn observe_response ( & mut self , isl : usize , num_tokens : usize ) {
318318 if self . first_token {
319- // NOTE: when there are multiple tokens in the first response,
319+ // NOTE: when there are multiple tokens in the first response,
320320 // we use the full response time as TTFT and ignore the ITL
321321 self . first_token = false ;
322322
0 commit comments