Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions tools/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1201,6 +1201,8 @@ struct server_task_result_metrics : server_task_result {
uint64_t n_tokens_predicted_total = 0;
uint64_t t_tokens_generation_total = 0;

uint64_t n_past_max = 0;

uint64_t n_prompt_tokens_processed = 0;
uint64_t t_prompt_processing = 0;

Expand All @@ -1226,6 +1228,8 @@ struct server_task_result_metrics : server_task_result {
{ "n_tokens_predicted_total", n_tokens_predicted_total },
{ "t_prompt_processing_total", t_prompt_processing_total },

{ "n_past_max", n_past_max },

{ "n_prompt_tokens_processed", n_prompt_tokens_processed },
{ "t_prompt_processing", t_prompt_processing },
{ "n_tokens_predicted", n_tokens_predicted },
Expand Down Expand Up @@ -1587,6 +1591,8 @@ struct server_metrics {
uint64_t n_tokens_predicted_total = 0;
uint64_t t_tokens_generation_total = 0;

uint64_t n_past_max = 0;

uint64_t n_prompt_tokens_processed = 0;
uint64_t t_prompt_processing = 0;

Expand All @@ -1605,6 +1611,10 @@ struct server_metrics {
n_prompt_tokens_processed += slot.n_prompt_tokens_processed;
t_prompt_processing += slot.t_prompt_processing;
t_prompt_processing_total += slot.t_prompt_processing;

if (slot.n_past > 0) {
n_past_max = std::max(n_past_max, (uint64_t) slot.n_past);
}
}

void on_prediction(const server_slot & slot) {
Expand All @@ -1620,6 +1630,9 @@ struct server_metrics {
if (slot.is_processing()) {
n_busy_slots_total++;
}
if (slot.n_past > 0) {
n_past_max = std::max(n_past_max, (uint64_t) slot.n_past);
}
}
}

Expand Down Expand Up @@ -2875,6 +2888,8 @@ struct server_context {
res->n_tokens_predicted_total = metrics.n_tokens_predicted_total;
res->t_tokens_generation_total = metrics.t_tokens_generation_total;

res->n_past_max = metrics.n_past_max;

res->n_prompt_tokens_processed = metrics.n_prompt_tokens_processed;
res->t_prompt_processing = metrics.t_prompt_processing;
res->n_tokens_predicted = metrics.n_tokens_predicted;
Expand Down Expand Up @@ -4077,6 +4092,10 @@ int main(int argc, char ** argv) {
{"name", "n_decode_total"},
{"help", "Total number of llama_decode() calls"},
{"value", res_metrics->n_decode_total}
}, {
{"name", "n_past_max"},
{"help", "Largest observed n_past."},
{"value", res_metrics->n_past_max}
}, {
{"name", "n_busy_slots_per_decode"},
{"help", "Average number of busy slots per llama_decode() call"},
Expand Down
Loading