Skip to content

Commit

Permalink
[Metrics] Add ttft and itl to server metrics (#2510)
Browse files Browse the repository at this point in the history
* Add ttft and itl to server metrics

* Fix ITL

* Fix clang-format

* Keep mobile and interface.chat untouched
  • Loading branch information
yongwww authored Jun 5, 2024
1 parent 64e33c5 commit 3bdc8f6
Show file tree
Hide file tree
Showing 13 changed files with 89 additions and 74 deletions.
22 changes: 11 additions & 11 deletions cpp/serve/engine.cc
Original file line number Diff line number Diff line change
Expand Up @@ -116,17 +116,17 @@ class MockEchoEngineImpl : public Engine {
// precompute the stream back results and store them in the request_map
request = Request::FromUntokenized(request, tokenizer_);
std::vector<RequestStreamOutput> outputs;
int64_t num_output_tokens = 0;
int64_t num_input_tokens = 0;
int64_t completion_tokens = 0;
int64_t prompt_tokens = 0;

for (Data input : request->inputs) {
// only stream back token data
if (auto* token_data = input.as<TokenDataNode>()) {
for (int64_t token_id : token_data->token_ids) {
num_input_tokens += 1;
num_output_tokens += 1;
prompt_tokens += 1;
completion_tokens += 1;
if (request->generation_cfg->max_tokens == -1 ||
num_output_tokens <= request->generation_cfg->max_tokens) {
completion_tokens <= request->generation_cfg->max_tokens) {
outputs.push_back(RequestStreamOutput(
request->id,
std::vector<IntTuple>(request->generation_cfg->n, IntTuple({token_id})),
Expand All @@ -140,7 +140,7 @@ class MockEchoEngineImpl : public Engine {
// output go beyond max tokens
String finish_reason = "stop";
if (request->generation_cfg->max_tokens != -1 &&
num_input_tokens > request->generation_cfg->max_tokens) {
prompt_tokens > request->generation_cfg->max_tokens) {
finish_reason = "length";
}
Array<IntTuple> group_delta_token_ids;
Expand All @@ -156,11 +156,11 @@ class MockEchoEngineImpl : public Engine {

// attach usage and config
picojson::object usage;
usage["prompt_tokens"] = picojson::value(static_cast<int64_t>(num_input_tokens));
usage["prompt_tokens"] = picojson::value(static_cast<int64_t>(prompt_tokens));
usage["completion_tokens"] =
picojson::value(static_cast<int64_t>(num_output_tokens * request->generation_cfg->n));
picojson::value(static_cast<int64_t>(completion_tokens * request->generation_cfg->n));
usage["total_tokens"] = picojson::value(
static_cast<int64_t>(num_input_tokens + num_output_tokens * request->generation_cfg->n));
static_cast<int64_t>(prompt_tokens + completion_tokens * request->generation_cfg->n));
usage["extra"] = picojson::value(request->generation_cfg->AsJSON());
// NOTE: Invariant requirement
// always stream back final usage
Expand Down Expand Up @@ -504,9 +504,9 @@ class EngineImpl : public Engine {

// Get a request copy where all text inputs are tokenized.
request = Request::FromUntokenized(request, tokenizer_);
ICHECK_NE(request->num_input_tokens, -1);
ICHECK_NE(request->prompt_tokens, -1);

if (request->num_input_tokens >= engine_config_->max_single_sequence_length &&
if (request->prompt_tokens >= engine_config_->max_single_sequence_length &&
request_stream_callback_ != nullptr) {
this->StreamBackError(request, "length");
return;
Expand Down
2 changes: 1 addition & 1 deletion cpp/serve/engine_actions/action_commons.cc
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ void ActionStepPostProcess(Array<Request> requests, EngineState estate, Array<Mo
for (const RequestStateEntry& rsentry : rstate->entries) {
for (Data data : rsentry->mstates[0]->prefilled_inputs) {
// note that we are counting prefill tokens across all branches
rstate->metrics.num_prefill_tokens += data->GetLength();
rstate->metrics.prefill_tokens += data->GetLength();
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion cpp/serve/engine_actions/batch_decode.cc
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ class BatchDecodeActionObj : public EngineActionObj {
mstates[i]->CommitToken(sample_results[i]);
// Metrics update
// live update the output metrics
running_rsentries[i]->rstate->metrics.num_output_tokens += 1;
running_rsentries[i]->rstate->metrics.completion_tokens += 1;
}

auto tend = std::chrono::high_resolution_clock::now();
Expand Down
2 changes: 1 addition & 1 deletion cpp/serve/engine_actions/batch_verify.cc
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ class BatchVerifyActionObj : public EngineActionObj {
}
// Metrics update
// live update the output metrics
rsentries[i]->rstate->metrics.num_output_tokens += accept_length;
rsentries[i]->rstate->metrics.completion_tokens += accept_length;
estate->metrics.spec_decode.Update(cum_verify_lengths[i + 1] - cum_verify_lengths[i],
accept_length);
int rollback_length =
Expand Down
2 changes: 1 addition & 1 deletion cpp/serve/engine_actions/eagle_batch_verify.cc
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ class EagleBatchVerifyActionObj : public EngineActionObj {
}
// Metrics update
// live update the output metrics
rsentries[i]->rstate->metrics.num_output_tokens += accept_length;
rsentries[i]->rstate->metrics.completion_tokens += accept_length;
estate->metrics.spec_decode.Update(cum_verify_lengths[i + 1] - cum_verify_lengths[i],
accept_length);
// - Minus one because the last draft token has no kv cache entry
Expand Down
34 changes: 18 additions & 16 deletions cpp/serve/metrics.cc
Original file line number Diff line number Diff line change
Expand Up @@ -72,25 +72,27 @@ picojson::object SpecDecodeMetrics::AsJSON() const {

picojson::object RequestMetrics::AsJSON() const {
picojson::object metrics;
metrics["num_input_tokens"] = picojson::value(num_input_tokens);
metrics["num_prefill_tokens"] = picojson::value(num_prefill_tokens);
metrics["num_output_tokens"] = picojson::value(num_output_tokens);
metrics["prompt_tokens"] = picojson::value(prompt_tokens);
metrics["prefill_tokens"] = picojson::value(prefill_tokens);
metrics["completion_tokens"] = picojson::value(completion_tokens);

if (num_output_tokens != 0) {
metrics["decode_tokens_per_s"] = picojson::value(num_output_tokens / this->GetDecodeTime());
if (completion_tokens != 0) {
metrics["decode_tokens_per_s"] = picojson::value(completion_tokens / this->GetDecodeTime());
}
if (num_prefill_tokens != 0) {
metrics["prefill_tokens_per_s"] = picojson::value(num_prefill_tokens / this->GetPrefillTime());
if (prefill_tokens != 0) {
metrics["prefill_tokens_per_s"] = picojson::value(prefill_tokens / this->GetPrefillTime());
}
metrics["end_to_end_latency_s"] = picojson::value(this->GetTotalTime());
metrics["ttft_s"] = picojson::value(this->GetTTFT());
metrics["inter_token_latency_s"] = picojson::value(this->GetInterTokenLatency());
return metrics;
}

std::string RequestMetrics::AsUsageJSONStr(bool include_extra) const {
picojson::object usage;
usage["completion_tokens"] = picojson::value(num_output_tokens);
usage["prompt_tokens"] = picojson::value(num_input_tokens);
usage["total_tokens"] = picojson::value(num_input_tokens + num_output_tokens);
usage["completion_tokens"] = picojson::value(completion_tokens);
usage["prompt_tokens"] = picojson::value(prompt_tokens);
usage["total_tokens"] = picojson::value(prompt_tokens + completion_tokens);
if (include_extra) {
usage["extra"] = picojson::value(this->AsJSON());
}
Expand All @@ -101,9 +103,9 @@ picojson::object EngineMetrics::AsJSON() const {
picojson::object metrics;
metrics["engine_prefill_time_sum"] = picojson::value(engine_prefill_time_sum);
metrics["engine_decode_time_sum"] = picojson::value(engine_decode_time_sum);
metrics["num_input_tokens_sum"] = picojson::value(num_input_tokens_sum);
metrics["num_prefill_tokens_sum"] = picojson::value(num_prefill_tokens_sum);
metrics["num_output_tokens_sum"] = picojson::value(num_output_tokens_sum);
metrics["prompt_tokens_sum"] = picojson::value(prompt_tokens_sum);
metrics["prefill_tokens_sum"] = picojson::value(prefill_tokens_sum);
metrics["completion_tokens_sum"] = picojson::value(completion_tokens_sum);
metrics["last_finished_request"] = picojson::value(last_finished_request.AsJSON());
if (!spec_decode.IsEmpty()) {
metrics["spec_decode"] = picojson::value(spec_decode.AsJSON());
Expand Down Expand Up @@ -146,9 +148,9 @@ std::string EngineMetrics::AsUsageJSONStr() const {
void EngineMetrics::Reset() {
engine_prefill_time_sum = 0.0;
engine_decode_time_sum = 0.0;
num_input_tokens_sum = 0;
num_prefill_tokens_sum = 0;
num_output_tokens_sum = 0;
prompt_tokens_sum = 0;
prefill_tokens_sum = 0;
completion_tokens_sum = 0;
last_finished_request.Reset();
spec_decode.Reset();
decode_time_by_batch_size.clear();
Expand Down
34 changes: 21 additions & 13 deletions cpp/serve/metrics.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,11 +97,11 @@ struct SpecDecodeMetrics {
*/
struct RequestMetrics {
/*! \brief Request input tokens. */
int64_t num_input_tokens = 0;
int64_t prompt_tokens = 0;
/*! \brief Total number of output tokens. */
int64_t num_output_tokens = 0;
int64_t completion_tokens = 0;
/*! \brief Total number of tokens that needs to be prefilled */
int64_t num_prefill_tokens = 0;
int64_t prefill_tokens = 0;

/*! \brief The time of adding the request to engine. */
std::chrono::high_resolution_clock::time_point add_time_point;
Expand All @@ -111,7 +111,7 @@ struct RequestMetrics {
std::chrono::high_resolution_clock::time_point finish_time_point;

/*! \brief check whether the request metrics is a completed request */
bool IsComplete() const { return num_input_tokens != 0 && num_output_tokens != 0; }
bool IsComplete() const { return prompt_tokens != 0 && completion_tokens != 0; }

/*! \return the prefill time in seconds */
double GetPrefillTime() const {
Expand All @@ -123,16 +123,24 @@ struct RequestMetrics {
return static_cast<double>((finish_time_point - prefill_end_time_point).count()) / 1e9;
}

/*! \return the time to first token (TTFT) in seconds */
double GetTTFT() const {
return static_cast<double>((prefill_end_time_point - add_time_point).count()) / 1e9;
}

/*! \return the prefill time in seconds */
double GetTotalTime() const {
return static_cast<double>((finish_time_point - add_time_point).count()) / 1e9;
}

/*! \return the inter token latency (ITL) in seconds */
double GetInterTokenLatency() const { return GetTotalTime() / completion_tokens; }

/*! \brief Reset the metric. */
void Reset() {
this->num_input_tokens = 0;
this->num_prefill_tokens = 0;
this->num_output_tokens = 0;
this->prompt_tokens = 0;
this->prefill_tokens = 0;
this->completion_tokens = 0;
}
/*!
* \brief Return the request metrics in JSON.
Expand All @@ -155,11 +163,11 @@ struct EngineMetrics {
/*! \brief The total engine time on decode/draft/verify, including warmup */
double engine_decode_time_sum = 0;
/*! \brief The total number of request input tokens. */
int64_t num_input_tokens_sum = 0;
int64_t prompt_tokens_sum = 0;
/*! \brief The total number of processed tokens (excluding the prefix-cached length) in prefill */
int64_t num_prefill_tokens_sum = 0;
int64_t prefill_tokens_sum = 0;
/*! \brief The total number of request output tokens */
int64_t num_output_tokens_sum = 0;
int64_t completion_tokens_sum = 0;
/*! \brief metrics from last finished request. */
RequestMetrics last_finished_request;
/*! \brief speculative decoding metrics */
Expand Down Expand Up @@ -213,9 +221,9 @@ struct EngineMetrics {
* by including the information from the finished request.
*/
void RequestFinishUpdate(const RequestMetrics& request_metrics) {
num_input_tokens_sum += request_metrics.num_input_tokens;
num_prefill_tokens_sum += request_metrics.num_prefill_tokens;
num_output_tokens_sum += request_metrics.num_output_tokens;
prompt_tokens_sum += request_metrics.prompt_tokens;
prefill_tokens_sum += request_metrics.prefill_tokens;
completion_tokens_sum += request_metrics.completion_tokens;
last_finished_request = request_metrics;
}
/*!
Expand Down
12 changes: 6 additions & 6 deletions cpp/serve/request.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,22 +24,22 @@ Request::Request(String id, Array<Data> inputs, GenerationConfig generation_cfg)
}
// Compute the total input length, or fall back to "-1" which means
// unknown due to the existence of untokenized data.
int num_input_tokens = 0;
int prompt_tokens = 0;
for (Data input : inputs) {
if (const auto* token_data = input.as<TokenDataNode>()) {
num_input_tokens += token_data->token_ids.size();
prompt_tokens += token_data->token_ids.size();
} else if (const auto* image_data = input.as<ImageDataNode>()) {
num_input_tokens += image_data->GetLength();
prompt_tokens += image_data->GetLength();
} else {
num_input_tokens = -1;
prompt_tokens = -1;
break;
}
}

ObjectPtr<RequestNode> n = make_object<RequestNode>();
n->id = std::move(id);
n->inputs = std::move(inputs);
n->num_input_tokens = num_input_tokens;
n->prompt_tokens = prompt_tokens;
n->generation_cfg = std::move(generation_cfg);
data_ = std::move(n);
}
Expand All @@ -61,7 +61,7 @@ Request Request::FromUntokenized(const Request& request, const Tokenizer& tokeni

// If there is no untokenized input, we don't need to create a new request.
if (!has_untokenized_input) {
ICHECK_NE(request->num_input_tokens, -1);
ICHECK_NE(request->prompt_tokens, -1);
return request;
} else {
return Request(request->id, std::move(inputs), request->generation_cfg);
Expand Down
2 changes: 1 addition & 1 deletion cpp/serve/request.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class RequestNode : public Object {
* "-1" means the input length is unknown due to the existence
* of untokenized text data.
*/
int num_input_tokens = -1;
int prompt_tokens = -1;
/*!
* \brief The sampling configuration which may contain temperature,
* top_p, repetition_penalty, max_gen_len, etc.
Expand Down
4 changes: 2 additions & 2 deletions cpp/serve/request_state.cc
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ DeltaRequestReturn RequestStateEntryNode::GetReturnTokenIds(const Tokenizer& tok
return {return_token_ids, logprob_json_strs, String("length")};
}
// Case 6. Total length of the request reaches the maximum single sequence length ==> Finished
if (request->num_input_tokens + num_committed_tokens >= max_single_sequence_length) {
if (request->prompt_tokens + num_committed_tokens >= max_single_sequence_length) {
std::vector<int32_t> remaining = stop_str_handler->Finish();
return_token_ids.insert(return_token_ids.end(), remaining.begin(), remaining.end());
return {return_token_ids, logprob_json_strs, String("length")};
Expand All @@ -203,7 +203,7 @@ RequestState::RequestState(std::vector<RequestStateEntry> entries,
ICHECK(!entries.empty());
ObjectPtr<RequestStateNode> n = make_object<RequestStateNode>();
n->entries = std::move(entries);
n->metrics.num_input_tokens = n->entries[0]->request->num_input_tokens;
n->metrics.prompt_tokens = n->entries[0]->request->prompt_tokens;
n->metrics.add_time_point = add_time_point;
data_ = std::move(n);
}
Expand Down
17 changes: 8 additions & 9 deletions python/mlc_llm/bench/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ class Metrics(BaseModel):

prompt_tokens: int
completion_tokens: int
end_to_end_latency: float
inter_token_latency: float
decode_token_latency: float
end_to_end_latency_s: float
inter_token_latency_s: float
decode_tokens_per_s: float
ttft: Optional[float] = None


Expand Down Expand Up @@ -85,14 +85,13 @@ def extract_metrics_from_request_records(
prompt_tokens = self.count_tokens(metric.input)
completion_tokens = self.count_tokens(metric.output)
assert prompt_tokens > 0 and completion_tokens >= 0, "Invalid prompt tokens"
end_to_end_latency = metric.end_to_end_latency
if metric.ttft is None:
ttft = 0
end_to_end_latency_s = metric.end_to_end_latency_s
ttft = metric.ttft if metric.ttft is not None else 0
refined_metric = Metrics(
inter_token_latency=end_to_end_latency / completion_tokens,
decode_token_latency=(end_to_end_latency - ttft) / completion_tokens,
inter_token_latency_s=end_to_end_latency_s / completion_tokens,
decode_tokens_per_s=completion_tokens / (end_to_end_latency_s - ttft),
ttft=metric.ttft,
end_to_end_latency=end_to_end_latency,
end_to_end_latency_s=end_to_end_latency_s,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
)
Expand Down
12 changes: 9 additions & 3 deletions python/mlc_llm/bench/request.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""MLC LLM Bench Request"""
import json
import time
from typing import Any, Dict, List, Optional

Expand All @@ -21,7 +22,7 @@ class RequestRecords(BaseModel):

input: str
output: str
end_to_end_latency: float
end_to_end_latency_s: float
ttft: Optional[float] = None


Expand Down Expand Up @@ -109,11 +110,16 @@ async def __call__(self, params: Dict[str, Any] = None) -> None:
generated_text = ""
ttft = None
start_time = time.monotonic()
# chat_params["stream_options"] = {"include_usage": True}
response = await self.client.chat.completions.create(**chat_params)

if chat_params["stream"]:
async for chunk in response:
if chunk.choices[0].delta.content is not None:
if chunk.usage:
logger.info(
"Server Metrics:\n%s", json.dumps(chunk.usage.extra, indent=4, default=str)
)
elif chunk.choices[0].delta.content is not None:
if not ttft:
ttft = time.monotonic() - start_time # type: ignore
generated_text += chunk.choices[0].delta.content
Expand All @@ -124,7 +130,7 @@ async def __call__(self, params: Dict[str, Any] = None) -> None:
req_rec = RequestRecords(
input=prompt,
output=generated_text,
end_to_end_latency=total_request_time,
end_to_end_latency_s=total_request_time,
ttft=ttft,
)
self.request_records.append(req_rec)
Expand Down
Loading

0 comments on commit 3bdc8f6

Please sign in to comment.