Skip to content

Commit

Permalink
Add prefill tps in oga-bench (#250)
Browse files Browse the repository at this point in the history
* Add prefill tps in oga-bench

Signed-off-by: David Fan <jiafa@microsoft.com>

* link fix

---------

Signed-off-by: David Fan <jiafa@microsoft.com>
Co-authored-by: ramkrishna2910 <ramkrishna2910@gmail.com>
  • Loading branch information
jiafatom and ramkrishna2910 authored Dec 13, 2024
1 parent 8c46f6b commit 9dea250
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 8 deletions.
3 changes: 2 additions & 1 deletion src/turnkeyml/llm/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,10 @@ class Keys:
PER_ITERATION_LATENCY = "per_iteration_latency"
MEAN_LATENCY = "mean_latency"
STD_DEV_LATENCY = "std_dev_latency"
MEAN_TOKENS_PER_SECOND = "mean_tokens_per_second"
TOKEN_GENERATION_TOKENS_PER_SECOND = "token_generation_tokens_per_second"
STD_DEV_TOKENS_PER_SECOND = "std_dev_tokens_per_second"
SECONDS_TO_FIRST_TOKEN = "seconds_to_first_token"
PREFILL_TOKENS_PER_SECOND = "prefill_tokens_per_second"
STD_DEV_SECONDS_TO_FIRST_TOKEN = "std_dev_seconds_to_first_token"
CHECKPOINT = "checkpoint"
DTYPE = "dtype"
Expand Down
11 changes: 8 additions & 3 deletions src/turnkeyml/llm/tools/huggingface_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,10 @@ class HuggingfaceBench(Tool):
def __init__(self):
super().__init__(monitor_message="Benchmarking Huggingface LLM")

self.status_stats = [Keys.SECONDS_TO_FIRST_TOKEN, Keys.MEAN_TOKENS_PER_SECOND]
self.status_stats = [
Keys.SECONDS_TO_FIRST_TOKEN,
Keys.TOKEN_GENERATION_TOKENS_PER_SECOND,
]

@staticmethod
def parser(parser: argparse.ArgumentParser = None, add_help: bool = True):
Expand Down Expand Up @@ -283,11 +286,13 @@ def run(
[token_len for _, token_len in decode_per_iteration_result]
)
# Subtract 1 so that we don't count the prefill token
mean_tokens_per_second = (mean_token_len - 1) / mean_decode_latency
token_generation_tokens_per_second = (mean_token_len - 1) / mean_decode_latency

# Save performance data to stats
state.save_stat(Keys.SECONDS_TO_FIRST_TOKEN, mean_time_to_first_token)
state.save_stat(Keys.MEAN_TOKENS_PER_SECOND, mean_tokens_per_second)
state.save_stat(
Keys.TOKEN_GENERATION_TOKENS_PER_SECOND, token_generation_tokens_per_second
)
state.save_stat(Keys.PROMPT_TOKENS, input_ids.shape[1])

return state
13 changes: 10 additions & 3 deletions src/turnkeyml/llm/tools/ort_genai/oga_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ def __init__(self):

self.status_stats = [
Keys.SECONDS_TO_FIRST_TOKEN,
Keys.MEAN_TOKENS_PER_SECOND,
Keys.PREFILL_TOKENS_PER_SECOND,
Keys.TOKEN_GENERATION_TOKENS_PER_SECOND,
Keys.PROMPT_TOKENS,
]

Expand Down Expand Up @@ -144,10 +145,16 @@ def run(
per_iteration_tokens_per_second.append(model.tokens_per_second)

mean_time_to_first_token = statistics.mean(per_iteration_time_to_first_token)
mean_tokens_per_second = statistics.mean(per_iteration_tokens_per_second)
prefill_tokens_per_second = input_ids_len / mean_time_to_first_token
token_generation_tokens_per_second = statistics.mean(
per_iteration_tokens_per_second
)

state.save_stat(Keys.SECONDS_TO_FIRST_TOKEN, mean_time_to_first_token)
state.save_stat(Keys.MEAN_TOKENS_PER_SECOND, mean_tokens_per_second)
state.save_stat(Keys.PREFILL_TOKENS_PER_SECOND, prefill_tokens_per_second)
state.save_stat(
Keys.TOKEN_GENERATION_TOKENS_PER_SECOND, token_generation_tokens_per_second
)
state.save_stat(Keys.PROMPT_TOKENS, input_ids_len)

return state
2 changes: 1 addition & 1 deletion test/llm_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def test_001_huggingface_bench(self):

stats = fs.Stats(state.cache_dir, state.build_name).stats

assert stats[Keys.MEAN_TOKENS_PER_SECOND] > 0
assert stats[Keys.TOKEN_GENERATION_TOKENS_PER_SECOND] > 0


if __name__ == "__main__":
Expand Down

0 comments on commit 9dea250

Please sign in to comment.