diff --git a/src/turnkeyml/llm/cache.py b/src/turnkeyml/llm/cache.py index 6bf90bc..996908e 100644 --- a/src/turnkeyml/llm/cache.py +++ b/src/turnkeyml/llm/cache.py @@ -21,9 +21,10 @@ class Keys: PER_ITERATION_LATENCY = "per_iteration_latency" MEAN_LATENCY = "mean_latency" STD_DEV_LATENCY = "std_dev_latency" - MEAN_TOKENS_PER_SECOND = "mean_tokens_per_second" + TOKEN_GENERATION_TOKENS_PER_SECOND = "token_generation_tokens_per_second" STD_DEV_TOKENS_PER_SECOND = "std_dev_tokens_per_second" SECONDS_TO_FIRST_TOKEN = "seconds_to_first_token" + PREFILL_TOKENS_PER_SECOND = "prefill_tokens_per_second" STD_DEV_SECONDS_TO_FIRST_TOKEN = "std_dev_seconds_to_first_token" CHECKPOINT = "checkpoint" DTYPE = "dtype" diff --git a/src/turnkeyml/llm/tools/huggingface_bench.py b/src/turnkeyml/llm/tools/huggingface_bench.py index f3a65fc..4cee854 100644 --- a/src/turnkeyml/llm/tools/huggingface_bench.py +++ b/src/turnkeyml/llm/tools/huggingface_bench.py @@ -110,7 +110,10 @@ class HuggingfaceBench(Tool): def __init__(self): super().__init__(monitor_message="Benchmarking Huggingface LLM") - self.status_stats = [Keys.SECONDS_TO_FIRST_TOKEN, Keys.MEAN_TOKENS_PER_SECOND] + self.status_stats = [ + Keys.SECONDS_TO_FIRST_TOKEN, + Keys.TOKEN_GENERATION_TOKENS_PER_SECOND, + ] @staticmethod def parser(parser: argparse.ArgumentParser = None, add_help: bool = True): @@ -283,11 +286,13 @@ def run( [token_len for _, token_len in decode_per_iteration_result] ) # Subtract 1 so that we don't count the prefill token - mean_tokens_per_second = (mean_token_len - 1) / mean_decode_latency + token_generation_tokens_per_second = (mean_token_len - 1) / mean_decode_latency # Save performance data to stats state.save_stat(Keys.SECONDS_TO_FIRST_TOKEN, mean_time_to_first_token) - state.save_stat(Keys.MEAN_TOKENS_PER_SECOND, mean_tokens_per_second) + state.save_stat( + Keys.TOKEN_GENERATION_TOKENS_PER_SECOND, token_generation_tokens_per_second + ) state.save_stat(Keys.PROMPT_TOKENS, input_ids.shape[1]) return state diff --git a/src/turnkeyml/llm/tools/ort_genai/oga_bench.py b/src/turnkeyml/llm/tools/ort_genai/oga_bench.py index c704803..fed7f8c 100644 --- a/src/turnkeyml/llm/tools/ort_genai/oga_bench.py +++ b/src/turnkeyml/llm/tools/ort_genai/oga_bench.py @@ -32,7 +32,8 @@ def __init__(self): self.status_stats = [ Keys.SECONDS_TO_FIRST_TOKEN, - Keys.MEAN_TOKENS_PER_SECOND, + Keys.PREFILL_TOKENS_PER_SECOND, + Keys.TOKEN_GENERATION_TOKENS_PER_SECOND, Keys.PROMPT_TOKENS, ] @@ -144,10 +145,16 @@ def run( per_iteration_tokens_per_second.append(model.tokens_per_second) mean_time_to_first_token = statistics.mean(per_iteration_time_to_first_token) - mean_tokens_per_second = statistics.mean(per_iteration_tokens_per_second) + prefill_tokens_per_second = input_ids_len / mean_time_to_first_token + token_generation_tokens_per_second = statistics.mean( + per_iteration_tokens_per_second + ) state.save_stat(Keys.SECONDS_TO_FIRST_TOKEN, mean_time_to_first_token) - state.save_stat(Keys.MEAN_TOKENS_PER_SECOND, mean_tokens_per_second) + state.save_stat(Keys.PREFILL_TOKENS_PER_SECOND, prefill_tokens_per_second) + state.save_stat( + Keys.TOKEN_GENERATION_TOKENS_PER_SECOND, token_generation_tokens_per_second + ) state.save_stat(Keys.PROMPT_TOKENS, input_ids_len) return state diff --git a/test/llm_api.py b/test/llm_api.py index 59dcde1..fe7cdbe 100644 --- a/test/llm_api.py +++ b/test/llm_api.py @@ -78,7 +78,7 @@ def test_001_huggingface_bench(self): stats = fs.Stats(state.cache_dir, state.build_name).stats - assert stats[Keys.MEAN_TOKENS_PER_SECOND] > 0 + assert stats[Keys.TOKEN_GENERATION_TOKENS_PER_SECOND] > 0 if __name__ == "__main__":