From 8c46f6bf9c19fbd2dcd9c715e84625479415365e Mon Sep 17 00:00:00 2001 From: David Fan <30608893+jiafatom@users.noreply.github.com> Date: Fri, 13 Dec 2024 11:27:20 -0800 Subject: [PATCH 1/2] Add cuda support when loading local onnx model (#249) --- setup.py | 18 ++++++++++++++++-- src/turnkeyml/llm/tools/ort_genai/oga.py | 5 +++-- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 1d233fa..dff2065 100644 --- a/setup.py +++ b/setup.py @@ -33,8 +33,6 @@ "invoke>=2.0.0", "onnx>=1.11.0", "onnxmltools==1.10.0", - "onnxruntime >=1.10.1;platform_system=='Linux'", - "onnxruntime-directml>=1.19.0;platform_system=='Windows'", "torch>=1.12.1", "pyyaml>=5.4", "typeguard>=2.3.13", @@ -46,6 +44,10 @@ "fasteners", "GitPython>=3.1.40", "psutil", + # Conditional dependencies for ONNXRuntime backends + "onnxruntime >=1.10.1;platform_system=='Linux' and extra != 'llm-oga-cuda'", + "onnxruntime-directml >=1.19.0;platform_system=='Windows' and extra != 'llm-oga-cuda'", + "onnxruntime-gpu >=1.19.1;extra == 'llm-oga-cuda'", ], extras_require={ "llm": [ @@ -71,6 +73,18 @@ "fastapi", "uvicorn[standard]", ], + "llm-oga-cuda": [ + "onnxruntime-genai-cuda==0.4.0", + "tqdm", + "torch>=2.0.0,<2.4", + "transformers<4.45.0", + "accelerate", + "py-cpuinfo", + "sentencepiece", + "datasets", + "fastapi", + "uvicorn[standard]", + ], "llm-oga-npu": [ "transformers", "torch", diff --git a/src/turnkeyml/llm/tools/ort_genai/oga.py b/src/turnkeyml/llm/tools/ort_genai/oga.py index de5a14a..acd3c49 100644 --- a/src/turnkeyml/llm/tools/ort_genai/oga.py +++ b/src/turnkeyml/llm/tools/ort_genai/oga.py @@ -35,7 +35,7 @@ oga_model_builder_cache_path = "model_builder" # Mapping from processor to executiion provider, used in pathnames and by model_builder -execution_providers = {"cpu": "cpu", "npu": "npu", "igpu": "dml"} +execution_providers = {"cpu": "cpu", "npu": "npu", "igpu": "dml", "cuda": "cuda"} class OrtGenaiTokenizer(TokenizerAdapter): @@ -248,7 +248,7 @@ def parser(add_help: bool = True) -> argparse.ArgumentParser: parser.add_argument( "-d", "--device", - choices=["igpu", "npu", "cpu"], + choices=["igpu", "npu", "cpu", "cuda"], default="igpu", help="Which device to load the model on to (default: igpu)", ) @@ -312,6 +312,7 @@ def run( "cpu": {"int4": "*/*", "fp32": "*/*"}, "igpu": {"int4": "*/*", "fp16": "*/*"}, "npu": {"int4": "amd/**-onnx-ryzen-strix"}, + "cuda": {"int4": "*/*", "fp16": "*/*"}, } hf_supported = ( device in hf_supported_models From 9dea250699ad19f11dcee725141ac081a9fcb980 Mon Sep 17 00:00:00 2001 From: David Fan <30608893+jiafatom@users.noreply.github.com> Date: Fri, 13 Dec 2024 14:03:22 -0800 Subject: [PATCH 2/2] Add prefill tps in oga-bench (#250) * Add prefill tps in oga-bench Signed-off-by: David Fan * link fix --------- Signed-off-by: David Fan Co-authored-by: ramkrishna2910 --- src/turnkeyml/llm/cache.py | 3 ++- src/turnkeyml/llm/tools/huggingface_bench.py | 11 ++++++++--- src/turnkeyml/llm/tools/ort_genai/oga_bench.py | 13 ++++++++++--- test/llm_api.py | 2 +- 4 files changed, 21 insertions(+), 8 deletions(-) diff --git a/src/turnkeyml/llm/cache.py b/src/turnkeyml/llm/cache.py index 6bf90bc..996908e 100644 --- a/src/turnkeyml/llm/cache.py +++ b/src/turnkeyml/llm/cache.py @@ -21,9 +21,10 @@ class Keys: PER_ITERATION_LATENCY = "per_iteration_latency" MEAN_LATENCY = "mean_latency" STD_DEV_LATENCY = "std_dev_latency" - MEAN_TOKENS_PER_SECOND = "mean_tokens_per_second" + TOKEN_GENERATION_TOKENS_PER_SECOND = "token_generation_tokens_per_second" STD_DEV_TOKENS_PER_SECOND = "std_dev_tokens_per_second" SECONDS_TO_FIRST_TOKEN = "seconds_to_first_token" + PREFILL_TOKENS_PER_SECOND = "prefill_tokens_per_second" STD_DEV_SECONDS_TO_FIRST_TOKEN = "std_dev_seconds_to_first_token" CHECKPOINT = "checkpoint" DTYPE = "dtype" diff --git a/src/turnkeyml/llm/tools/huggingface_bench.py b/src/turnkeyml/llm/tools/huggingface_bench.py index f3a65fc..4cee854 100644 --- a/src/turnkeyml/llm/tools/huggingface_bench.py +++ b/src/turnkeyml/llm/tools/huggingface_bench.py @@ -110,7 +110,10 @@ class HuggingfaceBench(Tool): def __init__(self): super().__init__(monitor_message="Benchmarking Huggingface LLM") - self.status_stats = [Keys.SECONDS_TO_FIRST_TOKEN, Keys.MEAN_TOKENS_PER_SECOND] + self.status_stats = [ + Keys.SECONDS_TO_FIRST_TOKEN, + Keys.TOKEN_GENERATION_TOKENS_PER_SECOND, + ] @staticmethod def parser(parser: argparse.ArgumentParser = None, add_help: bool = True): @@ -283,11 +286,13 @@ def run( [token_len for _, token_len in decode_per_iteration_result] ) # Subtract 1 so that we don't count the prefill token - mean_tokens_per_second = (mean_token_len - 1) / mean_decode_latency + token_generation_tokens_per_second = (mean_token_len - 1) / mean_decode_latency # Save performance data to stats state.save_stat(Keys.SECONDS_TO_FIRST_TOKEN, mean_time_to_first_token) - state.save_stat(Keys.MEAN_TOKENS_PER_SECOND, mean_tokens_per_second) + state.save_stat( + Keys.TOKEN_GENERATION_TOKENS_PER_SECOND, token_generation_tokens_per_second + ) state.save_stat(Keys.PROMPT_TOKENS, input_ids.shape[1]) return state diff --git a/src/turnkeyml/llm/tools/ort_genai/oga_bench.py b/src/turnkeyml/llm/tools/ort_genai/oga_bench.py index c704803..fed7f8c 100644 --- a/src/turnkeyml/llm/tools/ort_genai/oga_bench.py +++ b/src/turnkeyml/llm/tools/ort_genai/oga_bench.py @@ -32,7 +32,8 @@ def __init__(self): self.status_stats = [ Keys.SECONDS_TO_FIRST_TOKEN, - Keys.MEAN_TOKENS_PER_SECOND, + Keys.PREFILL_TOKENS_PER_SECOND, + Keys.TOKEN_GENERATION_TOKENS_PER_SECOND, Keys.PROMPT_TOKENS, ] @@ -144,10 +145,16 @@ def run( per_iteration_tokens_per_second.append(model.tokens_per_second) mean_time_to_first_token = statistics.mean(per_iteration_time_to_first_token) - mean_tokens_per_second = statistics.mean(per_iteration_tokens_per_second) + prefill_tokens_per_second = input_ids_len / mean_time_to_first_token + token_generation_tokens_per_second = statistics.mean( + per_iteration_tokens_per_second + ) state.save_stat(Keys.SECONDS_TO_FIRST_TOKEN, mean_time_to_first_token) - state.save_stat(Keys.MEAN_TOKENS_PER_SECOND, mean_tokens_per_second) + state.save_stat(Keys.PREFILL_TOKENS_PER_SECOND, prefill_tokens_per_second) + state.save_stat( + Keys.TOKEN_GENERATION_TOKENS_PER_SECOND, token_generation_tokens_per_second + ) state.save_stat(Keys.PROMPT_TOKENS, input_ids_len) return state diff --git a/test/llm_api.py b/test/llm_api.py index 59dcde1..fe7cdbe 100644 --- a/test/llm_api.py +++ b/test/llm_api.py @@ -78,7 +78,7 @@ def test_001_huggingface_bench(self): stats = fs.Stats(state.cache_dir, state.build_name).stats - assert stats[Keys.MEAN_TOKENS_PER_SECOND] > 0 + assert stats[Keys.TOKEN_GENERATION_TOKENS_PER_SECOND] > 0 if __name__ == "__main__":