From 077a6616682002cf44e004bc03be87f6def9424f Mon Sep 17 00:00:00 2001 From: Anmol Agarwal Date: Wed, 17 Jul 2024 15:31:03 -0400 Subject: [PATCH] Fixing requests and concurrency (prefill_profiler) --- README.md | 6 ------ docs/tutorials/prefill_profiler.rst | 6 ------ metron/prefill_profiler.py | 12 +++++++++--- 3 files changed, 9 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index b20f502..c532d72 100644 --- a/README.md +++ b/README.md @@ -115,10 +115,7 @@ Launch any open source system and setup API keys and URL as shown for [vLLM](#ru ```bash python -m metron.prefill_profiler \ --model "meta-llama/Meta-Llama-3-8B-Instruct" \ ---max-num-completed-requests 1 \ --timeout 600 \ ---num-ray-clients 1 \ ---num-concurrent-requests-per-client 1 \ --fixed-request-generator-decode-tokens 16 \ --output-dir "prefill_experiments/prefill_profiler_vllm_llama-3-8b" \ --should-use-given-dir true @@ -128,10 +125,7 @@ To modify range of prompt tokens for which prefill times get profiled, use the f ```bash python -m metron.prefill_profiler \ --model "meta-llama/Meta-Llama-3-8B-Instruct" \ ---max-num-completed-requests 1 \ --timeout 600 \ ---num-ray-clients 1 \ ---num-concurrent-requests-per-client 1 \ --output-dir "prefill_experiments/prefill_profiler_vllm_llama-3-8b" \ --should-use-given-dir true \ --prefill-lengths 256 512 1024 2048 4096 8192 16384 32768 65536 diff --git a/docs/tutorials/prefill_profiler.rst b/docs/tutorials/prefill_profiler.rst index 93cffc0..c6d51fc 100644 --- a/docs/tutorials/prefill_profiler.rst +++ b/docs/tutorials/prefill_profiler.rst @@ -17,10 +17,7 @@ And, then run the following command: python -m metron.prefill_profiler \ --model "meta-llama/Meta-Llama-3-8B-Instruct" \ - --max-num-completed-requests 1 \ --timeout 600 \ - --num-ray-clients 1 \ - --num-concurrent-requests-per-client 1 \ --fixed-request-generator-decode-tokens 16 \ --output-dir "prefill_experiments/prefill_profiler_vllm_llama-3-8b" @@ -39,10 +36,7 @@ To profile a custom range of prompt lengths, use the flag ``--prefill-lengths`` python -m metron.prefill_profiler \ --model "meta-llama/Meta-Llama-3-8B-Instruct" \ - --max-num-completed-requests 1 \ --timeout 600 \ - --num-ray-clients 1 \ - --num-concurrent-requests-per-client 1 \ --fixed-request-generator-decode-tokens 16 \ --output-dir "prefill_experiments/prefill_profiler_vllm_llama-3-8b" \ --prefill-lengths 256 512 1024 2048 4096 8192 16384 32768 65536 diff --git a/metron/prefill_profiler.py b/metron/prefill_profiler.py index ad43743..a79c14b 100644 --- a/metron/prefill_profiler.py +++ b/metron/prefill_profiler.py @@ -31,6 +31,12 @@ PREFILL_POLYNOMIAL_DEGREE = 2 # RMSE threshold for the prefill time predictor PREFILL_RMSE_THRESHOLD = 0.05 +# Number of Ray clients to use for prefill profiling +PREFILL_NUM_RAY_CLIENTS = 1 +# Number of concurrent requests per client for prefill profiling +PREFILL_NUM_CONCURRENT_REQUESTS_PER_CLIENT = 1 +# Number of completed requests to wait for before stopping the prefill profiling for a prompt length +PREFILL_MAX_NUM_COMPLETED_REQUESTS = 1 class PrefillProfiler: @@ -77,9 +83,9 @@ def run(self): model=self.args.model, output_dir=run_dir, additional_sampling_params=self.args.additional_sampling_params, - num_ray_clients=self.args.num_ray_clients, - num_concurrent_requests_per_client=self.args.num_concurrent_requests_per_client, - max_num_completed_requests=self.args.max_num_completed_requests, + num_ray_clients=PREFILL_NUM_RAY_CLIENTS, + num_concurrent_requests_per_client=PREFILL_NUM_CONCURRENT_REQUESTS_PER_CLIENT, + max_num_completed_requests=PREFILL_MAX_NUM_COMPLETED_REQUESTS, timeout=self.args.timeout, llm_api=self.args.llm_api, request_generator_config=request_generator_config,