Skip to content

Commit 8585c30

Browse files
docs: add concurrency choice to the perf.sh (#1497)
1 parent 73e0f8c commit 8585c30

File tree

2 files changed

+82
-13
lines changed

2 files changed

+82
-13
lines changed

benchmarks/llm/perf.sh

Lines changed: 65 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,13 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616

17-
18-
# Parse command line arguments
17+
# Default Values
1918
model="neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic"
2019
url="http://localhost:8000"
2120
mode="aggregated"
2221
artifacts_root_dir="artifacts_root"
2322
deployment_kind="dynamo"
23+
concurrency_list="1,2,4,8,16,32,64,128,256"
2424

2525
# Input Sequence Length (isl) 3000 and Output Sequence Length (osl) 150 are
2626
# selected for chat use case. Note that for other use cases, the results and
@@ -35,49 +35,77 @@ prefill_dp=0
3535
decode_tp=0
3636
decode_dp=0
3737

38+
print_help() {
39+
echo "Usage: $0 [OPTIONS]"
40+
echo
41+
echo "Options:"
42+
echo " --tensor-parallelism, --tp <int> Tensor parallelism (default: $tp)"
43+
echo " --data-parallelism, --dp <int> Data parallelism (default: $dp)"
44+
echo " --prefill-tensor-parallelism, --prefill-tp <int> Prefill tensor parallelism (default: $prefill_tp)"
45+
echo " --prefill-data-parallelism, --prefill-dp <int> Prefill data parallelism (default: $prefill_dp)"
46+
echo " --decode-tensor-parallelism, --decode-tp <int> Decode tensor parallelism (default: $decode_tp)"
47+
echo " --decode-data-parallelism, --decode-dp <int> Decode data parallelism (default: $decode_dp)"
48+
echo " --model <model_id> Hugging Face model ID to benchmark (default: $model)"
49+
echo " --input-sequence-length, --isl <int> Input sequence length (default: $isl)"
50+
echo " --output-sequence-length, --osl <int> Output sequence length (default: $osl)"
51+
echo " --url <http://host:port> Target URL for inference requests (default: $url)"
52+
echo " --concurrency <list> Comma-separated concurrency levels (default: $concurrency_list)"
53+
echo " --mode <aggregated|disaggregated> Serving mode (default: $mode)"
54+
echo " --artifacts-root-dir <path> Root directory to store benchmark results (default: $artifacts_root_dir)"
55+
echo " --deployment-kind <type> Deployment tag used for pareto chart labels (default: $deployment_kind)"
56+
echo " --help Show this help message and exit"
57+
echo
58+
exit 0
59+
}
60+
61+
# Parse command line arguments
3862
# The defaults can be overridden by command line arguments.
3963
while [[ $# -gt 0 ]]; do
4064
case $1 in
41-
--tensor-parallelism)
65+
--tensor-parallelism|--tp)
4266
tp="$2"
4367
shift 2
4468
;;
45-
--data-parallelism)
69+
--data-parallelism|--dp)
4670
dp="$2"
4771
shift 2
4872
;;
49-
--prefill-tensor-parallelism)
73+
--prefill-tensor-parallelism|--prefill-tp)
5074
prefill_tp="$2"
5175
shift 2
5276
;;
53-
--prefill-data-parallelism)
77+
--prefill-data-parallelism|--prefill-dp)
5478
prefill_dp="$2"
5579
shift 2
5680
;;
57-
--decode-tensor-parallelism)
81+
--decode-tensor-parallelism|--decode-tp)
5882
decode_tp="$2"
5983
shift 2
6084
;;
61-
--decode-data-parallelism)
85+
--decode-data-parallelism|--decode-dp)
6286
decode_dp="$2"
6387
shift 2
6488
;;
65-
--model)
89+
--model)
6690
model="$2"
6791
shift 2
6892
;;
69-
--input-sequence-length)
93+
--input-sequence-length|--isl)
7094
isl="$2"
7195
shift 2
7296
;;
73-
--output-sequence-length)
97+
--output-sequence-length|--osl)
7498
osl="$2"
7599
shift 2
76100
;;
77101
--url)
78102
url="$2"
79103
shift 2
80104
;;
105+
--concurrency)
106+
concurrency_list="$2"
107+
shift 2
108+
;;
81109
--mode)
82110
mode="$2"
83111
shift 2
@@ -90,13 +118,30 @@ while [[ $# -gt 0 ]]; do
90118
deployment_kind="$2"
91119
shift 2
92120
;;
121+
--help)
122+
print_help
123+
;;
93124
*)
94125
echo "Unknown option: $1"
95126
exit 1
96127
;;
97128
esac
98129
done
99130

131+
# Function to validate if concurrency values are positive integers
132+
validate_concurrency() {
133+
for val in "${concurrency_array[@]}"; do
134+
if ! [[ "$val" =~ ^[0-9]+$ ]] || [ "$val" -le 0 ]; then
135+
echo "Error: Invalid concurrency value '$val'. Must be a positive integer." >&2
136+
exit 1
137+
fi
138+
done
139+
}
140+
141+
IFS=',' read -r -a concurrency_array <<< "$concurrency_list"
142+
# Validate concurrency values
143+
validate_concurrency
144+
100145
if [ "${mode}" == "aggregated" ]; then
101146
if [ "${tp}" == "0" ] && [ "${dp}" == "0" ]; then
102147
echo "--tensor-parallelism and --data-parallelism must be set for aggregated mode."
@@ -157,8 +202,15 @@ if [ $index -gt 0 ]; then
157202
echo "--------------------------------"
158203
fi
159204

205+
echo "Running genai-perf with:"
206+
echo "Model: $model"
207+
echo "ISL: $isl"
208+
echo "OSL: $osl"
209+
echo "Concurrency levels: ${concurrency_array[@]}"
210+
160211
# Concurrency levels to test
161-
for concurrency in 1 2 4 8 16 32 64 128 256; do
212+
for concurrency in "${concurrency_array[@]}"; do
213+
echo "Run concurrency: $concurrency"
162214

163215
# NOTE: For Dynamo HTTP OpenAI frontend, use `nvext` for fields like
164216
# `ignore_eos` since they are not in the official OpenAI spec.
@@ -185,7 +237,7 @@ for concurrency in 1 2 4 8 16 32 64 128 256; do
185237
--artifact-dir ${artifact_dir} \
186238
-- \
187239
-v \
188-
--max-threads 256 \
240+
--max-threads ${concurrency} \
189241
-H 'Authorization: Bearer NOT USED' \
190242
-H 'Accept: text/event-stream'
191243

examples/llm/benchmarks/README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,23 @@ Single-Node
242242
bash -x /workspace/benchmarks/llm/perf.sh --mode aggregated --deployment-kind vllm_serve --tensor-parallelism 8 --data-parallelism 2
243243
```
244244

245+
We could also run the benchmarking script and specify the model, input sequence length, output sequence length, and concurrency levels to target for benchmarking:
246+
247+
```bash
248+
bash -x /workspace/benchmarks/llm/perf.sh \
249+
--mode aggregated \
250+
--deployment-kind vllm_serve \
251+
--tensor-parallelism 1 \
252+
--data-parallelism 1 \
253+
--model neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic \
254+
--input-sequence-length 3000 \
255+
--output-sequence-length 150 \
256+
--url http://localhost:8000 \
257+
--concurrency 1,2,4,8,16,32,64,128,256
258+
259+
# The `--concurrency` option accepts either a single value (e.g., 64) or a comma-separated list (e.g., 1,2,4,8) to specify multiple concurrency levels for benchmarking.
260+
```
261+
245262
> [!Important]
246263
> We should be careful in specifying these options in `perf.sh` script. They should closely reflect the deployment config that is being benchmarked. See `perf.sh --help` to learn more about these option. In the above command, we described that our deployment is using aggregated serving in `vllm serve`. We have also accurately described that we have 2 workers with TP=4(or TP=8 for two nodes).
247264

0 commit comments

Comments
 (0)