1414# See the License for the specific language governing permissions and
1515# limitations under the License.
1616
17-
18- # Parse command line arguments
17+ # Default Values
1918model=" neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic"
2019url=" http://localhost:8000"
2120mode=" aggregated"
2221artifacts_root_dir=" artifacts_root"
2322deployment_kind=" dynamo"
23+ concurrency_list=" 1,2,4,8,16,32,64,128,256"
2424
2525# Input Sequence Length (isl) 3000 and Output Sequence Length (osl) 150 are
2626# selected for chat use case. Note that for other use cases, the results and
@@ -35,49 +35,77 @@ prefill_dp=0
3535decode_tp=0
3636decode_dp=0
3737
38+ print_help () {
39+ echo " Usage: $0 [OPTIONS]"
40+ echo
41+ echo " Options:"
42+ echo " --tensor-parallelism, --tp <int> Tensor parallelism (default: $tp )"
43+ echo " --data-parallelism, --dp <int> Data parallelism (default: $dp )"
44+ echo " --prefill-tensor-parallelism, --prefill-tp <int> Prefill tensor parallelism (default: $prefill_tp )"
45+ echo " --prefill-data-parallelism, --prefill-dp <int> Prefill data parallelism (default: $prefill_dp )"
46+ echo " --decode-tensor-parallelism, --decode-tp <int> Decode tensor parallelism (default: $decode_tp )"
47+ echo " --decode-data-parallelism, --decode-dp <int> Decode data parallelism (default: $decode_dp )"
48+ echo " --model <model_id> Hugging Face model ID to benchmark (default: $model )"
49+ echo " --input-sequence-length, --isl <int> Input sequence length (default: $isl )"
50+ echo " --output-sequence-length, --osl <int> Output sequence length (default: $osl )"
51+ echo " --url <http://host:port> Target URL for inference requests (default: $url )"
52+ echo " --concurrency <list> Comma-separated concurrency levels (default: $concurrency_list )"
53+ echo " --mode <aggregated|disaggregated> Serving mode (default: $mode )"
54+ echo " --artifacts-root-dir <path> Root directory to store benchmark results (default: $artifacts_root_dir )"
55+ echo " --deployment-kind <type> Deployment tag used for pareto chart labels (default: $deployment_kind )"
56+ echo " --help Show this help message and exit"
57+ echo
58+ exit 0
59+ }
60+
61+ # Parse command line arguments
3862# The defaults can be overridden by command line arguments.
3963while [[ $# -gt 0 ]]; do
4064 case $1 in
41- --tensor-parallelism)
65+ --tensor-parallelism|--tp )
4266 tp=" $2 "
4367 shift 2
4468 ;;
45- --data-parallelism)
69+ --data-parallelism|--dp )
4670 dp=" $2 "
4771 shift 2
4872 ;;
49- --prefill-tensor-parallelism)
73+ --prefill-tensor-parallelism|--prefill-tp )
5074 prefill_tp=" $2 "
5175 shift 2
5276 ;;
53- --prefill-data-parallelism)
77+ --prefill-data-parallelism|--prefill-dp )
5478 prefill_dp=" $2 "
5579 shift 2
5680 ;;
57- --decode-tensor-parallelism)
81+ --decode-tensor-parallelism|--decode-tp )
5882 decode_tp=" $2 "
5983 shift 2
6084 ;;
61- --decode-data-parallelism)
85+ --decode-data-parallelism|--decode-dp )
6286 decode_dp=" $2 "
6387 shift 2
6488 ;;
65- --model)
89+ --model)
6690 model=" $2 "
6791 shift 2
6892 ;;
69- --input-sequence-length)
93+ --input-sequence-length|--isl )
7094 isl=" $2 "
7195 shift 2
7296 ;;
73- --output-sequence-length)
97+ --output-sequence-length|--osl )
7498 osl=" $2 "
7599 shift 2
76100 ;;
77101 --url)
78102 url=" $2 "
79103 shift 2
80104 ;;
105+ --concurrency)
106+ concurrency_list=" $2 "
107+ shift 2
108+ ;;
81109 --mode)
82110 mode=" $2 "
83111 shift 2
@@ -90,13 +118,30 @@ while [[ $# -gt 0 ]]; do
90118 deployment_kind=" $2 "
91119 shift 2
92120 ;;
121+ --help)
122+ print_help
123+ ;;
93124 * )
94125 echo " Unknown option: $1 "
95126 exit 1
96127 ;;
97128 esac
98129done
99130
131+ # Function to validate if concurrency values are positive integers
132+ validate_concurrency () {
133+ for val in " ${concurrency_array[@]} " ; do
134+ if ! [[ " $val " =~ ^[0-9]+$ ]] || [ " $val " -le 0 ]; then
135+ echo " Error: Invalid concurrency value '$val '. Must be a positive integer." >&2
136+ exit 1
137+ fi
138+ done
139+ }
140+
141+ IFS=' ,' read -r -a concurrency_array <<< " $concurrency_list"
142+ # Validate concurrency values
143+ validate_concurrency
144+
100145if [ " ${mode} " == " aggregated" ]; then
101146 if [ " ${tp} " == " 0" ] && [ " ${dp} " == " 0" ]; then
102147 echo " --tensor-parallelism and --data-parallelism must be set for aggregated mode."
@@ -157,8 +202,15 @@ if [ $index -gt 0 ]; then
157202 echo " --------------------------------"
158203fi
159204
205+ echo " Running genai-perf with:"
206+ echo " Model: $model "
207+ echo " ISL: $isl "
208+ echo " OSL: $osl "
209+ echo " Concurrency levels: ${concurrency_array[@]} "
210+
160211# Concurrency levels to test
161- for concurrency in 1 2 4 8 16 32 64 128 256; do
212+ for concurrency in " ${concurrency_array[@]} " ; do
213+ echo " Run concurrency: $concurrency "
162214
163215 # NOTE: For Dynamo HTTP OpenAI frontend, use `nvext` for fields like
164216 # `ignore_eos` since they are not in the official OpenAI spec.
@@ -185,7 +237,7 @@ for concurrency in 1 2 4 8 16 32 64 128 256; do
185237 --artifact-dir ${artifact_dir} \
186238 -- \
187239 -v \
188- --max-threads 256 \
240+ --max-threads ${concurrency} \
189241 -H ' Authorization: Bearer NOT USED' \
190242 -H ' Accept: text/event-stream'
191243
0 commit comments