@@ -387,3 +387,178 @@ python3 vllm/benchmarks/benchmark_throughput.py \
387387 --enable-lora \
388388 --lora-path yard1/llama-2-7b-sql-lora-test
389389 ```
390+
391+ ---
392+ ## Example - Structured Output Benchmark
393+
394+ Benchmark the performance of structured output generation (JSON, grammar, regex).
395+
396+ ### Server Setup
397+
398+ ``` bash
399+ vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
400+ ```
401+
402+ ### JSON Schema Benchmark
403+
404+ ``` bash
405+ python3 benchmarks/benchmark_serving_structured_output.py \
406+ --backend vllm \
407+ --model NousResearch/Hermes-3-Llama-3.1-8B \
408+ --dataset json \
409+ --structured-output-ratio 1.0 \
410+ --request-rate 10 \
411+ --num-prompts 1000
412+ ```
413+
414+ ### Grammar-based Generation Benchmark
415+
416+ ``` bash
417+ python3 benchmarks/benchmark_serving_structured_output.py \
418+ --backend vllm \
419+ --model NousResearch/Hermes-3-Llama-3.1-8B \
420+ --dataset grammar \
421+ --structure-type grammar \
422+ --request-rate 10 \
423+ --num-prompts 1000
424+ ```
425+
426+ ### Regex-based Generation Benchmark
427+
428+ ``` bash
429+ python3 benchmarks/benchmark_serving_structured_output.py \
430+ --backend vllm \
431+ --model NousResearch/Hermes-3-Llama-3.1-8B \
432+ --dataset regex \
433+ --request-rate 10 \
434+ --num-prompts 1000
435+ ```
436+
437+ ### Choice-based Generation Benchmark
438+
439+ ``` bash
440+ python3 benchmarks/benchmark_serving_structured_output.py \
441+ --backend vllm \
442+ --model NousResearch/Hermes-3-Llama-3.1-8B \
443+ --dataset choice \
444+ --request-rate 10 \
445+ --num-prompts 1000
446+ ```
447+
448+ ### XGrammar Benchmark Dataset
449+
450+ ``` bash
451+ python3 benchmarks/benchmark_serving_structured_output.py \
452+ --backend vllm \
453+ --model NousResearch/Hermes-3-Llama-3.1-8B \
454+ --dataset xgrammar_bench \
455+ --request-rate 10 \
456+ --num-prompts 1000
457+ ```
458+
459+ ---
460+ ## Example - Long Document QA Throughput Benchmark
461+
462+ Benchmark the performance of long document question-answering with prefix caching.
463+
464+ ### Basic Long Document QA Test
465+
466+ ``` bash
467+ python3 benchmarks/benchmark_long_document_qa_throughput.py \
468+ --model meta-llama/Llama-2-7b-chat-hf \
469+ --enable-prefix-caching \
470+ --num-documents 16 \
471+ --document-length 2000 \
472+ --output-len 50 \
473+ --repeat-count 5
474+ ```
475+
476+ ### Different Repeat Modes
477+
478+ ``` bash
479+ # Random mode (default) - shuffle prompts randomly
480+ python3 benchmarks/benchmark_long_document_qa_throughput.py \
481+ --model meta-llama/Llama-2-7b-chat-hf \
482+ --enable-prefix-caching \
483+ --num-documents 8 \
484+ --document-length 3000 \
485+ --repeat-count 3 \
486+ --repeat-mode random
487+
488+ # Tile mode - repeat entire prompt list in sequence
489+ python3 benchmarks/benchmark_long_document_qa_throughput.py \
490+ --model meta-llama/Llama-2-7b-chat-hf \
491+ --enable-prefix-caching \
492+ --num-documents 8 \
493+ --document-length 3000 \
494+ --repeat-count 3 \
495+ --repeat-mode tile
496+
497+ # Interleave mode - repeat each prompt consecutively
498+ python3 benchmarks/benchmark_long_document_qa_throughput.py \
499+ --model meta-llama/Llama-2-7b-chat-hf \
500+ --enable-prefix-caching \
501+ --num-documents 8 \
502+ --document-length 3000 \
503+ --repeat-count 3 \
504+ --repeat-mode interleave
505+ ```
506+
507+ ---
508+ ## Example - Prefix Caching Benchmark
509+
510+ Benchmark the efficiency of automatic prefix caching.
511+
512+ ### Fixed Prompt with Prefix Caching
513+
514+ ``` bash
515+ python3 benchmarks/benchmark_prefix_caching.py \
516+ --model meta-llama/Llama-2-7b-chat-hf \
517+ --enable-prefix-caching \
518+ --num-prompts 1 \
519+ --repeat-count 100 \
520+ --input-length-range 128:256
521+ ```
522+
523+ ### ShareGPT Dataset with Prefix Caching
524+
525+ ``` bash
526+ # download dataset
527+ # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
528+
529+ python3 benchmarks/benchmark_prefix_caching.py \
530+ --model meta-llama/Llama-2-7b-chat-hf \
531+ --dataset-path /path/ShareGPT_V3_unfiltered_cleaned_split.json \
532+ --enable-prefix-caching \
533+ --num-prompts 20 \
534+ --repeat-count 5 \
535+ --input-length-range 128:256
536+ ```
537+
538+ ---
539+ ## Example - Request Prioritization Benchmark
540+
541+ Benchmark the performance of request prioritization in vLLM.
542+
543+ ### Basic Prioritization Test
544+
545+ ``` bash
546+ python3 benchmarks/benchmark_prioritization.py \
547+ --model meta-llama/Llama-2-7b-chat-hf \
548+ --input-len 128 \
549+ --output-len 64 \
550+ --num-prompts 100 \
551+ --scheduling-policy priority
552+ ```
553+
554+ ### Multiple Sequences per Prompt
555+
556+ ``` bash
557+ python3 benchmarks/benchmark_prioritization.py \
558+ --model meta-llama/Llama-2-7b-chat-hf \
559+ --input-len 128 \
560+ --output-len 64 \
561+ --num-prompts 100 \
562+ --scheduling-policy priority \
563+ --n 2
564+ ```
0 commit comments