vllm-project
diff --git a/‎.buildkite/test-amd.yaml‎
Lines changed: 3 additions & 3 deletions b/‎.buildkite/test-amd.yaml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.buildkite/test-pipeline.yaml‎
Lines changed: 3 additions & 1 deletion b/‎.buildkite/test-pipeline.yaml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 9 additions & 9 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎docs/cli/.nav.yml‎
Lines changed: 1 addition & 1 deletion b/‎docs/cli/.nav.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/cli/bench/sweep/plot.md‎
Lines changed: 9 additions & 0 deletions b/‎docs/cli/bench/sweep/plot.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎docs/cli/bench/sweep/serve.md‎
Lines changed: 9 additions & 0 deletions b/‎docs/cli/bench/sweep/serve.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎docs/cli/bench/sweep/serve_sla.md‎
Lines changed: 9 additions & 0 deletions b/‎docs/cli/bench/sweep/serve_sla.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎docs/contributing/benchmarks.md‎
Lines changed: 3 additions & 3 deletions b/‎docs/contributing/benchmarks.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/mkdocs/hooks/generate_argparse.py‎
Lines changed: 24 additions & 10 deletions b/‎docs/mkdocs/hooks/generate_argparse.py‎
Lines changed: 24 additions & 10 deletions
diff --git a/‎requirements/cuda.txt‎
Lines changed: 0 additions & 2 deletions b/‎requirements/cuda.txt‎
Lines changed: 0 additions & 2 deletions
@@ -38,7 +38,7 @@ steps:
 - label: Pytorch Nightly Dependency Override Check # 2min
   # if this test fails, it means the nightly torch version is not compatible with some
   # of the dependencies. Please check the error message and add the package to whitelist
-  # in /vllm/tools/generate_nightly_torch_test.py
+  # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
   mirror_hardwares: [amdexperimental]
   agent_pool: mi325_1
   # grade: Blocking
@@ -286,7 +286,7 @@ steps:
 
 - label: Engine Test # 25min
   timeout_in_minutes: 40
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   #grade: Blocking
   source_file_dependencies:
@@ -908,7 +908,7 @@ steps:
 
 - label: Quantized Models Test # 45 min
   timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   source_file_dependencies:
 
@@ -38,7 +38,7 @@ steps:
 - label: Pytorch Nightly Dependency Override Check # 2min
   # if this test fails, it means the nightly torch version is not compatible with some
   # of the dependencies. Please check the error message and add the package to whitelist
-  # in /vllm/tools/generate_nightly_torch_test.py
+  # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
   soft_fail: true
   source_file_dependencies:
   - requirements/nightly_torch_test.txt
@@ -498,6 +498,8 @@ steps:
   - tests/kernels/moe
   - vllm/model_executor/layers/fused_moe/
   - vllm/distributed/device_communicators/
+  - vllm/envs.py
+  - vllm/config
   commands:
     - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
   parallelism: 2
 
@@ -45,7 +45,7 @@ repos:
   - id: format-torch-nightly-test
     name: reformat nightly_torch_test.txt to be in sync with test.in
     language: python
-    entry: python tools/generate_nightly_torch_test.py
+    entry: python tools/pre_commit/generate_nightly_torch_test.py
     files: ^requirements/test\.(in|txt)$
   - id: mypy-local
     name: Run mypy locally for lowest supported Python version
@@ -78,12 +78,12 @@ repos:
     stages: [manual] # Only run in CI
   - id: shellcheck
     name: Lint shell scripts
-    entry: tools/shellcheck.sh
+    entry: tools/pre_commit/shellcheck.sh
     language: script
     types: [shell]
   - id: png-lint
     name: Lint PNG exports from excalidraw
-    entry: tools/png-lint.sh
+    entry: tools/pre_commit/png-lint.sh
     language: script
     types: [png]
   - id: signoff-commit
@@ -100,12 +100,12 @@ repos:
     stages: [commit-msg]
   - id: check-spdx-header
     name: Check SPDX headers
-    entry: python tools/check_spdx_header.py
+    entry: python tools/pre_commit/check_spdx_header.py
     language: python
     types: [python]
   - id: check-root-lazy-imports
     name: Check root lazy imports
-    entry: python tools/check_init_lazy_imports.py
+    entry: python tools/pre_commit/check_init_lazy_imports.py
     language: python
     types: [python]
   - id: check-filenames
@@ -119,19 +119,19 @@ repos:
     pass_filenames: false
   - id: update-dockerfile-graph
     name: Update Dockerfile dependency graph
-    entry: tools/update-dockerfile-graph.sh
+    entry: tools/pre_commit/update-dockerfile-graph.sh
     language: script
   - id: enforce-import-regex-instead-of-re
     name: Enforce import regex as re
-    entry: python tools/enforce_regex_import.py
+    entry: python tools/pre_commit/enforce_regex_import.py
     language: python
     types: [python]
     pass_filenames: false
     additional_dependencies: [regex]
   # forbid directly import triton
   - id: forbid-direct-triton-import
     name: "Forbid direct 'import triton'"
-    entry: python tools/check_triton_import.py
+    entry: python tools/pre_commit/check_triton_import.py
     language: python
     types: [python]
     pass_filenames: false
@@ -144,7 +144,7 @@ repos:
     additional_dependencies: [regex]
   - id: validate-config
     name: Validate configuration has default values and that each field has a docstring
-    entry: python tools/validate_config.py
+    entry: python tools/pre_commit/validate_config.py
     language: python
     additional_dependencies: [regex]
   # Keep `suggestion` last
 
@@ -5,4 +5,4 @@ nav:
   - complete.md
   - run-batch.md
   - vllm bench:
-    - bench/*.md
+    - bench/**/*.md
@@ -0,0 +1,9 @@
+# vllm bench sweep plot
+
+## JSON CLI Arguments
+
+--8<-- "docs/cli/json_tip.inc.md"
+
+## Options
+
+--8<-- "docs/argparse/bench_sweep_plot.md"
@@ -0,0 +1,9 @@
+# vllm bench sweep serve
+
+## JSON CLI Arguments
+
+--8<-- "docs/cli/json_tip.inc.md"
+
+## Options
+
+--8<-- "docs/argparse/bench_sweep_serve.md"
@@ -0,0 +1,9 @@
+# vllm bench sweep serve_sla
+
+## JSON CLI Arguments
+
+--8<-- "docs/cli/json_tip.inc.md"
+
+## Options
+
+--8<-- "docs/argparse/bench_sweep_serve_sla.md"
@@ -1061,7 +1061,7 @@ Follow these steps to run the script:
 Example command:
 
 ```bash
-python -m vllm.benchmarks.sweep.serve \
+vllm bench sweep serve \
     --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
     --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
     --serve-params benchmarks/serve_hparams.json \
@@ -1109,7 +1109,7 @@ For example, to ensure E2E latency within different target values for 99% of req
 Example command:
 
 ```bash
-python -m vllm.benchmarks.sweep.serve_sla \
+vllm bench sweep serve_sla \
     --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
     --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
     --serve-params benchmarks/serve_hparams.json \
@@ -1138,7 +1138,7 @@ The algorithm for adjusting the SLA variable is as follows:
 Example command:
 
 ```bash
-python -m vllm.benchmarks.sweep.plot benchmarks/results/<timestamp> \
+vllm bench sweep plot benchmarks/results/<timestamp> \
     --var-x max_concurrency \
     --row-by random_input_len \
     --col-by random_output_len \
 
@@ -56,15 +56,20 @@ def auto_mock(module, attr, max_mocks=50):
     )
 
 
-latency = auto_mock("vllm.benchmarks", "latency")
-serve = auto_mock("vllm.benchmarks", "serve")
-throughput = auto_mock("vllm.benchmarks", "throughput")
+bench_latency = auto_mock("vllm.benchmarks", "latency")
+bench_serve = auto_mock("vllm.benchmarks", "serve")
+bench_sweep_plot = auto_mock("vllm.benchmarks.sweep.plot", "SweepPlotArgs")
+bench_sweep_serve = auto_mock("vllm.benchmarks.sweep.serve", "SweepServeArgs")
+bench_sweep_serve_sla = auto_mock(
+    "vllm.benchmarks.sweep.serve_sla", "SweepServeSLAArgs"
+)
+bench_throughput = auto_mock("vllm.benchmarks", "throughput")
 AsyncEngineArgs = auto_mock("vllm.engine.arg_utils", "AsyncEngineArgs")
 EngineArgs = auto_mock("vllm.engine.arg_utils", "EngineArgs")
 ChatCommand = auto_mock("vllm.entrypoints.cli.openai", "ChatCommand")
 CompleteCommand = auto_mock("vllm.entrypoints.cli.openai", "CompleteCommand")
-cli_args = auto_mock("vllm.entrypoints.openai", "cli_args")
-run_batch = auto_mock("vllm.entrypoints.openai", "run_batch")
+openai_cli_args = auto_mock("vllm.entrypoints.openai", "cli_args")
+openai_run_batch = auto_mock("vllm.entrypoints.openai", "run_batch")
 FlexibleArgumentParser = auto_mock(
     "vllm.utils.argparse_utils", "FlexibleArgumentParser"
 )
@@ -114,6 +119,9 @@ def add_arguments(self, actions):
                 self._markdown_output.append(f"{action.help}\n\n")
 
             if (default := action.default) != SUPPRESS:
+                # Make empty string defaults visible
+                if default == "":
+                    default = '""'
                 self._markdown_output.append(f"Default: `{default}`\n\n")
 
     def format_help(self):
@@ -150,17 +158,23 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
 
     # Create parsers to document
     parsers = {
+        # Engine args
         "engine_args": create_parser(EngineArgs.add_cli_args),
         "async_engine_args": create_parser(
             AsyncEngineArgs.add_cli_args, async_args_only=True
         ),
-        "serve": create_parser(cli_args.make_arg_parser),
+        # CLI
+        "serve": create_parser(openai_cli_args.make_arg_parser),
         "chat": create_parser(ChatCommand.add_cli_args),
         "complete": create_parser(CompleteCommand.add_cli_args),
-        "bench_latency": create_parser(latency.add_cli_args),
-        "bench_throughput": create_parser(throughput.add_cli_args),
-        "bench_serve": create_parser(serve.add_cli_args),
-        "run-batch": create_parser(run_batch.make_arg_parser),
+        "run-batch": create_parser(openai_run_batch.make_arg_parser),
+        # Benchmark CLI
+        "bench_latency": create_parser(bench_latency.add_cli_args),
+        "bench_serve": create_parser(bench_serve.add_cli_args),
+        "bench_sweep_plot": create_parser(bench_sweep_plot.add_cli_args),
+        "bench_sweep_serve": create_parser(bench_sweep_serve.add_cli_args),
+        "bench_sweep_serve_sla": create_parser(bench_sweep_serve_sla.add_cli_args),
+        "bench_throughput": create_parser(bench_throughput.add_cli_args),
     }
 
     # Generate documentation for each parser
 
@@ -13,5 +13,3 @@ torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytor
 # xformers==0.0.32.post1; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.8
 # FlashInfer should be updated together with the Dockerfile
 flashinfer-python==0.4.1
-# Triton Kernels are needed for mxfp4 fused moe. (Should be updated alongside torch)
-triton_kernels @ git+https://github.com/triton-lang/triton.git@v3.5.0#subdirectory=python/triton_kernels